diff --git a/lib/libkvm/kvm_proc.c b/lib/libkvm/kvm_proc.c index 63f7c2a8a824..eed2f3de6075 100644 --- a/lib/libkvm/kvm_proc.c +++ b/lib/libkvm/kvm_proc.c @@ -1,788 +1,788 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software developed by the Computer Systems * Engineering group at Lawrence Berkeley Laboratory under DARPA contract * BG 91-66 and contributed to Berkeley. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); __SCCSID("@(#)kvm_proc.c 8.3 (Berkeley) 9/23/93"); /* * Proc traversal interface for kvm. ps and w are (probably) the exclusive * users of this code, so we've factored it out into a separate module. * Thus, we keep this grunge out of the other kvm applications (i.e., * most other applications are interested only in open/close/read/nlist). */ #include #define _WANT_UCRED /* make ucred.h give us 'struct ucred' */ #include #include #include #include #include #include #include #include #define _WANT_PRISON /* make jail.h give us 'struct prison' */ #include #include #include #include #include #include #include #include #define _WANT_KW_EXITCODE #include #include #include #include #include #include #include #include #include #include #include #include "kvm_private.h" #define KREAD(kd, addr, obj) \ (kvm_read(kd, addr, (char *)(obj), sizeof(*obj)) != sizeof(*obj)) static int ticks; static int hz; static uint64_t cpu_tick_frequency; /* * From sys/kern/kern_tc.c. Depends on cpu_tick_frequency, which is * read/initialized before this function is ever called. */ static uint64_t cputick2usec(uint64_t tick) { if (cpu_tick_frequency == 0) return (0); if (tick > 18446744073709551) /* floor(2^64 / 1000) */ return (tick / (cpu_tick_frequency / 1000000)); else if (tick > 18446744073709) /* floor(2^64 / 1000000) */ return ((tick * 1000) / (cpu_tick_frequency / 1000)); else return ((tick * 1000000) / cpu_tick_frequency); } /* * Read proc's from memory file into buffer bp, which has space to hold * at most maxcnt procs. */ static int kvm_proclist(kvm_t *kd, int what, int arg, struct proc *p, struct kinfo_proc *bp, int maxcnt) { int cnt = 0; struct kinfo_proc kinfo_proc, *kp; struct pgrp pgrp; struct session sess; struct cdev t_cdev; struct tty tty; struct vmspace vmspace; struct sigacts sigacts; #if 0 struct pstats pstats; #endif struct ucred ucred; struct prison pr; struct thread mtd; struct proc proc; struct proc pproc; struct sysentvec sysent; char svname[KI_EMULNAMELEN]; struct thread *td = NULL; bool first_thread; kp = &kinfo_proc; kp->ki_structsize = sizeof(kinfo_proc); /* * Loop on the processes, then threads within the process if requested. */ if (what == KERN_PROC_ALL) what |= KERN_PROC_INC_THREAD; for (; cnt < maxcnt && p != NULL; p = LIST_NEXT(&proc, p_list)) { memset(kp, 0, sizeof *kp); if (KREAD(kd, (u_long)p, &proc)) { _kvm_err(kd, kd->program, "can't read proc at %p", p); return (-1); } if (proc.p_state == PRS_NEW) continue; if (KREAD(kd, (u_long)proc.p_ucred, &ucred) == 0) { kp->ki_ruid = ucred.cr_ruid; kp->ki_svuid = ucred.cr_svuid; kp->ki_rgid = ucred.cr_rgid; kp->ki_svgid = ucred.cr_svgid; kp->ki_cr_flags = ucred.cr_flags; if (ucred.cr_ngroups > KI_NGROUPS) { kp->ki_ngroups = KI_NGROUPS; kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW; } else kp->ki_ngroups = ucred.cr_ngroups; kvm_read(kd, (u_long)ucred.cr_groups, kp->ki_groups, kp->ki_ngroups * sizeof(gid_t)); kp->ki_uid = ucred.cr_uid; if (ucred.cr_prison != NULL) { if (KREAD(kd, (u_long)ucred.cr_prison, &pr)) { _kvm_err(kd, kd->program, "can't read prison at %p", ucred.cr_prison); return (-1); } kp->ki_jid = pr.pr_id; } } switch(what & ~KERN_PROC_INC_THREAD) { case KERN_PROC_GID: if (kp->ki_groups[0] != (gid_t)arg) continue; break; case KERN_PROC_PID: if (proc.p_pid != (pid_t)arg) continue; break; case KERN_PROC_RGID: if (kp->ki_rgid != (gid_t)arg) continue; break; case KERN_PROC_UID: if (kp->ki_uid != (uid_t)arg) continue; break; case KERN_PROC_RUID: if (kp->ki_ruid != (uid_t)arg) continue; break; } /* * We're going to add another proc to the set. If this * will overflow the buffer, assume the reason is because * nprocs (or the proc list) is corrupt and declare an error. */ if (cnt >= maxcnt) { _kvm_err(kd, kd->program, "nprocs corrupt"); return (-1); } /* * gather kinfo_proc */ kp->ki_paddr = p; kp->ki_addr = 0; /* XXX uarea */ /* kp->ki_kstack = proc.p_thread.td_kstack; XXXKSE */ kp->ki_args = proc.p_args; kp->ki_numthreads = proc.p_numthreads; kp->ki_tracep = proc.p_tracevp; kp->ki_textvp = proc.p_textvp; kp->ki_fd = proc.p_fd; kp->ki_pd = proc.p_pd; kp->ki_vmspace = proc.p_vmspace; if (proc.p_sigacts != NULL) { if (KREAD(kd, (u_long)proc.p_sigacts, &sigacts)) { _kvm_err(kd, kd->program, "can't read sigacts at %p", proc.p_sigacts); return (-1); } kp->ki_sigignore = sigacts.ps_sigignore; kp->ki_sigcatch = sigacts.ps_sigcatch; } #if 0 if ((proc.p_flag & P_INMEM) && proc.p_stats != NULL) { if (KREAD(kd, (u_long)proc.p_stats, &pstats)) { _kvm_err(kd, kd->program, "can't read stats at %x", proc.p_stats); return (-1); } kp->ki_start = pstats.p_start; /* * XXX: The times here are probably zero and need * to be calculated from the raw data in p_rux and * p_crux. */ kp->ki_rusage = pstats.p_ru; kp->ki_childstime = pstats.p_cru.ru_stime; kp->ki_childutime = pstats.p_cru.ru_utime; /* Some callers want child-times in a single value */ timeradd(&kp->ki_childstime, &kp->ki_childutime, &kp->ki_childtime); } #endif if (proc.p_oppid) kp->ki_ppid = proc.p_oppid; else if (proc.p_pptr) { if (KREAD(kd, (u_long)proc.p_pptr, &pproc)) { _kvm_err(kd, kd->program, "can't read pproc at %p", proc.p_pptr); return (-1); } kp->ki_ppid = pproc.p_pid; } else kp->ki_ppid = 0; if (proc.p_pgrp == NULL) goto nopgrp; if (KREAD(kd, (u_long)proc.p_pgrp, &pgrp)) { _kvm_err(kd, kd->program, "can't read pgrp at %p", proc.p_pgrp); return (-1); } kp->ki_pgid = pgrp.pg_id; kp->ki_jobc = -1; /* Or calculate? Arguably not. */ if (KREAD(kd, (u_long)pgrp.pg_session, &sess)) { _kvm_err(kd, kd->program, "can't read session at %p", pgrp.pg_session); return (-1); } kp->ki_sid = sess.s_sid; (void)memcpy(kp->ki_login, sess.s_login, sizeof(kp->ki_login)); if ((proc.p_flag & P_CONTROLT) && sess.s_ttyp != NULL) { if (KREAD(kd, (u_long)sess.s_ttyp, &tty)) { _kvm_err(kd, kd->program, "can't read tty at %p", sess.s_ttyp); return (-1); } if (tty.t_dev != NULL) { if (KREAD(kd, (u_long)tty.t_dev, &t_cdev)) { _kvm_err(kd, kd->program, "can't read cdev at %p", tty.t_dev); return (-1); } #if 0 kp->ki_tdev = t_cdev.si_udev; #else kp->ki_tdev = NODEV; #endif } if (tty.t_pgrp != NULL) { if (KREAD(kd, (u_long)tty.t_pgrp, &pgrp)) { _kvm_err(kd, kd->program, "can't read tpgrp at %p", tty.t_pgrp); return (-1); } kp->ki_tpgid = pgrp.pg_id; } else kp->ki_tpgid = -1; if (tty.t_session != NULL) { if (KREAD(kd, (u_long)tty.t_session, &sess)) { _kvm_err(kd, kd->program, "can't read session at %p", tty.t_session); return (-1); } kp->ki_tsid = sess.s_sid; } } else { nopgrp: kp->ki_tdev = NODEV; } (void)kvm_read(kd, (u_long)proc.p_vmspace, (char *)&vmspace, sizeof(vmspace)); kp->ki_size = vmspace.vm_map.size; /* * Approximate the kernel's method of calculating * this field. */ #define pmap_resident_count(pm) ((pm)->pm_stats.resident_count) kp->ki_rssize = pmap_resident_count(&vmspace.vm_pmap); kp->ki_swrss = vmspace.vm_swrss; kp->ki_tsize = vmspace.vm_tsize; kp->ki_dsize = vmspace.vm_dsize; kp->ki_ssize = vmspace.vm_ssize; switch (what & ~KERN_PROC_INC_THREAD) { case KERN_PROC_PGRP: if (kp->ki_pgid != (pid_t)arg) continue; break; case KERN_PROC_SESSION: if (kp->ki_sid != (pid_t)arg) continue; break; case KERN_PROC_TTY: if ((proc.p_flag & P_CONTROLT) == 0 || kp->ki_tdev != (dev_t)arg) continue; break; } if (proc.p_comm[0] != 0) strlcpy(kp->ki_comm, proc.p_comm, MAXCOMLEN); (void)kvm_read(kd, (u_long)proc.p_sysent, (char *)&sysent, sizeof(sysent)); (void)kvm_read(kd, (u_long)sysent.sv_name, (char *)&svname, sizeof(svname)); if (svname[0] != 0) strlcpy(kp->ki_emul, svname, KI_EMULNAMELEN); kp->ki_runtime = cputick2usec(proc.p_rux.rux_runtime); kp->ki_pid = proc.p_pid; kp->ki_xstat = KW_EXITCODE(proc.p_xexit, proc.p_xsig); kp->ki_acflag = proc.p_acflag; kp->ki_lock = proc.p_lock; kp->ki_tdev_freebsd11 = kp->ki_tdev; /* truncate */ /* Per-thread items; iterate as appropriate. */ td = TAILQ_FIRST(&proc.p_threads); for (first_thread = true; cnt < maxcnt && td != NULL && (first_thread || (what & KERN_PROC_INC_THREAD)); first_thread = false) { if (proc.p_state != PRS_ZOMBIE) { if (KREAD(kd, (u_long)td, &mtd)) { _kvm_err(kd, kd->program, "can't read thread at %p", td); return (-1); } if (what & KERN_PROC_INC_THREAD) td = TAILQ_NEXT(&mtd, td_plist); } else td = NULL; if ((proc.p_state != PRS_ZOMBIE) && mtd.td_wmesg) (void)kvm_read(kd, (u_long)mtd.td_wmesg, kp->ki_wmesg, WMESGLEN); else memset(kp->ki_wmesg, 0, WMESGLEN); if (proc.p_pgrp == NULL) { kp->ki_kiflag = 0; } else { kp->ki_kiflag = sess.s_ttyvp ? KI_CTTY : 0; if (sess.s_leader == p) kp->ki_kiflag |= KI_SLEADER; } if ((proc.p_state != PRS_ZOMBIE) && (mtd.td_blocked != 0)) { kp->ki_kiflag |= KI_LOCKBLOCK; if (mtd.td_lockname) (void)kvm_read(kd, (u_long)mtd.td_lockname, kp->ki_lockname, LOCKNAMELEN); else memset(kp->ki_lockname, 0, LOCKNAMELEN); kp->ki_lockname[LOCKNAMELEN] = 0; } else kp->ki_kiflag &= ~KI_LOCKBLOCK; kp->ki_siglist = proc.p_siglist; if (proc.p_state != PRS_ZOMBIE) { SIGSETOR(kp->ki_siglist, mtd.td_siglist); kp->ki_sigmask = mtd.td_sigmask; kp->ki_swtime = (ticks - proc.p_swtick) / hz; kp->ki_flag = proc.p_flag; kp->ki_sflag = 0; kp->ki_nice = proc.p_nice; kp->ki_traceflag = proc.p_traceflag; if (proc.p_state == PRS_NORMAL) { if (TD_ON_RUNQ(&mtd) || TD_CAN_RUN(&mtd) || TD_IS_RUNNING(&mtd)) { kp->ki_stat = SRUN; - } else if (mtd.td_state == + } else if (TD_GET_STATE(&mtd) == TDS_INHIBITED) { if (P_SHOULDSTOP(&proc)) { kp->ki_stat = SSTOP; } else if ( TD_IS_SLEEPING(&mtd)) { kp->ki_stat = SSLEEP; } else if (TD_ON_LOCK(&mtd)) { kp->ki_stat = SLOCK; } else { kp->ki_stat = SWAIT; } } } else { kp->ki_stat = SIDL; } /* Stuff from the thread */ kp->ki_pri.pri_level = mtd.td_priority; kp->ki_pri.pri_native = mtd.td_base_pri; kp->ki_lastcpu = mtd.td_lastcpu; kp->ki_wchan = mtd.td_wchan; kp->ki_oncpu = mtd.td_oncpu; if (mtd.td_name[0] != '\0') strlcpy(kp->ki_tdname, mtd.td_name, sizeof(kp->ki_tdname)); else memset(kp->ki_tdname, 0, sizeof(kp->ki_tdname)); kp->ki_pctcpu = 0; kp->ki_rqindex = 0; /* * Note: legacy fields; wraps at NO_CPU_OLD * or the old max CPU value as appropriate */ if (mtd.td_lastcpu == NOCPU) kp->ki_lastcpu_old = NOCPU_OLD; else if (mtd.td_lastcpu > MAXCPU_OLD) kp->ki_lastcpu_old = MAXCPU_OLD; else kp->ki_lastcpu_old = mtd.td_lastcpu; if (mtd.td_oncpu == NOCPU) kp->ki_oncpu_old = NOCPU_OLD; else if (mtd.td_oncpu > MAXCPU_OLD) kp->ki_oncpu_old = MAXCPU_OLD; else kp->ki_oncpu_old = mtd.td_oncpu; kp->ki_tid = mtd.td_tid; } else { memset(&kp->ki_sigmask, 0, sizeof(kp->ki_sigmask)); kp->ki_stat = SZOMB; kp->ki_tid = 0; } bcopy(&kinfo_proc, bp, sizeof(kinfo_proc)); ++bp; ++cnt; } } return (cnt); } /* * Build proc info array by reading in proc list from a crash dump. * Return number of procs read. maxcnt is the max we will read. */ static int kvm_deadprocs(kvm_t *kd, int what, int arg, u_long a_allproc, u_long a_zombproc, int maxcnt) { struct kinfo_proc *bp = kd->procbase; int acnt, zcnt = 0; struct proc *p; if (KREAD(kd, a_allproc, &p)) { _kvm_err(kd, kd->program, "cannot read allproc"); return (-1); } acnt = kvm_proclist(kd, what, arg, p, bp, maxcnt); if (acnt < 0) return (acnt); if (a_zombproc != 0) { if (KREAD(kd, a_zombproc, &p)) { _kvm_err(kd, kd->program, "cannot read zombproc"); return (-1); } zcnt = kvm_proclist(kd, what, arg, p, bp + acnt, maxcnt - acnt); if (zcnt < 0) zcnt = 0; } return (acnt + zcnt); } struct kinfo_proc * kvm_getprocs(kvm_t *kd, int op, int arg, int *cnt) { int mib[4], st, nprocs; size_t size, osize; int temp_op; if (kd->procbase != 0) { free((void *)kd->procbase); /* * Clear this pointer in case this call fails. Otherwise, * kvm_close() will free it again. */ kd->procbase = 0; } if (ISALIVE(kd)) { size = 0; mib[0] = CTL_KERN; mib[1] = KERN_PROC; mib[2] = op; mib[3] = arg; temp_op = op & ~KERN_PROC_INC_THREAD; st = sysctl(mib, temp_op == KERN_PROC_ALL || temp_op == KERN_PROC_PROC ? 3 : 4, NULL, &size, NULL, 0); if (st == -1) { _kvm_syserr(kd, kd->program, "kvm_getprocs"); return (0); } /* * We can't continue with a size of 0 because we pass * it to realloc() (via _kvm_realloc()), and passing 0 * to realloc() results in undefined behavior. */ if (size == 0) { /* * XXX: We should probably return an invalid, * but non-NULL, pointer here so any client * program trying to dereference it will * crash. However, _kvm_freeprocs() calls * free() on kd->procbase if it isn't NULL, * and free()'ing a junk pointer isn't good. * Then again, _kvm_freeprocs() isn't used * anywhere . . . */ kd->procbase = _kvm_malloc(kd, 1); goto liveout; } do { size += size / 10; kd->procbase = (struct kinfo_proc *) _kvm_realloc(kd, kd->procbase, size); if (kd->procbase == NULL) return (0); osize = size; st = sysctl(mib, temp_op == KERN_PROC_ALL || temp_op == KERN_PROC_PROC ? 3 : 4, kd->procbase, &size, NULL, 0); } while (st == -1 && errno == ENOMEM && size == osize); if (st == -1) { _kvm_syserr(kd, kd->program, "kvm_getprocs"); return (0); } /* * We have to check the size again because sysctl() * may "round up" oldlenp if oldp is NULL; hence it * might've told us that there was data to get when * there really isn't any. */ if (size > 0 && kd->procbase->ki_structsize != sizeof(struct kinfo_proc)) { _kvm_err(kd, kd->program, "kinfo_proc size mismatch (expected %zu, got %d)", sizeof(struct kinfo_proc), kd->procbase->ki_structsize); return (0); } liveout: nprocs = size == 0 ? 0 : size / kd->procbase->ki_structsize; } else { struct nlist nl[6], *p; struct nlist nlz[2]; nl[0].n_name = "_nprocs"; nl[1].n_name = "_allproc"; nl[2].n_name = "_ticks"; nl[3].n_name = "_hz"; nl[4].n_name = "_cpu_tick_frequency"; nl[5].n_name = 0; nlz[0].n_name = "_zombproc"; nlz[1].n_name = 0; if (!kd->arch->ka_native(kd)) { _kvm_err(kd, kd->program, "cannot read procs from non-native core"); return (0); } if (kvm_nlist(kd, nl) != 0) { for (p = nl; p->n_type != 0; ++p) ; _kvm_err(kd, kd->program, "%s: no such symbol", p->n_name); return (0); } (void) kvm_nlist(kd, nlz); /* attempt to get zombproc */ if (KREAD(kd, nl[0].n_value, &nprocs)) { _kvm_err(kd, kd->program, "can't read nprocs"); return (0); } /* * If returning all threads, we don't know how many that * might be. Presume that there are, on average, no more * than 10 threads per process. */ if (op == KERN_PROC_ALL || (op & KERN_PROC_INC_THREAD)) nprocs *= 10; /* XXX */ if (KREAD(kd, nl[2].n_value, &ticks)) { _kvm_err(kd, kd->program, "can't read ticks"); return (0); } if (KREAD(kd, nl[3].n_value, &hz)) { _kvm_err(kd, kd->program, "can't read hz"); return (0); } if (KREAD(kd, nl[4].n_value, &cpu_tick_frequency)) { _kvm_err(kd, kd->program, "can't read cpu_tick_frequency"); return (0); } size = nprocs * sizeof(struct kinfo_proc); kd->procbase = (struct kinfo_proc *)_kvm_malloc(kd, size); if (kd->procbase == NULL) return (0); nprocs = kvm_deadprocs(kd, op, arg, nl[1].n_value, nlz[0].n_value, nprocs); if (nprocs <= 0) { _kvm_freeprocs(kd); nprocs = 0; } #ifdef notdef else { size = nprocs * sizeof(struct kinfo_proc); kd->procbase = realloc(kd->procbase, size); } #endif } *cnt = nprocs; return (kd->procbase); } void _kvm_freeprocs(kvm_t *kd) { free(kd->procbase); kd->procbase = NULL; } void * _kvm_realloc(kvm_t *kd, void *p, size_t n) { void *np; np = reallocf(p, n); if (np == NULL) _kvm_err(kd, kd->program, "out of memory"); return (np); } /* * Get the command args or environment. */ static char ** kvm_argv(kvm_t *kd, const struct kinfo_proc *kp, int env, int nchr) { int oid[4]; int i; size_t bufsz; static int buflen; static char *buf, *p; static char **bufp; static int argc; char **nbufp; if (!ISALIVE(kd)) { _kvm_err(kd, kd->program, "cannot read user space from dead kernel"); return (NULL); } if (nchr == 0 || nchr > ARG_MAX) nchr = ARG_MAX; if (buflen == 0) { buf = malloc(nchr); if (buf == NULL) { _kvm_err(kd, kd->program, "cannot allocate memory"); return (NULL); } argc = 32; bufp = malloc(sizeof(char *) * argc); if (bufp == NULL) { free(buf); buf = NULL; _kvm_err(kd, kd->program, "cannot allocate memory"); return (NULL); } buflen = nchr; } else if (nchr > buflen) { p = realloc(buf, nchr); if (p != NULL) { buf = p; buflen = nchr; } } oid[0] = CTL_KERN; oid[1] = KERN_PROC; oid[2] = env ? KERN_PROC_ENV : KERN_PROC_ARGS; oid[3] = kp->ki_pid; bufsz = buflen; if (sysctl(oid, 4, buf, &bufsz, 0, 0) == -1) { /* * If the supplied buf is too short to hold the requested * value the sysctl returns with ENOMEM. The buf is filled * with the truncated value and the returned bufsz is equal * to the requested len. */ if (errno != ENOMEM || bufsz != (size_t)buflen) return (NULL); buf[bufsz - 1] = '\0'; errno = 0; } else if (bufsz == 0) return (NULL); i = 0; p = buf; do { bufp[i++] = p; p += strlen(p) + 1; if (i >= argc) { argc += argc; nbufp = realloc(bufp, sizeof(char *) * argc); if (nbufp == NULL) return (NULL); bufp = nbufp; } } while (p < buf + bufsz); bufp[i++] = 0; return (bufp); } char ** kvm_getargv(kvm_t *kd, const struct kinfo_proc *kp, int nchr) { return (kvm_argv(kd, kp, 0, nchr)); } char ** kvm_getenvv(kvm_t *kd, const struct kinfo_proc *kp, int nchr) { return (kvm_argv(kd, kp, 1, nchr)); } diff --git a/sys/compat/linprocfs/linprocfs.c b/sys/compat/linprocfs/linprocfs.c index 79ffc4dfd5aa..fc2c29240893 100644 --- a/sys/compat/linprocfs/linprocfs.c +++ b/sys/compat/linprocfs/linprocfs.c @@ -1,2106 +1,2106 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2000 Dag-Erling Coïdan Smørgrav * Copyright (c) 1999 Pierre Beyssac * Copyright (c) 1993 Jan-Simon Pendry * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)procfs_status.c 8.4 (Berkeley) 6/15/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #include #endif /* __i386__ || __amd64__ */ #include #include #include #include #include #include /* * Various conversion macros */ #define T2J(x) ((long)(((x) * 100ULL) / (stathz ? stathz : hz))) /* ticks to jiffies */ #define T2CS(x) ((unsigned long)(((x) * 100ULL) / (stathz ? stathz : hz))) /* ticks to centiseconds */ #define T2S(x) ((x) / (stathz ? stathz : hz)) /* ticks to seconds */ #define B2K(x) ((x) >> 10) /* bytes to kbytes */ #define B2P(x) ((x) >> PAGE_SHIFT) /* bytes to pages */ #define P2B(x) ((x) << PAGE_SHIFT) /* pages to bytes */ #define P2K(x) ((x) << (PAGE_SHIFT - 10)) /* pages to kbytes */ #define TV2J(x) ((x)->tv_sec * 100UL + (x)->tv_usec / 10000) /** * @brief Mapping of ki_stat in struct kinfo_proc to the linux state * * The linux procfs state field displays one of the characters RSDZTW to * denote running, sleeping in an interruptible wait, waiting in an * uninterruptible disk sleep, a zombie process, process is being traced * or stopped, or process is paging respectively. * * Our struct kinfo_proc contains the variable ki_stat which contains a * value out of SIDL, SRUN, SSLEEP, SSTOP, SZOMB, SWAIT and SLOCK. * * This character array is used with ki_stati-1 as an index and tries to * map our states to suitable linux states. */ static char linux_state[] = "RRSTZDD"; /* * Filler function for proc/meminfo */ static int linprocfs_domeminfo(PFS_FILL_ARGS) { unsigned long memtotal; /* total memory in bytes */ unsigned long memfree; /* free memory in bytes */ unsigned long cached; /* page cache */ unsigned long buffers; /* buffer cache */ unsigned long long swaptotal; /* total swap space in bytes */ unsigned long long swapused; /* used swap space in bytes */ unsigned long long swapfree; /* free swap space in bytes */ size_t sz; int error, i, j; memtotal = physmem * PAGE_SIZE; memfree = (unsigned long)vm_free_count() * PAGE_SIZE; swap_pager_status(&i, &j); swaptotal = (unsigned long long)i * PAGE_SIZE; swapused = (unsigned long long)j * PAGE_SIZE; swapfree = swaptotal - swapused; /* * This value may exclude wired pages, but we have no good way of * accounting for that. */ cached = (vm_active_count() + vm_inactive_count() + vm_laundry_count()) * PAGE_SIZE; sz = sizeof(buffers); error = kernel_sysctlbyname(curthread, "vfs.bufspace", &buffers, &sz, NULL, 0, 0, 0); if (error != 0) buffers = 0; sbuf_printf(sb, "MemTotal: %9lu kB\n" "MemFree: %9lu kB\n" "Buffers: %9lu kB\n" "Cached: %9lu kB\n" "SwapTotal:%9llu kB\n" "SwapFree: %9llu kB\n", B2K(memtotal), B2K(memfree), B2K(buffers), B2K(cached), B2K(swaptotal), B2K(swapfree)); return (0); } #if defined(__i386__) || defined(__amd64__) /* * Filler function for proc/cpuinfo (i386 & amd64 version) */ static int linprocfs_docpuinfo(PFS_FILL_ARGS) { int hw_model[2]; char model[128]; uint64_t freq; size_t size; u_int cache_size[4]; int fqmhz, fqkhz; int i, j; /* * We default the flags to include all non-conflicting flags, * and the Intel versions of conflicting flags. */ static char *cpu_feature_names[] = { /* 0 */ "fpu", "vme", "de", "pse", /* 4 */ "tsc", "msr", "pae", "mce", /* 8 */ "cx8", "apic", "", "sep", /* 12 */ "mtrr", "pge", "mca", "cmov", /* 16 */ "pat", "pse36", "pn", "clflush", /* 20 */ "", "dts", "acpi", "mmx", /* 24 */ "fxsr", "sse", "sse2", "ss", /* 28 */ "ht", "tm", "ia64", "pbe" }; static char *amd_feature_names[] = { /* 0 */ "", "", "", "", /* 4 */ "", "", "", "", /* 8 */ "", "", "", "syscall", /* 12 */ "", "", "", "", /* 16 */ "", "", "", "mp", /* 20 */ "nx", "", "mmxext", "", /* 24 */ "", "fxsr_opt", "pdpe1gb", "rdtscp", /* 28 */ "", "lm", "3dnowext", "3dnow" }; static char *cpu_feature2_names[] = { /* 0 */ "pni", "pclmulqdq", "dtes64", "monitor", /* 4 */ "ds_cpl", "vmx", "smx", "est", /* 8 */ "tm2", "ssse3", "cid", "sdbg", /* 12 */ "fma", "cx16", "xtpr", "pdcm", /* 16 */ "", "pcid", "dca", "sse4_1", /* 20 */ "sse4_2", "x2apic", "movbe", "popcnt", /* 24 */ "tsc_deadline_timer", "aes", "xsave", "", /* 28 */ "avx", "f16c", "rdrand", "hypervisor" }; static char *amd_feature2_names[] = { /* 0 */ "lahf_lm", "cmp_legacy", "svm", "extapic", /* 4 */ "cr8_legacy", "abm", "sse4a", "misalignsse", /* 8 */ "3dnowprefetch", "osvw", "ibs", "xop", /* 12 */ "skinit", "wdt", "", "lwp", /* 16 */ "fma4", "tce", "", "nodeid_msr", /* 20 */ "", "tbm", "topoext", "perfctr_core", /* 24 */ "perfctr_nb", "", "bpext", "ptsc", /* 28 */ "perfctr_llc", "mwaitx", "", "" }; static char *cpu_stdext_feature_names[] = { /* 0 */ "fsgsbase", "tsc_adjust", "", "bmi1", /* 4 */ "hle", "avx2", "", "smep", /* 8 */ "bmi2", "erms", "invpcid", "rtm", /* 12 */ "cqm", "", "mpx", "rdt_a", /* 16 */ "avx512f", "avx512dq", "rdseed", "adx", /* 20 */ "smap", "avx512ifma", "", "clflushopt", /* 24 */ "clwb", "intel_pt", "avx512pf", "avx512er", /* 28 */ "avx512cd", "sha_ni", "avx512bw", "avx512vl" }; static char *power_flags[] = { "ts", "fid", "vid", "ttp", "tm", "stc", "100mhzsteps", "hwpstate", "", "cpb", "eff_freq_ro", "proc_feedback", "acc_power", }; hw_model[0] = CTL_HW; hw_model[1] = HW_MODEL; model[0] = '\0'; size = sizeof(model); if (kernel_sysctl(td, hw_model, 2, &model, &size, 0, 0, 0, 0) != 0) strcpy(model, "unknown"); #ifdef __i386__ switch (cpu_vendor_id) { case CPU_VENDOR_AMD: if (cpu_class < CPUCLASS_686) cpu_feature_names[16] = "fcmov"; break; case CPU_VENDOR_CYRIX: cpu_feature_names[24] = "cxmmx"; break; } #endif if (cpu_exthigh >= 0x80000006) do_cpuid(0x80000006, cache_size); else memset(cache_size, 0, sizeof(cache_size)); for (i = 0; i < mp_ncpus; ++i) { fqmhz = 0; fqkhz = 0; freq = atomic_load_acq_64(&tsc_freq); if (freq != 0) { fqmhz = (freq + 4999) / 1000000; fqkhz = ((freq + 4999) / 10000) % 100; } sbuf_printf(sb, "processor\t: %d\n" "vendor_id\t: %.20s\n" "cpu family\t: %u\n" "model\t\t: %u\n" "model name\t: %s\n" "stepping\t: %u\n" "cpu MHz\t\t: %d.%02d\n" "cache size\t: %d KB\n" "physical id\t: %d\n" "siblings\t: %d\n" "core id\t\t: %d\n" "cpu cores\t: %d\n" "apicid\t\t: %d\n" "initial apicid\t: %d\n" "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: %s\n", i, cpu_vendor, CPUID_TO_FAMILY(cpu_id), CPUID_TO_MODEL(cpu_id), model, cpu_id & CPUID_STEPPING, fqmhz, fqkhz, (cache_size[2] >> 16), 0, mp_ncpus, i, mp_ncpus, i, i, /*cpu_id & CPUID_LOCAL_APIC_ID ??*/ (cpu_feature & CPUID_FPU) ? "yes" : "no", "yes", CPUID_TO_FAMILY(cpu_id), "yes"); sbuf_cat(sb, "flags\t\t:"); for (j = 0; j < nitems(cpu_feature_names); j++) if (cpu_feature & (1 << j) && cpu_feature_names[j][0] != '\0') sbuf_printf(sb, " %s", cpu_feature_names[j]); for (j = 0; j < nitems(amd_feature_names); j++) if (amd_feature & (1 << j) && amd_feature_names[j][0] != '\0') sbuf_printf(sb, " %s", amd_feature_names[j]); for (j = 0; j < nitems(cpu_feature2_names); j++) if (cpu_feature2 & (1 << j) && cpu_feature2_names[j][0] != '\0') sbuf_printf(sb, " %s", cpu_feature2_names[j]); for (j = 0; j < nitems(amd_feature2_names); j++) if (amd_feature2 & (1 << j) && amd_feature2_names[j][0] != '\0') sbuf_printf(sb, " %s", amd_feature2_names[j]); for (j = 0; j < nitems(cpu_stdext_feature_names); j++) if (cpu_stdext_feature & (1 << j) && cpu_stdext_feature_names[j][0] != '\0') sbuf_printf(sb, " %s", cpu_stdext_feature_names[j]); sbuf_cat(sb, "\n"); sbuf_printf(sb, "bugs\t\t: %s\n" "bogomips\t: %d.%02d\n" "clflush size\t: %d\n" "cache_alignment\t: %d\n" "address sizes\t: %d bits physical, %d bits virtual\n", #if defined(I586_CPU) && !defined(NO_F00F_HACK) (has_f00f_bug) ? "Intel F00F" : "", #else "", #endif fqmhz * 2, fqkhz, cpu_clflush_line_size, cpu_clflush_line_size, cpu_maxphyaddr, (cpu_maxphyaddr > 32) ? 48 : 0); sbuf_cat(sb, "power management: "); for (j = 0; j < nitems(power_flags); j++) if (amd_pminfo & (1 << j)) sbuf_printf(sb, " %s", power_flags[j]); sbuf_cat(sb, "\n\n"); /* XXX per-cpu vendor / class / model / id? */ } sbuf_cat(sb, "\n"); return (0); } #else /* ARM64TODO: implement non-stubbed linprocfs_docpuinfo */ static int linprocfs_docpuinfo(PFS_FILL_ARGS) { int i; for (i = 0; i < mp_ncpus; ++i) { sbuf_printf(sb, "processor\t: %d\n" "BogoMIPS\t: %d.%02d\n", i, 0, 0); sbuf_cat(sb, "Features\t: "); sbuf_cat(sb, "\n"); sbuf_printf(sb, "CPU implementer\t: \n" "CPU architecture: \n" "CPU variant\t: 0x%x\n" "CPU part\t: 0x%x\n" "CPU revision\t: %d\n", 0, 0, 0); sbuf_cat(sb, "\n"); } return (0); } #endif /* __i386__ || __amd64__ */ static const char *path_slash_sys = "/sys"; static const char *fstype_sysfs = "sysfs"; static int _mtab_helper(const struct pfs_node *pn, const struct statfs *sp, const char **mntfrom, const char **mntto, const char **fstype) { /* determine device name */ *mntfrom = sp->f_mntfromname; /* determine mount point */ *mntto = sp->f_mntonname; /* determine fs type */ *fstype = sp->f_fstypename; if (strcmp(*fstype, pn->pn_info->pi_name) == 0) *mntfrom = *fstype = "proc"; else if (strcmp(*fstype, "procfs") == 0) return (ECANCELED); if (strcmp(*fstype, "autofs") == 0) { /* * FreeBSD uses eg "map -hosts", whereas Linux * expects just "-hosts". */ if (strncmp(*mntfrom, "map ", 4) == 0) *mntfrom += 4; } if (strcmp(*fstype, "linsysfs") == 0) { *mntfrom = path_slash_sys; *fstype = fstype_sysfs; } else { /* For Linux msdosfs is called vfat */ if (strcmp(*fstype, "msdosfs") == 0) *fstype = "vfat"; } return (0); } static void _sbuf_mntoptions_helper(struct sbuf *sb, uint64_t f_flags) { sbuf_cat(sb, (f_flags & MNT_RDONLY) ? "ro" : "rw"); #define ADD_OPTION(opt, name) \ if (f_flags & (opt)) sbuf_cat(sb, "," name); ADD_OPTION(MNT_SYNCHRONOUS, "sync"); ADD_OPTION(MNT_NOEXEC, "noexec"); ADD_OPTION(MNT_NOSUID, "nosuid"); ADD_OPTION(MNT_UNION, "union"); ADD_OPTION(MNT_ASYNC, "async"); ADD_OPTION(MNT_SUIDDIR, "suiddir"); ADD_OPTION(MNT_NOSYMFOLLOW, "nosymfollow"); ADD_OPTION(MNT_NOATIME, "noatime"); #undef ADD_OPTION } /* * Filler function for proc/mtab and proc//mounts. * * /proc/mtab doesn't exist in Linux' procfs, but is included here so * users can symlink /compat/linux/etc/mtab to /proc/mtab */ static int linprocfs_domtab(PFS_FILL_ARGS) { struct nameidata nd; const char *lep, *mntto, *mntfrom, *fstype; char *dlep, *flep; size_t lep_len; int error; struct statfs *buf, *sp; size_t count; /* resolve symlinks etc. in the emulation tree prefix */ /* * Ideally, this would use the current chroot rather than some * hardcoded path. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, linux_emul_path, td); flep = NULL; error = namei(&nd); lep = linux_emul_path; if (error == 0) { if (vn_fullpath(nd.ni_vp, &dlep, &flep) == 0) lep = dlep; vrele(nd.ni_vp); } lep_len = strlen(lep); buf = NULL; error = kern_getfsstat(td, &buf, SIZE_T_MAX, &count, UIO_SYSSPACE, MNT_WAIT); if (error != 0) { free(buf, M_TEMP); free(flep, M_TEMP); return (error); } for (sp = buf; count > 0; sp++, count--) { error = _mtab_helper(pn, sp, &mntfrom, &mntto, &fstype); if (error != 0) { MPASS(error == ECANCELED); continue; } /* determine mount point */ if (strncmp(mntto, lep, lep_len) == 0 && mntto[lep_len] == '/') mntto += lep_len; sbuf_printf(sb, "%s %s %s ", mntfrom, mntto, fstype); _sbuf_mntoptions_helper(sb, sp->f_flags); /* a real Linux mtab will also show NFS options */ sbuf_printf(sb, " 0 0\n"); } free(buf, M_TEMP); free(flep, M_TEMP); return (error); } static int linprocfs_doprocmountinfo(PFS_FILL_ARGS) { struct nameidata nd; const char *mntfrom, *mntto, *fstype; const char *lep; char *dlep, *flep; struct statfs *buf, *sp; size_t count, lep_len; int error; /* * Ideally, this would use the current chroot rather than some * hardcoded path. */ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, linux_emul_path, td); flep = NULL; error = namei(&nd); lep = linux_emul_path; if (error == 0) { if (vn_fullpath(nd.ni_vp, &dlep, &flep) == 0) lep = dlep; vrele(nd.ni_vp); } lep_len = strlen(lep); buf = NULL; error = kern_getfsstat(td, &buf, SIZE_T_MAX, &count, UIO_SYSSPACE, MNT_WAIT); if (error != 0) goto out; for (sp = buf; count > 0; sp++, count--) { error = _mtab_helper(pn, sp, &mntfrom, &mntto, &fstype); if (error != 0) { MPASS(error == ECANCELED); continue; } if (strncmp(mntto, lep, lep_len) == 0 && mntto[lep_len] == '/') mntto += lep_len; #if 0 /* * If the prefix is a chroot, and this mountpoint is not under * the prefix, we should skip it. Leave it for now for * consistency with procmtab above. */ else continue; #endif /* * (1) mount id * * (2) parent mount id -- we don't have this cheaply, so * provide a dummy value * * (3) major:minor -- ditto * * (4) root filesystem mount -- probably a namespaces thing * * (5) mountto path */ sbuf_printf(sb, "%u 0 0:0 / %s ", sp->f_fsid.val[0] ^ sp->f_fsid.val[1], mntto); /* (6) mount options */ _sbuf_mntoptions_helper(sb, sp->f_flags); /* * (7) zero or more optional fields -- again, namespace related * * (8) End of variable length fields separator ("-") * * (9) fstype * * (10) mount from * * (11) "superblock" options -- like (6), but different * semantics in Linux */ sbuf_printf(sb, " - %s %s %s\n", fstype, mntfrom, (sp->f_flags & MNT_RDONLY) ? "ro" : "rw"); } error = 0; out: free(buf, M_TEMP); free(flep, M_TEMP); return (error); } /* * Filler function for proc/partitions */ static int linprocfs_dopartitions(PFS_FILL_ARGS) { struct g_class *cp; struct g_geom *gp; struct g_provider *pp; int major, minor; g_topology_lock(); sbuf_printf(sb, "major minor #blocks name rio rmerge rsect " "ruse wio wmerge wsect wuse running use aveq\n"); LIST_FOREACH(cp, &g_classes, class) { if (strcmp(cp->name, "DISK") == 0 || strcmp(cp->name, "PART") == 0) LIST_FOREACH(gp, &cp->geom, geom) { LIST_FOREACH(pp, &gp->provider, provider) { if (linux_driver_get_major_minor( pp->name, &major, &minor) != 0) { major = 0; minor = 0; } sbuf_printf(sb, "%d %d %lld %s " "%d %d %d %d %d " "%d %d %d %d %d %d\n", major, minor, (long long)pp->mediasize, pp->name, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } } } g_topology_unlock(); return (0); } /* * Filler function for proc/stat * * Output depends on kernel version: * * v2.5.40 <= * user nice system idle * v2.5.41 * user nice system idle iowait * v2.6.11 * user nice system idle iowait irq softirq steal * v2.6.24 * user nice system idle iowait irq softirq steal guest * v2.6.33 >= * user nice system idle iowait irq softirq steal guest guest_nice */ static int linprocfs_dostat(PFS_FILL_ARGS) { struct pcpu *pcpu; long cp_time[CPUSTATES]; long *cp; struct timeval boottime; int i; char *zero_pad; bool has_intr = true; if (linux_kernver(td) >= LINUX_KERNVER(2,6,33)) { zero_pad = " 0 0 0 0\n"; } else if (linux_kernver(td) >= LINUX_KERNVER(2,6,24)) { zero_pad = " 0 0 0\n"; } else if (linux_kernver(td) >= LINUX_KERNVER(2,6,11)) { zero_pad = " 0 0\n"; } else if (linux_kernver(td) >= LINUX_KERNVER(2,5,41)) { has_intr = false; zero_pad = " 0\n"; } else { has_intr = false; zero_pad = "\n"; } read_cpu_time(cp_time); getboottime(&boottime); /* Parameters common to all versions */ sbuf_printf(sb, "cpu %lu %lu %lu %lu", T2J(cp_time[CP_USER]), T2J(cp_time[CP_NICE]), T2J(cp_time[CP_SYS]), T2J(cp_time[CP_IDLE])); /* Print interrupt stats if available */ if (has_intr) { sbuf_printf(sb, " 0 %lu", T2J(cp_time[CP_INTR])); } /* Pad out remaining fields depending on version */ sbuf_printf(sb, "%s", zero_pad); CPU_FOREACH(i) { pcpu = pcpu_find(i); cp = pcpu->pc_cp_time; sbuf_printf(sb, "cpu%d %lu %lu %lu %lu", i, T2J(cp[CP_USER]), T2J(cp[CP_NICE]), T2J(cp[CP_SYS]), T2J(cp[CP_IDLE])); if (has_intr) { sbuf_printf(sb, " 0 %lu", T2J(cp[CP_INTR])); } sbuf_printf(sb, "%s", zero_pad); } sbuf_printf(sb, "disk 0 0 0 0\n" "page %ju %ju\n" "swap %ju %ju\n" "intr %ju\n" "ctxt %ju\n" "btime %lld\n", (uintmax_t)VM_CNT_FETCH(v_vnodepgsin), (uintmax_t)VM_CNT_FETCH(v_vnodepgsout), (uintmax_t)VM_CNT_FETCH(v_swappgsin), (uintmax_t)VM_CNT_FETCH(v_swappgsout), (uintmax_t)VM_CNT_FETCH(v_intr), (uintmax_t)VM_CNT_FETCH(v_swtch), (long long)boottime.tv_sec); return (0); } static int linprocfs_doswaps(PFS_FILL_ARGS) { struct xswdev xsw; uintmax_t total, used; int n; char devname[SPECNAMELEN + 1]; sbuf_printf(sb, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); for (n = 0; ; n++) { if (swap_dev_info(n, &xsw, devname, sizeof(devname)) != 0) break; total = (uintmax_t)xsw.xsw_nblks * PAGE_SIZE / 1024; used = (uintmax_t)xsw.xsw_used * PAGE_SIZE / 1024; /* * The space and not tab after the device name is on * purpose. Linux does so. */ sbuf_printf(sb, "/dev/%-34s unknown\t\t%jd\t%jd\t-1\n", devname, total, used); } return (0); } /* * Filler function for proc/uptime */ static int linprocfs_douptime(PFS_FILL_ARGS) { long cp_time[CPUSTATES]; struct timeval tv; getmicrouptime(&tv); read_cpu_time(cp_time); sbuf_printf(sb, "%lld.%02ld %ld.%02lu\n", (long long)tv.tv_sec, tv.tv_usec / 10000, T2S(cp_time[CP_IDLE] / mp_ncpus), T2CS(cp_time[CP_IDLE] / mp_ncpus) % 100); return (0); } /* * Get OS build date */ static void linprocfs_osbuild(struct thread *td, struct sbuf *sb) { #if 0 char osbuild[256]; char *cp1, *cp2; strncpy(osbuild, version, 256); osbuild[255] = '\0'; cp1 = strstr(osbuild, "\n"); cp2 = strstr(osbuild, ":"); if (cp1 && cp2) { *cp1 = *cp2 = '\0'; cp1 = strstr(osbuild, "#"); } else cp1 = NULL; if (cp1) sbuf_printf(sb, "%s%s", cp1, cp2 + 1); else #endif sbuf_cat(sb, "#4 Sun Dec 18 04:30:00 CET 1977"); } /* * Get OS builder */ static void linprocfs_osbuilder(struct thread *td, struct sbuf *sb) { #if 0 char builder[256]; char *cp; cp = strstr(version, "\n "); if (cp) { strncpy(builder, cp + 5, 256); builder[255] = '\0'; cp = strstr(builder, ":"); if (cp) *cp = '\0'; } if (cp) sbuf_cat(sb, builder); else #endif sbuf_cat(sb, "des@freebsd.org"); } /* * Filler function for proc/version */ static int linprocfs_doversion(PFS_FILL_ARGS) { char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; linux_get_osname(td, osname); linux_get_osrelease(td, osrelease); sbuf_printf(sb, "%s version %s (", osname, osrelease); linprocfs_osbuilder(td, sb); sbuf_cat(sb, ") (gcc version " __VERSION__ ") "); linprocfs_osbuild(td, sb); sbuf_cat(sb, "\n"); return (0); } /* * Filler function for proc/loadavg */ static int linprocfs_doloadavg(PFS_FILL_ARGS) { sbuf_printf(sb, "%d.%02d %d.%02d %d.%02d %d/%d %d\n", (int)(averunnable.ldavg[0] / averunnable.fscale), (int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100), (int)(averunnable.ldavg[1] / averunnable.fscale), (int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100), (int)(averunnable.ldavg[2] / averunnable.fscale), (int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100), 1, /* number of running tasks */ nprocs, /* number of tasks */ lastpid /* the last pid */ ); return (0); } static int linprocfs_get_tty_nr(struct proc *p) { struct session *sp; const char *ttyname; int error, major, minor, nr; PROC_LOCK_ASSERT(p, MA_OWNED); sx_assert(&proctree_lock, SX_LOCKED); if ((p->p_flag & P_CONTROLT) == 0) return (-1); sp = p->p_pgrp->pg_session; if (sp == NULL) return (-1); ttyname = devtoname(sp->s_ttyp->t_dev); error = linux_driver_get_major_minor(ttyname, &major, &minor); if (error != 0) return (-1); nr = makedev(major, minor); return (nr); } /* * Filler function for proc/pid/stat */ static int linprocfs_doprocstat(PFS_FILL_ARGS) { struct kinfo_proc kp; struct timeval boottime; char state; static int ratelimit = 0; int tty_nr; vm_offset_t startcode, startdata; getboottime(&boottime); sx_slock(&proctree_lock); PROC_LOCK(p); fill_kinfo_proc(p, &kp); tty_nr = linprocfs_get_tty_nr(p); sx_sunlock(&proctree_lock); if (p->p_vmspace) { startcode = (vm_offset_t)p->p_vmspace->vm_taddr; startdata = (vm_offset_t)p->p_vmspace->vm_daddr; } else { startcode = 0; startdata = 0; } sbuf_printf(sb, "%d", p->p_pid); #define PS_ADD(name, fmt, arg) sbuf_printf(sb, " " fmt, arg) PS_ADD("comm", "(%s)", p->p_comm); if (kp.ki_stat > sizeof(linux_state)) { state = 'R'; if (ratelimit == 0) { printf("linprocfs: don't know how to handle unknown FreeBSD state %d/%zd, mapping to R\n", kp.ki_stat, sizeof(linux_state)); ++ratelimit; } } else state = linux_state[kp.ki_stat - 1]; PS_ADD("state", "%c", state); PS_ADD("ppid", "%d", p->p_pptr ? p->p_pptr->p_pid : 0); PS_ADD("pgrp", "%d", p->p_pgid); PS_ADD("session", "%d", p->p_session->s_sid); PROC_UNLOCK(p); PS_ADD("tty", "%d", tty_nr); PS_ADD("tpgid", "%d", kp.ki_tpgid); PS_ADD("flags", "%u", 0); /* XXX */ PS_ADD("minflt", "%lu", kp.ki_rusage.ru_minflt); PS_ADD("cminflt", "%lu", kp.ki_rusage_ch.ru_minflt); PS_ADD("majflt", "%lu", kp.ki_rusage.ru_majflt); PS_ADD("cmajflt", "%lu", kp.ki_rusage_ch.ru_majflt); PS_ADD("utime", "%ld", TV2J(&kp.ki_rusage.ru_utime)); PS_ADD("stime", "%ld", TV2J(&kp.ki_rusage.ru_stime)); PS_ADD("cutime", "%ld", TV2J(&kp.ki_rusage_ch.ru_utime)); PS_ADD("cstime", "%ld", TV2J(&kp.ki_rusage_ch.ru_stime)); PS_ADD("priority", "%d", kp.ki_pri.pri_user); PS_ADD("nice", "%d", kp.ki_nice); /* 19 (nicest) to -19 */ PS_ADD("0", "%d", 0); /* removed field */ PS_ADD("itrealvalue", "%d", 0); /* XXX */ PS_ADD("starttime", "%lu", TV2J(&kp.ki_start) - TV2J(&boottime)); PS_ADD("vsize", "%ju", P2K((uintmax_t)kp.ki_size)); PS_ADD("rss", "%ju", (uintmax_t)kp.ki_rssize); PS_ADD("rlim", "%lu", kp.ki_rusage.ru_maxrss); PS_ADD("startcode", "%ju", (uintmax_t)startcode); PS_ADD("endcode", "%ju", (uintmax_t)startdata); PS_ADD("startstack", "%u", 0); /* XXX */ PS_ADD("kstkesp", "%u", 0); /* XXX */ PS_ADD("kstkeip", "%u", 0); /* XXX */ PS_ADD("signal", "%u", 0); /* XXX */ PS_ADD("blocked", "%u", 0); /* XXX */ PS_ADD("sigignore", "%u", 0); /* XXX */ PS_ADD("sigcatch", "%u", 0); /* XXX */ PS_ADD("wchan", "%u", 0); /* XXX */ PS_ADD("nswap", "%lu", kp.ki_rusage.ru_nswap); PS_ADD("cnswap", "%lu", kp.ki_rusage_ch.ru_nswap); PS_ADD("exitsignal", "%d", 0); /* XXX */ PS_ADD("processor", "%u", kp.ki_lastcpu); PS_ADD("rt_priority", "%u", 0); /* XXX */ /* >= 2.5.19 */ PS_ADD("policy", "%u", kp.ki_pri.pri_class); /* >= 2.5.19 */ #undef PS_ADD sbuf_putc(sb, '\n'); return (0); } /* * Filler function for proc/pid/statm */ static int linprocfs_doprocstatm(PFS_FILL_ARGS) { struct kinfo_proc kp; segsz_t lsize; sx_slock(&proctree_lock); PROC_LOCK(p); fill_kinfo_proc(p, &kp); PROC_UNLOCK(p); sx_sunlock(&proctree_lock); /* * See comments in linprocfs_doprocstatus() regarding the * computation of lsize. */ /* size resident share trs drs lrs dt */ sbuf_printf(sb, "%ju ", B2P((uintmax_t)kp.ki_size)); sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_rssize); sbuf_printf(sb, "%ju ", (uintmax_t)0); /* XXX */ sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_tsize); sbuf_printf(sb, "%ju ", (uintmax_t)(kp.ki_dsize + kp.ki_ssize)); lsize = B2P(kp.ki_size) - kp.ki_dsize - kp.ki_ssize - kp.ki_tsize - 1; sbuf_printf(sb, "%ju ", (uintmax_t)lsize); sbuf_printf(sb, "%ju\n", (uintmax_t)0); /* XXX */ return (0); } /* * Filler function for proc/pid/status */ static int linprocfs_doprocstatus(PFS_FILL_ARGS) { struct kinfo_proc kp; char *state; segsz_t lsize; struct thread *td2; struct sigacts *ps; l_sigset_t siglist, sigignore, sigcatch; int i; sx_slock(&proctree_lock); PROC_LOCK(p); td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */ if (P_SHOULDSTOP(p)) { state = "T (stopped)"; } else { switch(p->p_state) { case PRS_NEW: state = "I (idle)"; break; case PRS_NORMAL: if (p->p_flag & P_WEXIT) { state = "X (exiting)"; break; } - switch(td2->td_state) { + switch(TD_GET_STATE(td2)) { case TDS_INHIBITED: state = "S (sleeping)"; break; case TDS_RUNQ: case TDS_RUNNING: state = "R (running)"; break; default: state = "? (unknown)"; break; } break; case PRS_ZOMBIE: state = "Z (zombie)"; break; default: state = "? (unknown)"; break; } } fill_kinfo_proc(p, &kp); sx_sunlock(&proctree_lock); sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */ sbuf_printf(sb, "State:\t%s\n", state); /* * Credentials */ sbuf_printf(sb, "Tgid:\t%d\n", p->p_pid); sbuf_printf(sb, "Pid:\t%d\n", p->p_pid); sbuf_printf(sb, "PPid:\t%d\n", kp.ki_ppid ); sbuf_printf(sb, "TracerPid:\t%d\n", kp.ki_tracer ); sbuf_printf(sb, "Uid:\t%d %d %d %d\n", p->p_ucred->cr_ruid, p->p_ucred->cr_uid, p->p_ucred->cr_svuid, /* FreeBSD doesn't have fsuid */ p->p_ucred->cr_uid); sbuf_printf(sb, "Gid:\t%d %d %d %d\n", p->p_ucred->cr_rgid, p->p_ucred->cr_gid, p->p_ucred->cr_svgid, /* FreeBSD doesn't have fsgid */ p->p_ucred->cr_gid); sbuf_cat(sb, "Groups:\t"); for (i = 0; i < p->p_ucred->cr_ngroups; i++) sbuf_printf(sb, "%d ", p->p_ucred->cr_groups[i]); PROC_UNLOCK(p); sbuf_putc(sb, '\n'); /* * Memory * * While our approximation of VmLib may not be accurate (I * don't know of a simple way to verify it, and I'm not sure * it has much meaning anyway), I believe it's good enough. * * The same code that could (I think) accurately compute VmLib * could also compute VmLck, but I don't really care enough to * implement it. Submissions are welcome. */ sbuf_printf(sb, "VmSize:\t%8ju kB\n", B2K((uintmax_t)kp.ki_size)); sbuf_printf(sb, "VmLck:\t%8u kB\n", P2K(0)); /* XXX */ sbuf_printf(sb, "VmRSS:\t%8ju kB\n", P2K((uintmax_t)kp.ki_rssize)); sbuf_printf(sb, "VmData:\t%8ju kB\n", P2K((uintmax_t)kp.ki_dsize)); sbuf_printf(sb, "VmStk:\t%8ju kB\n", P2K((uintmax_t)kp.ki_ssize)); sbuf_printf(sb, "VmExe:\t%8ju kB\n", P2K((uintmax_t)kp.ki_tsize)); lsize = B2P(kp.ki_size) - kp.ki_dsize - kp.ki_ssize - kp.ki_tsize - 1; sbuf_printf(sb, "VmLib:\t%8ju kB\n", P2K((uintmax_t)lsize)); /* * Signal masks */ PROC_LOCK(p); bsd_to_linux_sigset(&p->p_siglist, &siglist); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); bsd_to_linux_sigset(&ps->ps_sigignore, &sigignore); bsd_to_linux_sigset(&ps->ps_sigcatch, &sigcatch); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); sbuf_printf(sb, "SigPnd:\t%016jx\n", siglist.__mask); /* * XXX. SigBlk - target thread's signal mask, td_sigmask. * To implement SigBlk pseudofs should support proc/tid dir entries. */ sbuf_printf(sb, "SigBlk:\t%016x\n", 0); sbuf_printf(sb, "SigIgn:\t%016jx\n", sigignore.__mask); sbuf_printf(sb, "SigCgt:\t%016jx\n", sigcatch.__mask); /* * Linux also prints the capability masks, but we don't have * capabilities yet, and when we do get them they're likely to * be meaningless to Linux programs, so we lie. XXX */ sbuf_printf(sb, "CapInh:\t%016x\n", 0); sbuf_printf(sb, "CapPrm:\t%016x\n", 0); sbuf_printf(sb, "CapEff:\t%016x\n", 0); return (0); } /* * Filler function for proc/pid/cwd */ static int linprocfs_doproccwd(PFS_FILL_ARGS) { struct pwd *pwd; char *fullpath = "unknown"; char *freepath = NULL; pwd = pwd_hold(td); vn_fullpath(pwd->pwd_cdir, &fullpath, &freepath); sbuf_printf(sb, "%s", fullpath); if (freepath) free(freepath, M_TEMP); pwd_drop(pwd); return (0); } /* * Filler function for proc/pid/root */ static int linprocfs_doprocroot(PFS_FILL_ARGS) { struct pwd *pwd; struct vnode *vp; char *fullpath = "unknown"; char *freepath = NULL; pwd = pwd_hold(td); vp = jailed(p->p_ucred) ? pwd->pwd_jdir : pwd->pwd_rdir; vn_fullpath(vp, &fullpath, &freepath); sbuf_printf(sb, "%s", fullpath); if (freepath) free(freepath, M_TEMP); pwd_drop(pwd); return (0); } /* * Filler function for proc/pid/cmdline */ static int linprocfs_doproccmdline(PFS_FILL_ARGS) { int ret; PROC_LOCK(p); if ((ret = p_cansee(td, p)) != 0) { PROC_UNLOCK(p); return (ret); } /* * Mimic linux behavior and pass only processes with usermode * address space as valid. Return zero silently otherwize. */ if (p->p_vmspace == &vmspace0) { PROC_UNLOCK(p); return (0); } if (p->p_args != NULL) { sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length); PROC_UNLOCK(p); return (0); } if ((p->p_flag & P_SYSTEM) != 0) { PROC_UNLOCK(p); return (0); } PROC_UNLOCK(p); ret = proc_getargv(td, p, sb); return (ret); } /* * Filler function for proc/pid/environ */ static int linprocfs_doprocenviron(PFS_FILL_ARGS) { /* * Mimic linux behavior and pass only processes with usermode * address space as valid. Return zero silently otherwize. */ if (p->p_vmspace == &vmspace0) return (0); return (proc_getenvv(td, p, sb)); } static char l32_map_str[] = "%08lx-%08lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n"; static char l64_map_str[] = "%016lx-%016lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n"; static char vdso_str[] = " [vdso]"; static char stack_str[] = " [stack]"; /* * Filler function for proc/pid/maps */ static int linprocfs_doprocmaps(PFS_FILL_ARGS) { struct vmspace *vm; vm_map_t map; vm_map_entry_t entry, tmp_entry; vm_object_t obj, tobj, lobj; vm_offset_t e_start, e_end; vm_ooffset_t off; vm_prot_t e_prot; unsigned int last_timestamp; char *name = "", *freename = NULL; const char *l_map_str; ino_t ino; int ref_count, shadow_count, flags; int error; struct vnode *vp; struct vattr vat; bool private; PROC_LOCK(p); error = p_candebug(td, p); PROC_UNLOCK(p); if (error) return (error); if (uio->uio_rw != UIO_READ) return (EOPNOTSUPP); error = 0; vm = vmspace_acquire_ref(p); if (vm == NULL) return (ESRCH); if (SV_CURPROC_FLAG(SV_LP64)) l_map_str = l64_map_str; else l_map_str = l32_map_str; map = &vm->vm_map; vm_map_lock_read(map); VM_MAP_ENTRY_FOREACH(entry, map) { name = ""; freename = NULL; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; e_prot = entry->protection; e_start = entry->start; e_end = entry->end; obj = entry->object.vm_object; off = entry->offset; for (lobj = tobj = obj; tobj != NULL; lobj = tobj, tobj = tobj->backing_object) { VM_OBJECT_RLOCK(tobj); off += lobj->backing_object_offset; if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); } private = (entry->eflags & MAP_ENTRY_COW) != 0 || obj == NULL || (obj->flags & OBJ_ANON) != 0; last_timestamp = map->timestamp; vm_map_unlock_read(map); ino = 0; if (lobj) { vp = vm_object_vnode(lobj); if (vp != NULL) vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); flags = obj->flags; ref_count = obj->ref_count; shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(vp, &name, &freename); vn_lock(vp, LK_SHARED | LK_RETRY); VOP_GETATTR(vp, &vat, td->td_ucred); ino = vat.va_fileid; vput(vp); } else if (SV_PROC_ABI(p) == SV_ABI_LINUX) { if (e_start == p->p_sysent->sv_shared_page_base) name = vdso_str; if (e_end == p->p_sysent->sv_usrstack) name = stack_str; } } else { flags = 0; ref_count = 0; shadow_count = 0; } /* * format: * start, end, access, offset, major, minor, inode, name. */ error = sbuf_printf(sb, l_map_str, (u_long)e_start, (u_long)e_end, (e_prot & VM_PROT_READ)?"r":"-", (e_prot & VM_PROT_WRITE)?"w":"-", (e_prot & VM_PROT_EXECUTE)?"x":"-", private ? "p" : "s", (u_long)off, 0, 0, (u_long)ino, *name ? " " : " ", name ); if (freename) free(freename, M_TEMP); vm_map_lock_read(map); if (error == -1) { error = 0; break; } if (last_timestamp != map->timestamp) { /* * Look again for the entry because the map was * modified while it was unlocked. Specifically, * the entry may have been clipped, merged, or deleted. */ vm_map_lookup_entry(map, e_end - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); return (error); } /* * Filler function for proc/pid/mem */ static int linprocfs_doprocmem(PFS_FILL_ARGS) { ssize_t resid; int error; resid = uio->uio_resid; error = procfs_doprocmem(PFS_FILL_ARGNAMES); if (uio->uio_rw == UIO_READ && resid != uio->uio_resid) return (0); if (error == EFAULT) error = EIO; return (error); } /* * Criteria for interface name translation */ #define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER) static int linux_ifname(struct ifnet *ifp, char *buffer, size_t buflen) { struct ifnet *ifscan; int ethno; IFNET_RLOCK_ASSERT(); /* Short-circuit non ethernet interfaces */ if (!IFP_IS_ETH(ifp)) return (strlcpy(buffer, ifp->if_xname, buflen)); /* Determine the (relative) unit number for ethernet interfaces */ ethno = 0; CK_STAILQ_FOREACH(ifscan, &V_ifnet, if_link) { if (ifscan == ifp) return (snprintf(buffer, buflen, "eth%d", ethno)); if (IFP_IS_ETH(ifscan)) ethno++; } return (0); } /* * Filler function for proc/net/dev */ static int linprocfs_donetdev(PFS_FILL_ARGS) { char ifname[16]; /* XXX LINUX_IFNAMSIZ */ struct ifnet *ifp; sbuf_printf(sb, "%6s|%58s|%s\n" "%6s|%58s|%58s\n", "Inter-", " Receive", " Transmit", " face", "bytes packets errs drop fifo frame compressed multicast", "bytes packets errs drop fifo colls carrier compressed"); CURVNET_SET(TD_TO_VNET(curthread)); IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { linux_ifname(ifp, ifname, sizeof ifname); sbuf_printf(sb, "%6.6s: ", ifname); sbuf_printf(sb, "%7ju %7ju %4ju %4ju %4lu %5lu %10lu %9ju ", (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IBYTES), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IERRORS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS), /* rx_missed_errors */ 0UL, /* rx_fifo_errors */ 0UL, /* rx_length_errors + * rx_over_errors + * rx_crc_errors + * rx_frame_errors */ 0UL, /* rx_compressed */ (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS)); /* XXX-BZ rx only? */ sbuf_printf(sb, "%8ju %7ju %4ju %4ju %4lu %5ju %7lu %10lu\n", (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OBYTES), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OERRORS), (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS), 0UL, /* tx_fifo_errors */ (uintmax_t )ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS), 0UL, /* tx_carrier_errors + * tx_aborted_errors + * tx_window_errors + * tx_heartbeat_errors*/ 0UL); /* tx_compressed */ } IFNET_RUNLOCK(); CURVNET_RESTORE(); return (0); } /* * Filler function for proc/sys/kernel/osrelease */ static int linprocfs_doosrelease(PFS_FILL_ARGS) { char osrelease[LINUX_MAX_UTSNAME]; linux_get_osrelease(td, osrelease); sbuf_printf(sb, "%s\n", osrelease); return (0); } /* * Filler function for proc/sys/kernel/ostype */ static int linprocfs_doostype(PFS_FILL_ARGS) { char osname[LINUX_MAX_UTSNAME]; linux_get_osname(td, osname); sbuf_printf(sb, "%s\n", osname); return (0); } /* * Filler function for proc/sys/kernel/version */ static int linprocfs_doosbuild(PFS_FILL_ARGS) { linprocfs_osbuild(td, sb); sbuf_cat(sb, "\n"); return (0); } /* * Filler function for proc/sys/kernel/msgmax */ static int linprocfs_domsgmax(PFS_FILL_ARGS) { sbuf_printf(sb, "%d\n", msginfo.msgmax); return (0); } /* * Filler function for proc/sys/kernel/msgmni */ static int linprocfs_domsgmni(PFS_FILL_ARGS) { sbuf_printf(sb, "%d\n", msginfo.msgmni); return (0); } /* * Filler function for proc/sys/kernel/msgmnb */ static int linprocfs_domsgmnb(PFS_FILL_ARGS) { sbuf_printf(sb, "%d\n", msginfo.msgmnb); return (0); } /* * Filler function for proc/sys/kernel/ngroups_max * * Note that in Linux it defaults to 65536, not 1023. */ static int linprocfs_dongroups_max(PFS_FILL_ARGS) { sbuf_printf(sb, "%d\n", ngroups_max); return (0); } /* * Filler function for proc/sys/kernel/pid_max */ static int linprocfs_dopid_max(PFS_FILL_ARGS) { sbuf_printf(sb, "%i\n", PID_MAX); return (0); } /* * Filler function for proc/sys/kernel/sem */ static int linprocfs_dosem(PFS_FILL_ARGS) { sbuf_printf(sb, "%d %d %d %d\n", seminfo.semmsl, seminfo.semmns, seminfo.semopm, seminfo.semmni); return (0); } /* * Filler function for proc/sys/kernel/shmall */ static int linprocfs_doshmall(PFS_FILL_ARGS) { sbuf_printf(sb, "%lu\n", shminfo.shmall); return (0); } /* * Filler function for proc/sys/kernel/shmmax */ static int linprocfs_doshmmax(PFS_FILL_ARGS) { sbuf_printf(sb, "%lu\n", shminfo.shmmax); return (0); } /* * Filler function for proc/sys/kernel/shmmni */ static int linprocfs_doshmmni(PFS_FILL_ARGS) { sbuf_printf(sb, "%lu\n", shminfo.shmmni); return (0); } /* * Filler function for proc/sys/kernel/tainted */ static int linprocfs_dotainted(PFS_FILL_ARGS) { sbuf_printf(sb, "0\n"); return (0); } /* * Filler function for proc/sys/vm/min_free_kbytes * * This mirrors the approach in illumos to return zero for reads. Effectively, * it says, no memory is kept in reserve for "atomic allocations". This class * of allocation can be used at times when a thread cannot be suspended. */ static int linprocfs_dominfree(PFS_FILL_ARGS) { sbuf_printf(sb, "%d\n", 0); return (0); } /* * Filler function for proc/scsi/device_info */ static int linprocfs_doscsidevinfo(PFS_FILL_ARGS) { return (0); } /* * Filler function for proc/scsi/scsi */ static int linprocfs_doscsiscsi(PFS_FILL_ARGS) { return (0); } /* * Filler function for proc/devices */ static int linprocfs_dodevices(PFS_FILL_ARGS) { char *char_devices; sbuf_printf(sb, "Character devices:\n"); char_devices = linux_get_char_devices(); sbuf_printf(sb, "%s", char_devices); linux_free_get_char_devices(char_devices); sbuf_printf(sb, "\nBlock devices:\n"); return (0); } /* * Filler function for proc/cmdline */ static int linprocfs_docmdline(PFS_FILL_ARGS) { sbuf_printf(sb, "BOOT_IMAGE=%s", kernelname); sbuf_printf(sb, " ro root=302\n"); return (0); } /* * Filler function for proc/filesystems */ static int linprocfs_dofilesystems(PFS_FILL_ARGS) { struct vfsconf *vfsp; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_flags & VFCF_SYNTHETIC) sbuf_printf(sb, "nodev"); sbuf_printf(sb, "\t%s\n", vfsp->vfc_name); } vfsconf_sunlock(); return(0); } /* * Filler function for proc/modules */ static int linprocfs_domodules(PFS_FILL_ARGS) { #if 0 struct linker_file *lf; TAILQ_FOREACH(lf, &linker_files, link) { sbuf_printf(sb, "%-20s%8lu%4d\n", lf->filename, (unsigned long)lf->size, lf->refs); } #endif return (0); } /* * Filler function for proc/pid/fd */ static int linprocfs_dofdescfs(PFS_FILL_ARGS) { if (p == curproc) sbuf_printf(sb, "/dev/fd"); else sbuf_printf(sb, "unknown"); return (0); } /* * Filler function for proc/pid/limits */ static const struct linux_rlimit_ident { const char *desc; const char *unit; unsigned int rlim_id; } linux_rlimits_ident[] = { { "Max cpu time", "seconds", RLIMIT_CPU }, { "Max file size", "bytes", RLIMIT_FSIZE }, { "Max data size", "bytes", RLIMIT_DATA }, { "Max stack size", "bytes", RLIMIT_STACK }, { "Max core file size", "bytes", RLIMIT_CORE }, { "Max resident set", "bytes", RLIMIT_RSS }, { "Max processes", "processes", RLIMIT_NPROC }, { "Max open files", "files", RLIMIT_NOFILE }, { "Max locked memory", "bytes", RLIMIT_MEMLOCK }, { "Max address space", "bytes", RLIMIT_AS }, { "Max file locks", "locks", LINUX_RLIMIT_LOCKS }, { "Max pending signals", "signals", LINUX_RLIMIT_SIGPENDING }, { "Max msgqueue size", "bytes", LINUX_RLIMIT_MSGQUEUE }, { "Max nice priority", "", LINUX_RLIMIT_NICE }, { "Max realtime priority", "", LINUX_RLIMIT_RTPRIO }, { "Max realtime timeout", "us", LINUX_RLIMIT_RTTIME }, { 0, 0, 0 } }; static int linprocfs_doproclimits(PFS_FILL_ARGS) { const struct linux_rlimit_ident *li; struct plimit *limp; struct rlimit rl; ssize_t size; int res, error; error = 0; PROC_LOCK(p); limp = lim_hold(p->p_limit); PROC_UNLOCK(p); size = sizeof(res); sbuf_printf(sb, "%-26s%-21s%-21s%-21s\n", "Limit", "Soft Limit", "Hard Limit", "Units"); for (li = linux_rlimits_ident; li->desc != NULL; ++li) { switch (li->rlim_id) { case LINUX_RLIMIT_LOCKS: /* FALLTHROUGH */ case LINUX_RLIMIT_RTTIME: rl.rlim_cur = RLIM_INFINITY; break; case LINUX_RLIMIT_SIGPENDING: error = kernel_sysctlbyname(td, "kern.sigqueue.max_pending_per_proc", &res, &size, 0, 0, 0, 0); if (error != 0) goto out; rl.rlim_cur = res; rl.rlim_max = res; break; case LINUX_RLIMIT_MSGQUEUE: error = kernel_sysctlbyname(td, "kern.ipc.msgmnb", &res, &size, 0, 0, 0, 0); if (error != 0) goto out; rl.rlim_cur = res; rl.rlim_max = res; break; case LINUX_RLIMIT_NICE: /* FALLTHROUGH */ case LINUX_RLIMIT_RTPRIO: rl.rlim_cur = 0; rl.rlim_max = 0; break; default: rl = limp->pl_rlimit[li->rlim_id]; break; } if (rl.rlim_cur == RLIM_INFINITY) sbuf_printf(sb, "%-26s%-21s%-21s%-10s\n", li->desc, "unlimited", "unlimited", li->unit); else sbuf_printf(sb, "%-26s%-21llu%-21llu%-10s\n", li->desc, (unsigned long long)rl.rlim_cur, (unsigned long long)rl.rlim_max, li->unit); } out: lim_free(limp); return (error); } /* * The point of the following two functions is to work around * an assertion in Chromium; see kern/240991 for details. */ static int linprocfs_dotaskattr(PFS_ATTR_ARGS) { vap->va_nlink = 3; return (0); } /* * Filler function for proc//task/.dummy */ static int linprocfs_dotaskdummy(PFS_FILL_ARGS) { return (0); } /* * Filler function for proc/sys/kernel/random/uuid */ static int linprocfs_douuid(PFS_FILL_ARGS) { struct uuid uuid; kern_uuidgen(&uuid, 1); sbuf_printf_uuid(sb, &uuid); sbuf_printf(sb, "\n"); return(0); } /* * Filler function for proc/pid/auxv */ static int linprocfs_doauxv(PFS_FILL_ARGS) { struct sbuf *asb; off_t buflen, resid; int error; /* * Mimic linux behavior and pass only processes with usermode * address space as valid. Return zero silently otherwise. */ if (p->p_vmspace == &vmspace0) return (0); if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0 || uio->uio_resid < 0) return (EINVAL); asb = sbuf_new_auto(); if (asb == NULL) return (ENOMEM); error = proc_getauxv(td, p, asb); if (error == 0) error = sbuf_finish(asb); resid = sbuf_len(asb) - uio->uio_offset; if (resid > uio->uio_resid) buflen = uio->uio_resid; else buflen = resid; if (buflen > IOSIZE_MAX) return (EINVAL); if (buflen > maxphys) buflen = maxphys; if (resid <= 0) return (0); if (error == 0) error = uiomove(sbuf_data(asb) + uio->uio_offset, buflen, uio); sbuf_delete(asb); return (error); } /* * Constructor */ static int linprocfs_init(PFS_INIT_ARGS) { struct pfs_node *root; struct pfs_node *dir; struct pfs_node *sys; root = pi->pi_root; /* /proc/... */ pfs_create_file(root, "cmdline", &linprocfs_docmdline, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "cpuinfo", &linprocfs_docpuinfo, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "devices", &linprocfs_dodevices, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "filesystems", &linprocfs_dofilesystems, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "loadavg", &linprocfs_doloadavg, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "meminfo", &linprocfs_domeminfo, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "modules", &linprocfs_domodules, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "mounts", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "mtab", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "partitions", &linprocfs_dopartitions, NULL, NULL, NULL, PFS_RD); pfs_create_link(root, "self", &procfs_docurproc, NULL, NULL, NULL, 0); pfs_create_file(root, "stat", &linprocfs_dostat, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "swaps", &linprocfs_doswaps, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "uptime", &linprocfs_douptime, NULL, NULL, NULL, PFS_RD); pfs_create_file(root, "version", &linprocfs_doversion, NULL, NULL, NULL, PFS_RD); /* /proc/bus/... */ dir = pfs_create_dir(root, "bus", NULL, NULL, NULL, 0); dir = pfs_create_dir(dir, "pci", NULL, NULL, NULL, 0); dir = pfs_create_dir(dir, "devices", NULL, NULL, NULL, 0); /* /proc/net/... */ dir = pfs_create_dir(root, "net", NULL, NULL, NULL, 0); pfs_create_file(dir, "dev", &linprocfs_donetdev, NULL, NULL, NULL, PFS_RD); /* /proc//... */ dir = pfs_create_dir(root, "pid", NULL, NULL, NULL, PFS_PROCDEP); pfs_create_file(dir, "cmdline", &linprocfs_doproccmdline, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "cwd", &linprocfs_doproccwd, NULL, NULL, NULL, 0); pfs_create_file(dir, "environ", &linprocfs_doprocenviron, NULL, &procfs_candebug, NULL, PFS_RD); pfs_create_link(dir, "exe", &procfs_doprocfile, NULL, &procfs_notsystem, NULL, 0); pfs_create_file(dir, "maps", &linprocfs_doprocmaps, NULL, NULL, NULL, PFS_RD | PFS_AUTODRAIN); pfs_create_file(dir, "mem", &linprocfs_doprocmem, procfs_attr_rw, &procfs_candebug, NULL, PFS_RDWR | PFS_RAW); pfs_create_file(dir, "mountinfo", &linprocfs_doprocmountinfo, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "mounts", &linprocfs_domtab, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "root", &linprocfs_doprocroot, NULL, NULL, NULL, 0); pfs_create_file(dir, "stat", &linprocfs_doprocstat, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "statm", &linprocfs_doprocstatm, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "status", &linprocfs_doprocstatus, NULL, NULL, NULL, PFS_RD); pfs_create_link(dir, "fd", &linprocfs_dofdescfs, NULL, NULL, NULL, 0); pfs_create_file(dir, "auxv", &linprocfs_doauxv, NULL, &procfs_candebug, NULL, PFS_RD|PFS_RAWRD); pfs_create_file(dir, "limits", &linprocfs_doproclimits, NULL, NULL, NULL, PFS_RD); /* /proc//task/... */ dir = pfs_create_dir(dir, "task", linprocfs_dotaskattr, NULL, NULL, 0); pfs_create_file(dir, ".dummy", &linprocfs_dotaskdummy, NULL, NULL, NULL, PFS_RD); /* /proc/scsi/... */ dir = pfs_create_dir(root, "scsi", NULL, NULL, NULL, 0); pfs_create_file(dir, "device_info", &linprocfs_doscsidevinfo, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "scsi", &linprocfs_doscsiscsi, NULL, NULL, NULL, PFS_RD); /* /proc/sys/... */ sys = pfs_create_dir(root, "sys", NULL, NULL, NULL, 0); /* /proc/sys/kernel/... */ dir = pfs_create_dir(sys, "kernel", NULL, NULL, NULL, 0); pfs_create_file(dir, "osrelease", &linprocfs_doosrelease, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "ostype", &linprocfs_doostype, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "version", &linprocfs_doosbuild, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "msgmax", &linprocfs_domsgmax, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "msgmni", &linprocfs_domsgmni, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "msgmnb", &linprocfs_domsgmnb, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "ngroups_max", &linprocfs_dongroups_max, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "pid_max", &linprocfs_dopid_max, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "sem", &linprocfs_dosem, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "shmall", &linprocfs_doshmall, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "shmmax", &linprocfs_doshmmax, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "shmmni", &linprocfs_doshmmni, NULL, NULL, NULL, PFS_RD); pfs_create_file(dir, "tainted", &linprocfs_dotainted, NULL, NULL, NULL, PFS_RD); /* /proc/sys/kernel/random/... */ dir = pfs_create_dir(dir, "random", NULL, NULL, NULL, 0); pfs_create_file(dir, "uuid", &linprocfs_douuid, NULL, NULL, NULL, PFS_RD); /* /proc/sys/vm/.... */ dir = pfs_create_dir(sys, "vm", NULL, NULL, NULL, 0); pfs_create_file(dir, "min_free_kbytes", &linprocfs_dominfree, NULL, NULL, NULL, PFS_RD); return (0); } /* * Destructor */ static int linprocfs_uninit(PFS_INIT_ARGS) { /* nothing to do, pseudofs will GC */ return (0); } PSEUDOFS(linprocfs, 1, VFCF_JAIL); #if defined(__aarch64__) || defined(__amd64__) MODULE_DEPEND(linprocfs, linux_common, 1, 1, 1); #else MODULE_DEPEND(linprocfs, linux, 1, 1, 1); #endif MODULE_DEPEND(linprocfs, procfs, 1, 1, 1); MODULE_DEPEND(linprocfs, sysvmsg, 1, 1, 1); MODULE_DEPEND(linprocfs, sysvsem, 1, 1, 1); MODULE_DEPEND(linprocfs, sysvshm, 1, 1, 1); diff --git a/sys/ddb/db_command.c b/sys/ddb/db_command.c index 21ff75f78e6a..fedec1dd33a4 100644 --- a/sys/ddb/db_command.c +++ b/sys/ddb/db_command.c @@ -1,920 +1,920 @@ /*- * SPDX-License-Identifier: MIT-CMU * * Mach Operating System * Copyright (c) 1991,1990 Carnegie Mellon University * All Rights Reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Author: David B. Golub, Carnegie Mellon University * Date: 7/90 */ /* * Command dispatcher. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Exported global variables */ int db_cmd_loop_done; db_addr_t db_dot; db_addr_t db_last_addr; db_addr_t db_prev; db_addr_t db_next; static db_cmdfcn_t db_dump; static db_cmdfcn_t db_fncall; static db_cmdfcn_t db_gdb; static db_cmdfcn_t db_halt; static db_cmdfcn_t db_kill; static db_cmdfcn_t db_reset; static db_cmdfcn_t db_stack_trace; static db_cmdfcn_t db_stack_trace_active; static db_cmdfcn_t db_stack_trace_all; static db_cmdfcn_t db_watchdog; /* * 'show' commands */ static struct command db_show_active_cmds[] = { { "trace", db_stack_trace_active, 0, NULL }, }; struct command_table db_show_active_table = LIST_HEAD_INITIALIZER(db_show_active_table); static struct command db_show_all_cmds[] = { { "trace", db_stack_trace_all, 0, NULL }, }; struct command_table db_show_all_table = LIST_HEAD_INITIALIZER(db_show_all_table); static struct command db_show_cmds[] = { { "active", 0, 0, &db_show_active_table }, { "all", 0, 0, &db_show_all_table }, { "registers", db_show_regs, 0, NULL }, { "breaks", db_listbreak_cmd, 0, NULL }, { "threads", db_show_threads, 0, NULL }, }; struct command_table db_show_table = LIST_HEAD_INITIALIZER(db_show_table); static struct command db_cmds[] = { { "print", db_print_cmd, 0, NULL }, { "p", db_print_cmd, 0, NULL }, { "examine", db_examine_cmd, CS_SET_DOT, NULL }, { "x", db_examine_cmd, CS_SET_DOT, NULL }, { "search", db_search_cmd, CS_OWN|CS_SET_DOT, NULL }, { "set", db_set_cmd, CS_OWN, NULL }, { "write", db_write_cmd, CS_MORE|CS_SET_DOT, NULL }, { "w", db_write_cmd, CS_MORE|CS_SET_DOT, NULL }, { "delete", db_delete_cmd, 0, NULL }, { "d", db_delete_cmd, 0, NULL }, { "dump", db_dump, 0, NULL }, { "break", db_breakpoint_cmd, 0, NULL }, { "b", db_breakpoint_cmd, 0, NULL }, { "dwatch", db_deletewatch_cmd, 0, NULL }, { "watch", db_watchpoint_cmd, CS_MORE,NULL }, { "dhwatch", db_deletehwatch_cmd, 0, NULL }, { "hwatch", db_hwatchpoint_cmd, 0, NULL }, { "step", db_single_step_cmd, 0, NULL }, { "s", db_single_step_cmd, 0, NULL }, { "continue", db_continue_cmd, 0, NULL }, { "c", db_continue_cmd, 0, NULL }, { "until", db_trace_until_call_cmd,0, NULL }, { "next", db_trace_until_matching_cmd,0, NULL }, { "match", db_trace_until_matching_cmd,0, NULL }, { "trace", db_stack_trace, CS_OWN, NULL }, { "t", db_stack_trace, CS_OWN, NULL }, /* XXX alias for active trace */ { "acttrace", db_stack_trace_active, 0, NULL }, /* XXX alias for all trace */ { "alltrace", db_stack_trace_all, 0, NULL }, { "where", db_stack_trace, CS_OWN, NULL }, { "bt", db_stack_trace, CS_OWN, NULL }, { "call", db_fncall, CS_OWN, NULL }, { "show", 0, 0, &db_show_table }, { "ps", db_ps, 0, NULL }, { "gdb", db_gdb, 0, NULL }, { "halt", db_halt, 0, NULL }, { "reboot", db_reset, 0, NULL }, { "reset", db_reset, 0, NULL }, { "kill", db_kill, CS_OWN, NULL }, { "watchdog", db_watchdog, CS_OWN, NULL }, { "thread", db_set_thread, 0, NULL }, { "run", db_run_cmd, CS_OWN, NULL }, { "script", db_script_cmd, CS_OWN, NULL }, { "scripts", db_scripts_cmd, 0, NULL }, { "unscript", db_unscript_cmd, CS_OWN, NULL }, { "capture", db_capture_cmd, CS_OWN, NULL }, { "textdump", db_textdump_cmd, CS_OWN, NULL }, { "findstack", db_findstack_cmd, 0, NULL }, }; struct command_table db_cmd_table = LIST_HEAD_INITIALIZER(db_cmd_table); static struct command *db_last_command = NULL; /* * if 'ed' style: 'dot' is set at start of last item printed, * and '+' points to next line. * Otherwise: 'dot' points to next item, '..' points to last. */ static bool db_ed_style = true; /* * Utility routine - discard tokens through end-of-line. */ void db_skip_to_eol(void) { int t; do { t = db_read_token(); } while (t != tEOL); } /* * Results of command search. */ #define CMD_UNIQUE 0 #define CMD_FOUND 1 #define CMD_NONE 2 #define CMD_AMBIGUOUS 3 #define CMD_HELP 4 static void db_cmd_match(char *name, struct command *cmd, struct command **cmdp, int *resultp); static void db_cmd_list(struct command_table *table); static int db_cmd_search(char *name, struct command_table *table, struct command **cmdp); static void db_command(struct command **last_cmdp, struct command_table *cmd_table, int dopager); /* * Initialize the command lists from the static tables. */ void db_command_init(void) { #define N(a) (sizeof(a) / sizeof(a[0])) int i; for (i = 0; i < N(db_cmds); i++) db_command_register(&db_cmd_table, &db_cmds[i]); for (i = 0; i < N(db_show_cmds); i++) db_command_register(&db_show_table, &db_show_cmds[i]); for (i = 0; i < N(db_show_active_cmds); i++) db_command_register(&db_show_active_table, &db_show_active_cmds[i]); for (i = 0; i < N(db_show_all_cmds); i++) db_command_register(&db_show_all_table, &db_show_all_cmds[i]); #undef N } /* * Register a command. */ void db_command_register(struct command_table *list, struct command *cmd) { struct command *c, *last; last = NULL; LIST_FOREACH(c, list, next) { int n = strcmp(cmd->name, c->name); /* Check that the command is not already present. */ if (n == 0) { printf("%s: Warning, the command \"%s\" already exists;" " ignoring request\n", __func__, cmd->name); return; } if (n < 0) { /* NB: keep list sorted lexicographically */ LIST_INSERT_BEFORE(c, cmd, next); return; } last = c; } if (last == NULL) LIST_INSERT_HEAD(list, cmd, next); else LIST_INSERT_AFTER(last, cmd, next); } /* * Remove a command previously registered with db_command_register. */ void db_command_unregister(struct command_table *list, struct command *cmd) { struct command *c; LIST_FOREACH(c, list, next) { if (cmd == c) { LIST_REMOVE(cmd, next); return; } } /* NB: intentionally quiet */ } /* * Helper function to match a single command. */ static void db_cmd_match(char *name, struct command *cmd, struct command **cmdp, int *resultp) { char *lp, *rp; int c; lp = name; rp = cmd->name; while ((c = *lp) == *rp) { if (c == 0) { /* complete match */ *cmdp = cmd; *resultp = CMD_UNIQUE; return; } lp++; rp++; } if (c == 0) { /* end of name, not end of command - partial match */ if (*resultp == CMD_FOUND) { *resultp = CMD_AMBIGUOUS; /* but keep looking for a full match - this lets us match single letters */ } else { *cmdp = cmd; *resultp = CMD_FOUND; } } } /* * Search for command prefix. */ static int db_cmd_search(char *name, struct command_table *table, struct command **cmdp) { struct command *cmd; int result = CMD_NONE; LIST_FOREACH(cmd, table, next) { db_cmd_match(name,cmd,cmdp,&result); if (result == CMD_UNIQUE) break; } if (result == CMD_NONE) { /* check for 'help' */ if (name[0] == 'h' && name[1] == 'e' && name[2] == 'l' && name[3] == 'p') result = CMD_HELP; } return (result); } static void db_cmd_list(struct command_table *table) { struct command *cmd; int have_subcommands; have_subcommands = 0; LIST_FOREACH(cmd, table, next) { if (cmd->more != NULL) have_subcommands++; db_printf("%-16s", cmd->name); db_end_line(16); } if (have_subcommands > 0) { db_printf("\nThe following have subcommands; append \"help\" " "to list (e.g. \"show help\"):\n"); LIST_FOREACH(cmd, table, next) { if (cmd->more == NULL) continue; db_printf("%-16s", cmd->name); db_end_line(16); } } } static void db_command(struct command **last_cmdp, struct command_table *cmd_table, int dopager) { struct command *cmd = NULL; int t; char modif[TOK_STRING_SIZE]; db_expr_t addr, count; bool have_addr = false; int result; t = db_read_token(); if (t == tEOL) { /* empty line repeats last command, at 'next' */ cmd = *last_cmdp; addr = (db_expr_t)db_next; have_addr = false; count = 1; modif[0] = '\0'; } else if (t == tEXCL) { db_fncall((db_expr_t)0, (bool)false, (db_expr_t)0, (char *)0); return; } else if (t != tIDENT) { db_printf("Unrecognized input; use \"help\" " "to list available commands\n"); db_flush_lex(); return; } else { /* * Search for command */ while (cmd_table) { result = db_cmd_search(db_tok_string, cmd_table, &cmd); switch (result) { case CMD_NONE: db_printf("No such command; use \"help\" " "to list available commands\n"); db_flush_lex(); return; case CMD_AMBIGUOUS: db_printf("Ambiguous\n"); db_flush_lex(); return; case CMD_HELP: if (cmd_table == &db_cmd_table) { db_printf("This is ddb(4), the kernel debugger; " "see https://man.FreeBSD.org/ddb/4 for help.\n"); db_printf("Use \"bt\" for backtrace, \"dump\" for " "kernel core dump, \"reset\" to reboot.\n"); db_printf("Available commands:\n"); } db_cmd_list(cmd_table); db_flush_lex(); return; default: break; } if ((cmd_table = cmd->more) != NULL) { t = db_read_token(); if (t != tIDENT) { db_printf("Subcommand required; " "available subcommands:\n"); db_cmd_list(cmd_table); db_flush_lex(); return; } } } if ((cmd->flag & CS_OWN) == 0) { /* * Standard syntax: * command [/modifier] [addr] [,count] */ t = db_read_token(); if (t == tSLASH) { t = db_read_token(); if (t != tIDENT) { db_printf("Bad modifier\n"); db_flush_lex(); return; } db_strcpy(modif, db_tok_string); } else { db_unread_token(t); modif[0] = '\0'; } if (db_expression(&addr)) { db_dot = (db_addr_t) addr; db_last_addr = db_dot; have_addr = true; } else { addr = (db_expr_t) db_dot; have_addr = false; } t = db_read_token(); if (t == tCOMMA) { if (!db_expression(&count)) { db_printf("Count missing\n"); db_flush_lex(); return; } } else { db_unread_token(t); count = -1; } if ((cmd->flag & CS_MORE) == 0) { db_skip_to_eol(); } } } *last_cmdp = cmd; if (cmd != NULL) { /* * Execute the command. */ if (dopager) db_enable_pager(); else db_disable_pager(); (*cmd->fcn)(addr, have_addr, count, modif); if (dopager) db_disable_pager(); if (cmd->flag & CS_SET_DOT) { /* * If command changes dot, set dot to * previous address displayed (if 'ed' style). */ if (db_ed_style) { db_dot = db_prev; } else { db_dot = db_next; } } else { /* * If command does not change dot, * set 'next' location to be the same. */ db_next = db_dot; } } } /* * At least one non-optional command must be implemented using * DB_COMMAND() so that db_cmd_set gets created. Here is one. */ DB_COMMAND(panic, db_panic) { db_disable_pager(); panic("from debugger"); } void db_command_loop(void) { /* * Initialize 'prev' and 'next' to dot. */ db_prev = db_dot; db_next = db_dot; db_cmd_loop_done = 0; while (!db_cmd_loop_done) { if (db_print_position() != 0) db_printf("\n"); db_printf("db> "); (void) db_read_line(); db_command(&db_last_command, &db_cmd_table, /* dopager */ 1); } } /* * Execute a command on behalf of a script. The caller is responsible for * making sure that the command string is < DB_MAXLINE or it will be * truncated. * * XXXRW: Runs by injecting faked input into DDB input stream; it would be * nicer to use an alternative approach that didn't mess with the previous * command buffer. */ void db_command_script(const char *command) { db_prev = db_next = db_dot; db_inject_line(command); db_command(&db_last_command, &db_cmd_table, /* dopager */ 0); } void db_error(const char *s) { if (s) db_printf("%s", s); db_flush_lex(); kdb_reenter_silent(); } static void db_dump(db_expr_t dummy, bool dummy2, db_expr_t dummy3, char *dummy4) { int error; if (textdump_pending) { db_printf("textdump_pending set.\n" "run \"textdump unset\" first or \"textdump dump\" for a textdump.\n"); return; } error = doadump(false); if (error) { db_printf("Cannot dump: "); switch (error) { case EBUSY: db_printf("debugger got invoked while dumping.\n"); break; case ENXIO: db_printf("no dump device specified.\n"); break; default: db_printf("unknown error (error=%d).\n", error); break; } } } /* * Call random function: * !expr(arg,arg,arg) */ /* The generic implementation supports a maximum of 10 arguments. */ typedef db_expr_t __db_f(db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t, db_expr_t); static __inline int db_fncall_generic(db_expr_t addr, db_expr_t *rv, int nargs, db_expr_t args[]) { __db_f *f = (__db_f *)addr; if (nargs > 10) { db_printf("Too many arguments (max 10)\n"); return (0); } *rv = (*f)(args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9]); return (1); } static void db_fncall(db_expr_t dummy1, bool dummy2, db_expr_t dummy3, char *dummy4) { db_expr_t fn_addr; db_expr_t args[DB_MAXARGS]; int nargs = 0; db_expr_t retval; int t; if (!db_expression(&fn_addr)) { db_printf("Bad function\n"); db_flush_lex(); return; } t = db_read_token(); if (t == tLPAREN) { if (db_expression(&args[0])) { nargs++; while ((t = db_read_token()) == tCOMMA) { if (nargs == DB_MAXARGS) { db_printf("Too many arguments (max %d)\n", DB_MAXARGS); db_flush_lex(); return; } if (!db_expression(&args[nargs])) { db_printf("Argument missing\n"); db_flush_lex(); return; } nargs++; } db_unread_token(t); } if (db_read_token() != tRPAREN) { db_printf("Mismatched parens\n"); db_flush_lex(); return; } } db_skip_to_eol(); db_disable_pager(); if (DB_CALL(fn_addr, &retval, nargs, args)) db_printf("= %#lr\n", (long)retval); } static void db_halt(db_expr_t dummy, bool dummy2, db_expr_t dummy3, char *dummy4) { cpu_halt(); } static void db_kill(db_expr_t dummy1, bool dummy2, db_expr_t dummy3, char *dummy4) { db_expr_t old_radix, pid, sig; struct proc *p; #define DB_ERROR(f) do { db_printf f; db_flush_lex(); goto out; } while (0) /* * PIDs and signal numbers are typically represented in base * 10, so make that the default here. It can, of course, be * overridden by specifying a prefix. */ old_radix = db_radix; db_radix = 10; /* Retrieve arguments. */ if (!db_expression(&sig)) DB_ERROR(("Missing signal number\n")); if (!db_expression(&pid)) DB_ERROR(("Missing process ID\n")); db_skip_to_eol(); if (!_SIG_VALID(sig)) DB_ERROR(("Signal number out of range\n")); /* * Find the process in question. allproc_lock is not needed * since we're in DDB. */ /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) if (p->p_pid == pid) break; /* sx_sunlock(&allproc_lock); */ if (p == NULL) DB_ERROR(("Can't find process with pid %ld\n", (long) pid)); /* If it's already locked, bail; otherwise, do the deed. */ if (PROC_TRYLOCK(p) == 0) DB_ERROR(("Can't lock process with pid %ld\n", (long) pid)); else { pksignal(p, sig, NULL); PROC_UNLOCK(p); } out: db_radix = old_radix; #undef DB_ERROR } /* * Reboot. In case there is an additional argument, take it as delay in * seconds. Default to 15s if we cannot parse it and make sure we will * never wait longer than 1 week. Some code is similar to * kern_shutdown.c:shutdown_panic(). */ #ifndef DB_RESET_MAXDELAY #define DB_RESET_MAXDELAY (3600 * 24 * 7) #endif static void db_reset(db_expr_t addr, bool have_addr, db_expr_t count __unused, char *modif __unused) { int delay, loop; if (have_addr) { delay = (int)db_hex2dec(addr); /* If we parse to fail, use 15s. */ if (delay == -1) delay = 15; /* Cap at one week. */ if ((uintmax_t)delay > (uintmax_t)DB_RESET_MAXDELAY) delay = DB_RESET_MAXDELAY; db_printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", delay); for (loop = delay * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) return; } } cpu_reset(); } static void db_watchdog(db_expr_t dummy1, bool dummy2, db_expr_t dummy3, char *dummy4) { db_expr_t old_radix, tout; int err, i; old_radix = db_radix; db_radix = 10; err = db_expression(&tout); db_skip_to_eol(); db_radix = old_radix; /* If no argument is provided the watchdog will just be disabled. */ if (err == 0) { db_printf("No argument provided, disabling watchdog\n"); tout = 0; } else if ((tout & WD_INTERVAL) == WD_TO_NEVER) { db_error("Out of range watchdog interval\n"); return; } EVENTHANDLER_INVOKE(watchdog_list, tout, &i); } static void db_gdb(db_expr_t dummy1, bool dummy2, db_expr_t dummy3, char *dummy4) { if (kdb_dbbe_select("gdb") != 0) { db_printf("The remote GDB backend could not be selected.\n"); return; } /* * Mark that we are done in the debugger. kdb_trap() * should re-enter with the new backend. */ db_cmd_loop_done = 1; db_printf("(ctrl-c will return control to ddb)\n"); } static void db_stack_trace(db_expr_t tid, bool hastid, db_expr_t count, char *modif) { struct thread *td; db_expr_t radix; pid_t pid; int t; /* * We parse our own arguments. We don't like the default radix. */ radix = db_radix; db_radix = 10; hastid = db_expression(&tid); t = db_read_token(); if (t == tCOMMA) { if (!db_expression(&count)) { db_printf("Count missing\n"); db_flush_lex(); db_radix = radix; return; } } else { db_unread_token(t); count = -1; } db_skip_to_eol(); db_radix = radix; if (hastid) { td = kdb_thr_lookup((lwpid_t)tid); if (td == NULL) td = kdb_thr_from_pid((pid_t)tid); if (td == NULL) { db_printf("Thread %d not found\n", (int)tid); return; } } else td = kdb_thread; if (td->td_proc != NULL) pid = td->td_proc->p_pid; else pid = -1; db_printf("Tracing pid %d tid %ld td %p\n", pid, (long)td->td_tid, td); if (td->td_proc != NULL && (td->td_proc->p_flag & P_INMEM) == 0) db_printf("--- swapped out\n"); else db_trace_thread(td, count); } static void _db_stack_trace_all(bool active_only) { struct thread *td; jmp_buf jb; void *prev_jb; for (td = kdb_thr_first(); td != NULL; td = kdb_thr_next(td)) { prev_jb = kdb_jmpbuf(jb); if (setjmp(jb) == 0) { - if (td->td_state == TDS_RUNNING) + if (TD_IS_RUNNING(td)) db_printf("\nTracing command %s pid %d" " tid %ld td %p (CPU %d)\n", td->td_proc->p_comm, td->td_proc->p_pid, (long)td->td_tid, td, td->td_oncpu); else if (active_only) continue; else db_printf("\nTracing command %s pid %d" " tid %ld td %p\n", td->td_proc->p_comm, td->td_proc->p_pid, (long)td->td_tid, td); if (td->td_proc->p_flag & P_INMEM) db_trace_thread(td, -1); else db_printf("--- swapped out\n"); if (db_pager_quit) { kdb_jmpbuf(prev_jb); return; } } kdb_jmpbuf(prev_jb); } } static void db_stack_trace_active(db_expr_t dummy, bool dummy2, db_expr_t dummy3, char *dummy4) { _db_stack_trace_all(true); } static void db_stack_trace_all(db_expr_t dummy, bool dummy2, db_expr_t dummy3, char *dummy4) { _db_stack_trace_all(false); } /* * Take the parsed expression value from the command line that was parsed * as a hexadecimal value and convert it as if the expression was parsed * as a decimal value. Returns -1 if the expression was not a valid * decimal value. */ db_expr_t db_hex2dec(db_expr_t expr) { uintptr_t x, y; db_expr_t val; y = 1; val = 0; x = expr; while (x != 0) { if (x % 16 > 9) return (-1); val += (x % 16) * (y); x >>= 4; y *= 10; } return (val); } diff --git a/sys/ddb/db_ps.c b/sys/ddb/db_ps.c index 44b803299ee9..a5245528ca83 100644 --- a/sys/ddb/db_ps.c +++ b/sys/ddb/db_ps.c @@ -1,533 +1,533 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #define PRINT_NONE 0 #define PRINT_ARGS 1 static void dumpthread(volatile struct proc *p, volatile struct thread *td, int all); static void db_ps_proc(struct proc *p); static int ps_mode; /* * At least one non-optional show-command must be implemented using * DB_SHOW_ALL_COMMAND() so that db_show_all_cmd_set gets created. * Here is one. */ DB_SHOW_ALL_COMMAND(procs, db_procs_cmd) { db_ps(addr, have_addr, count, modif); } static void dump_args(volatile struct proc *p) { char *args; int i, len; if (p->p_args == NULL) return; args = p->p_args->ar_args; len = (int)p->p_args->ar_length; for (i = 0; i < len; i++) { if (args[i] == '\0') db_printf(" "); else db_printf("%c", args[i]); } } /* * Layout: * - column counts * - header * - single-threaded process * - multi-threaded process * - thread in a MT process * * 1 2 3 4 5 6 7 * 1234567890123456789012345678901234567890123456789012345678901234567890 * pid ppid pgrp uid state wmesg wchan cmd * * (threaded) * * * For machines with 64-bit pointers, we expand the wchan field 8 more * characters. */ void db_ps(db_expr_t addr, bool hasaddr, db_expr_t count, char *modif) { struct proc *p; int i; ps_mode = modif[0] == 'a' ? PRINT_ARGS : PRINT_NONE; #ifdef __LP64__ db_printf(" pid ppid pgrp uid state wmesg wchan cmd\n"); #else db_printf(" pid ppid pgrp uid state wmesg wchan cmd\n"); #endif if (!LIST_EMPTY(&allproc)) p = LIST_FIRST(&allproc); else p = &proc0; for (; p != NULL && !db_pager_quit; p = LIST_NEXT(p, p_list)) db_ps_proc(p); /* * Processes such as zombies not in allproc. */ for (i = 0; i <= pidhash && !db_pager_quit; i++) { LIST_FOREACH(p, &pidhashtbl[i], p_hash) { if (p->p_list.le_prev == NULL) db_ps_proc(p); } } } static void db_ps_proc(struct proc *p) { volatile struct proc *pp; volatile struct thread *td; struct ucred *cred; struct pgrp *pgrp; char state[9]; int rflag, sflag, dflag, lflag, wflag; pp = p->p_pptr; if (pp == NULL) pp = p; cred = p->p_ucred; pgrp = p->p_pgrp; db_printf("%5d %5d %5d %5d ", p->p_pid, pp->p_pid, pgrp != NULL ? pgrp->pg_id : 0, cred != NULL ? cred->cr_ruid : 0); /* Determine our primary process state. */ switch (p->p_state) { case PRS_NORMAL: if (P_SHOULDSTOP(p)) state[0] = 'T'; else { /* * One of D, L, R, S, W. For a * multithreaded process we will use * the state of the thread with the * highest precedence. The * precendence order from high to low * is R, L, D, S, W. If no thread is * in a sane state we use '?' for our * primary state. */ rflag = sflag = dflag = lflag = wflag = 0; FOREACH_THREAD_IN_PROC(p, td) { - if (td->td_state == TDS_RUNNING || - td->td_state == TDS_RUNQ || - td->td_state == TDS_CAN_RUN) + if (TD_GET_STATE(td) == TDS_RUNNING || + TD_GET_STATE(td) == TDS_RUNQ || + TD_GET_STATE(td) == TDS_CAN_RUN) rflag++; if (TD_ON_LOCK(td)) lflag++; if (TD_IS_SLEEPING(td)) { if (!(td->td_flags & TDF_SINTR)) dflag++; else sflag++; } if (TD_AWAITING_INTR(td)) wflag++; } if (rflag) state[0] = 'R'; else if (lflag) state[0] = 'L'; else if (dflag) state[0] = 'D'; else if (sflag) state[0] = 'S'; else if (wflag) state[0] = 'W'; else state[0] = '?'; } break; case PRS_NEW: state[0] = 'N'; break; case PRS_ZOMBIE: state[0] = 'Z'; break; default: state[0] = 'U'; break; } state[1] = '\0'; /* Additional process state flags. */ if (!(p->p_flag & P_INMEM)) strlcat(state, "W", sizeof(state)); if (p->p_flag & P_TRACED) strlcat(state, "X", sizeof(state)); if (p->p_flag & P_WEXIT && p->p_state != PRS_ZOMBIE) strlcat(state, "E", sizeof(state)); if (p->p_flag & P_PPWAIT) strlcat(state, "V", sizeof(state)); if (p->p_flag & P_SYSTEM || p->p_lock > 0) strlcat(state, "L", sizeof(state)); if (p->p_pgrp != NULL && p->p_session != NULL && SESS_LEADER(p)) strlcat(state, "s", sizeof(state)); /* Cheated here and didn't compare pgid's. */ if (p->p_flag & P_CONTROLT) strlcat(state, "+", sizeof(state)); if (cred != NULL && jailed(cred)) strlcat(state, "J", sizeof(state)); db_printf(" %-6.6s ", state); if (p->p_flag & P_HADTHREADS) { #ifdef __LP64__ db_printf(" (threaded) "); #else db_printf(" (threaded) "); #endif if (p->p_flag & P_SYSTEM) db_printf("["); db_printf("%s", p->p_comm); if (p->p_flag & P_SYSTEM) db_printf("]"); if (ps_mode == PRINT_ARGS) { db_printf(" "); dump_args(p); } db_printf("\n"); } FOREACH_THREAD_IN_PROC(p, td) { dumpthread(p, td, p->p_flag & P_HADTHREADS); if (db_pager_quit) break; } } static void dumpthread(volatile struct proc *p, volatile struct thread *td, int all) { char state[9], wprefix; const char *wmesg; const void *wchan; if (all) { db_printf("%6d ", td->td_tid); - switch (td->td_state) { + switch (TD_GET_STATE(td)) { case TDS_RUNNING: snprintf(state, sizeof(state), "Run"); break; case TDS_RUNQ: snprintf(state, sizeof(state), "RunQ"); break; case TDS_CAN_RUN: snprintf(state, sizeof(state), "CanRun"); break; case TDS_INACTIVE: snprintf(state, sizeof(state), "Inactv"); break; case TDS_INHIBITED: state[0] = '\0'; if (TD_ON_LOCK(td)) strlcat(state, "L", sizeof(state)); if (TD_IS_SLEEPING(td)) { if (td->td_flags & TDF_SINTR) strlcat(state, "S", sizeof(state)); else strlcat(state, "D", sizeof(state)); } if (TD_IS_SWAPPED(td)) strlcat(state, "W", sizeof(state)); if (TD_AWAITING_INTR(td)) strlcat(state, "I", sizeof(state)); if (TD_IS_SUSPENDED(td)) strlcat(state, "s", sizeof(state)); if (state[0] != '\0') break; default: snprintf(state, sizeof(state), "???"); } db_printf(" %-6.6s ", state); } wprefix = ' '; if (TD_ON_LOCK(td)) { wprefix = '*'; wmesg = td->td_lockname; wchan = td->td_blocked; } else if (TD_ON_SLEEPQ(td)) { wmesg = td->td_wmesg; wchan = td->td_wchan; } else if (TD_IS_RUNNING(td)) { snprintf(state, sizeof(state), "CPU %d", td->td_oncpu); wmesg = state; wchan = NULL; } else { wmesg = ""; wchan = NULL; } db_printf("%c%-7.7s ", wprefix, wmesg); if (wchan == NULL) #ifdef __LP64__ db_printf("%18s ", ""); #else db_printf("%10s ", ""); #endif else db_printf("%p ", wchan); if (p->p_flag & P_SYSTEM) db_printf("["); if (td->td_name[0] != '\0') db_printf("%s", td->td_name); else db_printf("%s", td->td_proc->p_comm); if (p->p_flag & P_SYSTEM) db_printf("]"); if (ps_mode == PRINT_ARGS && all == 0) { db_printf(" "); dump_args(p); } db_printf("\n"); } DB_SHOW_COMMAND(thread, db_show_thread) { struct thread *td; struct lock_object *lock; u_int delta; bool comma; /* Determine which thread to examine. */ if (have_addr) td = db_lookup_thread(addr, false); else td = kdb_thread; lock = (struct lock_object *)td->td_lock; db_printf("Thread %d at %p:\n", td->td_tid, td); db_printf(" proc (pid %d): %p\n", td->td_proc->p_pid, td->td_proc); if (td->td_name[0] != '\0') db_printf(" name: %s\n", td->td_name); db_printf(" pcb: %p\n", td->td_pcb); db_printf(" stack: %p-%p\n", (void *)td->td_kstack, (void *)(td->td_kstack + td->td_kstack_pages * PAGE_SIZE - 1)); db_printf(" flags: %#x ", td->td_flags); db_printf(" pflags: %#x\n", td->td_pflags); db_printf(" state: "); - switch (td->td_state) { + switch (TD_GET_STATE(td)) { case TDS_INACTIVE: db_printf("INACTIVE\n"); break; case TDS_CAN_RUN: db_printf("CAN RUN\n"); break; case TDS_RUNQ: db_printf("RUNQ\n"); break; case TDS_RUNNING: db_printf("RUNNING (CPU %d)\n", td->td_oncpu); break; case TDS_INHIBITED: db_printf("INHIBITED: {"); comma = false; if (TD_IS_SLEEPING(td)) { db_printf("SLEEPING"); comma = true; } if (TD_IS_SUSPENDED(td)) { if (comma) db_printf(", "); db_printf("SUSPENDED"); comma = true; } if (TD_IS_SWAPPED(td)) { if (comma) db_printf(", "); db_printf("SWAPPED"); comma = true; } if (TD_ON_LOCK(td)) { if (comma) db_printf(", "); db_printf("LOCK"); comma = true; } if (TD_AWAITING_INTR(td)) { if (comma) db_printf(", "); db_printf("IWAIT"); } db_printf("}\n"); break; default: - db_printf("??? (%#x)\n", td->td_state); + db_printf("??? (%#x)\n", TD_GET_STATE(td)); break; } if (TD_ON_LOCK(td)) db_printf(" lock: %s turnstile: %p\n", td->td_lockname, td->td_blocked); if (TD_ON_SLEEPQ(td)) db_printf( " wmesg: %s wchan: %p sleeptimo %lx. %jx (curr %lx. %jx)\n", td->td_wmesg, td->td_wchan, (long)sbttobt(td->td_sleeptimo).sec, (uintmax_t)sbttobt(td->td_sleeptimo).frac, (long)sbttobt(sbinuptime()).sec, (uintmax_t)sbttobt(sbinuptime()).frac); db_printf(" priority: %d\n", td->td_priority); db_printf(" container lock: %s (%p)\n", lock->lo_name, lock); if (td->td_swvoltick != 0) { delta = ticks - td->td_swvoltick; db_printf(" last voluntary switch: %u.%03u s ago\n", delta / hz, (delta % hz) * 1000 / hz); } if (td->td_swinvoltick != 0) { delta = ticks - td->td_swinvoltick; db_printf(" last involuntary switch: %u.%03u s ago\n", delta / hz, (delta % hz) * 1000 / hz); } } DB_SHOW_COMMAND(proc, db_show_proc) { struct thread *td; struct proc *p; int i; /* Determine which process to examine. */ if (have_addr) p = db_lookup_proc(addr); else p = kdb_thread->td_proc; db_printf("Process %d (%s) at %p:\n", p->p_pid, p->p_comm, p); db_printf(" state: "); switch (p->p_state) { case PRS_NEW: db_printf("NEW\n"); break; case PRS_NORMAL: db_printf("NORMAL\n"); break; case PRS_ZOMBIE: db_printf("ZOMBIE\n"); break; default: db_printf("??? (%#x)\n", p->p_state); } if (p->p_ucred != NULL) { db_printf(" uid: %d gids: ", p->p_ucred->cr_uid); for (i = 0; i < p->p_ucred->cr_ngroups; i++) { db_printf("%d", p->p_ucred->cr_groups[i]); if (i < (p->p_ucred->cr_ngroups - 1)) db_printf(", "); } db_printf("\n"); } if (p->p_pptr != NULL) db_printf(" parent: pid %d at %p\n", p->p_pptr->p_pid, p->p_pptr); if (p->p_leader != NULL && p->p_leader != p) db_printf(" leader: pid %d at %p\n", p->p_leader->p_pid, p->p_leader); if (p->p_sysent != NULL) db_printf(" ABI: %s\n", p->p_sysent->sv_name); db_printf(" flag: %#x ", p->p_flag); db_printf(" flag2: %#x\n", p->p_flag2); if (p->p_args != NULL) { db_printf(" arguments: "); dump_args(p); db_printf("\n"); } db_printf(" reaper: %p reapsubtree: %d\n", p->p_reaper, p->p_reapsubtree); db_printf(" sigparent: %d\n", p->p_sigparent); db_printf(" vmspace: %p\n", p->p_vmspace); db_printf(" (map %p)\n", (p->p_vmspace != NULL) ? &p->p_vmspace->vm_map : 0); db_printf(" (map.pmap %p)\n", (p->p_vmspace != NULL) ? &p->p_vmspace->vm_map.pmap : 0); db_printf(" (pmap %p)\n", (p->p_vmspace != NULL) ? &p->p_vmspace->vm_pmap : 0); db_printf(" threads: %d\n", p->p_numthreads); FOREACH_THREAD_IN_PROC(p, td) { dumpthread(p, td, 1); if (db_pager_quit) break; } } void db_findstack_cmd(db_expr_t addr, bool have_addr, db_expr_t dummy3 __unused, char *dummy4 __unused) { struct thread *td; vm_offset_t saddr; if (have_addr) saddr = addr; else { db_printf("Usage: findstack
\n"); return; } for (td = kdb_thr_first(); td != NULL; td = kdb_thr_next(td)) { if (kstack_contains(td, saddr, 1)) { db_printf("Thread %p\n", td); return; } } } diff --git a/sys/gdb/gdb_main.c b/sys/gdb/gdb_main.c index 6e0c9f21f947..a9e935ebfbb5 100644 --- a/sys/gdb/gdb_main.c +++ b/sys/gdb/gdb_main.c @@ -1,882 +1,882 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_NODE(_debug, OID_AUTO, gdb, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GDB settings"); static dbbe_init_f gdb_init; static dbbe_trap_f gdb_trap; KDB_BACKEND(gdb, gdb_init, NULL, NULL, gdb_trap); static struct gdb_dbgport null_gdb_dbgport; DATA_SET(gdb_dbgport_set, null_gdb_dbgport); SET_DECLARE(gdb_dbgport_set, struct gdb_dbgport); struct gdb_dbgport *gdb_cur = NULL; int gdb_listening = 0; bool gdb_ackmode = true; static unsigned char gdb_bindata[64]; #ifdef DDB bool gdb_return_to_ddb = false; #endif static int gdb_init(void) { struct gdb_dbgport *dp, **iter; int cur_pri, pri; gdb_cur = NULL; cur_pri = -1; SET_FOREACH(iter, gdb_dbgport_set) { dp = *iter; pri = (dp->gdb_probe != NULL) ? dp->gdb_probe() : -1; dp->gdb_active = (pri >= 0) ? 0 : -1; if (pri > cur_pri) { cur_pri = pri; gdb_cur = dp; } } if (gdb_cur != NULL) { printf("GDB: debug ports:"); SET_FOREACH(iter, gdb_dbgport_set) { dp = *iter; if (dp->gdb_active == 0) printf(" %s", dp->gdb_name); } printf("\n"); } else printf("GDB: no debug ports present\n"); if (gdb_cur != NULL) { gdb_cur->gdb_init(); printf("GDB: current port: %s\n", gdb_cur->gdb_name); } if (gdb_cur != NULL) { cur_pri = (boothowto & RB_GDB) ? 2 : 0; gdb_consinit(); } else cur_pri = -1; return (cur_pri); } static void gdb_do_mem_search(void) { size_t patlen; intmax_t addr, size; const unsigned char *found; if (gdb_rx_varhex(&addr) || gdb_rx_char() != ';' || gdb_rx_varhex(&size) || gdb_rx_char() != ';' || gdb_rx_bindata(gdb_bindata, sizeof(gdb_bindata), &patlen)) { gdb_tx_err(EINVAL); return; } if (gdb_search_mem((char *)(uintptr_t)addr, size, gdb_bindata, patlen, &found)) { if (found == 0ULL) gdb_tx_begin('0'); else { gdb_tx_begin('1'); gdb_tx_char(','); gdb_tx_hex((intmax_t)(uintptr_t)found, 8); } gdb_tx_end(); } else gdb_tx_err(EIO); } static void gdb_do_threadinfo(struct thread **thr_iter) { static struct thread * const done_sentinel = (void *)(uintptr_t)1; static const size_t tidsz_hex = sizeof(lwpid_t) * 2; size_t tds_sent; if (*thr_iter == NULL) { gdb_tx_err(ENXIO); return; } if (*thr_iter == done_sentinel) { gdb_tx_begin('l'); *thr_iter = NULL; goto sendit; } gdb_tx_begin('m'); for (tds_sent = 0; *thr_iter != NULL && gdb_txbuf_has_capacity(tidsz_hex + 1); *thr_iter = kdb_thr_next(*thr_iter), tds_sent++) { if (tds_sent > 0) gdb_tx_char(','); gdb_tx_varhex((*thr_iter)->td_tid); } /* * Can't send EOF and "some" in same packet, so set a sentinel to send * EOF when GDB asks us next. */ if (*thr_iter == NULL && tds_sent > 0) *thr_iter = done_sentinel; sendit: gdb_tx_end(); } #define BIT(n) (1ull << (n)) enum { GDB_MULTIPROCESS, GDB_SWBREAK, GDB_HWBREAK, GDB_QRELOCINSN, GDB_FORK_EVENTS, GDB_VFORK_EVENTS, GDB_EXEC_EVENTS, GDB_VCONT_SUPPORTED, GDB_QTHREADEVENTS, GDB_NO_RESUMED, }; static const char * const gdb_feature_names[] = { [GDB_MULTIPROCESS] = "multiprocess", [GDB_SWBREAK] = "swbreak", [GDB_HWBREAK] = "hwbreak", [GDB_QRELOCINSN] = "qRelocInsn", [GDB_FORK_EVENTS] = "fork-events", [GDB_VFORK_EVENTS] = "vfork-events", [GDB_EXEC_EVENTS] = "exec-events", [GDB_VCONT_SUPPORTED] = "vContSupported", [GDB_QTHREADEVENTS] = "QThreadEvents", [GDB_NO_RESUMED] = "no-resumed", }; static void gdb_do_qsupported(uint32_t *feat) { char *tok, *delim, ok; size_t i, toklen; /* Parse supported host features */ *feat = 0; switch (gdb_rx_char()) { case ':': break; case EOF: goto nofeatures; default: goto error; } while (gdb_rxsz > 0) { tok = gdb_rxp; delim = strchrnul(gdb_rxp, ';'); toklen = (delim - tok); gdb_rxp += toklen; gdb_rxsz -= toklen; if (*delim != '\0') { *delim = '\0'; gdb_rxp += 1; gdb_rxsz -= 1; } if (toklen < 2) goto error; ok = tok[toklen - 1]; if (ok != '-' && ok != '+') { /* * GDB only has one KV-pair feature, and we don't * support it, so ignore and move on. */ if (strchr(tok, '=') != NULL) continue; /* Not a KV-pair, and not a +/- flag? Malformed. */ goto error; } if (ok != '+') continue; tok[toklen - 1] = '\0'; for (i = 0; i < nitems(gdb_feature_names); i++) if (strcmp(gdb_feature_names[i], tok) == 0) break; if (i == nitems(gdb_feature_names)) { /* Unknown GDB feature. */ continue; } *feat |= BIT(i); } nofeatures: /* Send a supported feature list back */ gdb_tx_begin(0); gdb_tx_str("PacketSize"); gdb_tx_char('='); /* * We don't buffer framing bytes, but we do need to retain a byte for a * trailing nul. */ gdb_tx_varhex(GDB_BUFSZ + strlen("$#nn") - 1); gdb_tx_str(";qXfer:threads:read+"); /* * If the debugport is a reliable transport, request No Ack mode from * the server. The server may or may not choose to enter No Ack mode. * https://sourceware.org/gdb/onlinedocs/gdb/Packet-Acknowledgment.html */ if (gdb_cur->gdb_dbfeatures & GDB_DBGP_FEAT_RELIABLE) gdb_tx_str(";QStartNoAckMode+"); /* * Future consideration: * - vCont * - multiprocess */ gdb_tx_end(); return; error: *feat = 0; gdb_tx_err(EINVAL); } /* * A qXfer_context provides a vaguely generic way to generate a multi-packet * response on the fly, making some assumptions about the size of sbuf writes * vs actual packet length constraints. A non-byzantine gdb host should allow * hundreds of bytes per packet or more. * * Upper layers are considered responsible for escaping the four forbidden * characters '# $ } *'. */ struct qXfer_context { struct sbuf sb; size_t last_offset; bool flushed; bool lastmessage; char xfer_buf[GDB_BUFSZ]; }; static int qXfer_drain(void *v, const char *buf, int len) { struct qXfer_context *qx; if (len < 0) return (-EINVAL); qx = v; if (qx->flushed) { /* * Overflow. We lost some message. Maybe the packet size is * ridiculously small. */ printf("%s: Overflow in qXfer detected.\n", __func__); return (-ENOBUFS); } qx->last_offset += len; qx->flushed = true; if (qx->lastmessage) gdb_tx_begin('l'); else gdb_tx_begin('m'); memcpy(gdb_txp, buf, len); gdb_txp += len; gdb_tx_end(); return (len); } static int init_qXfer_ctx(struct qXfer_context *qx, uintmax_t len) { /* Protocol (max) length field includes framing overhead. */ if (len < sizeof("$m#nn")) return (ENOSPC); len -= 4; len = ummin(len, GDB_BUFSZ - 1); qx->last_offset = 0; qx->flushed = false; qx->lastmessage = false; sbuf_new(&qx->sb, qx->xfer_buf, len, SBUF_FIXEDLEN); sbuf_set_drain(&qx->sb, qXfer_drain, qx); return (0); } /* * Squashes special XML and GDB characters down to _. Sorry. */ static void qXfer_escape_xmlattr_str(char *dst, size_t dstlen, const char *src) { static const char *forbidden = "#$}*"; size_t i; char c; for (i = 0; i < dstlen - 1 && *src != 0; src++, i++) { c = *src; /* XML attr filter */ if (c < 32) c = '_'; /* We assume attributes will be "" quoted. */ if (c == '<' || c == '&' || c == '"') c = '_'; /* GDB escape. */ if (strchr(forbidden, c) != NULL) { /* * It would be nice to escape these properly, but to do * it correctly we need to escape them in the transmit * layer, potentially doubling our buffer requirements. * For now, avoid breaking the protocol by squashing * them to underscore. */ #if 0 *dst++ = '}'; c ^= 0x20; #endif c = '_'; } *dst++ = c; } if (*src != 0) printf("XXX%s: overflow; API misuse\n", __func__); *dst = 0; } /* * Dynamically generate qXfer:threads document, one packet at a time. * * The format is loosely described[0], although it does not seem that the * mentioned on that page is required. * * [0]: https://sourceware.org/gdb/current/onlinedocs/gdb/Thread-List-Format.html */ static void do_qXfer_threads_read(void) { /* Kludgy context */ static struct { struct qXfer_context qXfer; /* Kludgy state machine */ struct thread *iter; enum { XML_START_THREAD, /* ' ...' */ XML_END_THREAD, /* '' */ XML_SENT_END_THREADS, /* '' */ } next_step; } ctx; static char td_name_escape[MAXCOMLEN * 2 + 1]; const char *name_src; uintmax_t offset, len; int error; /* Annex part must be empty. */ if (gdb_rx_char() != ':') goto misformed_request; if (gdb_rx_varhex(&offset) != 0 || gdb_rx_char() != ',' || gdb_rx_varhex(&len) != 0) goto misformed_request; /* * Validate resume xfers. */ if (offset != 0) { if (offset != ctx.qXfer.last_offset) { printf("%s: Resumed offset %ju != expected %zu\n", __func__, offset, ctx.qXfer.last_offset); error = ESPIPE; goto request_error; } ctx.qXfer.flushed = false; } if (offset == 0) { ctx.iter = kdb_thr_first(); ctx.next_step = XML_START_THREAD; error = init_qXfer_ctx(&ctx.qXfer, len); if (error != 0) goto request_error; sbuf_cat(&ctx.qXfer.sb, ""); } while (!ctx.qXfer.flushed && ctx.iter != NULL) { switch (ctx.next_step) { case XML_START_THREAD: ctx.next_step = XML_THREAD_ID; sbuf_cat(&ctx.qXfer.sb, "td_tid); continue; case XML_THREAD_CORE: ctx.next_step = XML_THREAD_NAME; if (ctx.iter->td_oncpu != NOCPU) { sbuf_printf(&ctx.qXfer.sb, " core=\"%d\"", ctx.iter->td_oncpu); } continue; case XML_THREAD_NAME: ctx.next_step = XML_THREAD_EXTRA; if (ctx.iter->td_name[0] != 0) name_src = ctx.iter->td_name; else if (ctx.iter->td_proc != NULL && ctx.iter->td_proc->p_comm[0] != 0) name_src = ctx.iter->td_proc->p_comm; else continue; qXfer_escape_xmlattr_str(td_name_escape, sizeof(td_name_escape), name_src); sbuf_printf(&ctx.qXfer.sb, " name=\"%s\"", td_name_escape); continue; case XML_THREAD_EXTRA: ctx.next_step = XML_END_THREAD; sbuf_putc(&ctx.qXfer.sb, '>'); - if (ctx.iter->td_state == TDS_RUNNING) + if (TD_GET_STATE(ctx.iter) == TDS_RUNNING) sbuf_cat(&ctx.qXfer.sb, "Running"); - else if (ctx.iter->td_state == TDS_RUNQ) + else if (TD_GET_STATE(ctx.iter) == TDS_RUNQ) sbuf_cat(&ctx.qXfer.sb, "RunQ"); - else if (ctx.iter->td_state == TDS_CAN_RUN) + else if (TD_GET_STATE(ctx.iter) == TDS_CAN_RUN) sbuf_cat(&ctx.qXfer.sb, "CanRun"); else if (TD_ON_LOCK(ctx.iter)) sbuf_cat(&ctx.qXfer.sb, "Blocked"); else if (TD_IS_SLEEPING(ctx.iter)) sbuf_cat(&ctx.qXfer.sb, "Sleeping"); else if (TD_IS_SWAPPED(ctx.iter)) sbuf_cat(&ctx.qXfer.sb, "Swapped"); else if (TD_AWAITING_INTR(ctx.iter)) sbuf_cat(&ctx.qXfer.sb, "IthreadWait"); else if (TD_IS_SUSPENDED(ctx.iter)) sbuf_cat(&ctx.qXfer.sb, "Suspended"); else sbuf_cat(&ctx.qXfer.sb, "???"); continue; case XML_END_THREAD: ctx.next_step = XML_START_THREAD; sbuf_cat(&ctx.qXfer.sb, ""); ctx.iter = kdb_thr_next(ctx.iter); continue; /* * This one isn't part of the looping state machine, * but GCC complains if you leave an enum value out of the * select. */ case XML_SENT_END_THREADS: /* NOTREACHED */ break; } } if (ctx.qXfer.flushed) return; if (ctx.next_step != XML_SENT_END_THREADS) { ctx.next_step = XML_SENT_END_THREADS; sbuf_cat(&ctx.qXfer.sb, ""); } if (ctx.qXfer.flushed) return; ctx.qXfer.lastmessage = true; sbuf_finish(&ctx.qXfer.sb); sbuf_delete(&ctx.qXfer.sb); ctx.qXfer.last_offset = 0; return; misformed_request: /* * GDB "General-Query-Packets.html" qXfer-read anchor specifically * documents an E00 code for malformed requests or invalid annex. * Non-zero codes indicate invalid offset or "error reading the data." */ error = 0; request_error: gdb_tx_err(error); return; } /* * A set of standardized transfers from "special data areas." * * We've already matched on "qXfer:" and advanced the rx packet buffer past * that bit. Parse out the rest of the packet and generate an appropriate * response. */ static void do_qXfer(void) { if (!gdb_rx_equal("threads:")) goto unrecognized; if (!gdb_rx_equal("read:")) goto unrecognized; do_qXfer_threads_read(); return; unrecognized: gdb_tx_empty(); return; } static void gdb_handle_detach(void) { kdb_cpu_clear_singlestep(); gdb_listening = 0; if (gdb_cur->gdb_dbfeatures & GDB_DBGP_FEAT_WANTTERM) gdb_cur->gdb_term(); #ifdef DDB if (!gdb_return_to_ddb) return; gdb_return_to_ddb = false; if (kdb_dbbe_select("ddb") != 0) printf("The ddb backend could not be selected.\n"); #endif } static int gdb_trap(int type, int code) { jmp_buf jb; struct thread *thr_iter; void *prev_jb; uint32_t host_features; prev_jb = kdb_jmpbuf(jb); if (setjmp(jb) != 0) { printf("%s bailing, hopefully back to ddb!\n", __func__); gdb_listening = 0; (void)kdb_jmpbuf(prev_jb); return (1); } gdb_listening = 0; gdb_ackmode = true; /* * Send a T packet. We currently do not support watchpoints (the * awatch, rwatch or watch elements). */ gdb_tx_begin('T'); gdb_tx_hex(gdb_cpu_signal(type, code), 2); gdb_tx_varhex(GDB_REG_PC); gdb_tx_char(':'); gdb_tx_reg(GDB_REG_PC); gdb_tx_char(';'); gdb_tx_str("thread:"); gdb_tx_varhex((long)kdb_thread->td_tid); gdb_tx_char(';'); gdb_tx_end(); /* XXX check error condition. */ thr_iter = NULL; while (gdb_rx_begin() == 0) { /* printf("GDB: got '%s'\n", gdb_rxp); */ switch (gdb_rx_char()) { case '?': /* Last signal. */ gdb_tx_begin('T'); gdb_tx_hex(gdb_cpu_signal(type, code), 2); gdb_tx_str("thread:"); gdb_tx_varhex((long)kdb_thread->td_tid); gdb_tx_char(';'); gdb_tx_end(); break; case 'c': { /* Continue. */ uintmax_t addr; register_t pc; if (!gdb_rx_varhex(&addr)) { pc = addr; gdb_cpu_setreg(GDB_REG_PC, &pc); } kdb_cpu_clear_singlestep(); gdb_listening = 1; return (1); } case 'C': { /* Continue with signal. */ uintmax_t addr, sig; register_t pc; if (!gdb_rx_varhex(&sig) && gdb_rx_char() == ';' && !gdb_rx_varhex(&addr)) { pc = addr; gdb_cpu_setreg(GDB_REG_PC, &pc); } kdb_cpu_clear_singlestep(); gdb_listening = 1; return (1); } case 'D': { /* Detach */ gdb_tx_ok(); gdb_handle_detach(); return (1); } case 'g': { /* Read registers. */ size_t r; gdb_tx_begin(0); for (r = 0; r < GDB_NREGS; r++) gdb_tx_reg(r); gdb_tx_end(); break; } case 'G': { /* Write registers. */ char *val; bool success; size_t r; for (success = true, r = 0; r < GDB_NREGS; r++) { val = gdb_rxp; if (!gdb_rx_mem(val, gdb_cpu_regsz(r))) { gdb_tx_err(EINVAL); success = false; break; } gdb_cpu_setreg(r, val); } if (success) gdb_tx_ok(); break; } case 'H': { /* Set thread. */ intmax_t tid; struct thread *thr; /* Ignore 'g' (general) or 'c' (continue) flag. */ (void) gdb_rx_char(); if (gdb_rx_varhex(&tid)) { gdb_tx_err(EINVAL); break; } if (tid > 0) { thr = kdb_thr_lookup(tid); if (thr == NULL) { gdb_tx_err(ENOENT); break; } kdb_thr_select(thr); } gdb_tx_ok(); break; } case 'k': /* Kill request. */ gdb_handle_detach(); return (1); case 'm': { /* Read memory. */ uintmax_t addr, size; if (gdb_rx_varhex(&addr) || gdb_rx_char() != ',' || gdb_rx_varhex(&size)) { gdb_tx_err(EINVAL); break; } gdb_tx_begin(0); if (gdb_tx_mem((char *)(uintptr_t)addr, size)) gdb_tx_end(); else gdb_tx_err(EIO); break; } case 'M': { /* Write memory. */ uintmax_t addr, size; if (gdb_rx_varhex(&addr) || gdb_rx_char() != ',' || gdb_rx_varhex(&size) || gdb_rx_char() != ':') { gdb_tx_err(EINVAL); break; } if (gdb_rx_mem((char *)(uintptr_t)addr, size) == 0) gdb_tx_err(EIO); else gdb_tx_ok(); break; } case 'p': { /* Read register. */ uintmax_t reg; if (gdb_rx_varhex(®)) { gdb_tx_err(EINVAL); break; } gdb_tx_begin(0); gdb_tx_reg(reg); gdb_tx_end(); break; } case 'P': { /* Write register. */ char *val; uintmax_t reg; val = gdb_rxp; if (gdb_rx_varhex(®) || gdb_rx_char() != '=' || !gdb_rx_mem(val, gdb_cpu_regsz(reg))) { gdb_tx_err(EINVAL); break; } gdb_cpu_setreg(reg, val); gdb_tx_ok(); break; } case 'q': /* General query. */ if (gdb_rx_equal("C")) { gdb_tx_begin('Q'); gdb_tx_char('C'); gdb_tx_varhex((long)kdb_thread->td_tid); gdb_tx_end(); } else if (gdb_rx_equal("Supported")) { gdb_do_qsupported(&host_features); } else if (gdb_rx_equal("fThreadInfo")) { thr_iter = kdb_thr_first(); gdb_do_threadinfo(&thr_iter); } else if (gdb_rx_equal("sThreadInfo")) { gdb_do_threadinfo(&thr_iter); } else if (gdb_rx_equal("Xfer:")) { do_qXfer(); } else if (gdb_rx_equal("Search:memory:")) { gdb_do_mem_search(); #ifdef __powerpc__ } else if (gdb_rx_equal("Offsets")) { gdb_cpu_do_offsets(); #endif } else if (!gdb_cpu_query()) gdb_tx_empty(); break; case 'Q': if (gdb_rx_equal("StartNoAckMode")) { if ((gdb_cur->gdb_dbfeatures & GDB_DBGP_FEAT_RELIABLE) == 0) { /* * Shouldn't happen if we didn't * advertise support. Reject. */ gdb_tx_empty(); break; } gdb_ackmode = false; gdb_tx_ok(); } else gdb_tx_empty(); break; case 's': { /* Step. */ uintmax_t addr; register_t pc; if (!gdb_rx_varhex(&addr)) { pc = addr; gdb_cpu_setreg(GDB_REG_PC, &pc); } kdb_cpu_set_singlestep(); gdb_listening = 1; return (1); } case 'S': { /* Step with signal. */ uintmax_t addr, sig; register_t pc; if (!gdb_rx_varhex(&sig) && gdb_rx_char() == ';' && !gdb_rx_varhex(&addr)) { pc = addr; gdb_cpu_setreg(GDB_REG_PC, &pc); } kdb_cpu_set_singlestep(); gdb_listening = 1; return (1); } case 'T': { /* Thread alive. */ intmax_t tid; if (gdb_rx_varhex(&tid)) { gdb_tx_err(EINVAL); break; } if (kdb_thr_lookup(tid) != NULL) gdb_tx_ok(); else gdb_tx_err(ENOENT); break; } case EOF: /* Empty command. Treat as unknown command. */ /* FALLTHROUGH */ default: /* Unknown command. Send empty response. */ gdb_tx_empty(); break; } } (void)kdb_jmpbuf(prev_jb); return (0); } diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 5eb8186c23ca..2d6a4f636240 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -1,894 +1,894 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kdb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; struct proc *initproc; int linux_alloc_current_noop(struct thread *td __unused, int flags __unused) { return (0); } int (*lkpi_alloc_current)(struct thread *, int) = linux_alloc_current_noop; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef VERBOSE_SYSINIT /* * We'll use the defined value of VERBOSE_SYSINIT from the kernel config to * dictate the default VERBOSE_SYSINIT behavior. Significant values for this * option and associated tunable are: * - 0, 'compiled in but silent by default' * - 1, 'compiled in but verbose by default' (default) */ int verbose_sysinit = VERBOSE_SYSINIT; TUNABLE_INT("debug.verbose_sysinit", &verbose_sysinit); #endif #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { struct sysinit **sipp; /* system initialization*/ struct sysinit **xipp; /* interior loop of sort*/ struct sysinit *save; /* bubble*/ #if defined(VERBOSE_SYSINIT) int last; int verbose; #endif TSENTER(); if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } #if defined(VERBOSE_SYSINIT) last = SI_SUB_COPYRIGHT; verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last && verbose_sysinit != 0) { verbose = 1; last = (*sipp)->subsystem; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } TSEXIT(); /* Here so we don't overlap with start_init. */ mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_FOURTH, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_FOURTH, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_FIFTH, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_FIFTH, print_caddr_t, diag_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_transtrap = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; /* * The two following SYSINIT's are proc0 specific glue code. I am not * convinced that they can not be safely combined, but their order of * operation has been maintained as the same as the original init_main.c * for right now. */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; struct uidinfo tmpuinfo; struct loginclass tmplc = { .lc_name = "", }; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; td->td_tid = THREAD0_TID; tidhash_add(td); - td->td_state = TDS_RUNNING; + TD_SET_STATE(td, TDS_RUNNING); td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = curcpu; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); td->td_domain.dr_policy = td->td_cpuset->cs_domain; prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; p->p_treeflag |= P_TREE_REAPER; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, 1); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ /* A hack to prevent uifind from tripping over NULL pointers. */ curthread->td_ucred = newcred; tmpuinfo.ui_uid = 1; newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo; newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_loginclass = &tmplc; newcred->cr_loginclass = loginclass_find("default"); /* End hack. creds get properly set later with thread_cow_get_proc */ curthread->td_ucred = NULL; newcred->cr_prison = &prison0; newcred->cr_users++; /* avoid assertion failure */ proc_set_cred_init(p, newcred); newcred->cr_users--; crfree(newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_pd = pdinit(NULL, false); p->p_fd = fdinit(NULL, false, NULL); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_free_count()); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; refcount_init(&vmspace0.vm_refcnt, 1); pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_DIRECT_INVOKE(process_init, p); EVENTHANDLER_DIRECT_INVOKE(thread_init, td); #ifdef KDTRACE_HOOKS kdtrace_proc_ctor(p); kdtrace_thread_ctor(td); #endif EVENTHANDLER_DIRECT_INVOKE(process_ctor, p); EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; PROC_STATUNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { struct image_args args; int error; char *var, *path; char *free_init_path, *tmp_init_path; struct thread *td; struct proc *p; struct vmspace *oldvmspace; TSENTER(); /* Here so we don't overlap with mi_startup. */ td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* For Multicons, report which console is primary to both */ if (boothowto & RB_MULTIPLE) { if (boothowto & RB_SERIAL) printf("Dual Console: Serial Primary, Video Secondary\n"); else printf("Dual Console: Video Primary, Serial Secondary\n"); } if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } free_init_path = tmp_init_path = strdup(init_path, M_TEMP); while ((path = strsep(&tmp_init_path, ":")) != NULL) { if (bootverbose) printf("start_init: trying %s\n", path); memset(&args, 0, sizeof(args)); error = exec_alloc_args(&args); if (error != 0) panic("%s: Can't allocate space for init arguments %d", __func__, error); error = exec_args_add_fname(&args, path, UIO_SYSSPACE); if (error != 0) panic("%s: Can't add fname %d", __func__, error); error = exec_args_add_arg(&args, path, UIO_SYSSPACE); if (error != 0) panic("%s: Can't add argv[0] %d", __func__, error); if (boothowto & RB_SINGLE) error = exec_args_add_arg(&args, "-s", UIO_SYSSPACE); if (error != 0) panic("%s: Can't add argv[0] %d", __func__, error); /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ KASSERT((td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); oldvmspace = td->td_proc->p_vmspace; error = kern_execve(td, &args, NULL, oldvmspace); KASSERT(error != 0, ("kern_execve returned success, not EJUSTRETURN")); if (error == EJUSTRETURN) { exec_cleanup(td, oldvmspace); free(free_init_path, M_TEMP); TSEXIT(); return; } if (error != ENOENT) printf("exec %s: error %d\n", path, error); } free(free_init_path, M_TEMP); printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in its own address space. We do this * early to reserve pid 1. Note special case - do not make it * runnable yet, init execution is started when userspace can be served. */ static void create_init(const void *udata __unused) { struct fork_req fr; struct ucred *newcred, *oldcred; struct thread *td; int error; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); crcowfree(td); td->td_realucred = crcowget(initproc->p_ucred); td->td_ucred = td->td_realucred; PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); /* * DDB(4). */ #ifdef DDB static void db_show_print_syinit(struct sysinit *sip, bool ddb) { const char *sname, *funcname; c_db_sym_t sym; db_expr_t offset; #define xprint(...) \ if (ddb) \ db_printf(__VA_ARGS__); \ else \ printf(__VA_ARGS__) if (sip == NULL) { xprint("%s: no sysinit * given\n", __func__); return; } sym = db_search_symbol((vm_offset_t)sip, DB_STGY_ANY, &offset); db_symbol_values(sym, &sname, NULL); sym = db_search_symbol((vm_offset_t)sip->func, DB_STGY_PROC, &offset); db_symbol_values(sym, &funcname, NULL); xprint("%s(%p)\n", (sname != NULL) ? sname : "", sip); xprint(" %#08x %#08x\n", sip->subsystem, sip->order); xprint(" %p(%s)(%p)\n", sip->func, (funcname != NULL) ? funcname : "", sip->udata); #undef xprint } DB_SHOW_COMMAND(sysinit, db_show_sysinit) { struct sysinit **sipp; db_printf("SYSINIT vs Name(Ptr)\n"); db_printf(" Subsystem Order\n"); db_printf(" Function(Name)(Arg)\n"); for (sipp = sysinit; sipp < sysinit_end; sipp++) { db_show_print_syinit(*sipp, true); if (db_pager_quit) break; } } #endif /* DDB */ diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c index 0e11af2123e2..af7c52c6b176 100644 --- a/sys/kern/kern_intr.c +++ b/sys/kern/kern_intr.c @@ -1,1669 +1,1669 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997, Stefan Esser * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_usage_prof.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif /* * Describe an interrupt thread. There is one of these per interrupt event. */ struct intr_thread { struct intr_event *it_event; struct thread *it_thread; /* Kernel thread. */ int it_flags; /* (j) IT_* flags. */ int it_need; /* Needs service. */ }; /* Interrupt thread flags kept in it_flags */ #define IT_DEAD 0x000001 /* Thread is waiting to exit. */ #define IT_WAIT 0x000002 /* Thread is waiting for completion. */ struct intr_entropy { struct thread *td; uintptr_t event; }; struct intr_event *clk_intr_event; struct intr_event *tty_intr_event; void *vm_ih; struct proc *intrproc; static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads"); static int intr_storm_threshold = 0; SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN, &intr_storm_threshold, 0, "Number of consecutive interrupts before storm protection is enabled"); static int intr_epoch_batch = 1000; SYSCTL_INT(_hw, OID_AUTO, intr_epoch_batch, CTLFLAG_RWTUN, &intr_epoch_batch, 0, "Maximum interrupt handler executions without re-entering epoch(9)"); static TAILQ_HEAD(, intr_event) event_list = TAILQ_HEAD_INITIALIZER(event_list); static struct mtx event_lock; MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF); static void intr_event_update(struct intr_event *ie); static int intr_event_schedule_thread(struct intr_event *ie); static struct intr_thread *ithread_create(const char *name); static void ithread_destroy(struct intr_thread *ithread); static void ithread_execute_handlers(struct proc *p, struct intr_event *ie); static void ithread_loop(void *); static void ithread_update(struct intr_thread *ithd); static void start_softintr(void *); /* Map an interrupt type to an ithread priority. */ u_char intr_priority(enum intr_type flags) { u_char pri; flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET | INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV); switch (flags) { case INTR_TYPE_TTY: pri = PI_TTY; break; case INTR_TYPE_BIO: pri = PI_DISK; break; case INTR_TYPE_NET: pri = PI_NET; break; case INTR_TYPE_CAM: pri = PI_DISK; break; case INTR_TYPE_AV: pri = PI_AV; break; case INTR_TYPE_CLK: pri = PI_REALTIME; break; case INTR_TYPE_MISC: pri = PI_DULL; /* don't care */ break; default: /* We didn't specify an interrupt level. */ panic("intr_priority: no interrupt type in flags"); } return pri; } /* * Update an ithread based on the associated intr_event. */ static void ithread_update(struct intr_thread *ithd) { struct intr_event *ie; struct thread *td; u_char pri; ie = ithd->it_event; td = ithd->it_thread; mtx_assert(&ie->ie_lock, MA_OWNED); /* Determine the overall priority of this event. */ if (CK_SLIST_EMPTY(&ie->ie_handlers)) pri = PRI_MAX_ITHD; else pri = CK_SLIST_FIRST(&ie->ie_handlers)->ih_pri; /* Update name and priority. */ strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif thread_lock(td); sched_prio(td, pri); thread_unlock(td); } /* * Regenerate the full name of an interrupt event and update its priority. */ static void intr_event_update(struct intr_event *ie) { struct intr_handler *ih; char *last; int missed, space, flags; /* Start off with no entropy and just the name of the event. */ mtx_assert(&ie->ie_lock, MA_OWNED); strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname)); flags = 0; missed = 0; space = 1; /* Run through all the handlers updating values. */ CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 < sizeof(ie->ie_fullname)) { strcat(ie->ie_fullname, " "); strcat(ie->ie_fullname, ih->ih_name); space = 0; } else missed++; flags |= ih->ih_flags; } ie->ie_hflags = flags; /* * If there is only one handler and its name is too long, just copy in * as much of the end of the name (includes the unit number) as will * fit. Otherwise, we have multiple handlers and not all of the names * will fit. Add +'s to indicate missing names. If we run out of room * and still have +'s to add, change the last character from a + to a *. */ if (missed == 1 && space == 1) { ih = CK_SLIST_FIRST(&ie->ie_handlers); missed = strlen(ie->ie_fullname) + strlen(ih->ih_name) + 2 - sizeof(ie->ie_fullname); strcat(ie->ie_fullname, (missed == 0) ? " " : "-"); strcat(ie->ie_fullname, &ih->ih_name[missed]); missed = 0; } last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2]; while (missed-- > 0) { if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) { if (*last == '+') { *last = '*'; break; } else *last = '+'; } else if (space) { strcat(ie->ie_fullname, " +"); space = 0; } else strcat(ie->ie_fullname, "+"); } /* * If this event has an ithread, update it's priority and * name. */ if (ie->ie_thread != NULL) ithread_update(ie->ie_thread); CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname); } int intr_event_create(struct intr_event **event, void *source, int flags, int irq, void (*pre_ithread)(void *), void (*post_ithread)(void *), void (*post_filter)(void *), int (*assign_cpu)(void *, int), const char *fmt, ...) { struct intr_event *ie; va_list ap; /* The only valid flag during creation is IE_SOFT. */ if ((flags & ~IE_SOFT) != 0) return (EINVAL); ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO); ie->ie_source = source; ie->ie_pre_ithread = pre_ithread; ie->ie_post_ithread = post_ithread; ie->ie_post_filter = post_filter; ie->ie_assign_cpu = assign_cpu; ie->ie_flags = flags; ie->ie_irq = irq; ie->ie_cpu = NOCPU; CK_SLIST_INIT(&ie->ie_handlers); mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF); va_start(ap, fmt); vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap); va_end(ap); strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname)); mtx_lock(&event_lock); TAILQ_INSERT_TAIL(&event_list, ie, ie_list); mtx_unlock(&event_lock); if (event != NULL) *event = ie; CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name); return (0); } /* * Bind an interrupt event to the specified CPU. Note that not all * platforms support binding an interrupt to a CPU. For those * platforms this request will fail. Using a cpu id of NOCPU unbinds * the interrupt event. */ static int _intr_event_bind(struct intr_event *ie, int cpu, bool bindirq, bool bindithread) { lwpid_t id; int error; /* Need a CPU to bind to. */ if (cpu != NOCPU && CPU_ABSENT(cpu)) return (EINVAL); if (ie->ie_assign_cpu == NULL) return (EOPNOTSUPP); error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR); if (error) return (error); /* * If we have any ithreads try to set their mask first to verify * permissions, etc. */ if (bindithread) { mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); error = cpuset_setithread(id, cpu); if (error) return (error); } else mtx_unlock(&ie->ie_lock); } if (bindirq) error = ie->ie_assign_cpu(ie->ie_source, cpu); if (error) { if (bindithread) { mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { cpu = ie->ie_cpu; id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); (void)cpuset_setithread(id, cpu); } else mtx_unlock(&ie->ie_lock); } return (error); } if (bindirq) { mtx_lock(&ie->ie_lock); ie->ie_cpu = cpu; mtx_unlock(&ie->ie_lock); } return (error); } /* * Bind an interrupt event to the specified CPU. For supported platforms, any * associated ithreads as well as the primary interrupt context will be bound * to the specificed CPU. */ int intr_event_bind(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, true, true)); } /* * Bind an interrupt event to the specified CPU, but do not bind associated * ithreads. */ int intr_event_bind_irqonly(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, true, false)); } /* * Bind an interrupt event's ithread to the specified CPU. */ int intr_event_bind_ithread(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, false, true)); } /* * Bind an interrupt event's ithread to the specified cpuset. */ int intr_event_bind_ithread_cpuset(struct intr_event *ie, cpuset_t *cs) { lwpid_t id; mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); return (cpuset_setthread(id, cs)); } else { mtx_unlock(&ie->ie_lock); } return (ENODEV); } static struct intr_event * intr_lookup(int irq) { struct intr_event *ie; mtx_lock(&event_lock); TAILQ_FOREACH(ie, &event_list, ie_list) if (ie->ie_irq == irq && (ie->ie_flags & IE_SOFT) == 0 && CK_SLIST_FIRST(&ie->ie_handlers) != NULL) break; mtx_unlock(&event_lock); return (ie); } int intr_setaffinity(int irq, int mode, void *m) { struct intr_event *ie; cpuset_t *mask; int cpu, n; mask = m; cpu = NOCPU; /* * If we're setting all cpus we can unbind. Otherwise make sure * only one cpu is in the set. */ if (CPU_CMP(cpuset_root, mask)) { for (n = 0; n < CPU_SETSIZE; n++) { if (!CPU_ISSET(n, mask)) continue; if (cpu != NOCPU) return (EINVAL); cpu = n; } } ie = intr_lookup(irq); if (ie == NULL) return (ESRCH); switch (mode) { case CPU_WHICH_IRQ: return (intr_event_bind(ie, cpu)); case CPU_WHICH_INTRHANDLER: return (intr_event_bind_irqonly(ie, cpu)); case CPU_WHICH_ITHREAD: return (intr_event_bind_ithread(ie, cpu)); default: return (EINVAL); } } int intr_getaffinity(int irq, int mode, void *m) { struct intr_event *ie; struct thread *td; struct proc *p; cpuset_t *mask; lwpid_t id; int error; mask = m; ie = intr_lookup(irq); if (ie == NULL) return (ESRCH); error = 0; CPU_ZERO(mask); switch (mode) { case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: mtx_lock(&ie->ie_lock); if (ie->ie_cpu == NOCPU) CPU_COPY(cpuset_root, mask); else CPU_SET(ie->ie_cpu, mask); mtx_unlock(&ie->ie_lock); break; case CPU_WHICH_ITHREAD: mtx_lock(&ie->ie_lock); if (ie->ie_thread == NULL) { mtx_unlock(&ie->ie_lock); CPU_COPY(cpuset_root, mask); } else { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, NULL); if (error != 0) return (error); CPU_COPY(&td->td_cpuset->cs_mask, mask); PROC_UNLOCK(p); } default: return (EINVAL); } return (0); } int intr_event_destroy(struct intr_event *ie) { mtx_lock(&event_lock); mtx_lock(&ie->ie_lock); if (!CK_SLIST_EMPTY(&ie->ie_handlers)) { mtx_unlock(&ie->ie_lock); mtx_unlock(&event_lock); return (EBUSY); } TAILQ_REMOVE(&event_list, ie, ie_list); #ifndef notyet if (ie->ie_thread != NULL) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); mtx_unlock(&event_lock); mtx_destroy(&ie->ie_lock); free(ie, M_ITHREAD); return (0); } static struct intr_thread * ithread_create(const char *name) { struct intr_thread *ithd; struct thread *td; int error; ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO); error = kproc_kthread_add(ithread_loop, ithd, &intrproc, &td, RFSTOPPED | RFHIGHPID, 0, "intr", "%s", name); if (error) panic("kproc_create() failed with %d", error); thread_lock(td); sched_class(td, PRI_ITHD); TD_SET_IWAIT(td); thread_unlock(td); td->td_pflags |= TDP_ITHREAD; ithd->it_thread = td; CTR2(KTR_INTR, "%s: created %s", __func__, name); return (ithd); } static void ithread_destroy(struct intr_thread *ithread) { struct thread *td; CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name); td = ithread->it_thread; thread_lock(td); ithread->it_flags |= IT_DEAD; if (TD_AWAITING_INTR(td)) { TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } else thread_unlock(td); } int intr_event_add_handler(struct intr_event *ie, const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri, enum intr_type flags, void **cookiep) { struct intr_handler *ih, *temp_ih; struct intr_handler **prevptr; struct intr_thread *it; if (ie == NULL || name == NULL || (handler == NULL && filter == NULL)) return (EINVAL); /* Allocate and populate an interrupt handler structure. */ ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO); ih->ih_filter = filter; ih->ih_handler = handler; ih->ih_argument = arg; strlcpy(ih->ih_name, name, sizeof(ih->ih_name)); ih->ih_event = ie; ih->ih_pri = pri; if (flags & INTR_EXCL) ih->ih_flags = IH_EXCLUSIVE; if (flags & INTR_MPSAFE) ih->ih_flags |= IH_MPSAFE; if (flags & INTR_ENTROPY) ih->ih_flags |= IH_ENTROPY; if (flags & INTR_TYPE_NET) ih->ih_flags |= IH_NET; /* We can only have one exclusive handler in a event. */ mtx_lock(&ie->ie_lock); if (!CK_SLIST_EMPTY(&ie->ie_handlers)) { if ((flags & INTR_EXCL) || (CK_SLIST_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) { mtx_unlock(&ie->ie_lock); free(ih, M_ITHREAD); return (EINVAL); } } /* Create a thread if we need one. */ while (ie->ie_thread == NULL && handler != NULL) { if (ie->ie_flags & IE_ADDING_THREAD) msleep(ie, &ie->ie_lock, 0, "ithread", 0); else { ie->ie_flags |= IE_ADDING_THREAD; mtx_unlock(&ie->ie_lock); it = ithread_create("intr: newborn"); mtx_lock(&ie->ie_lock); ie->ie_flags &= ~IE_ADDING_THREAD; ie->ie_thread = it; it->it_event = ie; ithread_update(it); wakeup(ie); } } /* Add the new handler to the event in priority order. */ CK_SLIST_FOREACH_PREVPTR(temp_ih, prevptr, &ie->ie_handlers, ih_next) { if (temp_ih->ih_pri > ih->ih_pri) break; } CK_SLIST_INSERT_PREVPTR(prevptr, temp_ih, ih, ih_next); intr_event_update(ie); CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name, ie->ie_name); mtx_unlock(&ie->ie_lock); if (cookiep != NULL) *cookiep = ih; return (0); } /* * Append a description preceded by a ':' to the name of the specified * interrupt handler. */ int intr_event_describe_handler(struct intr_event *ie, void *cookie, const char *descr) { struct intr_handler *ih; size_t space; char *start; mtx_lock(&ie->ie_lock); #ifdef INVARIANTS CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih == cookie) break; } if (ih == NULL) { mtx_unlock(&ie->ie_lock); panic("handler %p not found in interrupt event %p", cookie, ie); } #endif ih = cookie; /* * Look for an existing description by checking for an * existing ":". This assumes device names do not include * colons. If one is found, prepare to insert the new * description at that point. If one is not found, find the * end of the name to use as the insertion point. */ start = strchr(ih->ih_name, ':'); if (start == NULL) start = strchr(ih->ih_name, 0); /* * See if there is enough remaining room in the string for the * description + ":". The "- 1" leaves room for the trailing * '\0'. The "+ 1" accounts for the colon. */ space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1; if (strlen(descr) + 1 > space) { mtx_unlock(&ie->ie_lock); return (ENOSPC); } /* Append a colon followed by the description. */ *start = ':'; strcpy(start + 1, descr); intr_event_update(ie); mtx_unlock(&ie->ie_lock); return (0); } /* * Return the ie_source field from the intr_event an intr_handler is * associated with. */ void * intr_handler_source(void *cookie) { struct intr_handler *ih; struct intr_event *ie; ih = (struct intr_handler *)cookie; if (ih == NULL) return (NULL); ie = ih->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", ih->ih_name)); return (ie->ie_source); } /* * If intr_event_handle() is running in the ISR context at the time of the call, * then wait for it to complete. */ static void intr_event_barrier(struct intr_event *ie) { int phase; mtx_assert(&ie->ie_lock, MA_OWNED); phase = ie->ie_phase; /* * Switch phase to direct future interrupts to the other active counter. * Make sure that any preceding stores are visible before the switch. */ KASSERT(ie->ie_active[!phase] == 0, ("idle phase has activity")); atomic_store_rel_int(&ie->ie_phase, !phase); /* * This code cooperates with wait-free iteration of ie_handlers * in intr_event_handle. * Make sure that the removal and the phase update are not reordered * with the active count check. * Note that no combination of acquire and release fences can provide * that guarantee as Store->Load sequences can always be reordered. */ atomic_thread_fence_seq_cst(); /* * Now wait on the inactive phase. * The acquire fence is needed so that that all post-barrier accesses * are after the check. */ while (ie->ie_active[phase] > 0) cpu_spinwait(); atomic_thread_fence_acq(); } static void intr_handler_barrier(struct intr_handler *handler) { struct intr_event *ie; ie = handler->ih_event; mtx_assert(&ie->ie_lock, MA_OWNED); KASSERT((handler->ih_flags & IH_DEAD) == 0, ("update for a removed handler")); if (ie->ie_thread == NULL) { intr_event_barrier(ie); return; } if ((handler->ih_flags & IH_CHANGED) == 0) { handler->ih_flags |= IH_CHANGED; intr_event_schedule_thread(ie); } while ((handler->ih_flags & IH_CHANGED) != 0) msleep(handler, &ie->ie_lock, 0, "ih_barr", 0); } /* * Sleep until an ithread finishes executing an interrupt handler. * * XXX Doesn't currently handle interrupt filters or fast interrupt * handlers. This is intended for compatibility with linux drivers * only. Do not use in BSD code. */ void _intr_drain(int irq) { struct intr_event *ie; struct intr_thread *ithd; struct thread *td; ie = intr_lookup(irq); if (ie == NULL) return; if (ie->ie_thread == NULL) return; ithd = ie->ie_thread; td = ithd->it_thread; /* * We set the flag and wait for it to be cleared to avoid * long delays with potentially busy interrupt handlers * were we to only sample TD_AWAITING_INTR() every tick. */ thread_lock(td); if (!TD_AWAITING_INTR(td)) { ithd->it_flags |= IT_WAIT; while (ithd->it_flags & IT_WAIT) { thread_unlock(td); pause("idrain", 1); thread_lock(td); } } thread_unlock(td); return; } int intr_event_remove_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; struct intr_handler *ih; struct intr_handler **prevptr; #ifdef notyet int dead; #endif if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); mtx_lock(&ie->ie_lock); CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name, ie->ie_name); CK_SLIST_FOREACH_PREVPTR(ih, prevptr, &ie->ie_handlers, ih_next) { if (ih == handler) break; } if (ih == NULL) { panic("interrupt handler \"%s\" not found in " "interrupt event \"%s\"", handler->ih_name, ie->ie_name); } /* * If there is no ithread, then directly remove the handler. Note that * intr_event_handle() iterates ie_handlers in a lock-less fashion, so * care needs to be taken to keep ie_handlers consistent and to free * the removed handler only when ie_handlers is quiescent. */ if (ie->ie_thread == NULL) { CK_SLIST_REMOVE_PREVPTR(prevptr, ih, ih_next); intr_event_barrier(ie); intr_event_update(ie); mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } /* * Let the interrupt thread do the job. * The interrupt source is disabled when the interrupt thread is * running, so it does not have to worry about interaction with * intr_event_handle(). */ KASSERT((handler->ih_flags & IH_DEAD) == 0, ("duplicate handle remove")); handler->ih_flags |= IH_DEAD; intr_event_schedule_thread(ie); while (handler->ih_flags & IH_DEAD) msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0); intr_event_update(ie); #ifdef notyet /* * XXX: This could be bad in the case of ppbus(8). Also, I think * this could lead to races of stale data when servicing an * interrupt. */ dead = 1; CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih->ih_handler != NULL) { dead = 0; break; } } if (dead) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } int intr_event_suspend_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); mtx_lock(&ie->ie_lock); handler->ih_flags |= IH_SUSP; intr_handler_barrier(handler); mtx_unlock(&ie->ie_lock); return (0); } int intr_event_resume_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); /* * intr_handler_barrier() acts not only as a barrier, * it also allows to check for any pending interrupts. */ mtx_lock(&ie->ie_lock); handler->ih_flags &= ~IH_SUSP; intr_handler_barrier(handler); mtx_unlock(&ie->ie_lock); return (0); } static int intr_event_schedule_thread(struct intr_event *ie) { struct intr_entropy entropy; struct intr_thread *it; struct thread *td; struct thread *ctd; /* * If no ithread or no handlers, then we have a stray interrupt. */ if (ie == NULL || CK_SLIST_EMPTY(&ie->ie_handlers) || ie->ie_thread == NULL) return (EINVAL); ctd = curthread; it = ie->ie_thread; td = it->it_thread; /* * If any of the handlers for this ithread claim to be good * sources of entropy, then gather some. */ if (ie->ie_hflags & IH_ENTROPY) { entropy.event = (uintptr_t)ie; entropy.td = ctd; random_harvest_queue(&entropy, sizeof(entropy), RANDOM_INTERRUPT); } KASSERT(td->td_proc != NULL, ("ithread %s has no process", ie->ie_name)); /* * Set it_need to tell the thread to keep running if it is already * running. Then, lock the thread and see if we actually need to * put it on the runqueue. * * Use store_rel to arrange that the store to ih_need in * swi_sched() is before the store to it_need and prepare for * transfer of this order to loads in the ithread. */ atomic_store_rel_int(&it->it_need, 1); thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, td->td_proc->p_pid, td->td_name); TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } else { CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d", - __func__, td->td_proc->p_pid, td->td_name, it->it_need, td->td_state); + __func__, td->td_proc->p_pid, td->td_name, it->it_need, TD_GET_STATE(td)); thread_unlock(td); } return (0); } /* * Allow interrupt event binding for software interrupt handlers -- a no-op, * since interrupts are generated in software rather than being directed by * a PIC. */ static int swi_assign_cpu(void *arg, int cpu) { return (0); } /* * Add a software interrupt handler to a specified event. If a given event * is not specified, then a new event is created. */ int swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler, void *arg, int pri, enum intr_type flags, void **cookiep) { struct intr_event *ie; int error = 0; if (flags & INTR_ENTROPY) return (EINVAL); ie = (eventp != NULL) ? *eventp : NULL; if (ie != NULL) { if (!(ie->ie_flags & IE_SOFT)) return (EINVAL); } else { error = intr_event_create(&ie, NULL, IE_SOFT, 0, NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri); if (error) return (error); if (eventp != NULL) *eventp = ie; } if (handler != NULL) { error = intr_event_add_handler(ie, name, NULL, handler, arg, PI_SWI(pri), flags, cookiep); } return (error); } /* * Schedule a software interrupt thread. */ void swi_sched(void *cookie, int flags) { struct intr_handler *ih = (struct intr_handler *)cookie; struct intr_event *ie = ih->ih_event; struct intr_entropy entropy; int error __unused; CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name, ih->ih_need); if ((flags & SWI_FROMNMI) == 0) { entropy.event = (uintptr_t)ih; entropy.td = curthread; random_harvest_queue(&entropy, sizeof(entropy), RANDOM_SWI); } /* * Set ih_need for this handler so that if the ithread is already * running it will execute this handler on the next pass. Otherwise, * it will execute it the next time it runs. */ ih->ih_need = 1; if (flags & SWI_DELAY) return; if (flags & SWI_FROMNMI) { #if defined(SMP) && (defined(__i386__) || defined(__amd64__)) KASSERT(ie == clk_intr_event, ("SWI_FROMNMI used not with clk_intr_event")); ipi_self_from_nmi(IPI_SWI); #endif } else { VM_CNT_INC(v_soft); error = intr_event_schedule_thread(ie); KASSERT(error == 0, ("stray software interrupt")); } } /* * Remove a software interrupt handler. Currently this code does not * remove the associated interrupt event if it becomes empty. Calling code * may do so manually via intr_event_destroy(), but that's not really * an optimal interface. */ int swi_remove(void *cookie) { return (intr_event_remove_handler(cookie)); } static void intr_event_execute_handlers(struct proc *p, struct intr_event *ie) { struct intr_handler *ih, *ihn, *ihp; ihp = NULL; CK_SLIST_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) { /* * If this handler is marked for death, remove it from * the list of handlers and wake up the sleeper. */ if (ih->ih_flags & IH_DEAD) { mtx_lock(&ie->ie_lock); if (ihp == NULL) CK_SLIST_REMOVE_HEAD(&ie->ie_handlers, ih_next); else CK_SLIST_REMOVE_AFTER(ihp, ih_next); ih->ih_flags &= ~IH_DEAD; wakeup(ih); mtx_unlock(&ie->ie_lock); continue; } /* * Now that we know that the current element won't be removed * update the previous element. */ ihp = ih; if ((ih->ih_flags & IH_CHANGED) != 0) { mtx_lock(&ie->ie_lock); ih->ih_flags &= ~IH_CHANGED; wakeup(ih); mtx_unlock(&ie->ie_lock); } /* Skip filter only handlers */ if (ih->ih_handler == NULL) continue; /* Skip suspended handlers */ if ((ih->ih_flags & IH_SUSP) != 0) continue; /* * For software interrupt threads, we only execute * handlers that have their need flag set. Hardware * interrupt threads always invoke all of their handlers. * * ih_need can only be 0 or 1. Failed cmpset below * means that there is no request to execute handlers, * so a retry of the cmpset is not needed. */ if ((ie->ie_flags & IE_SOFT) != 0 && atomic_cmpset_int(&ih->ih_need, 1, 0) == 0) continue; /* Execute this handler. */ CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x", __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument, ih->ih_name, ih->ih_flags); if (!(ih->ih_flags & IH_MPSAFE)) mtx_lock(&Giant); ih->ih_handler(ih->ih_argument); if (!(ih->ih_flags & IH_MPSAFE)) mtx_unlock(&Giant); } } static void ithread_execute_handlers(struct proc *p, struct intr_event *ie) { /* Interrupt handlers should not sleep. */ if (!(ie->ie_flags & IE_SOFT)) THREAD_NO_SLEEPING(); intr_event_execute_handlers(p, ie); if (!(ie->ie_flags & IE_SOFT)) THREAD_SLEEPING_OK(); /* * Interrupt storm handling: * * If this interrupt source is currently storming, then throttle * it to only fire the handler once per clock tick. * * If this interrupt source is not currently storming, but the * number of back to back interrupts exceeds the storm threshold, * then enter storming mode. */ if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold && !(ie->ie_flags & IE_SOFT)) { /* Report the message only once every second. */ if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) { printf( "interrupt storm detected on \"%s\"; throttling interrupt source\n", ie->ie_name); } pause("istorm", 1); } else ie->ie_count++; /* * Now that all the handlers have had a chance to run, reenable * the interrupt source. */ if (ie->ie_post_ithread != NULL) ie->ie_post_ithread(ie->ie_source); } /* * This is the main code for interrupt threads. */ static void ithread_loop(void *arg) { struct epoch_tracker et; struct intr_thread *ithd; struct intr_event *ie; struct thread *td; struct proc *p; int wake, epoch_count; bool needs_epoch; td = curthread; p = td->td_proc; ithd = (struct intr_thread *)arg; KASSERT(ithd->it_thread == td, ("%s: ithread and proc linkage out of sync", __func__)); ie = ithd->it_event; ie->ie_count = 0; wake = 0; /* * As long as we have interrupts outstanding, go through the * list of handlers, giving each one a go at it. */ for (;;) { /* * If we are an orphaned thread, then just die. */ if (ithd->it_flags & IT_DEAD) { CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__, p->p_pid, td->td_name); free(ithd, M_ITHREAD); kthread_exit(); } /* * Service interrupts. If another interrupt arrives while * we are running, it will set it_need to note that we * should make another pass. * * The load_acq part of the following cmpset ensures * that the load of ih_need in ithread_execute_handlers() * is ordered after the load of it_need here. */ needs_epoch = (atomic_load_int(&ie->ie_hflags) & IH_NET) != 0; if (needs_epoch) { epoch_count = 0; NET_EPOCH_ENTER(et); } while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) { ithread_execute_handlers(p, ie); if (needs_epoch && ++epoch_count >= intr_epoch_batch) { NET_EPOCH_EXIT(et); epoch_count = 0; NET_EPOCH_ENTER(et); } } if (needs_epoch) NET_EPOCH_EXIT(et); WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread"); mtx_assert(&Giant, MA_NOTOWNED); /* * Processed all our interrupts. Now get the sched * lock. This may take a while and it_need may get * set again, so we have to check it again. */ thread_lock(td); if (atomic_load_acq_int(&ithd->it_need) == 0 && (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL | SWT_IWAIT); } else { if (ithd->it_flags & IT_WAIT) { wake = 1; ithd->it_flags &= ~IT_WAIT; } thread_unlock(td); } if (wake) { wakeup(ithd); wake = 0; } } } /* * Main interrupt handling body. * * Input: * o ie: the event connected to this interrupt. * o frame: some archs (i.e. i386) pass a frame to some. * handlers as their main argument. * Return value: * o 0: everything ok. * o EINVAL: stray interrupt. */ int intr_event_handle(struct intr_event *ie, struct trapframe *frame) { struct intr_handler *ih; struct trapframe *oldframe; struct thread *td; int phase; int ret; bool filter, thread; td = curthread; #ifdef KSTACK_USAGE_PROF intr_prof_stack_use(td, frame); #endif /* An interrupt with no event or handlers is a stray interrupt. */ if (ie == NULL || CK_SLIST_EMPTY(&ie->ie_handlers)) return (EINVAL); /* * Execute fast interrupt handlers directly. * To support clock handlers, if a handler registers * with a NULL argument, then we pass it a pointer to * a trapframe as its argument. */ td->td_intr_nesting_level++; filter = false; thread = false; ret = 0; critical_enter(); oldframe = td->td_intr_frame; td->td_intr_frame = frame; phase = ie->ie_phase; atomic_add_int(&ie->ie_active[phase], 1); /* * This fence is required to ensure that no later loads are * re-ordered before the ie_active store. */ atomic_thread_fence_seq_cst(); CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if ((ih->ih_flags & IH_SUSP) != 0) continue; if ((ie->ie_flags & IE_SOFT) != 0 && ih->ih_need == 0) continue; if (ih->ih_filter == NULL) { thread = true; continue; } CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__, ih->ih_filter, ih->ih_argument == NULL ? frame : ih->ih_argument, ih->ih_name); if (ih->ih_argument == NULL) ret = ih->ih_filter(frame); else ret = ih->ih_filter(ih->ih_argument); KASSERT(ret == FILTER_STRAY || ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 && (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0), ("%s: incorrect return value %#x from %s", __func__, ret, ih->ih_name)); filter = filter || ret == FILTER_HANDLED; /* * Wrapper handler special handling: * * in some particular cases (like pccard and pccbb), * the _real_ device handler is wrapped in a couple of * functions - a filter wrapper and an ithread wrapper. * In this case (and just in this case), the filter wrapper * could ask the system to schedule the ithread and mask * the interrupt source if the wrapped handler is composed * of just an ithread handler. * * TODO: write a generic wrapper to avoid people rolling * their own. */ if (!thread) { if (ret == FILTER_SCHEDULE_THREAD) thread = true; } } atomic_add_rel_int(&ie->ie_active[phase], -1); td->td_intr_frame = oldframe; if (thread) { if (ie->ie_pre_ithread != NULL) ie->ie_pre_ithread(ie->ie_source); } else { if (ie->ie_post_filter != NULL) ie->ie_post_filter(ie->ie_source); } /* Schedule the ithread if needed. */ if (thread) { int error __unused; error = intr_event_schedule_thread(ie); KASSERT(error == 0, ("bad stray interrupt")); } critical_exit(); td->td_intr_nesting_level--; #ifdef notyet /* The interrupt is not aknowledged by any filter and has no ithread. */ if (!thread && !filter) return (EINVAL); #endif return (0); } #ifdef DDB /* * Dump details about an interrupt handler */ static void db_dump_intrhand(struct intr_handler *ih) { int comma; db_printf("\t%-10s ", ih->ih_name); switch (ih->ih_pri) { case PI_REALTIME: db_printf("CLK "); break; case PI_AV: db_printf("AV "); break; case PI_TTY: db_printf("TTY "); break; case PI_NET: db_printf("NET "); break; case PI_DISK: db_printf("DISK"); break; case PI_DULL: db_printf("DULL"); break; default: if (ih->ih_pri >= PI_SOFT) db_printf("SWI "); else db_printf("%4u", ih->ih_pri); break; } db_printf(" "); if (ih->ih_filter != NULL) { db_printf("[F]"); db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC); } if (ih->ih_handler != NULL) { if (ih->ih_filter != NULL) db_printf(","); db_printf("[H]"); db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC); } db_printf("(%p)", ih->ih_argument); if (ih->ih_need || (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD | IH_MPSAFE)) != 0) { db_printf(" {"); comma = 0; if (ih->ih_flags & IH_EXCLUSIVE) { if (comma) db_printf(", "); db_printf("EXCL"); comma = 1; } if (ih->ih_flags & IH_ENTROPY) { if (comma) db_printf(", "); db_printf("ENTROPY"); comma = 1; } if (ih->ih_flags & IH_DEAD) { if (comma) db_printf(", "); db_printf("DEAD"); comma = 1; } if (ih->ih_flags & IH_MPSAFE) { if (comma) db_printf(", "); db_printf("MPSAFE"); comma = 1; } if (ih->ih_need) { if (comma) db_printf(", "); db_printf("NEED"); } db_printf("}"); } db_printf("\n"); } /* * Dump details about a event. */ void db_dump_intr_event(struct intr_event *ie, int handlers) { struct intr_handler *ih; struct intr_thread *it; int comma; db_printf("%s ", ie->ie_fullname); it = ie->ie_thread; if (it != NULL) db_printf("(pid %d)", it->it_thread->td_proc->p_pid); else db_printf("(no thread)"); if ((ie->ie_flags & (IE_SOFT | IE_ADDING_THREAD)) != 0 || (it != NULL && it->it_need)) { db_printf(" {"); comma = 0; if (ie->ie_flags & IE_SOFT) { db_printf("SOFT"); comma = 1; } if (ie->ie_flags & IE_ADDING_THREAD) { if (comma) db_printf(", "); db_printf("ADDING_THREAD"); comma = 1; } if (it != NULL && it->it_need) { if (comma) db_printf(", "); db_printf("NEED"); } db_printf("}"); } db_printf("\n"); if (handlers) CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) db_dump_intrhand(ih); } /* * Dump data about interrupt handlers */ DB_SHOW_COMMAND(intr, db_show_intr) { struct intr_event *ie; int all, verbose; verbose = strchr(modif, 'v') != NULL; all = strchr(modif, 'a') != NULL; TAILQ_FOREACH(ie, &event_list, ie_list) { if (!all && CK_SLIST_EMPTY(&ie->ie_handlers)) continue; db_dump_intr_event(ie, verbose); if (db_pager_quit) break; } } #endif /* DDB */ /* * Start standard software interrupt threads */ static void start_softintr(void *dummy) { if (swi_add(&clk_intr_event, "clk", NULL, NULL, SWI_CLOCK, INTR_MPSAFE, NULL)) panic("died while creating clk swi ithread"); if (swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, INTR_MPSAFE, &vm_ih)) panic("died while creating vm swi ithread"); } SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL); /* * Sysctls used by systat and others: hw.intrnames and hw.intrcnt. * The data for this machine dependent, and the declarations are in machine * dependent code. The layout of intrnames and intrcnt however is machine * independent. * * We do not know the length of intrcnt and intrnames at compile time, so * calculate things at run time. */ static int sysctl_intrnames(SYSCTL_HANDLER_ARGS) { return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req)); } SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_intrnames, "", "Interrupt Names"); static int sysctl_intrcnt(SYSCTL_HANDLER_ARGS) { #ifdef SCTL_MASK32 uint32_t *intrcnt32; unsigned i; int error; if (req->flags & SCTL_MASK32) { if (!req->oldptr) return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req)); intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT); if (intrcnt32 == NULL) return (ENOMEM); for (i = 0; i < sintrcnt / sizeof (u_long); i++) intrcnt32[i] = intrcnt[i]; error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req); free(intrcnt32, M_TEMP); return (error); } #endif return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req)); } SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_intrcnt, "", "Interrupt Counts"); #ifdef DDB /* * DDB command to dump the interrupt statistics. */ DB_SHOW_COMMAND(intrcnt, db_show_intrcnt) { u_long *i; char *cp; u_int j; cp = intrnames; j = 0; for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit; i++, j++) { if (*cp == '\0') break; if (*i != 0) db_printf("%s\t%lu\n", cp, *i); cp += strlen(cp) + 1; } } #endif diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c index 170e9598835e..a107c7cced95 100644 --- a/sys/kern/kern_prot.c +++ b/sys/kern/kern_prot.c @@ -1,2487 +1,2487 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 * The Regents of the University of California. * (c) UNIX System Laboratories, Inc. * Copyright (c) 2000-2001 Robert N. M. Watson. * All rights reserved. * * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94 */ /* * System calls related to processes and protection */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef REGRESSION FEATURE(regression, "Kernel support for interfaces necessary for regression testing (SECURITY RISK!)"); #endif #include #include static MALLOC_DEFINE(M_CRED, "cred", "credentials"); SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "BSD security policy"); static void crfree_final(struct ucred *cr); static void crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups); #ifndef _SYS_SYSPROTO_H_ struct getpid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getpid(struct thread *td, struct getpid_args *uap) { struct proc *p = td->td_proc; td->td_retval[0] = p->p_pid; #if defined(COMPAT_43) if (SV_PROC_FLAG(p, SV_AOUT)) td->td_retval[1] = kern_getppid(td); #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct getppid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getppid(struct thread *td, struct getppid_args *uap) { td->td_retval[0] = kern_getppid(td); return (0); } int kern_getppid(struct thread *td) { struct proc *p = td->td_proc; return (p->p_oppid); } /* * Get process group ID; note that POSIX getpgrp takes no parameter. */ #ifndef _SYS_SYSPROTO_H_ struct getpgrp_args { int dummy; }; #endif int sys_getpgrp(struct thread *td, struct getpgrp_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* Get an arbitrary pid's process group id */ #ifndef _SYS_SYSPROTO_H_ struct getpgid_args { pid_t pid; }; #endif int sys_getpgid(struct thread *td, struct getpgid_args *uap) { struct proc *p; int error; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* * Get an arbitrary pid's session id. */ #ifndef _SYS_SYSPROTO_H_ struct getsid_args { pid_t pid; }; #endif int sys_getsid(struct thread *td, struct getsid_args *uap) { return (kern_getsid(td, uap->pid)); } int kern_getsid(struct thread *td, pid_t pid) { struct proc *p; int error; if (pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_session->s_sid; PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct getuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getuid(struct thread *td, struct getuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_ruid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_uid; #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct geteuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_geteuid(struct thread *td, struct geteuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_uid; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getgid(struct thread *td, struct getgid_args *uap) { td->td_retval[0] = td->td_ucred->cr_rgid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_groups[0]; #endif return (0); } /* * Get effective group ID. The "egid" is groups[0], and could be obtained * via getgroups. This syscall exists because it is somewhat painful to do * correctly in a library function. */ #ifndef _SYS_SYSPROTO_H_ struct getegid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getegid(struct thread *td, struct getegid_args *uap) { td->td_retval[0] = td->td_ucred->cr_groups[0]; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgroups_args { u_int gidsetsize; gid_t *gidset; }; #endif int sys_getgroups(struct thread *td, struct getgroups_args *uap) { struct ucred *cred; u_int ngrp; int error; cred = td->td_ucred; ngrp = cred->cr_ngroups; if (uap->gidsetsize == 0) { error = 0; goto out; } if (uap->gidsetsize < ngrp) return (EINVAL); error = copyout(cred->cr_groups, uap->gidset, ngrp * sizeof(gid_t)); out: td->td_retval[0] = ngrp; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setsid_args { int dummy; }; #endif /* ARGSUSED */ int sys_setsid(struct thread *td, struct setsid_args *uap) { struct pgrp *pgrp; int error; struct proc *p = td->td_proc; struct pgrp *newpgrp; struct session *newsess; error = 0; pgrp = NULL; newpgrp = uma_zalloc(pgrp_zone, M_WAITOK); newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); sx_xlock(&proctree_lock); if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) { if (pgrp != NULL) PGRP_UNLOCK(pgrp); error = EPERM; } else { (void)enterpgrp(p, p->p_pid, newpgrp, newsess); td->td_retval[0] = p->p_pid; newpgrp = NULL; newsess = NULL; } sx_xunlock(&proctree_lock); uma_zfree(pgrp_zone, newpgrp); free(newsess, M_SESSION); return (error); } /* * set process group (setpgid/old setpgrp) * * caller does setpgid(targpid, targpgid) * * pid must be caller or child of caller (ESRCH) * if a child * pid must be in same session (EPERM) * pid can't have done an exec (EACCES) * if pgid != pid * there must exist some pid in same session having pgid (EPERM) * pid must not be session leader (EPERM) */ #ifndef _SYS_SYSPROTO_H_ struct setpgid_args { int pid; /* target process id */ int pgid; /* target pgrp id */ }; #endif /* ARGSUSED */ int sys_setpgid(struct thread *td, struct setpgid_args *uap) { struct proc *curp = td->td_proc; struct proc *targp; /* target process */ struct pgrp *pgrp; /* target pgrp */ int error; struct pgrp *newpgrp; if (uap->pgid < 0) return (EINVAL); error = 0; newpgrp = uma_zalloc(pgrp_zone, M_WAITOK); sx_xlock(&proctree_lock); if (uap->pid != 0 && uap->pid != curp->p_pid) { if ((targp = pfind(uap->pid)) == NULL) { error = ESRCH; goto done; } if (!inferior(targp)) { PROC_UNLOCK(targp); error = ESRCH; goto done; } if ((error = p_cansee(td, targp))) { PROC_UNLOCK(targp); goto done; } if (targp->p_pgrp == NULL || targp->p_session != curp->p_session) { PROC_UNLOCK(targp); error = EPERM; goto done; } if (targp->p_flag & P_EXEC) { PROC_UNLOCK(targp); error = EACCES; goto done; } PROC_UNLOCK(targp); } else targp = curp; if (SESS_LEADER(targp)) { error = EPERM; goto done; } if (uap->pgid == 0) uap->pgid = targp->p_pid; if ((pgrp = pgfind(uap->pgid)) == NULL) { if (uap->pgid == targp->p_pid) { error = enterpgrp(targp, uap->pgid, newpgrp, NULL); if (error == 0) newpgrp = NULL; } else error = EPERM; } else { if (pgrp == targp->p_pgrp) { PGRP_UNLOCK(pgrp); goto done; } if (pgrp->pg_id != targp->p_pid && pgrp->pg_session != curp->p_session) { PGRP_UNLOCK(pgrp); error = EPERM; goto done; } PGRP_UNLOCK(pgrp); error = enterthispgrp(targp, pgrp); } done: sx_xunlock(&proctree_lock); KASSERT((error == 0) || (newpgrp != NULL), ("setpgid failed and newpgrp is NULL")); uma_zfree(pgrp_zone, newpgrp); return (error); } /* * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD * compatible. It says that setting the uid/gid to euid/egid is a special * case of "appropriate privilege". Once the rules are expanded out, this * basically means that setuid(nnn) sets all three id's, in all permitted * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid()) * does not set the saved id - this is dangerous for traditional BSD * programs. For this reason, we *really* do not want to set * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2. */ #define POSIX_APPENDIX_B_4_2_2 #ifndef _SYS_SYSPROTO_H_ struct setuid_args { uid_t uid; }; #endif /* ARGSUSED */ int sys_setuid(struct thread *td, struct setuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t uid; struct uidinfo *uip; int error; uid = uap->uid; AUDIT_ARG_UID(uid); newcred = crget(); uip = uifind(uid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setuid(oldcred, uid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setuid(geteuid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setuid(xx)" sets all * three id's (assuming you have privs). * * Notes on the logic. We do things in three steps. * 1: We determine if the euid is going to change, and do EPERM * right away. We unconditionally change the euid later if this * test is satisfied, simplifying that part of the logic. * 2: We determine if the real and/or saved uids are going to * change. Determined by compile options. * 3: Change euid last. (after tests in #2 for "appropriate privs") */ if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */ #ifdef _POSIX_SAVED_IDS uid != oldcred->cr_svuid && /* allow setuid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ uid != oldcred->cr_uid && /* allow setuid(geteuid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETUID)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or uid == euid) * If so, we are changing the real uid and/or saved uid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */ uid == oldcred->cr_uid || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETUID) == 0) #endif { /* * Set the real uid and transfer proc count to new user. */ if (uid != oldcred->cr_ruid) { change_ruid(newcred, uip); setsugid(p); } /* * Set saved uid * * XXX always set saved uid even if not _POSIX_SAVED_IDS, as * the security of seteuid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (uid != oldcred->cr_svuid) { change_svuid(newcred, uid); setsugid(p); } } /* * In all permitted cases, we are changing the euid. */ if (uid != oldcred->cr_uid) { change_euid(newcred, uip); setsugid(p); } proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(uip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(uip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct seteuid_args { uid_t euid; }; #endif /* ARGSUSED */ int sys_seteuid(struct thread *td, struct seteuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid; struct uidinfo *euip; int error; euid = uap->euid; AUDIT_ARG_EUID(euid); newcred = crget(); euip = uifind(euid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_seteuid(oldcred, euid); if (error) goto fail; #endif if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */ euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID)) != 0) goto fail; /* * Everything's okay, do it. */ if (oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgid_args { gid_t gid; }; #endif /* ARGSUSED */ int sys_setgid(struct thread *td, struct setgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t gid; int error; gid = uap->gid; AUDIT_ARG_GID(gid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgid(oldcred, gid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setgid(getegid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setgid(xx)" sets all * three id's (assuming you have privs). * * For notes on the logic here, see setuid() above. */ if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */ #ifdef _POSIX_SAVED_IDS gid != oldcred->cr_svgid && /* allow setgid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETGID)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or gid == egid) * If so, we are changing the real uid and saved gid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */ gid == oldcred->cr_groups[0] || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETGID) == 0) #endif { /* * Set real gid */ if (oldcred->cr_rgid != gid) { change_rgid(newcred, gid); setsugid(p); } /* * Set saved gid * * XXX always set saved gid even if not _POSIX_SAVED_IDS, as * the security of setegid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (oldcred->cr_svgid != gid) { change_svgid(newcred, gid); setsugid(p); } } /* * In all cases permitted cases, we are changing the egid. * Copy credentials so other references do not see our changes. */ if (oldcred->cr_groups[0] != gid) { change_egid(newcred, gid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setegid_args { gid_t egid; }; #endif /* ARGSUSED */ int sys_setegid(struct thread *td, struct setegid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid; int error; egid = uap->egid; AUDIT_ARG_EGID(egid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setegid(oldcred, egid); if (error) goto fail; #endif if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */ egid != oldcred->cr_svgid && /* allow setegid(saved gid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID)) != 0) goto fail; if (oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgroups_args { u_int gidsetsize; gid_t *gidset; }; #endif /* ARGSUSED */ int sys_setgroups(struct thread *td, struct setgroups_args *uap) { gid_t smallgroups[XU_NGROUPS]; gid_t *groups; u_int gidsetsize; int error; gidsetsize = uap->gidsetsize; if (gidsetsize > ngroups_max + 1) return (EINVAL); if (gidsetsize > XU_NGROUPS) groups = malloc(gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK); else groups = smallgroups; error = copyin(uap->gidset, groups, gidsetsize * sizeof(gid_t)); if (error == 0) error = kern_setgroups(td, gidsetsize, groups); if (gidsetsize > XU_NGROUPS) free(groups, M_TEMP); return (error); } int kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; int error; MPASS(ngrp <= ngroups_max + 1); AUDIT_ARG_GROUPSET(groups, ngrp); newcred = crget(); crextend(newcred, ngrp); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgroups(oldcred, ngrp, groups); if (error) goto fail; #endif error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS); if (error) goto fail; if (ngrp == 0) { /* * setgroups(0, NULL) is a legitimate way of clearing the * groups vector on non-BSD systems (which generally do not * have the egid in the groups[0]). We risk security holes * when running non-BSD software if we do not do the same. */ newcred->cr_ngroups = 1; } else { crsetgroups_locked(newcred, ngrp, groups); } setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setreuid_args { uid_t ruid; uid_t euid; }; #endif /* ARGSUSED */ int sys_setreuid(struct thread *td, struct setreuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setreuid(oldcred, ruid, euid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid) || (euid != (uid_t)-1 && euid != oldcred->cr_uid && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) && newcred->cr_svuid != newcred->cr_uid) { change_svuid(newcred, newcred->cr_uid); setsugid(p); } proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setregid_args { gid_t rgid; gid_t egid; }; #endif /* ARGSUSED */ int sys_setregid(struct thread *td, struct setregid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid; int error; egid = uap->egid; rgid = uap->rgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setregid(oldcred, rgid, egid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid) || (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) && newcred->cr_svgid != newcred->cr_groups[0]) { change_svgid(newcred, newcred->cr_groups[0]); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } /* * setresuid(ruid, euid, suid) is like setreuid except control over the saved * uid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresuid_args { uid_t ruid; uid_t euid; uid_t suid; }; #endif /* ARGSUSED */ int sys_setresuid(struct thread *td, struct setresuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid, suid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; suid = uap->suid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); AUDIT_ARG_SUID(suid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresuid(oldcred, ruid, euid, suid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid && ruid != oldcred->cr_uid) || (euid != (uid_t)-1 && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid && euid != oldcred->cr_uid) || (suid != (uid_t)-1 && suid != oldcred->cr_ruid && suid != oldcred->cr_svuid && suid != oldcred->cr_uid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) { change_svuid(newcred, suid); setsugid(p); } proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } /* * setresgid(rgid, egid, sgid) is like setregid except control over the saved * gid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresgid_args { gid_t rgid; gid_t egid; gid_t sgid; }; #endif /* ARGSUSED */ int sys_setresgid(struct thread *td, struct setresgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid, sgid; int error; egid = uap->egid; rgid = uap->rgid; sgid = uap->sgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); AUDIT_ARG_SGID(sgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid && rgid != oldcred->cr_groups[0]) || (egid != (gid_t)-1 && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid && egid != oldcred->cr_groups[0]) || (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid && sgid != oldcred->cr_svgid && sgid != oldcred->cr_groups[0])) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) { change_svgid(newcred, sgid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getresuid_args { uid_t *ruid; uid_t *euid; uid_t *suid; }; #endif /* ARGSUSED */ int sys_getresuid(struct thread *td, struct getresuid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->ruid) error1 = copyout(&cred->cr_ruid, uap->ruid, sizeof(cred->cr_ruid)); if (uap->euid) error2 = copyout(&cred->cr_uid, uap->euid, sizeof(cred->cr_uid)); if (uap->suid) error3 = copyout(&cred->cr_svuid, uap->suid, sizeof(cred->cr_svuid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct getresgid_args { gid_t *rgid; gid_t *egid; gid_t *sgid; }; #endif /* ARGSUSED */ int sys_getresgid(struct thread *td, struct getresgid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->rgid) error1 = copyout(&cred->cr_rgid, uap->rgid, sizeof(cred->cr_rgid)); if (uap->egid) error2 = copyout(&cred->cr_groups[0], uap->egid, sizeof(cred->cr_groups[0])); if (uap->sgid) error3 = copyout(&cred->cr_svgid, uap->sgid, sizeof(cred->cr_svgid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct issetugid_args { int dummy; }; #endif /* ARGSUSED */ int sys_issetugid(struct thread *td, struct issetugid_args *uap) { struct proc *p = td->td_proc; /* * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, * we use P_SUGID because we consider changing the owners as * "tainting" as well. * This is significant for procs that start as root and "become" * a user without an exec - programs cannot know *everything* * that libc *might* have put in their data segment. */ td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0; return (0); } int sys___setugid(struct thread *td, struct __setugid_args *uap) { #ifdef REGRESSION struct proc *p; p = td->td_proc; switch (uap->flag) { case 0: PROC_LOCK(p); p->p_flag &= ~P_SUGID; PROC_UNLOCK(p); return (0); case 1: PROC_LOCK(p); p->p_flag |= P_SUGID; PROC_UNLOCK(p); return (0); default: return (EINVAL); } #else /* !REGRESSION */ return (ENOSYS); #endif /* REGRESSION */ } /* * Check if gid is a member of the group set. */ int groupmember(gid_t gid, struct ucred *cred) { int l; int h; int m; if (cred->cr_groups[0] == gid) return(1); /* * If gid was not our primary group, perform a binary search * of the supplemental groups. This is possible because we * sort the groups in crsetgroups(). */ l = 1; h = cred->cr_ngroups; while (l < h) { m = l + ((h - l) / 2); if (cred->cr_groups[m] < gid) l = m + 1; else h = m; } if ((l < cred->cr_ngroups) && (cred->cr_groups[l] == gid)) return (1); return (0); } /* * Test the active securelevel against a given level. securelevel_gt() * implements (securelevel > level). securelevel_ge() implements * (securelevel >= level). Note that the logic is inverted -- these * functions return EPERM on "success" and 0 on "failure". * * Due to care taken when setting the securelevel, we know that no jail will * be less secure that its parent (or the physical system), so it is sufficient * to test the current jail only. * * XXXRW: Possibly since this has to do with privilege, it should move to * kern_priv.c. */ int securelevel_gt(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel > level ? EPERM : 0); } int securelevel_ge(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0); } /* * 'see_other_uids' determines whether or not visibility of processes * and sockets with credentials holding different real uids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_uids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW, &see_other_uids, 0, "Unprivileged processes may see subjects/objects with different real uid"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_other_uids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_canseeotheruids(struct ucred *u1, struct ucred *u2) { if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) { if (priv_check_cred(u1, PRIV_SEEOTHERUIDS) != 0) return (ESRCH); } return (0); } /* * 'see_other_gids' determines whether or not visibility of processes * and sockets with credentials holding different real gids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_gids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW, &see_other_gids, 0, "Unprivileged processes may see subjects/objects with different real gid"); /* * Determine if u1 can "see" the subject specified by u2, according to the * 'see_other_gids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_canseeothergids(struct ucred *u1, struct ucred *u2) { int i, match; if (!see_other_gids) { match = 0; for (i = 0; i < u1->cr_ngroups; i++) { if (groupmember(u1->cr_groups[i], u2)) match = 1; if (match) break; } if (!match) { if (priv_check_cred(u1, PRIV_SEEOTHERGIDS) != 0) return (ESRCH); } } return (0); } /* * 'see_jail_proc' determines whether or not visibility of processes and * sockets with credentials holding different jail ids is possible using a * variety of system MIBs. * * XXX: data declarations should be together near the beginning of the file. */ static int see_jail_proc = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_jail_proc, CTLFLAG_RW, &see_jail_proc, 0, "Unprivileged processes may see subjects/objects with different jail ids"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_jail_proc' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_canseejailproc(struct ucred *u1, struct ucred *u2) { if (u1->cr_uid == 0) return (0); return (!see_jail_proc && u1->cr_prison != u2->cr_prison ? ESRCH : 0); } /*- * Determine if u1 "can see" the subject specified by u2. * Returns: 0 for permitted, an errno value otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_cansee(struct ucred *u1, struct ucred *u2) { int error; if ((error = prison_check(u1, u2))) return (error); #ifdef MAC if ((error = mac_cred_check_visible(u1, u2))) return (error); #endif if ((error = cr_canseeotheruids(u1, u2))) return (error); if ((error = cr_canseeothergids(u1, u2))) return (error); if ((error = cr_canseejailproc(u1, u2))) return (error); return (0); } /*- * Determine if td "can see" the subject specified by p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect p->p_ucred must be held. td really * should be curthread. * References: td and p must be valid for the lifetime of the call */ int p_cansee(struct thread *td, struct proc *p) { /* Wrap cr_cansee() for all functionality. */ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); return (cr_cansee(td->td_ucred, p->p_ucred)); } /* * 'conservative_signals' prevents the delivery of a broad class of * signals by unprivileged processes to processes that have changed their * credentials since the last invocation of execve(). This can prevent * the leakage of cached information or retained privileges as a result * of a common class of signal-related vulnerabilities. However, this * may interfere with some applications that expect to be able to * deliver these signals to peer processes after having given up * privilege. */ static int conservative_signals = 1; SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW, &conservative_signals, 0, "Unprivileged processes prevented from " "sending certain signals to processes whose credentials have changed"); /*- * Determine whether cred may deliver the specified signal to proc. * Returns: 0 for permitted, an errno value otherwise. * Locks: A lock must be held for proc. * References: cred and proc must be valid for the lifetime of the call. */ int cr_cansignal(struct ucred *cred, struct proc *proc, int signum) { int error; PROC_LOCK_ASSERT(proc, MA_OWNED); /* * Jail semantics limit the scope of signalling to proc in the * same jail as cred, if cred is in jail. */ error = prison_check(cred, proc->p_ucred); if (error) return (error); #ifdef MAC if ((error = mac_proc_check_signal(cred, proc, signum))) return (error); #endif if ((error = cr_canseeotheruids(cred, proc->p_ucred))) return (error); if ((error = cr_canseeothergids(cred, proc->p_ucred))) return (error); /* * UNIX signal semantics depend on the status of the P_SUGID * bit on the target process. If the bit is set, then additional * restrictions are placed on the set of available signals. */ if (conservative_signals && (proc->p_flag & P_SUGID)) { switch (signum) { case 0: case SIGKILL: case SIGINT: case SIGTERM: case SIGALRM: case SIGSTOP: case SIGTTIN: case SIGTTOU: case SIGTSTP: case SIGHUP: case SIGUSR1: case SIGUSR2: /* * Generally, permit job and terminal control * signals. */ break; default: /* Not permitted without privilege. */ error = priv_check_cred(cred, PRIV_SIGNAL_SUGID); if (error) return (error); } } /* * Generally, the target credential's ruid or svuid must match the * subject credential's ruid or euid. */ if (cred->cr_ruid != proc->p_ucred->cr_ruid && cred->cr_ruid != proc->p_ucred->cr_svuid && cred->cr_uid != proc->p_ucred->cr_ruid && cred->cr_uid != proc->p_ucred->cr_svuid) { error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED); if (error) return (error); } return (0); } /*- * Determine whether td may deliver the specified signal to p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must be * held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansignal(struct thread *td, struct proc *p, int signum) { KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); /* * UNIX signalling semantics require that processes in the same * session always be able to deliver SIGCONT to one another, * overriding the remaining protections. */ /* XXX: This will require an additional lock of some sort. */ if (signum == SIGCONT && td->td_proc->p_session == p->p_session) return (0); /* * Some compat layers use SIGTHR and higher signals for * communication between different kernel threads of the same * process, so that they expect that it's always possible to * deliver them, even for suid applications where cr_cansignal() can * deny such ability for security consideration. It should be * pretty safe to do since the only way to create two processes * with the same p_leader is via rfork(2). */ if (td->td_proc->p_leader != NULL && signum >= SIGTHR && signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader) return (0); return (cr_cansignal(td->td_ucred, p, signum)); } /*- * Determine whether td may reschedule p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansched(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_sched(td->td_ucred, p))) return (error); #endif if ((error = cr_canseeotheruids(td->td_ucred, p->p_ucred))) return (error); if ((error = cr_canseeothergids(td->td_ucred, p->p_ucred))) return (error); if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid && td->td_ucred->cr_uid != p->p_ucred->cr_ruid) { error = priv_check(td, PRIV_SCHED_DIFFCRED); if (error) return (error); } return (0); } /* * Handle getting or setting the prison's unprivileged_proc_debug * value. */ static int sysctl_unprivileged_proc_debug(SYSCTL_HANDLER_ARGS) { int error, val; val = prison_allow(req->td->td_ucred, PR_ALLOW_UNPRIV_DEBUG); error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && val != 1) return (EINVAL); prison_set_allow(req->td->td_ucred, PR_ALLOW_UNPRIV_DEBUG, val); return (0); } /* * The 'unprivileged_proc_debug' flag may be used to disable a variety of * unprivileged inter-process debugging services, including some procfs * functionality, ptrace(), and ktrace(). In the past, inter-process * debugging has been involved in a variety of security problems, and sites * not requiring the service might choose to disable it when hardening * systems. */ SYSCTL_PROC(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_SECURE | CTLFLAG_MPSAFE, 0, 0, sysctl_unprivileged_proc_debug, "I", "Unprivileged processes may use process debugging facilities"); /*- * Determine whether td may debug p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_candebug(struct thread *td, struct proc *p) { int credentialchanged, error, grpsubset, i, uidsubset; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = priv_check(td, PRIV_DEBUG_UNPRIV))) return (error); if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_debug(td->td_ucred, p))) return (error); #endif if ((error = cr_canseeotheruids(td->td_ucred, p->p_ucred))) return (error); if ((error = cr_canseeothergids(td->td_ucred, p->p_ucred))) return (error); /* * Is p's group set a subset of td's effective group set? This * includes p's egid, group access list, rgid, and svgid. */ grpsubset = 1; for (i = 0; i < p->p_ucred->cr_ngroups; i++) { if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) { grpsubset = 0; break; } } grpsubset = grpsubset && groupmember(p->p_ucred->cr_rgid, td->td_ucred) && groupmember(p->p_ucred->cr_svgid, td->td_ucred); /* * Are the uids present in p's credential equal to td's * effective uid? This includes p's euid, svuid, and ruid. */ uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid && td->td_ucred->cr_uid == p->p_ucred->cr_svuid && td->td_ucred->cr_uid == p->p_ucred->cr_ruid); /* * Has the credential of the process changed since the last exec()? */ credentialchanged = (p->p_flag & P_SUGID); /* * If p's gids aren't a subset, or the uids aren't a subset, * or the credential has changed, require appropriate privilege * for td to debug p. */ if (!grpsubset || !uidsubset) { error = priv_check(td, PRIV_DEBUG_DIFFCRED); if (error) return (error); } if (credentialchanged) { error = priv_check(td, PRIV_DEBUG_SUGID); if (error) return (error); } /* Can't trace init when securelevel > 0. */ if (p == initproc) { error = securelevel_gt(td->td_ucred, 0); if (error) return (error); } /* * Can't trace a process that's currently exec'ing. * * XXX: Note, this is not a security policy decision, it's a * basic correctness/functionality decision. Therefore, this check * should be moved to the caller's of p_candebug(). */ if ((p->p_flag & P_INEXEC) != 0) return (EBUSY); /* Denied explicitely */ if ((p->p_flag2 & P2_NOTRACE) != 0) { error = priv_check(td, PRIV_DEBUG_DENIED); if (error != 0) return (error); } return (0); } /*- * Determine whether the subject represented by cred can "see" a socket. * Returns: 0 for permitted, ENOENT otherwise. */ int cr_canseesocket(struct ucred *cred, struct socket *so) { int error; error = prison_check(cred, so->so_cred); if (error) return (ENOENT); #ifdef MAC error = mac_socket_check_visible(cred, so); if (error) return (error); #endif if (cr_canseeotheruids(cred, so->so_cred)) return (ENOENT); if (cr_canseeothergids(cred, so->so_cred)) return (ENOENT); return (0); } /*- * Determine whether td can wait for the exit of p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_canwait(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_wait(td->td_ucred, p))) return (error); #endif #if 0 /* XXXMAC: This could have odd effects on some shells. */ if ((error = cr_canseeotheruids(td->td_ucred, p->p_ucred))) return (error); #endif return (0); } /* * Credential management. * * struct ucred objects are rarely allocated but gain and lose references all * the time (e.g., on struct file alloc/dealloc) turning refcount updates into * a significant source of cache-line ping ponging. Common cases are worked * around by modifying thread-local counter instead if the cred to operate on * matches td_realucred. * * The counter is split into 2 parts: * - cr_users -- total count of all struct proc and struct thread objects * which have given cred in p_ucred and td_ucred respectively * - cr_ref -- the actual ref count, only valid if cr_users == 0 * * If users == 0 then cr_ref behaves similarly to refcount(9), in particular if * the count reaches 0 the object is freeable. * If users > 0 and curthread->td_realucred == cred, then updates are performed * against td_ucredref. * In other cases updates are performed against cr_ref. * * Changing td_realucred into something else decrements cr_users and transfers * accumulated updates. */ struct ucred * crcowget(struct ucred *cr) { mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users++; cr->cr_ref++; mtx_unlock(&cr->cr_mtx); return (cr); } static struct ucred * crunuse(struct thread *td) { struct ucred *cr, *crold; MPASS(td->td_realucred == td->td_ucred); cr = td->td_realucred; mtx_lock(&cr->cr_mtx); cr->cr_ref += td->td_ucredref; td->td_ucredref = 0; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users--; if (cr->cr_users == 0) { KASSERT(cr->cr_ref > 0, ("%s: ref %d not > 0 on cred %p", __func__, cr->cr_ref, cr)); crold = cr; } else { cr->cr_ref--; crold = NULL; } mtx_unlock(&cr->cr_mtx); td->td_realucred = NULL; return (crold); } static void crunusebatch(struct ucred *cr, int users, int ref) { KASSERT(users > 0, ("%s: passed users %d not > 0 ; cred %p", __func__, users, cr)); mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users >= users, ("%s: users %d not > %d on cred %p", __func__, cr->cr_users, users, cr)); cr->cr_users -= users; cr->cr_ref += ref; cr->cr_ref -= users; if (cr->cr_users > 0) { mtx_unlock(&cr->cr_mtx); return; } KASSERT(cr->cr_ref >= 0, ("%s: ref %d not >= 0 on cred %p", __func__, cr->cr_ref, cr)); if (cr->cr_ref > 0) { mtx_unlock(&cr->cr_mtx); return; } crfree_final(cr); } void crcowfree(struct thread *td) { struct ucred *cr; cr = crunuse(td); if (cr != NULL) crfree(cr); } struct ucred * crcowsync(void) { struct thread *td; struct proc *p; struct ucred *crnew, *crold; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(td->td_realucred == td->td_ucred); if (td->td_realucred == p->p_ucred) return (NULL); crnew = crcowget(p->p_ucred); crold = crunuse(td); td->td_realucred = crnew; td->td_ucred = td->td_realucred; return (crold); } /* * Batching. */ void credbatch_add(struct credbatch *crb, struct thread *td) { struct ucred *cr; MPASS(td->td_realucred != NULL); MPASS(td->td_realucred == td->td_ucred); - MPASS(td->td_state == TDS_INACTIVE); + MPASS(TD_GET_STATE(td) == TDS_INACTIVE); cr = td->td_realucred; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); if (crb->cred != cr) { if (crb->users > 0) { MPASS(crb->cred != NULL); crunusebatch(crb->cred, crb->users, crb->ref); crb->users = 0; crb->ref = 0; } } crb->cred = cr; crb->users++; crb->ref += td->td_ucredref; td->td_ucredref = 0; td->td_realucred = NULL; } void credbatch_final(struct credbatch *crb) { MPASS(crb->cred != NULL); MPASS(crb->users > 0); crunusebatch(crb->cred, crb->users, crb->ref); } /* * Allocate a zeroed cred structure. */ struct ucred * crget(void) { struct ucred *cr; cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); mtx_init(&cr->cr_mtx, "cred", NULL, MTX_DEF); cr->cr_ref = 1; #ifdef AUDIT audit_cred_init(cr); #endif #ifdef MAC mac_cred_init(cr); #endif cr->cr_groups = cr->cr_smallgroups; cr->cr_agroups = sizeof(cr->cr_smallgroups) / sizeof(cr->cr_smallgroups[0]); return (cr); } /* * Claim another reference to a ucred structure. */ struct ucred * crhold(struct ucred *cr) { struct thread *td; td = curthread; if (__predict_true(td->td_realucred == cr)) { KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); td->td_ucredref++; return (cr); } mtx_lock(&cr->cr_mtx); cr->cr_ref++; mtx_unlock(&cr->cr_mtx); return (cr); } /* * Free a cred structure. Throws away space when ref count gets to 0. */ void crfree(struct ucred *cr) { struct thread *td; td = curthread; if (__predict_true(td->td_realucred == cr)) { KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); td->td_ucredref--; return; } mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users >= 0, ("%s: users %d not >= 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_ref--; if (cr->cr_users > 0) { mtx_unlock(&cr->cr_mtx); return; } KASSERT(cr->cr_ref >= 0, ("%s: ref %d not >= 0 on cred %p", __func__, cr->cr_ref, cr)); if (cr->cr_ref > 0) { mtx_unlock(&cr->cr_mtx); return; } crfree_final(cr); } static void crfree_final(struct ucred *cr) { KASSERT(cr->cr_users == 0, ("%s: users %d not == 0 on cred %p", __func__, cr->cr_users, cr)); KASSERT(cr->cr_ref == 0, ("%s: ref %d not == 0 on cred %p", __func__, cr->cr_ref, cr)); /* * Some callers of crget(), such as nfs_statfs(), allocate a temporary * credential, but don't allocate a uidinfo structure. */ if (cr->cr_uidinfo != NULL) uifree(cr->cr_uidinfo); if (cr->cr_ruidinfo != NULL) uifree(cr->cr_ruidinfo); if (cr->cr_prison != NULL) prison_free(cr->cr_prison); if (cr->cr_loginclass != NULL) loginclass_free(cr->cr_loginclass); #ifdef AUDIT audit_cred_destroy(cr); #endif #ifdef MAC mac_cred_destroy(cr); #endif mtx_destroy(&cr->cr_mtx); if (cr->cr_groups != cr->cr_smallgroups) free(cr->cr_groups, M_CRED); free(cr, M_CRED); } /* * Copy a ucred's contents from a template. Does not block. */ void crcopy(struct ucred *dest, struct ucred *src) { KASSERT(dest->cr_ref == 1, ("crcopy of shared ucred")); bcopy(&src->cr_startcopy, &dest->cr_startcopy, (unsigned)((caddr_t)&src->cr_endcopy - (caddr_t)&src->cr_startcopy)); crsetgroups(dest, src->cr_ngroups, src->cr_groups); uihold(dest->cr_uidinfo); uihold(dest->cr_ruidinfo); prison_hold(dest->cr_prison); loginclass_hold(dest->cr_loginclass); #ifdef AUDIT audit_cred_copy(src, dest); #endif #ifdef MAC mac_cred_copy(src, dest); #endif } /* * Dup cred struct to a new held one. */ struct ucred * crdup(struct ucred *cr) { struct ucred *newcr; newcr = crget(); crcopy(newcr, cr); return (newcr); } /* * Fill in a struct xucred based on a struct ucred. */ void cru2x(struct ucred *cr, struct xucred *xcr) { int ngroups; bzero(xcr, sizeof(*xcr)); xcr->cr_version = XUCRED_VERSION; xcr->cr_uid = cr->cr_uid; ngroups = MIN(cr->cr_ngroups, XU_NGROUPS); xcr->cr_ngroups = ngroups; bcopy(cr->cr_groups, xcr->cr_groups, ngroups * sizeof(*cr->cr_groups)); } void cru2xt(struct thread *td, struct xucred *xcr) { cru2x(td->td_ucred, xcr); xcr->cr_pid = td->td_proc->p_pid; } /* * Set initial process credentials. * Callers are responsible for providing the reference for provided credentials. */ void proc_set_cred_init(struct proc *p, struct ucred *newcred) { p->p_ucred = crcowget(newcred); } /* * Change process credentials. * Callers are responsible for providing the reference for passed credentials * and for freeing old ones. * * Process has to be locked except when it does not have credentials (as it * should not be visible just yet) or when newcred is NULL (as this can be * only used when the process is about to be freed, at which point it should * not be visible anymore). */ void proc_set_cred(struct proc *p, struct ucred *newcred) { struct ucred *cr; cr = p->p_ucred; MPASS(cr != NULL); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(newcred->cr_users == 0, ("%s: users %d not 0 on cred %p", __func__, newcred->cr_users, newcred)); mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users--; mtx_unlock(&cr->cr_mtx); p->p_ucred = newcred; newcred->cr_users = 1; PROC_UPDATE_COW(p); } void proc_unset_cred(struct proc *p) { struct ucred *cr; MPASS(p->p_state == PRS_ZOMBIE || p->p_state == PRS_NEW); cr = p->p_ucred; p->p_ucred = NULL; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); mtx_lock(&cr->cr_mtx); cr->cr_users--; if (cr->cr_users == 0) KASSERT(cr->cr_ref > 0, ("%s: ref %d not > 0 on cred %p", __func__, cr->cr_ref, cr)); mtx_unlock(&cr->cr_mtx); crfree(cr); } struct ucred * crcopysafe(struct proc *p, struct ucred *cr) { struct ucred *oldcred; int groups; PROC_LOCK_ASSERT(p, MA_OWNED); oldcred = p->p_ucred; while (cr->cr_agroups < oldcred->cr_agroups) { groups = oldcred->cr_agroups; PROC_UNLOCK(p); crextend(cr, groups); PROC_LOCK(p); oldcred = p->p_ucred; } crcopy(cr, oldcred); return (oldcred); } /* * Extend the passed in credential to hold n items. */ void crextend(struct ucred *cr, int n) { int cnt; /* Truncate? */ if (n <= cr->cr_agroups) return; /* * We extend by 2 each time since we're using a power of two * allocator until we need enough groups to fill a page. * Once we're allocating multiple pages, only allocate as many * as we actually need. The case of processes needing a * non-power of two number of pages seems more likely than * a real world process that adds thousands of groups one at a * time. */ if ( n < PAGE_SIZE / sizeof(gid_t) ) { if (cr->cr_agroups == 0) cnt = MAX(1, MINALLOCSIZE / sizeof(gid_t)); else cnt = cr->cr_agroups * 2; while (cnt < n) cnt *= 2; } else cnt = roundup2(n, PAGE_SIZE / sizeof(gid_t)); /* Free the old array. */ if (cr->cr_groups != cr->cr_smallgroups) free(cr->cr_groups, M_CRED); cr->cr_groups = malloc(cnt * sizeof(gid_t), M_CRED, M_WAITOK | M_ZERO); cr->cr_agroups = cnt; } /* * Copy groups in to a credential, preserving any necessary invariants. * Currently this includes the sorting of all supplemental gids. * crextend() must have been called before hand to ensure sufficient * space is available. */ static void crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups) { int i; int j; gid_t g; KASSERT(cr->cr_agroups >= ngrp, ("cr_ngroups is too small")); bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t)); cr->cr_ngroups = ngrp; /* * Sort all groups except cr_groups[0] to allow groupmember to * perform a binary search. * * XXX: If large numbers of groups become common this should * be replaced with shell sort like linux uses or possibly * heap sort. */ for (i = 2; i < ngrp; i++) { g = cr->cr_groups[i]; for (j = i-1; j >= 1 && g < cr->cr_groups[j]; j--) cr->cr_groups[j + 1] = cr->cr_groups[j]; cr->cr_groups[j + 1] = g; } } /* * Copy groups in to a credential after expanding it if required. * Truncate the list to (ngroups_max + 1) if it is too large. */ void crsetgroups(struct ucred *cr, int ngrp, gid_t *groups) { if (ngrp > ngroups_max + 1) ngrp = ngroups_max + 1; crextend(cr, ngrp); crsetgroups_locked(cr, ngrp, groups); } /* * Get login name, if available. */ #ifndef _SYS_SYSPROTO_H_ struct getlogin_args { char *namebuf; u_int namelen; }; #endif /* ARGSUSED */ int sys_getlogin(struct thread *td, struct getlogin_args *uap) { char login[MAXLOGNAME]; struct proc *p = td->td_proc; size_t len; if (uap->namelen > MAXLOGNAME) uap->namelen = MAXLOGNAME; PROC_LOCK(p); SESS_LOCK(p->p_session); len = strlcpy(login, p->p_session->s_login, uap->namelen) + 1; SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (len > uap->namelen) return (ERANGE); return (copyout(login, uap->namebuf, len)); } /* * Set login name. */ #ifndef _SYS_SYSPROTO_H_ struct setlogin_args { char *namebuf; }; #endif /* ARGSUSED */ int sys_setlogin(struct thread *td, struct setlogin_args *uap) { struct proc *p = td->td_proc; int error; char logintmp[MAXLOGNAME]; CTASSERT(sizeof(p->p_session->s_login) >= sizeof(logintmp)); error = priv_check(td, PRIV_PROC_SETLOGIN); if (error) return (error); error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL); if (error != 0) { if (error == ENAMETOOLONG) error = EINVAL; return (error); } AUDIT_ARG_LOGIN(logintmp); PROC_LOCK(p); SESS_LOCK(p->p_session); strcpy(p->p_session->s_login, logintmp); SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); return (0); } void setsugid(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_SUGID; } /*- * Change a process's effective uid. * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_euid(struct ucred *newcred, struct uidinfo *euip) { newcred->cr_uid = euip->ui_uid; uihold(euip); uifree(newcred->cr_uidinfo); newcred->cr_uidinfo = euip; } /*- * Change a process's effective gid. * Side effects: newcred->cr_gid will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_egid(struct ucred *newcred, gid_t egid) { newcred->cr_groups[0] = egid; } /*- * Change a process's real uid. * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo * will be updated, and the old and new cr_ruidinfo proc * counts will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_ruid(struct ucred *newcred, struct uidinfo *ruip) { (void)chgproccnt(newcred->cr_ruidinfo, -1, 0); newcred->cr_ruid = ruip->ui_uid; uihold(ruip); uifree(newcred->cr_ruidinfo); newcred->cr_ruidinfo = ruip; (void)chgproccnt(newcred->cr_ruidinfo, 1, 0); } /*- * Change a process's real gid. * Side effects: newcred->cr_rgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_rgid(struct ucred *newcred, gid_t rgid) { newcred->cr_rgid = rgid; } /*- * Change a process's saved uid. * Side effects: newcred->cr_svuid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svuid(struct ucred *newcred, uid_t svuid) { newcred->cr_svuid = svuid; } /*- * Change a process's saved gid. * Side effects: newcred->cr_svgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svgid(struct ucred *newcred, gid_t svgid) { newcred->cr_svgid = svgid; } diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 4df1c72d50f7..7d179fe69844 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -1,1366 +1,1366 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 The FreeBSD Foundation * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RCTL #include #endif #ifdef RACCT FEATURE(racct, "Resource Accounting"); /* * Do not block processes that have their %cpu usage <= pcpu_threshold. */ static int pcpu_threshold = 1; #ifdef RACCT_DEFAULT_TO_DISABLED bool __read_frequently racct_enable = false; #else bool __read_frequently racct_enable = true; #endif SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Resource Accounting"); SYSCTL_BOOL(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 0, "Enable RACCT/RCTL"); SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 0, "Processes with higher %cpu usage than this value can be throttled."); /* * How many seconds it takes to use the scheduler %cpu calculations. When a * process starts, we compute its %cpu usage by dividing its runtime by the * process wall clock time. After RACCT_PCPU_SECS pass, we use the value * provided by the scheduler. */ #define RACCT_PCPU_SECS 3 struct mtx racct_lock; MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); static uma_zone_t racct_zone; static void racct_sub_racct(struct racct *dest, const struct racct *src); static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount); static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount); SDT_PROVIDER_DEFINE(racct); SDT_PROBE_DEFINE3(racct, , rusage, add, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__failure, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__buf, "struct proc *", "const struct buf *", "int"); SDT_PROBE_DEFINE3(racct, , rusage, add__cred, "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__force, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set__failure, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set__force, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, sub, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, sub__cred, "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE1(racct, , racct, create, "struct racct *"); SDT_PROBE_DEFINE1(racct, , racct, destroy, "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, join, "struct racct *", "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, join__failure, "struct racct *", "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, leave, "struct racct *", "struct racct *"); int racct_types[] = { [RACCT_CPU] = RACCT_IN_MILLIONS, [RACCT_DATA] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_STACK] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_CORE] = RACCT_DENIABLE, [RACCT_RSS] = RACCT_RECLAIMABLE, [RACCT_MEMLOCK] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NPROC] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NOFILE] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_VMEM] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NPTS] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SWAP] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NTHR] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_MSGQQUEUED] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_MSGQSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NMSGQ] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEMOP] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NSHM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SHMSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_WALLCLOCK] = RACCT_IN_MILLIONS, [RACCT_PCTCPU] = RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS, [RACCT_READBPS] = RACCT_DECAYING, [RACCT_WRITEBPS] = RACCT_DECAYING, [RACCT_READIOPS] = RACCT_DECAYING, [RACCT_WRITEIOPS] = RACCT_DECAYING }; static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; #ifdef SCHED_4BSD /* * Contains intermediate values for %cpu calculations to avoid using floating * point in the kernel. * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to * zero so the calculations are more straightforward. */ fixpt_t ccpu_exp[] = { [0] = FSCALE * 1, [1] = FSCALE * 0.95122942450071400909, [2] = FSCALE * 0.90483741803595957316, [3] = FSCALE * 0.86070797642505780722, [4] = FSCALE * 0.81873075307798185866, [5] = FSCALE * 0.77880078307140486824, [6] = FSCALE * 0.74081822068171786606, [7] = FSCALE * 0.70468808971871343435, [8] = FSCALE * 0.67032004603563930074, [9] = FSCALE * 0.63762815162177329314, [10] = FSCALE * 0.60653065971263342360, [11] = FSCALE * 0.57694981038048669531, [12] = FSCALE * 0.54881163609402643262, [13] = FSCALE * 0.52204577676101604789, [14] = FSCALE * 0.49658530379140951470, [15] = FSCALE * 0.47236655274101470713, [16] = FSCALE * 0.44932896411722159143, [17] = FSCALE * 0.42741493194872666992, [18] = FSCALE * 0.40656965974059911188, [19] = FSCALE * 0.38674102345450120691, [20] = FSCALE * 0.36787944117144232159, [21] = FSCALE * 0.34993774911115535467, [22] = FSCALE * 0.33287108369807955328, [23] = FSCALE * 0.31663676937905321821, [24] = FSCALE * 0.30119421191220209664, [25] = FSCALE * 0.28650479686019010032, [26] = FSCALE * 0.27253179303401260312, [27] = FSCALE * 0.25924026064589150757, [28] = FSCALE * 0.24659696394160647693, [29] = FSCALE * 0.23457028809379765313, [30] = FSCALE * 0.22313016014842982893, [31] = FSCALE * 0.21224797382674305771, [32] = FSCALE * 0.20189651799465540848, [33] = FSCALE * 0.19204990862075411423, [34] = FSCALE * 0.18268352405273465022, [35] = FSCALE * 0.17377394345044512668, [36] = FSCALE * 0.16529888822158653829, [37] = FSCALE * 0.15723716631362761621, [38] = FSCALE * 0.14956861922263505264, [39] = FSCALE * 0.14227407158651357185, [40] = FSCALE * 0.13533528323661269189, [41] = FSCALE * 0.12873490358780421886, [42] = FSCALE * 0.12245642825298191021, [43] = FSCALE * 0.11648415777349695786, [44] = FSCALE * 0.11080315836233388333, [45] = FSCALE * 0.10539922456186433678, [46] = FSCALE * 0.10025884372280373372, [47] = FSCALE * 0.09536916221554961888, [48] = FSCALE * 0.09071795328941250337, [49] = FSCALE * 0.08629358649937051097, [50] = FSCALE * 0.08208499862389879516, [51] = FSCALE * 0.07808166600115315231, [52] = FSCALE * 0.07427357821433388042, [53] = FSCALE * 0.07065121306042958674, [54] = FSCALE * 0.06720551273974976512, [55] = FSCALE * 0.06392786120670757270, [56] = FSCALE * 0.06081006262521796499, [57] = FSCALE * 0.05784432087483846296, [58] = FSCALE * 0.05502322005640722902, [59] = FSCALE * 0.05233970594843239308, [60] = FSCALE * 0.04978706836786394297, [61] = FSCALE * 0.04735892439114092119, [62] = FSCALE * 0.04504920239355780606, [63] = FSCALE * 0.04285212686704017991, [64] = FSCALE * 0.04076220397836621516, [65] = FSCALE * 0.03877420783172200988, [66] = FSCALE * 0.03688316740124000544, [67] = FSCALE * 0.03508435410084502588, [68] = FSCALE * 0.03337326996032607948, [69] = FSCALE * 0.03174563637806794323, [70] = FSCALE * 0.03019738342231850073, [71] = FSCALE * 0.02872463965423942912, [72] = FSCALE * 0.02732372244729256080, [73] = FSCALE * 0.02599112877875534358, [74] = FSCALE * 0.02472352647033939120, [75] = FSCALE * 0.02351774585600910823, [76] = FSCALE * 0.02237077185616559577, [77] = FSCALE * 0.02127973643837716938, [78] = FSCALE * 0.02024191144580438847, [79] = FSCALE * 0.01925470177538692429, [80] = FSCALE * 0.01831563888873418029, [81] = FSCALE * 0.01742237463949351138, [82] = FSCALE * 0.01657267540176124754, [83] = FSCALE * 0.01576441648485449082, [84] = FSCALE * 0.01499557682047770621, [85] = FSCALE * 0.01426423390899925527, [86] = FSCALE * 0.01356855901220093175, [87] = FSCALE * 0.01290681258047986886, [88] = FSCALE * 0.01227733990306844117, [89] = FSCALE * 0.01167856697039544521, [90] = FSCALE * 0.01110899653824230649, [91] = FSCALE * 0.01056720438385265337, [92] = FSCALE * 0.01005183574463358164, [93] = FSCALE * 0.00956160193054350793, [94] = FSCALE * 0.00909527710169581709, [95] = FSCALE * 0.00865169520312063417, [96] = FSCALE * 0.00822974704902002884, [97] = FSCALE * 0.00782837754922577143, [98] = FSCALE * 0.00744658307092434051, [99] = FSCALE * 0.00708340892905212004, [100] = FSCALE * 0.00673794699908546709, [101] = FSCALE * 0.00640933344625638184, [102] = FSCALE * 0.00609674656551563610, [103] = FSCALE * 0.00579940472684214321, [104] = FSCALE * 0.00551656442076077241, [105] = FSCALE * 0.00524751839918138427, [106] = FSCALE * 0.00499159390691021621, [107] = FSCALE * 0.00474815099941147558, [108] = FSCALE * 0.00451658094261266798, [109] = FSCALE * 0.00429630469075234057, [110] = FSCALE * 0.00408677143846406699, }; #endif #define CCPU_EXP_MAX 110 /* * This function is analogical to the getpcpu() function in the ps(1) command. * They should both calculate in the same way so that the racct %cpu * calculations are consistent with the values showed by the ps(1) tool. * The calculations are more complex in the 4BSD scheduler because of the value * of the ccpu variable. In ULE it is defined to be zero which saves us some * work. */ static uint64_t racct_getpcpu(struct proc *p, u_int pcpu) { u_int swtime; #ifdef SCHED_4BSD fixpt_t pctcpu, pctcpu_next; #endif #ifdef SMP struct pcpu *pc; int found; #endif fixpt_t p_pctcpu; struct thread *td; ASSERT_RACCT_ENABLED(); /* * If the process is swapped out, we count its %cpu usage as zero. * This behaviour is consistent with the userland ps(1) tool. */ if ((p->p_flag & P_INMEM) == 0) return (0); swtime = (ticks - p->p_swtick) / hz; /* * For short-lived processes, the sched_pctcpu() returns small * values even for cpu intensive processes. Therefore we use * our own estimate in this case. */ if (swtime < RACCT_PCPU_SECS) return (pcpu); p_pctcpu = 0; FOREACH_THREAD_IN_PROC(p, td) { if (td == PCPU_GET(idlethread)) continue; #ifdef SMP found = 0; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (td == pc->pc_idlethread) { found = 1; break; } } if (found) continue; #endif thread_lock(td); #ifdef SCHED_4BSD pctcpu = sched_pctcpu(td); /* Count also the yet unfinished second. */ pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; pctcpu_next += sched_pctcpu_delta(td); p_pctcpu += max(pctcpu, pctcpu_next); #else /* * In ULE the %cpu statistics are updated on every * sched_pctcpu() call. So special calculations to * account for the latest (unfinished) second are * not needed. */ p_pctcpu += sched_pctcpu(td); #endif thread_unlock(td); } #ifdef SCHED_4BSD if (swtime <= CCPU_EXP_MAX) return ((100 * (uint64_t)p_pctcpu * 1000000) / (FSCALE - ccpu_exp[swtime])); #endif return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); } static void racct_add_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); dest->r_resources[i] += src->r_resources[i]; } } static void racct_sub_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); KASSERT(src->r_resources[i] <= dest->r_resources[i], ("%s: resource %d propagation meltdown: src > dest", __func__, i)); } if (RACCT_CAN_DROP(i)) { dest->r_resources[i] -= src->r_resources[i]; if (dest->r_resources[i] < 0) dest->r_resources[i] = 0; } } } void racct_create(struct racct **racctp) { if (!racct_enable) return; SDT_PROBE1(racct, , racct, create, racctp); KASSERT(*racctp == NULL, ("racct already allocated")); *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); } static void racct_destroy_locked(struct racct **racctp) { struct racct *racct; int i; ASSERT_RACCT_ENABLED(); SDT_PROBE1(racct, , racct, destroy, racctp); RACCT_LOCK_ASSERT(); KASSERT(racctp != NULL, ("NULL racctp")); KASSERT(*racctp != NULL, ("NULL racct")); racct = *racctp; for (i = 0; i <= RACCT_MAX; i++) { if (RACCT_IS_SLOPPY(i)) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; KASSERT(racct->r_resources[i] == 0, ("destroying non-empty racct: " "%ju allocated for resource %d\n", racct->r_resources[i], i)); } uma_zfree(racct_zone, racct); *racctp = NULL; } void racct_destroy(struct racct **racct) { if (!racct_enable) return; RACCT_LOCK(); racct_destroy_locked(racct); RACCT_UNLOCK(); } /* * Increase consumption of 'resource' by 'amount' for 'racct', * but not its parents. Differently from other cases, 'amount' here * may be less than zero. */ static void racct_adjust_resource(struct racct *racct, int resource, int64_t amount) { ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); KASSERT(racct != NULL, ("NULL racct")); racct->r_resources[resource] += amount; if (racct->r_resources[resource] < 0) { KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), ("%s: resource %d usage < 0", __func__, resource)); racct->r_resources[resource] = 0; } /* * There are some cases where the racct %cpu resource would grow * beyond 100% per core. For example in racct_proc_exit() we add * the process %cpu usage to the ucred racct containers. If too * many processes terminated in a short time span, the ucred %cpu * resource could grow too much. Also, the 4BSD scheduler sometimes * returns for a thread more than 100% cpu usage. So we set a sane * boundary here to 100% * the maxumum number of CPUs. */ if ((resource == RACCT_PCTCPU) && (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; } static int racct_add_locked(struct proc *p, int resource, uint64_t amount, int force) { #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); #ifdef RCTL error = rctl_enforce(p, resource, amount); if (error && !force && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount); return (error); } #endif racct_adjust_resource(p->p_racct, resource, amount); racct_add_cred_locked(p->p_ucred, resource, amount); return (0); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. */ int racct_add(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); SDT_PROBE3(racct, , rusage, add, p, resource, amount); RACCT_LOCK(); error = racct_add_locked(p, resource, amount, 0); RACCT_UNLOCK(); return (error); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Doesn't check for limits and never fails. */ void racct_add_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); RACCT_LOCK(); racct_add_locked(p, resource, amount, 1); RACCT_UNLOCK(); } static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount); } /* * Increase allocation of 'resource' by 'amount' for credential 'cred'. * Doesn't check for limits and never fails. */ void racct_add_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount); RACCT_LOCK(); racct_add_cred_locked(cred, resource, amount); RACCT_UNLOCK(); } /* * Account for disk IO resource consumption. Checks for limits, * but never fails, due to disk limits being undeniable. */ void racct_add_buf(struct proc *p, const struct buf *bp, int is_write) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write); RACCT_LOCK(); if (is_write) { racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1); racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1); } else { racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1); racct_add_locked(curproc, RACCT_READIOPS, 1, 1); } RACCT_UNLOCK(); } static int racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) { int64_t old_amount, decayed_amount, diff_proc, diff_cred; #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); old_amount = p->p_racct->r_resources[resource]; /* * The diffs may be negative. */ diff_proc = amount - old_amount; if (resource == RACCT_PCTCPU) { /* * Resources in per-credential racct containers may decay. * If this is the case, we need to calculate the difference * between the new amount and the proportional value of the * old amount that has decayed in the ucred racct containers. */ decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; diff_cred = amount - decayed_amount; } else diff_cred = diff_proc; #ifdef notyet KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), ("%s: usage of non-droppable resource %d dropping", __func__, resource)); #endif #ifdef RCTL if (diff_proc > 0) { error = rctl_enforce(p, resource, diff_proc); if (error && !force && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, , rusage, set__failure, p, resource, amount); return (error); } } #endif racct_adjust_resource(p->p_racct, resource, diff_proc); if (diff_cred > 0) racct_add_cred_locked(p->p_ucred, resource, diff_cred); else if (diff_cred < 0) racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); return (0); } /* * Set allocation of 'resource' to 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. * * Note that decreasing the allocation always returns 0, * even if it's above the limit. */ int racct_set_unlocked(struct proc *p, int resource, uint64_t amount) { int error; ASSERT_RACCT_ENABLED(); PROC_LOCK(p); error = racct_set(p, resource, amount); PROC_UNLOCK(p); return (error); } int racct_set(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); SDT_PROBE3(racct, , rusage, set__force, p, resource, amount); RACCT_LOCK(); error = racct_set_locked(p, resource, amount, 0); RACCT_UNLOCK(); return (error); } void racct_set_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, set, p, resource, amount); RACCT_LOCK(); racct_set_locked(p, resource, amount, 1); RACCT_UNLOCK(); } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * not matter. */ uint64_t racct_get_limit(struct proc *p, int resource) { #ifdef RCTL uint64_t available; if (!racct_enable) return (UINT64_MAX); RACCT_LOCK(); available = rctl_get_limit(p, resource); RACCT_UNLOCK(); return (available); #else return (UINT64_MAX); #endif } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * matter. */ uint64_t racct_get_available(struct proc *p, int resource) { #ifdef RCTL uint64_t available; if (!racct_enable) return (UINT64_MAX); RACCT_LOCK(); available = rctl_get_available(p, resource); RACCT_UNLOCK(); return (available); #else return (UINT64_MAX); #endif } /* * Returns amount of the %cpu resource that process 'p' can add to its %cpu * utilization. Adding more than that would lead to the process being * throttled. */ static int64_t racct_pcpu_available(struct proc *p) { #ifdef RCTL uint64_t available; ASSERT_RACCT_ENABLED(); RACCT_LOCK(); available = rctl_pcpu_available(p); RACCT_UNLOCK(); return (available); #else return (INT64_MAX); #endif } /* * Decrease allocation of 'resource' by 'amount' for process 'p'. */ void racct_sub(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, sub, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(RACCT_CAN_DROP(resource), ("%s: called for non-droppable resource %d", __func__, resource)); RACCT_LOCK(); KASSERT(amount <= p->p_racct->r_resources[resource], ("%s: freeing %ju of resource %d, which is more " "than allocated %jd for %s (pid %d)", __func__, amount, resource, (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); racct_adjust_resource(p->p_racct, resource, -amount); racct_sub_cred_locked(p->p_ucred, resource, amount); RACCT_UNLOCK(); } static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, -amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount); } /* * Decrease allocation of 'resource' by 'amount' for credential 'cred'. */ void racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount); #ifdef notyet KASSERT(RACCT_CAN_DROP(resource), ("%s: called for resource %d which can not drop", __func__, resource)); #endif RACCT_LOCK(); racct_sub_cred_locked(cred, resource, amount); RACCT_UNLOCK(); } /* * Inherit resource usage information from the parent process. */ int racct_proc_fork(struct proc *parent, struct proc *child) { int i, error = 0; if (!racct_enable) return (0); /* * Create racct for the child process. */ racct_create(&child->p_racct); PROC_LOCK(parent); PROC_LOCK(child); RACCT_LOCK(); #ifdef RCTL error = rctl_proc_fork(parent, child); if (error != 0) goto out; #endif /* Init process cpu time. */ child->p_prev_runtime = 0; child->p_throttled = 0; /* * Inherit resource usage. */ for (i = 0; i <= RACCT_MAX; i++) { if (parent->p_racct->r_resources[i] == 0 || !RACCT_IS_INHERITABLE(i)) continue; error = racct_set_locked(child, i, parent->p_racct->r_resources[i], 0); if (error != 0) goto out; } error = racct_add_locked(child, RACCT_NPROC, 1, 0); error += racct_add_locked(child, RACCT_NTHR, 1, 0); out: RACCT_UNLOCK(); PROC_UNLOCK(child); PROC_UNLOCK(parent); if (error != 0) racct_proc_exit(child); return (error); } /* * Called at the end of fork1(), to handle rules that require the process * to be fully initialized. */ void racct_proc_fork_done(struct proc *child) { if (!racct_enable) return; #ifdef RCTL PROC_LOCK(child); RACCT_LOCK(); rctl_enforce(child, RACCT_NPROC, 0); rctl_enforce(child, RACCT_NTHR, 0); RACCT_UNLOCK(); PROC_UNLOCK(child); #endif } void racct_proc_exit(struct proc *p) { struct timeval wallclock; uint64_t pct_estimate, pct, runtime; int i; if (!racct_enable) return; PROC_LOCK(p); /* * We don't need to calculate rux, proc_reap() has already done this. */ runtime = cputick2usec(p->p_rux.rux_runtime); #ifdef notyet KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); #else if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { pct_estimate = (1000000 * runtime * 100) / ((uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); } else pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); RACCT_LOCK(); racct_set_locked(p, RACCT_CPU, runtime, 0); racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0, ("process reaped with %ju allocated for RSS\n", p->p_racct->r_resources[RACCT_RSS])); for (i = 0; i <= RACCT_MAX; i++) { if (p->p_racct->r_resources[i] == 0) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; racct_set_locked(p, i, 0, 0); } #ifdef RCTL rctl_racct_release(p->p_racct); #endif racct_destroy_locked(&p->p_racct); RACCT_UNLOCK(); PROC_UNLOCK(p); } /* * Called after credentials change, to move resource utilisation * between raccts. */ void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, struct ucred *newcred) { struct uidinfo *olduip, *newuip; struct loginclass *oldlc, *newlc; struct prison *oldpr, *newpr, *pr; if (!racct_enable) return; PROC_LOCK_ASSERT(p, MA_OWNED); newuip = newcred->cr_ruidinfo; olduip = oldcred->cr_ruidinfo; newlc = newcred->cr_loginclass; oldlc = oldcred->cr_loginclass; newpr = newcred->cr_prison; oldpr = oldcred->cr_prison; RACCT_LOCK(); if (newuip != olduip) { racct_sub_racct(olduip->ui_racct, p->p_racct); racct_add_racct(newuip->ui_racct, p->p_racct); } if (newlc != oldlc) { racct_sub_racct(oldlc->lc_racct, p->p_racct); racct_add_racct(newlc->lc_racct, p->p_racct); } if (newpr != oldpr) { for (pr = oldpr; pr != NULL; pr = pr->pr_parent) racct_sub_racct(pr->pr_prison_racct->prr_racct, p->p_racct); for (pr = newpr; pr != NULL; pr = pr->pr_parent) racct_add_racct(pr->pr_prison_racct->prr_racct, p->p_racct); } RACCT_UNLOCK(); } void racct_move(struct racct *dest, struct racct *src) { ASSERT_RACCT_ENABLED(); RACCT_LOCK(); racct_add_racct(dest, src); racct_sub_racct(src, src); RACCT_UNLOCK(); } void racct_proc_throttled(struct proc *p) { ASSERT_RACCT_ENABLED(); PROC_LOCK(p); while (p->p_throttled != 0) { msleep(p->p_racct, &p->p_mtx, 0, "racct", p->p_throttled < 0 ? 0 : p->p_throttled); if (p->p_throttled > 0) p->p_throttled = 0; } PROC_UNLOCK(p); } /* * Make the process sleep in userret() for 'timeout' ticks. Setting * timeout to -1 makes it sleep until woken up by racct_proc_wakeup(). */ void racct_proc_throttle(struct proc *p, int timeout) { struct thread *td; #ifdef SMP int cpuid; #endif KASSERT(timeout != 0, ("timeout %d", timeout)); ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); /* * Do not block kernel processes. Also do not block processes with * low %cpu utilization to improve interactivity. */ if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) return; if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout)) return; p->p_throttled = timeout; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); td->td_flags |= TDF_ASTPENDING; - switch (td->td_state) { + switch (TD_GET_STATE(td)) { case TDS_RUNQ: /* * If the thread is on the scheduler run-queue, we can * not just remove it from there. So we set the flag * TDF_NEEDRESCHED for the thread, so that once it is * running, it is taken off the cpu as soon as possible. */ td->td_flags |= TDF_NEEDRESCHED; break; case TDS_RUNNING: /* * If the thread is running, we request a context * switch for it by setting the TDF_NEEDRESCHED flag. */ td->td_flags |= TDF_NEEDRESCHED; #ifdef SMP cpuid = td->td_oncpu; if ((cpuid != NOCPU) && (td != curthread)) ipi_cpu(cpuid, IPI_AST); #endif break; default: break; } thread_unlock(td); } } static void racct_proc_wakeup(struct proc *p) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_throttled != 0) { p->p_throttled = 0; wakeup(p->p_racct); } } static void racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) { int64_t r_old, r_new; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); #ifdef RCTL rctl_throttle_decay(racct, RACCT_READBPS); rctl_throttle_decay(racct, RACCT_WRITEBPS); rctl_throttle_decay(racct, RACCT_READIOPS); rctl_throttle_decay(racct, RACCT_WRITEIOPS); #endif r_old = racct->r_resources[RACCT_PCTCPU]; /* If there is nothing to decay, just exit. */ if (r_old <= 0) return; r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; racct->r_resources[RACCT_PCTCPU] = r_new; } static void racct_decay_pre(void) { RACCT_LOCK(); } static void racct_decay_post(void) { RACCT_UNLOCK(); } static void racct_decay(void) { ASSERT_RACCT_ENABLED(); ui_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); loginclass_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); prison_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); } static void racctd(void) { struct thread *td; struct proc *p; struct timeval wallclock; uint64_t pct, pct_estimate, runtime; ASSERT_RACCT_ENABLED(); for (;;) { racct_decay(); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { if (p->p_state == PRS_ZOMBIE) racct_set(p, RACCT_PCTCPU, 0); PROC_UNLOCK(p); continue; } microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); PROC_STATLOCK(p); FOREACH_THREAD_IN_PROC(p, td) ruxagg(p, td); runtime = cputick2usec(p->p_rux.rux_runtime); PROC_STATUNLOCK(p); #ifdef notyet KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); #else if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif p->p_prev_runtime = runtime; if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { pct_estimate = (1000000 * runtime * 100) / ((uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); } else pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); RACCT_LOCK(); #ifdef RCTL rctl_throttle_decay(p->p_racct, RACCT_READBPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); rctl_throttle_decay(p->p_racct, RACCT_READIOPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); #endif racct_set_locked(p, RACCT_PCTCPU, pct, 1); racct_set_locked(p, RACCT_CPU, runtime, 0); racct_set_locked(p, RACCT_WALLCLOCK, (uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec, 0); RACCT_UNLOCK(); PROC_UNLOCK(p); } /* * To ensure that processes are throttled in a fair way, we need * to iterate over all processes again and check the limits * for %cpu resource only after ucred racct containers have been * properly filled. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { PROC_UNLOCK(p); continue; } if (racct_pcpu_available(p) <= 0) { if (p->p_racct->r_resources[RACCT_PCTCPU] > pcpu_threshold) racct_proc_throttle(p, -1); } else if (p->p_throttled == -1) { racct_proc_wakeup(p); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); pause("-", hz); } } static struct kproc_desc racctd_kp = { "racctd", racctd, NULL }; static void racctd_init(void) { if (!racct_enable) return; kproc_start(&racctd_kp); } SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); static void racct_init(void) { if (!racct_enable) return; racct_zone = uma_zcreate("racct", sizeof(struct racct), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * XXX: Move this somewhere. */ prison0.pr_prison_racct = prison_racct_find("0"); } SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); #endif /* !RACCT */ diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index 4c0491ab6e85..dcca67326264 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -1,685 +1,685 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #ifdef EPOCH_TRACE #include #endif #include static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL); int hogticks; static const char pause_wchan[MAXCPU]; static struct callout loadav_callout; struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. */ static fixpt_t cexp[3] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FSCALE, "Fixed-point scale factor used for calculating load average values"); static void loadav(void *arg); SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE(sched, , , preempt); static void sleepinit(void *unused) { hogticks = (hz / 10) * 2; /* Default only. */ init_sleepqueues(); } /* * vmem tries to lock the sleepq mutexes when free'ing kva, so make sure * it is available. */ SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, NULL); /* * General sleep call. Suspends the current thread until a wakeup is * performed on the specified identifier. The thread will then be made * runnable with the specified priority. Sleeps at most sbt units of time * (0 means no timeout). If pri includes the PCATCH flag, let signals * interrupt the sleep, otherwise ignore them while sleeping. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal becomes pending, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). * * The lock argument is unlocked before the caller is suspended, and * re-locked before _sleep() returns. If priority includes the PDROP * flag the lock is not re-locked before returning. */ int _sleep(const void *ident, struct lock_object *lock, int priority, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; struct lock_class *class; uintptr_t lock_state; int catch, pri, rval, sleepq_flags; WITNESS_SAVE_DECL(lock_witness); td = curthread; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, wmesg); #endif WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Sleeping on \"%s\"", wmesg); KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL, ("sleeping without a lock")); KASSERT(ident != NULL, ("_sleep: NULL ident")); KASSERT(TD_IS_RUNNING(td), ("_sleep: curthread not running")); if (priority & PDROP) KASSERT(lock != NULL && lock != &Giant.lock_object, ("PDROP requires a non-Giant lock")); if (lock != NULL) class = LOCK_CLASS(lock); else class = NULL; if (SCHEDULER_STOPPED_TD(td)) { if (lock != NULL && priority & PDROP) class->lc_unlock(lock); return (0); } catch = priority & PCATCH; pri = priority & PRIMASK; KASSERT(!TD_ON_SLEEPQ(td), ("recursive sleep")); if ((uintptr_t)ident >= (uintptr_t)&pause_wchan[0] && (uintptr_t)ident <= (uintptr_t)&pause_wchan[MAXCPU - 1]) sleepq_flags = SLEEPQ_PAUSE; else sleepq_flags = SLEEPQ_SLEEP; if (catch) sleepq_flags |= SLEEPQ_INTERRUPTIBLE; sleepq_lock(ident); CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(class->lc_flags & LC_SLEEPABLE)) { WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); } else /* GCC needs to follow the Yellow Brick Road */ lock_state = -1; /* * We put ourselves on the sleep queue and start our timeout * before calling thread_suspend_check, as we could stop there, * and a wakeup or a SIGCONT (or both) could occur while we were * stopped without resuming us. Thus, we must be ready for sleep * when cursig() is called. If the wakeup happens while we're * stopped, then td will no longer be on a sleep queue upon * return from cursig(). */ sleepq_add(ident, lock, wmesg, sleepq_flags, 0); if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { sleepq_release(ident); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); sleepq_lock(ident); } if (sbt != 0 && catch) rval = sleepq_timedwait_sig(ident, pri); else if (sbt != 0) rval = sleepq_timedwait(ident, pri); else if (catch) rval = sleepq_wait_sig(ident, pri); else { sleepq_wait(ident, pri); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } int msleep_spin_sbt(const void *ident, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; int rval; WITNESS_SAVE_DECL(mtx); td = curthread; KASSERT(mtx != NULL, ("sleeping without a mutex")); KASSERT(ident != NULL, ("msleep_spin_sbt: NULL ident")); KASSERT(TD_IS_RUNNING(td), ("msleep_spin_sbt: curthread not running")); if (SCHEDULER_STOPPED_TD(td)) return (0); sleepq_lock(ident); CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident); DROP_GIANT(); mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(&mtx->lock_object, mtx); mtx_unlock_spin(mtx); /* * We put ourselves on the sleep queue and start our timeout. */ sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0); if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); /* * Can't call ktrace with any spin locks held so it can lock the * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold * any spin lock. Thus, we have to drop the sleepq spin lock while * we handle those requests. This is safe since we have placed our * thread on the sleep queue already. */ #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { sleepq_release(ident); ktrcsw(1, 0, wmesg); sleepq_lock(ident); } #endif #ifdef WITNESS sleepq_release(ident); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"", wmesg); sleepq_lock(ident); #endif if (sbt != 0) rval = sleepq_timedwait(ident, 0); else { sleepq_wait(ident, 0); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); mtx_lock_spin(mtx); WITNESS_RESTORE(&mtx->lock_object, mtx); return (rval); } /* * pause_sbt() delays the calling thread by the given signed binary * time. During cold bootup, pause_sbt() uses the DELAY() function * instead of the _sleep() function to do the waiting. The "sbt" * argument must be greater than or equal to zero. A "sbt" value of * zero is equivalent to a "sbt" value of one tick. */ int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { KASSERT(sbt >= 0, ("pause_sbt: timeout must be >= 0")); /* silently convert invalid timeouts */ if (sbt == 0) sbt = tick_sbt; if ((cold && curthread == &thread0) || kdb_active || SCHEDULER_STOPPED()) { /* * We delay one second at a time to avoid overflowing the * system specific DELAY() function(s): */ while (sbt >= SBT_1S) { DELAY(1000000); sbt -= SBT_1S; } /* Do the delay remainder, if any */ sbt = howmany(sbt, SBT_1US); if (sbt > 0) DELAY(sbt); return (EWOULDBLOCK); } return (_sleep(&pause_wchan[curcpu], NULL, (flags & C_CATCH) ? PCATCH : 0, wmesg, sbt, pr, flags)); } /* * Make all threads sleeping on the specified identifier runnable. */ void wakeup(const void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) { KASSERT(ident != &proc0, ("wakeup and wakeup_swapper and proc0")); kick_proc0(); } } /* * Make a thread sleeping on the specified identifier runnable. * May wake more than one thread if a target thread is currently * swapped out. */ void wakeup_one(const void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) kick_proc0(); } void wakeup_any(const void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP | SLEEPQ_UNFAIR, 0, 0); sleepq_release(ident); if (wakeup_swapper) kick_proc0(); } /* * Signal sleeping waiters after the counter has reached zero. */ void _blockcount_wakeup(blockcount_t *bc, u_int old) { KASSERT(_BLOCKCOUNT_WAITERS(old), ("%s: no waiters on %p", __func__, bc)); if (atomic_cmpset_int(&bc->__count, _BLOCKCOUNT_WAITERS_FLAG, 0)) wakeup(bc); } /* * Wait for a wakeup or a signal. This does not guarantee that the count is * still zero on return. Callers wanting a precise answer should use * blockcount_wait() with an interlock. * * If there is no work to wait for, return 0. If the sleep was interrupted by a * signal, return EINTR or ERESTART, and return EAGAIN otherwise. */ int _blockcount_sleep(blockcount_t *bc, struct lock_object *lock, const char *wmesg, int prio) { void *wchan; uintptr_t lock_state; u_int old; int ret; bool catch, drop; KASSERT(lock != &Giant.lock_object, ("%s: cannot use Giant as the interlock", __func__)); catch = (prio & PCATCH) != 0; drop = (prio & PDROP) != 0; prio &= PRIMASK; /* * Synchronize with the fence in blockcount_release(). If we end up * waiting, the sleepqueue lock acquisition will provide the required * side effects. * * If there is no work to wait for, but waiters are present, try to put * ourselves to sleep to avoid jumping ahead. */ if (atomic_load_acq_int(&bc->__count) == 0) { if (lock != NULL && drop) LOCK_CLASS(lock)->lc_unlock(lock); return (0); } lock_state = 0; wchan = bc; sleepq_lock(wchan); DROP_GIANT(); if (lock != NULL) lock_state = LOCK_CLASS(lock)->lc_unlock(lock); old = blockcount_read(bc); ret = 0; do { if (_BLOCKCOUNT_COUNT(old) == 0) { sleepq_release(wchan); goto out; } if (_BLOCKCOUNT_WAITERS(old)) break; } while (!atomic_fcmpset_int(&bc->__count, &old, old | _BLOCKCOUNT_WAITERS_FLAG)); sleepq_add(wchan, NULL, wmesg, catch ? SLEEPQ_INTERRUPTIBLE : 0, 0); if (catch) ret = sleepq_wait_sig(wchan, prio); else sleepq_wait(wchan, prio); if (ret == 0) ret = EAGAIN; out: PICKUP_GIANT(); if (lock != NULL && !drop) LOCK_CLASS(lock)->lc_lock(lock, lock_state); return (ret); } static void kdb_switch(void) { thread_unlock(curthread); kdb_backtrace(); kdb_reenter(); panic("%s: did not reenter debugger", __func__); } /* * The machine independent parts of context switching. * * The thread lock is required on entry and is no longer held on return. */ void mi_switch(int flags) { uint64_t runtime, new_switchtime; struct thread *td; td = curthread; /* XXX */ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td)) mtx_assert(&Giant, MA_NOTOWNED); #endif KASSERT(td->td_critnest == 1 || KERNEL_PANICKED(), ("mi_switch: switch in a critical section")); KASSERT((flags & (SW_INVOL | SW_VOL)) != 0, ("mi_switch: switch must be voluntary or involuntary")); /* * Don't perform context switches from the debugger. */ if (kdb_active) kdb_switch(); if (SCHEDULER_STOPPED_TD(td)) return; if (flags & SW_VOL) { td->td_ru.ru_nvcsw++; td->td_swvoltick = ticks; } else { td->td_ru.ru_nivcsw++; td->td_swinvoltick = ticks; } #ifdef SCHED_STATS SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]); #endif /* * Compute the amount of time during which the current * thread was running, and add that to its total so far. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); td->td_generation++; /* bump preempt-detect counter */ VM_CNT_INC(v_swtch); PCPU_SET(switchticks, ticks); CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); #ifdef KDTRACE_HOOKS if (SDT_PROBES_ENABLED() && ((flags & SW_PREEMPT) != 0 || ((flags & SW_INVOL) != 0 && (flags & SW_TYPE_MASK) == SWT_NEEDRESCHED))) SDT_PROBE0(sched, , , preempt); #endif sched_switch(td, flags); CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); /* * If the last thread was exiting, finish cleaning it up. */ if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } spinlock_exit(); } /* * Change thread state to be runnable, placing it on the run queue if * it is in memory. If it is swapped out, return true so our caller * will know to awaken the swapper. * * Requires the thread lock on entry, drops on exit. */ int setrunnable(struct thread *td, int srqflags) { int swapin; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td->td_proc->p_state != PRS_ZOMBIE, ("setrunnable: pid %d is a zombie", td->td_proc->p_pid)); swapin = 0; - switch (td->td_state) { + switch (TD_GET_STATE(td)) { case TDS_RUNNING: case TDS_RUNQ: break; case TDS_CAN_RUN: KASSERT((td->td_flags & TDF_INMEM) != 0, ("setrunnable: td %p not in mem, flags 0x%X inhibit 0x%X", td, td->td_flags, td->td_inhibitors)); /* unlocks thread lock according to flags */ sched_wakeup(td, srqflags); return (0); case TDS_INHIBITED: /* * If we are only inhibited because we are swapped out * arrange to swap in this process. */ if (td->td_inhibitors == TDI_SWAPPED && (td->td_flags & TDF_SWAPINREQ) == 0) { td->td_flags |= TDF_SWAPINREQ; swapin = 1; } break; default: - panic("setrunnable: state 0x%x", td->td_state); + panic("setrunnable: state 0x%x", TD_GET_STATE(td)); } if ((srqflags & (SRQ_HOLD | SRQ_HOLDTD)) == 0) thread_unlock(td); return (swapin); } /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. */ static void loadav(void *arg) { int i, nrun; struct loadavg *avg; nrun = sched_load(); avg = &averunnable; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; /* * Schedule the next update to occur after 5 seconds, but add a * random variation to avoid synchronisation with processes that * run at regular intervals. */ callout_reset_sbt(&loadav_callout, SBT_1US * (4000000 + (int)(random() % 2000001)), SBT_1US, loadav, NULL, C_DIRECT_EXEC | C_PREL(32)); } /* ARGSUSED */ static void synch_setup(void *dummy) { callout_init(&loadav_callout, 1); /* Kick off timeout driven events by calling first time. */ loadav(NULL); } int should_yield(void) { return ((u_int)ticks - (u_int)curthread->td_swvoltick >= hogticks); } void maybe_yield(void) { if (should_yield()) kern_yield(PRI_USER); } void kern_yield(int prio) { struct thread *td; td = curthread; DROP_GIANT(); thread_lock(td); if (prio == PRI_USER) prio = td->td_user_pri; if (prio >= 0) sched_prio(td, prio); mi_switch(SW_VOL | SWT_RELINQUISH); PICKUP_GIANT(); } /* * General purpose yield system call. */ int sys_yield(struct thread *td, struct yield_args *uap) { thread_lock(td); if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, PRI_MAX_TIMESHARE); mi_switch(SW_VOL | SWT_RELINQUISH); td->td_retval[0] = 0; return (0); } diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c index 3561895d9fff..ea569576e7c9 100644 --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -1,1708 +1,1708 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include #include #include /* * Asserts below verify the stability of struct thread and struct proc * layout, as exposed by KBI to modules. On head, the KBI is allowed * to drift, change to the structures must be accompanied by the * assert update. * * On the stable branches after KBI freeze, conditions must not be * violated. Typically new fields are moved to the end of the * structures. */ #ifdef __amd64__ _Static_assert(offsetof(struct thread, td_flags) == 0xfc, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0x104, "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x4a0, "struct thread KBI td_frame"); _Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb8, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0xc4, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x3c0, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x3d8, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8, "struct proc KBI p_emuldata"); #endif #ifdef __i386__ _Static_assert(offsetof(struct thread, td_flags) == 0x98, "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa0, "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x300, "struct thread KBI td_frame"); _Static_assert(offsetof(struct thread, td_emuldata) == 0x344, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x6c, "struct proc KBI p_flag"); _Static_assert(offsetof(struct proc, p_pid) == 0x78, "struct proc KBI p_pid"); _Static_assert(offsetof(struct proc, p_filemon) == 0x26c, "struct proc KBI p_filemon"); _Static_assert(offsetof(struct proc, p_comm) == 0x280, "struct proc KBI p_comm"); _Static_assert(offsetof(struct proc, p_emuldata) == 0x30c, "struct proc KBI p_emuldata"); #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, , , lwp__exit); /* * thread related storage. */ static uma_zone_t thread_zone; struct thread_domain_data { struct thread *tdd_zombies; int tdd_reapticks; } __aligned(CACHE_LINE_SIZE); static struct thread_domain_data thread_domain_data[MAXMEMDOM]; static struct task thread_reap_task; static struct callout thread_reap_callout; static void thread_zombie(struct thread *); static void thread_reap(void); static void thread_reap_all(void); static void thread_reap_task_cb(void *, int); static void thread_reap_callout_cb(void *); static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary); static void thread_free_batched(struct thread *td); static __exclusive_cache_line struct mtx tid_lock; static bitstr_t *tid_bitmap; static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash"); static int maxthread; SYSCTL_INT(_kern, OID_AUTO, maxthread, CTLFLAG_RDTUN, &maxthread, 0, "Maximum number of threads"); static __exclusive_cache_line int nthreads; static LIST_HEAD(tidhashhead, thread) *tidhashtbl; static u_long tidhash; static u_long tidhashlock; static struct rwlock *tidhashtbl_lock; #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) #define TIDHASHLOCK(tid) (&tidhashtbl_lock[(tid) & tidhashlock]) EVENTHANDLER_LIST_DEFINE(thread_ctor); EVENTHANDLER_LIST_DEFINE(thread_dtor); EVENTHANDLER_LIST_DEFINE(thread_init); EVENTHANDLER_LIST_DEFINE(thread_fini); static bool thread_count_inc_try(void) { int nthreads_new; nthreads_new = atomic_fetchadd_int(&nthreads, 1) + 1; if (nthreads_new >= maxthread - 100) { if (priv_check_cred(curthread->td_ucred, PRIV_MAXPROC) != 0 || nthreads_new >= maxthread) { atomic_subtract_int(&nthreads, 1); return (false); } } return (true); } static bool thread_count_inc(void) { static struct timeval lastfail; static int curfail; thread_reap(); if (thread_count_inc_try()) { return (true); } thread_reap_all(); if (thread_count_inc_try()) { return (true); } if (ppsratecheck(&lastfail, &curfail, 1)) { printf("maxthread limit exceeded by uid %u " "(pid %d); consider increasing kern.maxthread\n", curthread->td_ucred->cr_ruid, curproc->p_pid); } return (false); } static void thread_count_sub(int n) { atomic_subtract_int(&nthreads, n); } static void thread_count_dec(void) { thread_count_sub(1); } static lwpid_t tid_alloc(void) { static lwpid_t trytid; lwpid_t tid; mtx_lock(&tid_lock); /* * It is an invariant that the bitmap is big enough to hold maxthread * IDs. If we got to this point there has to be at least one free. */ if (trytid >= maxthread) trytid = 0; bit_ffc_at(tid_bitmap, trytid, maxthread, &tid); if (tid == -1) { KASSERT(trytid != 0, ("unexpectedly ran out of IDs")); trytid = 0; bit_ffc_at(tid_bitmap, trytid, maxthread, &tid); KASSERT(tid != -1, ("unexpectedly ran out of IDs")); } bit_set(tid_bitmap, tid); trytid = tid + 1; mtx_unlock(&tid_lock); return (tid + NO_PID); } static void tid_free_locked(lwpid_t rtid) { lwpid_t tid; mtx_assert(&tid_lock, MA_OWNED); KASSERT(rtid >= NO_PID, ("%s: invalid tid %d\n", __func__, rtid)); tid = rtid - NO_PID; KASSERT(bit_test(tid_bitmap, tid) != 0, ("thread ID %d not allocated\n", rtid)); bit_clear(tid_bitmap, tid); } static void tid_free(lwpid_t rtid) { mtx_lock(&tid_lock); tid_free_locked(rtid); mtx_unlock(&tid_lock); } static void tid_free_batch(lwpid_t *batch, int n) { int i; mtx_lock(&tid_lock); for (i = 0; i < n; i++) { tid_free_locked(batch[i]); } mtx_unlock(&tid_lock); } /* * Batching for thread reapping. */ struct tidbatch { lwpid_t tab[16]; int n; }; static void tidbatch_prep(struct tidbatch *tb) { tb->n = 0; } static void tidbatch_add(struct tidbatch *tb, struct thread *td) { KASSERT(tb->n < nitems(tb->tab), ("%s: count too high %d", __func__, tb->n)); tb->tab[tb->n] = td->td_tid; tb->n++; } static void tidbatch_process(struct tidbatch *tb) { KASSERT(tb->n <= nitems(tb->tab), ("%s: count too high %d", __func__, tb->n)); if (tb->n == nitems(tb->tab)) { tid_free_batch(tb->tab, tb->n); tb->n = 0; } } static void tidbatch_final(struct tidbatch *tb) { KASSERT(tb->n <= nitems(tb->tab), ("%s: count too high %d", __func__, tb->n)); if (tb->n != 0) { tid_free_batch(tb->tab, tb->n); } } /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; - td->td_state = TDS_INACTIVE; + TD_SET_STATE(td, TDS_INACTIVE); td->td_lastcpu = td->td_oncpu = NOCPU; /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; td->td_lend_user_pri = PRI_MAX; #ifdef AUDIT audit_thread_alloc(td); #endif #ifdef KDTRACE_HOOKS kdtrace_thread_ctor(td); #endif umtx_thread_alloc(td); MPASS(td->td_sel == NULL); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ - switch (td->td_state) { + switch (TD_GET_STATE(td)) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif #ifdef KDTRACE_HOOKS kdtrace_thread_dtor(td); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); td_softdep_cleanup(td); MPASS(td->td_su == NULL); seltdfini(td); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; td->td_allocdomain = vm_phys_domain(vtophys(td)); td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_rlqe = NULL; EVENTHANDLER_DIRECT_INVOKE(thread_init, td); umtx_thread_init(td); td->td_kstack = 0; td->td_sel = NULL; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; EVENTHANDLER_DIRECT_INVOKE(thread_fini, td); rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); MPASS(td->td_sel == NULL); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c {arch}_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void proc_linkup0(struct proc *p, struct thread *td) { TAILQ_INIT(&p->p_threads); /* all threads in proc */ proc_linkup(p, td); } void proc_linkup(struct proc *p, struct thread *td) { sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } extern int max_threads_per_proc; /* * Initialize global thread allocation resources. */ void threadinit(void) { u_long i; lwpid_t tid0; uint32_t flags; /* * Place an upper limit on threads which can be allocated. * * Note that other factors may make the de facto limit much lower. * * Platform limits are somewhat arbitrary but deemed "more than good * enough" for the foreseable future. */ if (maxthread == 0) { #ifdef _LP64 maxthread = MIN(maxproc * max_threads_per_proc, 1000000); #else maxthread = MIN(maxproc * max_threads_per_proc, 100000); #endif } mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); tid_bitmap = bit_alloc(maxthread, M_TIDHASH, M_WAITOK); /* * Handle thread0. */ thread_count_inc(); tid0 = tid_alloc(); if (tid0 != THREAD0_TID) panic("tid0 %d != %d\n", tid0, THREAD0_TID); flags = UMA_ZONE_NOFREE; #ifdef __aarch64__ /* * Force thread structures to be allocated from the direct map. * Otherwise, superpage promotions and demotions may temporarily * invalidate thread structure mappings. For most dynamically allocated * structures this is not a problem, but translation faults cannot be * handled without accessing curthread. */ flags |= UMA_ZONE_CONTIG; #endif thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 32 - 1, flags); tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash); tidhashlock = (tidhash + 1) / 64; if (tidhashlock > 0) tidhashlock--; tidhashtbl_lock = malloc(sizeof(*tidhashtbl_lock) * (tidhashlock + 1), M_TIDHASH, M_WAITOK | M_ZERO); for (i = 0; i < tidhashlock + 1; i++) rw_init(&tidhashtbl_lock[i], "tidhash"); TASK_INIT(&thread_reap_task, 0, thread_reap_task_cb, NULL); callout_init(&thread_reap_callout, 1); callout_reset(&thread_reap_callout, 5 * hz, thread_reap_callout_cb, NULL); } /* * Place an unused thread on the zombie list. */ void thread_zombie(struct thread *td) { struct thread_domain_data *tdd; struct thread *ztd; tdd = &thread_domain_data[td->td_allocdomain]; ztd = atomic_load_ptr(&tdd->tdd_zombies); for (;;) { td->td_zombie = ztd; if (atomic_fcmpset_rel_ptr((uintptr_t *)&tdd->tdd_zombies, (uintptr_t *)&ztd, (uintptr_t)td)) break; continue; } } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombies from passed domain. */ static void thread_reap_domain(struct thread_domain_data *tdd) { struct thread *itd, *ntd; struct tidbatch tidbatch; struct credbatch credbatch; int tdcount; struct plimit *lim; int limcount; /* * Reading upfront is pessimal if followed by concurrent atomic_swap, * but most of the time the list is empty. */ if (tdd->tdd_zombies == NULL) return; itd = (struct thread *)atomic_swap_ptr((uintptr_t *)&tdd->tdd_zombies, (uintptr_t)NULL); if (itd == NULL) return; /* * Multiple CPUs can get here, the race is fine as ticks is only * advisory. */ tdd->tdd_reapticks = ticks; tidbatch_prep(&tidbatch); credbatch_prep(&credbatch); tdcount = 0; lim = NULL; limcount = 0; while (itd != NULL) { ntd = itd->td_zombie; EVENTHANDLER_DIRECT_INVOKE(thread_dtor, itd); tidbatch_add(&tidbatch, itd); credbatch_add(&credbatch, itd); MPASS(itd->td_limit != NULL); if (lim != itd->td_limit) { if (limcount != 0) { lim_freen(lim, limcount); limcount = 0; } } lim = itd->td_limit; limcount++; thread_free_batched(itd); tidbatch_process(&tidbatch); credbatch_process(&credbatch); tdcount++; if (tdcount == 32) { thread_count_sub(tdcount); tdcount = 0; } itd = ntd; } tidbatch_final(&tidbatch); credbatch_final(&credbatch); if (tdcount != 0) { thread_count_sub(tdcount); } MPASS(limcount != 0); lim_freen(lim, limcount); } /* * Reap zombies from all domains. */ static void thread_reap_all(void) { struct thread_domain_data *tdd; int i, domain; domain = PCPU_GET(domain); for (i = 0; i < vm_ndomains; i++) { tdd = &thread_domain_data[(i + domain) % vm_ndomains]; thread_reap_domain(tdd); } } /* * Reap zombies from local domain. */ static void thread_reap(void) { struct thread_domain_data *tdd; int domain; domain = PCPU_GET(domain); tdd = &thread_domain_data[domain]; thread_reap_domain(tdd); } static void thread_reap_task_cb(void *arg __unused, int pending __unused) { thread_reap_all(); } static void thread_reap_callout_cb(void *arg __unused) { struct thread_domain_data *tdd; int i, cticks, lticks; bool wantreap; wantreap = false; cticks = atomic_load_int(&ticks); for (i = 0; i < vm_ndomains; i++) { tdd = &thread_domain_data[i]; lticks = tdd->tdd_reapticks; if (tdd->tdd_zombies != NULL && (u_int)(cticks - lticks) > 5 * hz) { wantreap = true; break; } } if (wantreap) taskqueue_enqueue(taskqueue_thread, &thread_reap_task); callout_reset(&thread_reap_callout, 5 * hz, thread_reap_callout_cb, NULL); } /* * Allocate a thread. */ struct thread * thread_alloc(int pages) { struct thread *td; lwpid_t tid; if (!thread_count_inc()) { return (NULL); } tid = tid_alloc(); td = uma_zalloc(thread_zone, M_WAITOK); KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); if (!vm_thread_new(td, pages)) { uma_zfree(thread_zone, td); tid_free(tid); thread_count_dec(); return (NULL); } td->td_tid = tid; cpu_thread_alloc(td); EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); return (td); } int thread_alloc_stack(struct thread *td, int pages) { KASSERT(td->td_kstack == 0, ("thread_alloc_stack called on a thread with kstack")); if (!vm_thread_new(td, pages)) return (0); cpu_thread_alloc(td); return (1); } /* * Deallocate a thread. */ static void thread_free_batched(struct thread *td) { lock_profile_thread_exit(td); if (td->td_cpuset) cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); callout_drain(&td->td_slpcallout); /* * Freeing handled by the caller. */ td->td_tid = -1; uma_zfree(thread_zone, td); } void thread_free(struct thread *td) { lwpid_t tid; EVENTHANDLER_DIRECT_INVOKE(thread_dtor, td); tid = td->td_tid; thread_free_batched(td); tid_free(tid); thread_count_dec(); } void thread_cow_get_proc(struct thread *newtd, struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_realucred = crcowget(p->p_ucred); newtd->td_ucred = newtd->td_realucred; newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } void thread_cow_get(struct thread *newtd, struct thread *td) { MPASS(td->td_realucred == td->td_ucred); newtd->td_realucred = crcowget(td->td_realucred); newtd->td_ucred = newtd->td_realucred; newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } void thread_cow_free(struct thread *td) { if (td->td_realucred != NULL) crcowfree(td); if (td->td_limit != NULL) lim_free(td->td_limit); } void thread_cow_update(struct thread *td) { struct proc *p; struct ucred *oldcred; struct plimit *oldlimit; p = td->td_proc; oldlimit = NULL; PROC_LOCK(p); oldcred = crcowsync(); if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); } td->td_cowgen = p->p_cowgen; PROC_UNLOCK(p); if (oldcred != NULL) crfree(oldcred); if (oldlimit != NULL) lim_free(oldlimit); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { uint64_t runtime, new_switchtime; struct thread *td; struct thread *td2; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, td->td_name); SDT_PROBE0(proc, , , lwp__exit); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); MPASS(td->td_realucred == td->td_ucred); /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { atomic_add_int(&td->td_proc->p_exitthreads, 1); thread_unlink(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SINGLE is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); if (wakeup_swapper) kick_proc0(); } } PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() */ panic ("thread_exit: Last thread exiting on its own"); } } #ifdef HWPMC_HOOKS /* * If this thread is part of a process that is being tracked by hwpmc(4), * inform the module of the thread's impending exit. */ if (PMC_PROC_IS_USING_PMCS(td->td_proc)) { PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT, NULL); } else if (PMC_SYSTEM_SAMPLING_ACTIVE()) PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT_LOG, NULL); #endif PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); PROC_SUNLOCK(p); /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); VM_CNT_INC(v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; ruxagg_locked(p, td); rucollect(&p->p_ru, &td->td_ru); PROC_STATUNLOCK(p); - td->td_state = TDS_INACTIVE; + TD_SET_STATE(td, TDS_INACTIVE); #ifdef WITNESS witness_thread_exit(td); #endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()")); KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking")); td = FIRST_THREAD_IN_PROC(p); /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); lock_profile_thread_exit(td); cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); callout_drain(&td->td_slpcallout); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * its lock has been created. * PROC_LOCK_ASSERT(p, MA_OWNED); */ - td->td_state = TDS_INACTIVE; + TD_SET_STATE(td, TDS_INACTIVE); td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); #ifdef EPOCH_TRACE SLIST_INIT(&td->td_epochs); #endif sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); #ifdef EPOCH_TRACE MPASS(SLIST_EMPTY(&td->td_epochs)); #endif TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } static int calc_remaining(struct proc *p, int mode) { int remaining; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC) remaining = p->p_numthreads - p->p_suspcount; else panic("calc_remaining: wrong mode %d", mode); return (remaining); } static int remain_for_mode(int mode) { return (mode == SINGLE_ALLPROC ? 0 : 1); } static int weed_inhib(int mode, struct thread *td2, struct proc *p) { int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td2, MA_OWNED); wakeup_swapper = 0; /* * Since the thread lock is dropped by the scheduler we have * to retry to check for races. */ restart: switch (mode) { case SINGLE_EXIT: if (TD_IS_SUSPENDED(td2)) { wakeup_swapper |= thread_unsuspend_one(td2, p, true); thread_lock(td2); goto restart; } if (TD_CAN_ABORT(td2)) { wakeup_swapper |= sleepq_abort(td2, EINTR); return (wakeup_swapper); } break; case SINGLE_BOUNDARY: case SINGLE_NO_EXIT: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) { wakeup_swapper |= thread_unsuspend_one(td2, p, false); thread_lock(td2); goto restart; } if (TD_CAN_ABORT(td2)) { wakeup_swapper |= sleepq_abort(td2, ERESTART); return (wakeup_swapper); } break; case SINGLE_ALLPROC: /* * ALLPROC suspend tries to avoid spurious EINTR for * threads sleeping interruptable, by suspending the * thread directly, similarly to sig_suspend_threads(). * Since such sleep is not performed at the user * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP * is used to avoid immediate un-suspend. */ if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY | TDF_ALLPROCSUSP)) == 0) { wakeup_swapper |= thread_unsuspend_one(td2, p, false); thread_lock(td2); goto restart; } if (TD_CAN_ABORT(td2)) { if ((td2->td_flags & TDF_SBDRY) == 0) { thread_suspend_one(td2); td2->td_flags |= TDF_ALLPROCSUSP; } else { wakeup_swapper |= sleepq_abort(td2, ERESTART); return (wakeup_swapper); } } break; default: break; } thread_unlock(td2); return (wakeup_swapper); } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(struct proc *p, int mode) { struct thread *td; struct thread *td2; int remaining, wakeup_swapper; td = curthread; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); /* * If allowing non-ALLPROC singlethreading for non-curproc * callers, calc_remaining() and remain_for_mode() should be * adjusted to also account for td->td_proc != p. For now * this is not implemented because it is not used. */ KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) || (mode != SINGLE_ALLPROC && td->td_proc == p), ("mode %d proc %p curproc %p", mode, p, td->td_proc)); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } if (mode == SINGLE_ALLPROC) p->p_flag |= P_TOTAL_STOP; p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; remaining = calc_remaining(p, mode); while (remaining != remain_for_mode(mode)) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (TD_IS_INHIBITED(td2)) { wakeup_swapper |= weed_inhib(mode, td2, p); #ifdef SMP } else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); thread_unlock(td2); #endif } else thread_unlock(td2); } if (wakeup_swapper) kick_proc0(); remaining = calc_remaining(p, mode); /* * Maybe we suspended some threads.. was it enough? */ if (remaining == remain_for_mode(mode)) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td, p); remaining = calc_remaining(p, mode); } if (mode == SINGLE_EXIT) { /* * Convert the process to an unthreaded process. The * SINGLE_EXIT is called by exit1() or execve(), in * both cases other threads must be retired. */ KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads")); p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS); /* * Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads != 0) { PROC_SUNLOCK(p); PROC_UNLOCK(p); sched_relinquish(td); PROC_LOCK(p); PROC_SLOCK(p); } } else if (mode == SINGLE_BOUNDARY) { /* * Wait until all suspended threads are removed from * the processors. The thread_suspend_check() * increments p_boundary_count while it is still * running, which makes it possible for the execve() * to destroy vmspace while our other threads are * still using the address space. * * We lock the thread, which is only allowed to * succeed after context switch code finished using * the address space. */ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); KASSERT((td2->td_flags & TDF_BOUNDARY) != 0, ("td %p not on boundary", td2)); KASSERT(TD_IS_SUSPENDED(td2), ("td %p is not suspended", td2)); thread_unlock(td2); } } PROC_SUNLOCK(p); return (0); } bool thread_suspend_check_needed(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 && (td->td_dbgflags & TDB_SUSPEND) != 0)); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediately *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediately * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (thread_suspend_check_needed()) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * It is safe to access p->p_singlethread unlocked * because it can only be set to our address by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* * Ignore suspend requests if they are deferred. */ if ((td->td_flags & TDF_SBDRY) != 0) { KASSERT(return_instead, ("TDF_SBDRY set for unsafe thread_suspend_check")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0); } /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { PROC_UNLOCK(p); /* * Allow Linux emulation layer to do some work * before thread suicide. */ if (__predict_false(p->p_sysent->sv_thread_detach != NULL)) (p->p_sysent->sv_thread_detach)(td); umtx_thread_exit(td); kern_thr_exit(td); panic("stopped thread did not exit"); } PROC_SLOCK(p); thread_stopped(p); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); if (wakeup_swapper) kick_proc0(); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL | SWT_SUSPEND); PROC_LOCK(p); } return (0); } /* * Check for possible stops and suspensions while executing a * casueword or similar transiently failing operation. * * The sleep argument controls whether the function can handle a stop * request itself or it should return ERESTART and the request is * proceed at the kernel/user boundary in ast. * * Typically, when retrying due to casueword(9) failure (rv == 1), we * should handle the stop requests there, with exception of cases when * the thread owns a kernel resource, for instance busied the umtx * key, or when functions return immediately if thread_check_susp() * returned non-zero. On the other hand, retrying the whole lock * operation, we better not stop there but delegate the handling to * ast. * * If the request is for thread termination P_SINGLE_EXIT, we cannot * handle it at all, and simply return EINTR. */ int thread_check_susp(struct thread *td, bool sleep) { struct proc *p; int error; /* * The check for TDF_NEEDSUSPCHK is racy, but it is enough to * eventually break the lockstep loop. */ if ((td->td_flags & TDF_NEEDSUSPCHK) == 0) return (0); error = 0; p = td->td_proc; PROC_LOCK(p); if (p->p_flag & P_SINGLE_EXIT) error = EINTR; else if (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) error = sleep ? thread_suspend_check(0) : ERESTART; PROC_UNLOCK(p); return (error); } void thread_suspend_switch(struct thread *td, struct proc *p) { KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ if (p == td->td_proc) { thread_stopped(p); p->p_suspcount++; } PROC_UNLOCK(p); thread_lock(td); td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL | SWT_SUSPEND); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); } static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); td->td_flags &= ~TDF_ALLPROCSUSP; if (td->td_proc == p) { PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_suspcount--; if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) { td->td_flags &= ~TDF_BOUNDARY; p->p_boundary_count--; } } return (setrunnable(td, 0)); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, true); } else thread_unlock(td); } } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && p->p_numthreads == p->p_suspcount) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ if (p->p_singlethread->td_proc == p) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); } } if (wakeup_swapper) kick_proc0(); } /* * End the single threading mode.. */ void thread_single_end(struct proc *p, int mode) { struct thread *td; int wakeup_swapper; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) || (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0), ("mode %d does not match P_TOTAL_STOP", mode)); KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread, ("thread_single_end from other thread %p %p", curthread, p->p_singlethread)); KASSERT(mode != SINGLE_BOUNDARY || (p->p_flag & P_SINGLE_BOUNDARY) != 0, ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag)); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY | P_TOTAL_STOP); PROC_SLOCK(p); p->p_singlethread = NULL; wakeup_swapper = 0; /* * If there are other threads they may now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, mode == SINGLE_BOUNDARY); } else thread_unlock(td); } } KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, ("inconsistent boundary count %d", p->p_boundary_count)); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); } /* * Locate a thread by number and return with proc lock held. * * thread exit establishes proc -> tidhash lock ordering, but lookup * takes tidhash first and needs to return locked proc. * * The problem is worked around by relying on type-safety of both * structures and doing the work in 2 steps: * - tidhash-locked lookup which saves both thread and proc pointers * - proc-locked verification that the found thread still matches */ static bool tdfind_hash(lwpid_t tid, pid_t pid, struct proc **pp, struct thread **tdp) { #define RUN_THRESH 16 struct proc *p; struct thread *td; int run; bool locked; run = 0; rw_rlock(TIDHASHLOCK(tid)); locked = true; LIST_FOREACH(td, TIDHASH(tid), td_hash) { if (td->td_tid != tid) { run++; continue; } p = td->td_proc; if (pid != -1 && p->p_pid != pid) { td = NULL; break; } if (run > RUN_THRESH) { if (rw_try_upgrade(TIDHASHLOCK(tid))) { LIST_REMOVE(td, td_hash); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(TIDHASHLOCK(tid)); locked = false; break; } } break; } if (locked) rw_runlock(TIDHASHLOCK(tid)); if (td == NULL) return (false); *pp = p; *tdp = td; return (true); } struct thread * tdfind(lwpid_t tid, pid_t pid) { struct proc *p; struct thread *td; td = curthread; if (td->td_tid == tid) { if (pid != -1 && td->td_proc->p_pid != pid) return (NULL); PROC_LOCK(td->td_proc); return (td); } for (;;) { if (!tdfind_hash(tid, pid, &p, &td)) return (NULL); PROC_LOCK(p); if (td->td_tid != tid) { PROC_UNLOCK(p); continue; } if (td->td_proc != p) { PROC_UNLOCK(p); continue; } if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); return (NULL); } return (td); } } void tidhash_add(struct thread *td) { rw_wlock(TIDHASHLOCK(td->td_tid)); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(TIDHASHLOCK(td->td_tid)); } void tidhash_remove(struct thread *td) { rw_wlock(TIDHASHLOCK(td->td_tid)); LIST_REMOVE(td, td_hash); rw_wunlock(TIDHASHLOCK(td->td_tid)); } diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c index 7b44e12ef330..7ce9bd81704f 100644 --- a/sys/kern/sched_4bsd.c +++ b/sys/kern/sched_4bsd.c @@ -1,1797 +1,1797 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #ifdef KDTRACE_HOOKS #include int __read_mostly dtrace_vtime_active; dtrace_vtime_switch_func_t dtrace_vtime_switch_func; #endif /* * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in * the range 100-256 Hz (approximately). */ #define ESTCPULIM(e) \ min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) #ifdef SMP #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) #else #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ #endif #define NICE_WEIGHT 1 /* Priorities per nice level. */ #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) /* * The schedulable entity that runs a context. * This is an extension to the thread structure and is tailored to * the requirements of this scheduler. * All fields are protected by the scheduler lock. */ struct td_sched { fixpt_t ts_pctcpu; /* %cpu during p_swtime. */ u_int ts_estcpu; /* Estimated cpu utilization. */ int ts_cpticks; /* Ticks of cpu time. */ int ts_slptime; /* Seconds !RUNNING. */ int ts_slice; /* Remaining part of time slice. */ int ts_flags; struct runq *ts_runq; /* runq the thread is currently on */ #ifdef KTR char ts_name[TS_NAME_LEN]; #endif }; /* flags kept in td_flags */ #define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */ #define TDF_BOUND TDF_SCHED1 /* Bound to one CPU. */ #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ /* flags kept in ts_flags */ #define TSF_AFFINITY 0x0001 /* Has a non-"full" CPU set. */ #define SKE_RUNQ_PCPU(ts) \ ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) #define THREAD_CAN_SCHED(td, cpu) \ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= sizeof(struct thread0_storage), "increase struct thread0_storage.t0st_sched size"); static struct mtx sched_lock; static int realstathz = 127; /* stathz is sometimes 0 and run off of hz. */ static int sched_tdcnt; /* Total runnable threads in the system. */ static int sched_slice = 12; /* Thread run time before rescheduling. */ static void setup_runqs(void); static void schedcpu(void); static void schedcpu_thread(void); static void sched_priority(struct thread *td, u_char prio); static void sched_setup(void *dummy); static void maybe_resched(struct thread *td); static void updatepri(struct thread *td); static void resetpriority(struct thread *td); static void resetpriority_thread(struct thread *td); #ifdef SMP static int sched_pickcpu(struct thread *td); static int forward_wakeup(int cpunum); static void kick_other_cpu(int pri, int cpuid); #endif static struct kproc_desc sched_kp = { "schedcpu", schedcpu_thread, NULL }; SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start, &sched_kp); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); static void sched_initticks(void *dummy); SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL); /* * Global run queue. */ static struct runq runq; #ifdef SMP /* * Per-CPU run queues */ static struct runq runq_pcpu[MAXCPU]; long runq_length[MAXCPU]; static cpuset_t idle_cpus_mask; #endif struct pcpuidlestat { u_int idlecalls; u_int oldidlecalls; }; DPCPU_DEFINE_STATIC(struct pcpuidlestat, idlestat); static void setup_runqs(void) { #ifdef SMP int i; for (i = 0; i < MAXCPU; ++i) runq_init(&runq_pcpu[i]); #endif runq_init(&runq); } static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val, period; period = 1000000 / realstathz; new_val = period * sched_slice; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val <= 0) return (EINVAL); sched_slice = imax(1, (new_val + period / 2) / period); hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); return (0); } SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0, "Scheduler name"); SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_quantum, "I", "Quantum for timeshare threads in microseconds"); SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "Quantum for timeshare threads in stathz ticks"); #ifdef SMP /* Enable forwarding of wakeups to all other cpus */ static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Kernel SMP"); static int runq_fuzz = 1; SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, ""); static int forward_wakeup_enabled = 1; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW, &forward_wakeup_enabled, 0, "Forwarding of wakeup to idle CPUs"); static int forward_wakeups_requested = 0; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD, &forward_wakeups_requested, 0, "Requests for Forwarding of wakeup to idle CPUs"); static int forward_wakeups_delivered = 0; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD, &forward_wakeups_delivered, 0, "Completed Forwarding of wakeup to idle CPUs"); static int forward_wakeup_use_mask = 1; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW, &forward_wakeup_use_mask, 0, "Use the mask of idle cpus"); static int forward_wakeup_use_loop = 0; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW, &forward_wakeup_use_loop, 0, "Use a loop to find idle cpus"); #endif #if 0 static int sched_followon = 0; SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW, &sched_followon, 0, "allow threads to share a quantum"); #endif SDT_PROVIDER_DEFINE(sched); SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", "struct proc *", "uint8_t"); SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", "struct proc *", "void *"); SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", "struct proc *", "void *", "int"); SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", "struct proc *", "uint8_t", "struct thread *"); SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", "struct proc *"); SDT_PROBE_DEFINE(sched, , , on__cpu); SDT_PROBE_DEFINE(sched, , , remain__cpu); SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", "struct proc *"); static __inline void sched_load_add(void) { sched_tdcnt++; KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt); SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt); } static __inline void sched_load_rem(void) { sched_tdcnt--; KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt); SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt); } /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. */ static void maybe_resched(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority < curthread->td_priority) curthread->td_flags |= TDF_NEEDRESCHED; } /* * This function is called when a thread is about to be put on run queue * because it has been made runnable or its priority has been adjusted. It * determines if the new thread should preempt the current thread. If so, * it sets td_owepreempt to request a preemption. */ int maybe_preempt(struct thread *td) { #ifdef PREEMPTION struct thread *ctd; int cpri, pri; /* * The new thread should not preempt the current thread if any of the * following conditions are true: * * - The kernel is in the throes of crashing (panicstr). * - The current thread has a higher (numerically lower) or * equivalent priority. Note that this prevents curthread from * trying to preempt to itself. * - The current thread has an inhibitor set or is in the process of * exiting. In this case, the current thread is about to switch * out anyways, so there's no point in preempting. If we did, * the current thread would not be properly resumed as well, so * just avoid that whole landmine. * - If the new thread's priority is not a realtime priority and * the current thread's priority is not an idle priority and * FULL_PREEMPTION is disabled. * * If all of these conditions are false, but the current thread is in * a nested critical section, then we have to defer the preemption * until we exit the critical section. Otherwise, switch immediately * to the new thread. */ ctd = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("maybe_preempt: trying to run inhibited thread")); pri = td->td_priority; cpri = ctd->td_priority; if (KERNEL_PANICKED() || pri >= cpri /* || dumping */ || TD_IS_INHIBITED(ctd)) return (0); #ifndef FULL_PREEMPTION if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE) return (0); #endif CTR0(KTR_PROC, "maybe_preempt: scheduling preemption"); ctd->td_owepreempt = 1; return (1); #else return (0); #endif } /* * Constants for digital decay and forget: * 90% of (ts_estcpu) usage in 5 * loadav time * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive) * Note that, as ps(1) mentions, this can let percentages * total over 100% (I've seen 137.9% for 3 processes). * * Note that schedclock() updates ts_estcpu and p_cpticks asynchronously. * * We wish to decay away 90% of ts_estcpu in (5 * loadavg) seconds. * That is, the system wants to compute a value of decay such * that the following for loop: * for (i = 0; i < (5 * loadavg); i++) * ts_estcpu *= decay; * will compute * ts_estcpu *= 0.1; * for all values of loadavg: * * Mathematically this loop can be expressed by saying: * decay ** (5 * loadavg) ~= .1 * * The system computes decay as: * decay = (2 * loadavg) / (2 * loadavg + 1) * * We wish to prove that the system's computation of decay * will always fulfill the equation: * decay ** (5 * loadavg) ~= .1 * * If we compute b as: * b = 2 * loadavg * then * decay = b / (b + 1) * * We now need to prove two things: * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) * * Facts: * For x close to zero, exp(x) =~ 1 + x, since * exp(x) = 0! + x**1/1! + x**2/2! + ... . * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. * For x close to zero, ln(1+x) =~ x, since * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). * ln(.1) =~ -2.30 * * Proof of (1): * Solve (factor)**(power) =~ .1 given power (5*loadav): * solving for factor, * ln(factor) =~ (-2.30/5*loadav), or * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED * * Proof of (2): * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): * solving for power, * power*ln(b/(b+1)) =~ -2.30, or * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED * * Actual power values for the implemented algorithm are as follows: * loadav: 1 2 3 4 * power: 5.68 10.32 14.94 19.55 */ /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ #define loadfactor(loadav) (2 * (loadav)) #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "Decay factor used for updating %CPU"); /* * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). * * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). * * If you don't want to bother with the faster/more-accurate formula, you * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate * (more general) method of calculating the %age of CPU used by a process. */ #define CCPU_SHIFT 11 /* * Recompute process priorities, every hz ticks. * MP-safe, called without the Giant mutex. */ /* ARGSUSED */ static void schedcpu(void) { fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); struct thread *td; struct proc *p; struct td_sched *ts; int awake; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { awake = 0; ts = td_get_sched(td); thread_lock(td); /* * Increment sleep time (if sleeping). We * ignore overflow, as above. */ /* * The td_sched slptimes are not touched in wakeup * because the thread may not HAVE everything in * memory? XXX I think this is out of date. */ if (TD_ON_RUNQ(td)) { awake = 1; td->td_flags &= ~TDF_DIDRUN; } else if (TD_IS_RUNNING(td)) { awake = 1; /* Do not clear TDF_DIDRUN */ } else if (td->td_flags & TDF_DIDRUN) { awake = 1; td->td_flags &= ~TDF_DIDRUN; } /* * ts_pctcpu is only for ps and ttyinfo(). */ ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT; /* * If the td_sched has been idle the entire second, * stop recalculating its priority until * it wakes up. */ if (ts->ts_cpticks != 0) { #if (FSHIFT >= CCPU_SHIFT) ts->ts_pctcpu += (realstathz == 100) ? ((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT) : 100 * (((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else ts->ts_pctcpu += ((FSCALE - ccpu) * (ts->ts_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif ts->ts_cpticks = 0; } /* * If there are ANY running threads in this process, * then don't count it as sleeping. * XXX: this is broken. */ if (awake) { if (ts->ts_slptime > 1) { /* * In an ideal world, this should not * happen, because whoever woke us * up from the long sleep should have * unwound the slptime and reset our * priority before we run at the stale * priority. Should KASSERT at some * point when all the cases are fixed. */ updatepri(td); } ts->ts_slptime = 0; } else ts->ts_slptime++; if (ts->ts_slptime > 1) { thread_unlock(td); continue; } ts->ts_estcpu = decay_cpu(loadfac, ts->ts_estcpu); resetpriority(td); resetpriority_thread(td); thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } /* * Main loop for a kthread that executes schedcpu once a second. */ static void schedcpu_thread(void) { for (;;) { schedcpu(); pause("-", hz); } } /* * Recalculate the priority of a process after it has slept for a while. * For all load averages >= 1 and max ts_estcpu of 255, sleeping for at * least six times the loadfactor will decay ts_estcpu to zero. */ static void updatepri(struct thread *td) { struct td_sched *ts; fixpt_t loadfac; unsigned int newcpu; ts = td_get_sched(td); loadfac = loadfactor(averunnable.ldavg[0]); if (ts->ts_slptime > 5 * loadfac) ts->ts_estcpu = 0; else { newcpu = ts->ts_estcpu; ts->ts_slptime--; /* was incremented in schedcpu() */ while (newcpu && --ts->ts_slptime) newcpu = decay_cpu(loadfac, newcpu); ts->ts_estcpu = newcpu; } } /* * Compute the priority of a process when running in user mode. * Arrange to reschedule if the resulting priority is better * than that of the current process. */ static void resetpriority(struct thread *td) { u_int newpriority; if (td->td_pri_class != PRI_TIMESHARE) return; newpriority = PUSER + td_get_sched(td)->ts_estcpu / INVERSE_ESTCPU_WEIGHT + NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN); newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), PRI_MAX_TIMESHARE); sched_user_prio(td, newpriority); } /* * Update the thread's priority when the associated process's user * priority changes. */ static void resetpriority_thread(struct thread *td) { /* Only change threads with a time sharing user priority. */ if (td->td_priority < PRI_MIN_TIMESHARE || td->td_priority > PRI_MAX_TIMESHARE) return; /* XXX the whole needresched thing is broken, but not silly. */ maybe_resched(td); sched_prio(td, td->td_user_pri); } /* ARGSUSED */ static void sched_setup(void *dummy) { setup_runqs(); /* Account for thread0. */ sched_load_add(); } /* * This routine determines time constants after stathz and hz are setup. */ static void sched_initticks(void *dummy) { realstathz = stathz ? stathz : hz; sched_slice = realstathz / 10; /* ~100ms */ hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); } /* External interfaces start here */ /* * Very early in the boot some setup of scheduler-specific * parts of proc0 and of some scheduler resources needs to be done. * Called from: * proc0_init() */ void schedinit(void) { /* * Set up the scheduler specific parts of thread0. */ thread0.td_lock = &sched_lock; td_get_sched(&thread0)->ts_slice = sched_slice; mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN); } int sched_runnable(void) { #ifdef SMP return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); #else return runq_check(&runq); #endif } int sched_rr_interval(void) { /* Convert sched_slice from stathz to hz. */ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); } /* * We adjust the priority of the current process. The priority of a * process gets worse as it accumulates CPU time. The cpu usage * estimator (ts_estcpu) is increased here. resetpriority() will * compute a different priority each time ts_estcpu increases by * INVERSE_ESTCPU_WEIGHT (until PRI_MAX_TIMESHARE is reached). The * cpu usage estimator ramps up quite quickly when the process is * running (linearly), and decays away exponentially, at a rate which * is proportionally slower when the system is busy. The basic * principle is that the system will 90% forget that the process used * a lot of CPU time in 5 * loadav seconds. This causes the system to * favor processes which haven't run much recently, and to round-robin * among other processes. */ static void sched_clock_tick(struct thread *td) { struct pcpuidlestat *stat; struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); ts->ts_cpticks++; ts->ts_estcpu = ESTCPULIM(ts->ts_estcpu + 1); if ((ts->ts_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { resetpriority(td); resetpriority_thread(td); } /* * Force a context switch if the current thread has used up a full * time slice (default is 100ms). */ if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) { ts->ts_slice = sched_slice; td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND; } stat = DPCPU_PTR(idlestat); stat->oldidlecalls = stat->idlecalls; stat->idlecalls = 0; } void sched_clock(struct thread *td, int cnt) { for ( ; cnt > 0; cnt--) sched_clock_tick(td); } /* * Charge child's scheduling CPU usage to parent. */ void sched_exit(struct proc *p, struct thread *td) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit", "prio:%d", td->td_priority); PROC_LOCK_ASSERT(p, MA_OWNED); sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); } void sched_exit_thread(struct thread *td, struct thread *child) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit", "prio:%d", child->td_priority); thread_lock(td); td_get_sched(td)->ts_estcpu = ESTCPULIM(td_get_sched(td)->ts_estcpu + td_get_sched(child)->ts_estcpu); thread_unlock(td); thread_lock(child); if ((child->td_flags & TDF_NOLOAD) == 0) sched_load_rem(); thread_unlock(child); } void sched_fork(struct thread *td, struct thread *childtd) { sched_fork_thread(td, childtd); } void sched_fork_thread(struct thread *td, struct thread *childtd) { struct td_sched *ts, *tsc; childtd->td_oncpu = NOCPU; childtd->td_lastcpu = NOCPU; childtd->td_lock = &sched_lock; childtd->td_cpuset = cpuset_ref(td->td_cpuset); childtd->td_domain.dr_policy = td->td_cpuset->cs_domain; childtd->td_priority = childtd->td_base_pri; ts = td_get_sched(childtd); bzero(ts, sizeof(*ts)); tsc = td_get_sched(td); ts->ts_estcpu = tsc->ts_estcpu; ts->ts_flags |= (tsc->ts_flags & TSF_AFFINITY); ts->ts_slice = 1; } void sched_nice(struct proc *p, int nice) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); resetpriority(td); resetpriority_thread(td); thread_unlock(td); } } void sched_class(struct thread *td, int class) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_pri_class = class; } /* * Adjust the priority of a thread. */ static void sched_priority(struct thread *td, u_char prio) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); if (td != curthread && prio > td->td_priority) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), "lend prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, curthread); } THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; td->td_priority = prio; if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) { sched_rem(td); sched_add(td, SRQ_BORING | SRQ_HOLDTD); } } /* * Update a thread's priority when it is lent another thread's * priority. */ void sched_lend_prio(struct thread *td, u_char prio) { td->td_flags |= TDF_BORROWING; sched_priority(td, prio); } /* * Restore a thread's priority when priority propagation is * over. The prio argument is the minimum priority the thread * needs to have to satisfy other possible priority lending * requests. If the thread's regulary priority is less * important than prio the thread will keep a priority boost * of prio. */ void sched_unlend_prio(struct thread *td, u_char prio) { u_char base_pri; if (td->td_base_pri >= PRI_MIN_TIMESHARE && td->td_base_pri <= PRI_MAX_TIMESHARE) base_pri = td->td_user_pri; else base_pri = td->td_base_pri; if (prio >= base_pri) { td->td_flags &= ~TDF_BORROWING; sched_prio(td, base_pri); } else sched_lend_prio(td, prio); } void sched_prio(struct thread *td, u_char prio) { u_char oldprio; /* First, update the base priority. */ td->td_base_pri = prio; /* * If the thread is borrowing another thread's priority, don't ever * lower the priority. */ if (td->td_flags & TDF_BORROWING && td->td_priority < prio) return; /* Change the real priority. */ oldprio = td->td_priority; sched_priority(td, prio); /* * If the thread is on a turnstile, then let the turnstile update * its state. */ if (TD_ON_LOCK(td) && oldprio != prio) turnstile_adjust(td, oldprio); } void sched_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_base_user_pri = prio; if (td->td_lend_user_pri <= prio) return; td->td_user_pri = prio; } void sched_lend_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lend_user_pri = prio; td->td_user_pri = min(prio, td->td_base_user_pri); if (td->td_priority > td->td_user_pri) sched_prio(td, td->td_user_pri); else if (td->td_priority != td->td_user_pri) td->td_flags |= TDF_NEEDRESCHED; } /* * Like the above but first check if there is anything to do. */ void sched_lend_user_prio_cond(struct thread *td, u_char prio) { if (td->td_lend_user_pri != prio) goto lend; if (td->td_user_pri != min(prio, td->td_base_user_pri)) goto lend; if (td->td_priority != td->td_user_pri) goto lend; return; lend: thread_lock(td); sched_lend_user_prio(td, prio); thread_unlock(td); } void sched_sleep(struct thread *td, int pri) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptick = ticks; td_get_sched(td)->ts_slptime = 0; if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, pri); if (TD_IS_SUSPENDED(td) || pri >= PSOCK) td->td_flags |= TDF_CANSWAP; } void sched_switch(struct thread *td, int flags) { struct thread *newtd; struct mtx *tmtx; struct td_sched *ts; struct proc *p; int preempted; tmtx = &sched_lock; ts = td_get_sched(td); p = td->td_proc; THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lastcpu = td->td_oncpu; preempted = (td->td_flags & TDF_SLICEEND) == 0 && (flags & SW_PREEMPT) != 0; td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND); td->td_owepreempt = 0; td->td_oncpu = NOCPU; /* * At the last moment, if this thread is still marked RUNNING, * then put it back on the run queue as it has not been suspended * or stopped or any thing else similar. We never put the idle * threads on the run queue, however. */ if (td->td_flags & TDF_IDLETD) { TD_SET_CAN_RUN(td); #ifdef SMP CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask); #endif } else { if (TD_IS_RUNNING(td)) { /* Put us back on the run queue. */ sched_add(td, preempted ? SRQ_HOLDTD|SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_HOLDTD|SRQ_OURSELF|SRQ_YIELDING); } } /* * Switch to the sched lock to fix things up and pick * a new thread. Block the td_lock in order to avoid * breaking the critical path. */ if (td->td_lock != &sched_lock) { mtx_lock_spin(&sched_lock); tmtx = thread_lock_block(td); mtx_unlock_spin(tmtx); } if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_rem(); newtd = choosethread(); MPASS(newtd->td_lock == &sched_lock); #if (KTR_COMPILE & KTR_SCHED) != 0 if (TD_IS_IDLETHREAD(td)) KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", "prio:%d", td->td_priority); else KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, "lockname:\"%s\"", td->td_lockname); #endif if (td != newtd) { #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); /* I feel sleepy */ lock_profile_release_lock(&sched_lock.lock_object); #ifdef KDTRACE_HOOKS /* * If DTrace has set the active vtime enum to anything * other than INACTIVE (0), then it should have set the * function to call. */ if (dtrace_vtime_active) (*dtrace_vtime_switch_func)(newtd); #endif cpu_switch(td, newtd, tmtx); lock_profile_obtain_lock_success(&sched_lock.lock_object, 0, 0, __FILE__, __LINE__); /* * Where am I? What year is it? * We are in the same thread that went to sleep above, * but any amount of time may have passed. All our context * will still be available as will local variables. * PCPU values however may have changed as we may have * changed CPU so don't trust cached values of them. * New threads will go to fork_exit() instead of here * so if you change things here you may need to change * things there too. * * If the thread above was exiting it will never wake * up again here, so either it has saved everything it * needed to, or the thread_wait() or wait() will * need to reap it. */ SDT_PROBE0(sched, , , on__cpu); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); #endif } else { td->td_lock = &sched_lock; SDT_PROBE0(sched, , , remain__cpu); } KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); #ifdef SMP if (td->td_flags & TDF_IDLETD) CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask); #endif sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); spinlock_enter(); mtx_unlock_spin(&sched_lock); } void sched_wakeup(struct thread *td, int srqflags) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); td->td_flags &= ~TDF_CANSWAP; if (ts->ts_slptime > 1) { updatepri(td); resetpriority(td); } td->td_slptick = 0; ts->ts_slptime = 0; ts->ts_slice = sched_slice; sched_add(td, srqflags); } #ifdef SMP static int forward_wakeup(int cpunum) { struct pcpu *pc; cpuset_t dontuse, map, map2; u_int id, me; int iscpuset; mtx_assert(&sched_lock, MA_OWNED); CTR0(KTR_RUNQ, "forward_wakeup()"); if ((!forward_wakeup_enabled) || (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0)) return (0); if (!smp_started || KERNEL_PANICKED()) return (0); forward_wakeups_requested++; /* * Check the idle mask we received against what we calculated * before in the old version. */ me = PCPU_GET(cpuid); /* Don't bother if we should be doing it ourself. */ if (CPU_ISSET(me, &idle_cpus_mask) && (cpunum == NOCPU || me == cpunum)) return (0); CPU_SETOF(me, &dontuse); CPU_OR(&dontuse, &stopped_cpus); CPU_OR(&dontuse, &hlt_cpus_mask); CPU_ZERO(&map2); if (forward_wakeup_use_loop) { STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { id = pc->pc_cpuid; if (!CPU_ISSET(id, &dontuse) && pc->pc_curthread == pc->pc_idlethread) { CPU_SET(id, &map2); } } } if (forward_wakeup_use_mask) { map = idle_cpus_mask; CPU_ANDNOT(&map, &dontuse); /* If they are both on, compare and use loop if different. */ if (forward_wakeup_use_loop) { if (CPU_CMP(&map, &map2)) { printf("map != map2, loop method preferred\n"); map = map2; } } } else { map = map2; } /* If we only allow a specific CPU, then mask off all the others. */ if (cpunum != NOCPU) { KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum.")); iscpuset = CPU_ISSET(cpunum, &map); if (iscpuset == 0) CPU_ZERO(&map); else CPU_SETOF(cpunum, &map); } if (!CPU_EMPTY(&map)) { forward_wakeups_delivered++; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { id = pc->pc_cpuid; if (!CPU_ISSET(id, &map)) continue; if (cpu_idle_wakeup(pc->pc_cpuid)) CPU_CLR(id, &map); } if (!CPU_EMPTY(&map)) ipi_selected(map, IPI_AST); return (1); } if (cpunum == NOCPU) printf("forward_wakeup: Idle processor not found\n"); return (0); } static void kick_other_cpu(int pri, int cpuid) { struct pcpu *pcpu; int cpri; pcpu = pcpu_find(cpuid); if (CPU_ISSET(cpuid, &idle_cpus_mask)) { forward_wakeups_delivered++; if (!cpu_idle_wakeup(cpuid)) ipi_cpu(cpuid, IPI_AST); return; } cpri = pcpu->pc_curthread->td_priority; if (pri >= cpri) return; #if defined(IPI_PREEMPTION) && defined(PREEMPTION) #if !defined(FULL_PREEMPTION) if (pri <= PRI_MAX_ITHD) #endif /* ! FULL_PREEMPTION */ { ipi_cpu(cpuid, IPI_PREEMPT); return; } #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */ pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED; ipi_cpu(cpuid, IPI_AST); return; } #endif /* SMP */ #ifdef SMP static int sched_pickcpu(struct thread *td) { int best, cpu; mtx_assert(&sched_lock, MA_OWNED); if (td->td_lastcpu != NOCPU && THREAD_CAN_SCHED(td, td->td_lastcpu)) best = td->td_lastcpu; else best = NOCPU; CPU_FOREACH(cpu) { if (!THREAD_CAN_SCHED(td, cpu)) continue; if (best == NOCPU) best = cpu; else if (runq_length[cpu] < runq_length[best]) best = cpu; } KASSERT(best != NOCPU, ("no valid CPUs")); return (best); } #endif void sched_add(struct thread *td, int flags) #ifdef SMP { cpuset_t tidlemsk; struct td_sched *ts; u_int cpu, cpuid; int forwarded = 0; int single_cpu = 0; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ if (td->td_lock != &sched_lock) { mtx_lock_spin(&sched_lock); if ((flags & SRQ_HOLD) != 0) td->td_lock = &sched_lock; else thread_lock_set(td, &sched_lock); } TD_SET_RUNQ(td); /* * If SMP is started and the thread is pinned or otherwise limited to * a specific set of CPUs, queue the thread to a per-CPU run queue. * Otherwise, queue the thread to the global run queue. * * If SMP has not yet been started we must use the global run queue * as per-CPU state may not be initialized yet and we may crash if we * try to access the per-CPU run queues. */ if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND || ts->ts_flags & TSF_AFFINITY)) { if (td->td_pinned != 0) cpu = td->td_lastcpu; else if (td->td_flags & TDF_BOUND) { /* Find CPU from bound runq. */ KASSERT(SKE_RUNQ_PCPU(ts), ("sched_add: bound td_sched not on cpu runq")); cpu = ts->ts_runq - &runq_pcpu[0]; } else /* Find a valid CPU for our cpuset */ cpu = sched_pickcpu(td); ts->ts_runq = &runq_pcpu[cpu]; single_cpu = 1; CTR3(KTR_RUNQ, "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu); } else { CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, td); cpu = NOCPU; ts->ts_runq = &runq; } if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_add(); runq_add(ts->ts_runq, td, flags); if (cpu != NOCPU) runq_length[cpu]++; cpuid = PCPU_GET(cpuid); if (single_cpu && cpu != cpuid) { kick_other_cpu(td->td_priority, cpu); } else { if (!single_cpu) { tidlemsk = idle_cpus_mask; CPU_ANDNOT(&tidlemsk, &hlt_cpus_mask); CPU_CLR(cpuid, &tidlemsk); if (!CPU_ISSET(cpuid, &idle_cpus_mask) && ((flags & SRQ_INTR) == 0) && !CPU_EMPTY(&tidlemsk)) forwarded = forward_wakeup(cpu); } if (!forwarded) { if (!maybe_preempt(td)) maybe_resched(td); } } if ((flags & SRQ_HOLDTD) == 0) thread_unlock(td); } #else /* SMP */ { struct td_sched *ts; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ if (td->td_lock != &sched_lock) { mtx_lock_spin(&sched_lock); if ((flags & SRQ_HOLD) != 0) td->td_lock = &sched_lock; else thread_lock_set(td, &sched_lock); } TD_SET_RUNQ(td); CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td); ts->ts_runq = &runq; if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_add(); runq_add(ts->ts_runq, td, flags); if (!maybe_preempt(td)) maybe_resched(td); if ((flags & SRQ_HOLDTD) == 0) thread_unlock(td); } #endif /* SMP */ void sched_rem(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); KASSERT(td->td_flags & TDF_INMEM, ("sched_rem: thread swapped out")); KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); mtx_assert(&sched_lock, MA_OWNED); KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_rem(); #ifdef SMP if (ts->ts_runq != &runq) runq_length[ts->ts_runq - runq_pcpu]--; #endif runq_remove(ts->ts_runq, td); TD_SET_CAN_RUN(td); } /* * Select threads to run. Note that running threads still consume a * slot. */ struct thread * sched_choose(void) { struct thread *td; struct runq *rq; mtx_assert(&sched_lock, MA_OWNED); #ifdef SMP struct thread *tdcpu; rq = &runq; td = runq_choose_fuzz(&runq, runq_fuzz); tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); if (td == NULL || (tdcpu != NULL && tdcpu->td_priority < td->td_priority)) { CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu, PCPU_GET(cpuid)); td = tdcpu; rq = &runq_pcpu[PCPU_GET(cpuid)]; } else { CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td); } #else rq = &runq; td = runq_choose(&runq); #endif if (td) { #ifdef SMP if (td == tdcpu) runq_length[PCPU_GET(cpuid)]--; #endif runq_remove(rq, td); td->td_flags |= TDF_DIDRUN; KASSERT(td->td_flags & TDF_INMEM, ("sched_choose: thread swapped out")); return (td); } return (PCPU_GET(idlethread)); } void sched_preempt(struct thread *td) { SDT_PROBE2(sched, , , surrender, td, td->td_proc); if (td->td_critnest > 1) { td->td_owepreempt = 1; } else { thread_lock(td); mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT); } } void sched_userret_slowpath(struct thread *td) { thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; thread_unlock(td); } void sched_bind(struct thread *td, int cpu) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); KASSERT(td == curthread, ("sched_bind: can only bind curthread")); ts = td_get_sched(td); td->td_flags |= TDF_BOUND; #ifdef SMP ts->ts_runq = &runq_pcpu[cpu]; if (PCPU_GET(cpuid) == cpu) return; mi_switch(SW_VOL); thread_lock(td); #endif } void sched_unbind(struct thread* td) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); td->td_flags &= ~TDF_BOUND; } int sched_is_bound(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); return (td->td_flags & TDF_BOUND); } void sched_relinquish(struct thread *td) { thread_lock(td); mi_switch(SW_VOL | SWT_RELINQUISH); } int sched_load(void) { return (sched_tdcnt); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } fixpt_t sched_pctcpu(struct thread *td) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); return (ts->ts_pctcpu); } #ifdef RACCT /* * Calculates the contribution to the thread cpu usage for the latest * (unfinished) second. */ fixpt_t sched_pctcpu_delta(struct thread *td) { struct td_sched *ts; fixpt_t delta; int realstathz; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); delta = 0; realstathz = stathz ? stathz : hz; if (ts->ts_cpticks != 0) { #if (FSHIFT >= CCPU_SHIFT) delta = (realstathz == 100) ? ((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT) : 100 * (((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else delta = ((FSCALE - ccpu) * (ts->ts_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif } return (delta); } #endif u_int sched_estcpu(struct thread *td) { return (td_get_sched(td)->ts_estcpu); } /* * The actual idle process. */ void sched_idletd(void *dummy) { struct pcpuidlestat *stat; THREAD_NO_SLEEPING(); stat = DPCPU_PTR(idlestat); for (;;) { mtx_assert(&Giant, MA_NOTOWNED); while (sched_runnable() == 0) { cpu_idle(stat->idlecalls + stat->oldidlecalls > 64); stat->idlecalls++; } mtx_lock_spin(&sched_lock); mi_switch(SW_VOL | SWT_IDLE); } } /* * A CPU is entering for the first time or a thread is exiting. */ void sched_throw(struct thread *td) { /* * Correct spinlock nesting. The idle thread context that we are * borrowing was created so that it would start out with a single * spin lock (sched_lock) held in fork_trampoline(). Since we've * explicitly acquired locks in this function, the nesting count * is now 2 rather than 1. Since we are nested, calling * spinlock_exit() will simply adjust the counts without allowing * spin lock using code to interrupt us. */ if (td == NULL) { mtx_lock_spin(&sched_lock); spinlock_exit(); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); } else { lock_profile_release_lock(&sched_lock.lock_object); MPASS(td->td_lock == &sched_lock); td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; } mtx_assert(&sched_lock, MA_OWNED); KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); cpu_throw(td, choosethread()); /* doesn't return */ } void sched_fork_exit(struct thread *td) { /* * Finish setting up thread glue so that it begins execution in a * non-nested critical section with sched_lock held but not recursed. */ td->td_oncpu = PCPU_GET(cpuid); sched_lock.mtx_lock = (uintptr_t)td; lock_profile_obtain_lock_success(&sched_lock.lock_object, 0, 0, __FILE__, __LINE__); THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); SDT_PROBE0(sched, , , on__cpu); } char * sched_tdname(struct thread *td) { #ifdef KTR struct td_sched *ts; ts = td_get_sched(td); if (ts->ts_name[0] == '\0') snprintf(ts->ts_name, sizeof(ts->ts_name), "%s tid %d", td->td_name, td->td_tid); return (ts->ts_name); #else return (td->td_name); #endif } #ifdef KTR void sched_clear_tdname(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); ts->ts_name[0] = '\0'; } #endif void sched_affinity(struct thread *td) { #ifdef SMP struct td_sched *ts; int cpu; THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Set the TSF_AFFINITY flag if there is at least one CPU this * thread can't run on. */ ts = td_get_sched(td); ts->ts_flags &= ~TSF_AFFINITY; CPU_FOREACH(cpu) { if (!THREAD_CAN_SCHED(td, cpu)) { ts->ts_flags |= TSF_AFFINITY; break; } } /* * If this thread can run on all CPUs, nothing else to do. */ if (!(ts->ts_flags & TSF_AFFINITY)) return; /* Pinned threads and bound threads should be left alone. */ if (td->td_pinned != 0 || td->td_flags & TDF_BOUND) return; - switch (td->td_state) { + switch (TD_GET_STATE(td)) { case TDS_RUNQ: /* * If we are on a per-CPU runqueue that is in the set, * then nothing needs to be done. */ if (ts->ts_runq != &runq && THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu)) return; /* Put this thread on a valid per-CPU runqueue. */ sched_rem(td); sched_add(td, SRQ_HOLDTD | SRQ_BORING); break; case TDS_RUNNING: /* * See if our current CPU is in the set. If not, force a * context switch. */ if (THREAD_CAN_SCHED(td, td->td_oncpu)) return; td->td_flags |= TDF_NEEDRESCHED; if (td != curthread) ipi_cpu(cpu, IPI_AST); break; default: break; } #endif } diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c index 2c8f2909f2b3..d966a796c867 100644 --- a/sys/kern/subr_turnstile.c +++ b/sys/kern/subr_turnstile.c @@ -1,1329 +1,1329 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Implementation of turnstiles used to hold queue of threads blocked on * non-sleepable locks. Sleepable locks use condition variables to * implement their queues. Turnstiles differ from a sleep queue in that * turnstile queue's are assigned to a lock held by an owning thread. Thus, * when one thread is enqueued onto a turnstile, it can lend its priority * to the owning thread. * * We wish to avoid bloating locks with an embedded turnstile and we do not * want to use back-pointers in the locks for the same reason. Thus, we * use a similar approach to that of Solaris 7 as described in Solaris * Internals by Jim Mauro and Richard McDougall. Turnstiles are looked up * in a hash table based on the address of the lock. Each entry in the * hash table is a linked-lists of turnstiles and is called a turnstile * chain. Each chain contains a spin mutex that protects all of the * turnstiles in the chain. * * Each time a thread is created, a turnstile is allocated from a UMA zone * and attached to that thread. When a thread blocks on a lock, if it is the * first thread to block, it lends its turnstile to the lock. If the lock * already has a turnstile, then it gives its turnstile to the lock's * turnstile's free list. When a thread is woken up, it takes a turnstile from * the free list if there are any other waiters. If it is the only thread * blocked on the lock, then it reclaims the turnstile associated with the lock * and removes it from the hash table. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_turnstile_profiling.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #include #endif /* * Constants for the hash table of turnstile chains. TC_SHIFT is a magic * number chosen because the sleep queue's use the same value for the * shift. Basically, we ignore the lower 8 bits of the address. * TC_TABLESIZE must be a power of two for TC_MASK to work properly. */ #define TC_TABLESIZE 128 /* Must be power of 2. */ #define TC_MASK (TC_TABLESIZE - 1) #define TC_SHIFT 8 #define TC_HASH(lock) (((uintptr_t)(lock) >> TC_SHIFT) & TC_MASK) #define TC_LOOKUP(lock) &turnstile_chains[TC_HASH(lock)] /* * There are three different lists of turnstiles as follows. The list * connected by ts_link entries is a per-thread list of all the turnstiles * attached to locks that we own. This is used to fixup our priority when * a lock is released. The other two lists use the ts_hash entries. The * first of these two is the turnstile chain list that a turnstile is on * when it is attached to a lock. The second list to use ts_hash is the * free list hung off of a turnstile that is attached to a lock. * * Each turnstile contains three lists of threads. The two ts_blocked lists * are linked list of threads blocked on the turnstile's lock. One list is * for exclusive waiters, and the other is for shared waiters. The * ts_pending list is a linked list of threads previously awakened by * turnstile_signal() or turnstile_wait() that are waiting to be put on * the run queue. * * Locking key: * c - turnstile chain lock * q - td_contested lock */ struct turnstile { struct mtx ts_lock; /* Spin lock for self. */ struct threadqueue ts_blocked[2]; /* (c + q) Blocked threads. */ struct threadqueue ts_pending; /* (c) Pending threads. */ LIST_ENTRY(turnstile) ts_hash; /* (c) Chain and free list. */ LIST_ENTRY(turnstile) ts_link; /* (q) Contested locks. */ LIST_HEAD(, turnstile) ts_free; /* (c) Free turnstiles. */ struct lock_object *ts_lockobj; /* (c) Lock we reference. */ struct thread *ts_owner; /* (c + q) Who owns the lock. */ }; struct turnstile_chain { LIST_HEAD(, turnstile) tc_turnstiles; /* List of turnstiles. */ struct mtx tc_lock; /* Spin lock for this chain. */ #ifdef TURNSTILE_PROFILING u_int tc_depth; /* Length of tc_queues. */ u_int tc_max_depth; /* Max length of tc_queues. */ #endif }; #ifdef TURNSTILE_PROFILING u_int turnstile_max_depth; static SYSCTL_NODE(_debug, OID_AUTO, turnstile, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "turnstile profiling"); static SYSCTL_NODE(_debug_turnstile, OID_AUTO, chains, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "turnstile chain stats"); SYSCTL_UINT(_debug_turnstile, OID_AUTO, max_depth, CTLFLAG_RD, &turnstile_max_depth, 0, "maximum depth achieved of a single chain"); #endif static struct mtx td_contested_lock; static struct turnstile_chain turnstile_chains[TC_TABLESIZE]; static uma_zone_t turnstile_zone; /* * Prototypes for non-exported routines. */ static void init_turnstile0(void *dummy); #ifdef TURNSTILE_PROFILING static void init_turnstile_profiling(void *arg); #endif static void propagate_priority(struct thread *td); static int turnstile_adjust_thread(struct turnstile *ts, struct thread *td); static struct thread *turnstile_first_waiter(struct turnstile *ts); static void turnstile_setowner(struct turnstile *ts, struct thread *owner); #ifdef INVARIANTS static void turnstile_dtor(void *mem, int size, void *arg); #endif static int turnstile_init(void *mem, int size, int flags); static void turnstile_fini(void *mem, int size); SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE(sched, , , sleep); SDT_PROBE_DEFINE2(sched, , , wakeup, "struct thread *", "struct proc *"); static inline void propagate_unlock_ts(struct turnstile *top, struct turnstile *ts) { if (ts != top) mtx_unlock_spin(&ts->ts_lock); } static inline void propagate_unlock_td(struct turnstile *top, struct thread *td) { if (td->td_lock != &top->ts_lock) thread_unlock(td); } /* * Walks the chain of turnstiles and their owners to propagate the priority * of the thread being blocked to all the threads holding locks that have to * release their locks before this thread can run again. */ static void propagate_priority(struct thread *td) { struct turnstile *ts, *top; int pri; THREAD_LOCK_ASSERT(td, MA_OWNED); pri = td->td_priority; top = ts = td->td_blocked; THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); /* * The original turnstile lock is held across the entire * operation. We only ever lock down the chain so the lock * order is constant. */ for (;;) { td = ts->ts_owner; if (td == NULL) { /* * This might be a read lock with no owner. There's * not much we can do, so just bail. */ propagate_unlock_ts(top, ts); return; } /* * Wait for the thread lock to be stable and then only * acquire if it is not the turnstile lock. */ thread_lock_block_wait(td); if (td->td_lock != &ts->ts_lock) { thread_lock_flags(td, MTX_DUPOK); propagate_unlock_ts(top, ts); } MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); /* * If the thread is asleep, then we are probably about * to deadlock. To make debugging this easier, show * backtrace of misbehaving thread and panic to not * leave the kernel deadlocked. */ if (TD_IS_SLEEPING(td)) { printf( "Sleeping thread (tid %d, pid %d) owns a non-sleepable lock\n", td->td_tid, td->td_proc->p_pid); kdb_backtrace_thread(td); panic("sleeping thread"); } /* * If this thread already has higher priority than the * thread that is being blocked, we are finished. */ if (td->td_priority <= pri) { propagate_unlock_td(top, td); return; } /* * Bump this thread's priority. */ sched_lend_prio(td, pri); /* * If lock holder is actually running or on the run queue * then we are done. */ if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) { MPASS(td->td_blocked == NULL); propagate_unlock_td(top, td); return; } #ifndef SMP /* * For UP, we check to see if td is curthread (this shouldn't * ever happen however as it would mean we are in a deadlock.) */ KASSERT(td != curthread, ("Deadlock detected")); #endif /* * If we aren't blocked on a lock, we should be. */ KASSERT(TD_ON_LOCK(td), ( "thread %d(%s):%d holds %s but isn't blocked on a lock\n", - td->td_tid, td->td_name, td->td_state, + td->td_tid, td->td_name, TD_GET_STATE(td), ts->ts_lockobj->lo_name)); /* * Pick up the lock that td is blocked on. */ ts = td->td_blocked; MPASS(ts != NULL); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); /* Resort td on the list if needed. */ if (!turnstile_adjust_thread(ts, td)) { propagate_unlock_ts(top, ts); return; } /* The thread lock is released as ts lock above. */ } } /* * Adjust the thread's position on a turnstile after its priority has been * changed. */ static int turnstile_adjust_thread(struct turnstile *ts, struct thread *td) { struct thread *td1, *td2; int queue; THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_LOCK(td)); /* * This thread may not be blocked on this turnstile anymore * but instead might already be woken up on another CPU * that is waiting on the thread lock in turnstile_unpend() to * finish waking this thread up. We can detect this case * by checking to see if this thread has been given a * turnstile by either turnstile_signal() or * turnstile_broadcast(). In this case, treat the thread as * if it was already running. */ if (td->td_turnstile != NULL) return (0); /* * Check if the thread needs to be moved on the blocked chain. * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ THREAD_LOCKPTR_BLOCKED_ASSERT(td, &ts->ts_lock); td1 = TAILQ_PREV(td, threadqueue, td_lockq); td2 = TAILQ_NEXT(td, td_lockq); if ((td1 != NULL && td->td_priority < td1->td_priority) || (td2 != NULL && td->td_priority > td2->td_priority)) { /* * Remove thread from blocked chain and determine where * it should be moved to. */ queue = td->td_tsqueue; MPASS(queue == TS_EXCLUSIVE_QUEUE || queue == TS_SHARED_QUEUE); mtx_lock_spin(&td_contested_lock); TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq); TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) { MPASS(td1->td_proc->p_magic == P_MAGIC); if (td1->td_priority > td->td_priority) break; } if (td1 == NULL) TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); else TAILQ_INSERT_BEFORE(td1, td, td_lockq); mtx_unlock_spin(&td_contested_lock); if (td1 == NULL) CTR3(KTR_LOCK, "turnstile_adjust_thread: td %d put at tail on [%p] %s", td->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name); else CTR4(KTR_LOCK, "turnstile_adjust_thread: td %d moved before %d on [%p] %s", td->td_tid, td1->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name); } return (1); } /* * Early initialization of turnstiles. This is not done via a SYSINIT() * since this needs to be initialized very early when mutexes are first * initialized. */ void init_turnstiles(void) { int i; for (i = 0; i < TC_TABLESIZE; i++) { LIST_INIT(&turnstile_chains[i].tc_turnstiles); mtx_init(&turnstile_chains[i].tc_lock, "turnstile chain", NULL, MTX_SPIN); } mtx_init(&td_contested_lock, "td_contested", NULL, MTX_SPIN); LIST_INIT(&thread0.td_contested); thread0.td_turnstile = NULL; } #ifdef TURNSTILE_PROFILING static void init_turnstile_profiling(void *arg) { struct sysctl_oid *chain_oid; char chain_name[10]; int i; for (i = 0; i < TC_TABLESIZE; i++) { snprintf(chain_name, sizeof(chain_name), "%d", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_turnstile_chains), OID_AUTO, chain_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "turnstile chain stats"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "depth", CTLFLAG_RD, &turnstile_chains[i].tc_depth, 0, NULL); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_depth", CTLFLAG_RD, &turnstile_chains[i].tc_max_depth, 0, NULL); } } SYSINIT(turnstile_profiling, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile_profiling, NULL); #endif static void init_turnstile0(void *dummy) { turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile), NULL, #ifdef INVARIANTS turnstile_dtor, #else NULL, #endif turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); thread0.td_turnstile = turnstile_alloc(); } SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile0, NULL); /* * Update a thread on the turnstile list after it's priority has been changed. * The old priority is passed in as an argument. */ void turnstile_adjust(struct thread *td, u_char oldpri) { struct turnstile *ts; MPASS(TD_ON_LOCK(td)); /* * Pick up the lock that td is blocked on. */ ts = td->td_blocked; MPASS(ts != NULL); THREAD_LOCKPTR_BLOCKED_ASSERT(td, &ts->ts_lock); mtx_assert(&ts->ts_lock, MA_OWNED); /* Resort the turnstile on the list. */ if (!turnstile_adjust_thread(ts, td)) return; /* * If our priority was lowered and we are at the head of the * turnstile, then propagate our new priority up the chain. * Note that we currently don't try to revoke lent priorities * when our priority goes up. */ MPASS(td->td_tsqueue == TS_EXCLUSIVE_QUEUE || td->td_tsqueue == TS_SHARED_QUEUE); if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) && td->td_priority < oldpri) { propagate_priority(td); } } /* * Set the owner of the lock this turnstile is attached to. */ static void turnstile_setowner(struct turnstile *ts, struct thread *owner) { mtx_assert(&td_contested_lock, MA_OWNED); MPASS(ts->ts_owner == NULL); /* A shared lock might not have an owner. */ if (owner == NULL) return; MPASS(owner->td_proc->p_magic == P_MAGIC); ts->ts_owner = owner; LIST_INSERT_HEAD(&owner->td_contested, ts, ts_link); } #ifdef INVARIANTS /* * UMA zone item deallocator. */ static void turnstile_dtor(void *mem, int size, void *arg) { struct turnstile *ts; ts = mem; MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE])); MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])); MPASS(TAILQ_EMPTY(&ts->ts_pending)); } #endif /* * UMA zone item initializer. */ static int turnstile_init(void *mem, int size, int flags) { struct turnstile *ts; bzero(mem, size); ts = mem; TAILQ_INIT(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]); TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]); TAILQ_INIT(&ts->ts_pending); LIST_INIT(&ts->ts_free); mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN); return (0); } static void turnstile_fini(void *mem, int size) { struct turnstile *ts; ts = mem; mtx_destroy(&ts->ts_lock); } /* * Get a turnstile for a new thread. */ struct turnstile * turnstile_alloc(void) { return (uma_zalloc(turnstile_zone, M_WAITOK)); } /* * Free a turnstile when a thread is destroyed. */ void turnstile_free(struct turnstile *ts) { uma_zfree(turnstile_zone, ts); } /* * Lock the turnstile chain associated with the specified lock. */ void turnstile_chain_lock(struct lock_object *lock) { struct turnstile_chain *tc; tc = TC_LOOKUP(lock); mtx_lock_spin(&tc->tc_lock); } struct turnstile * turnstile_trywait(struct lock_object *lock) { struct turnstile_chain *tc; struct turnstile *ts; tc = TC_LOOKUP(lock); mtx_lock_spin(&tc->tc_lock); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) { mtx_lock_spin(&ts->ts_lock); return (ts); } ts = curthread->td_turnstile; MPASS(ts != NULL); mtx_lock_spin(&ts->ts_lock); KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer")); ts->ts_lockobj = lock; return (ts); } bool turnstile_lock(struct turnstile *ts, struct lock_object **lockp, struct thread **tdp) { struct turnstile_chain *tc; struct lock_object *lock; if ((lock = ts->ts_lockobj) == NULL) return (false); tc = TC_LOOKUP(lock); mtx_lock_spin(&tc->tc_lock); mtx_lock_spin(&ts->ts_lock); if (__predict_false(lock != ts->ts_lockobj)) { mtx_unlock_spin(&tc->tc_lock); mtx_unlock_spin(&ts->ts_lock); return (false); } *lockp = lock; *tdp = ts->ts_owner; return (true); } void turnstile_unlock(struct turnstile *ts, struct lock_object *lock) { struct turnstile_chain *tc; mtx_assert(&ts->ts_lock, MA_OWNED); mtx_unlock_spin(&ts->ts_lock); if (ts == curthread->td_turnstile) ts->ts_lockobj = NULL; tc = TC_LOOKUP(lock); mtx_unlock_spin(&tc->tc_lock); } void turnstile_assert(struct turnstile *ts) { MPASS(ts->ts_lockobj == NULL); } void turnstile_cancel(struct turnstile *ts) { struct turnstile_chain *tc; struct lock_object *lock; mtx_assert(&ts->ts_lock, MA_OWNED); mtx_unlock_spin(&ts->ts_lock); lock = ts->ts_lockobj; if (ts == curthread->td_turnstile) ts->ts_lockobj = NULL; tc = TC_LOOKUP(lock); mtx_unlock_spin(&tc->tc_lock); } /* * Look up the turnstile for a lock in the hash table locking the associated * turnstile chain along the way. If no turnstile is found in the hash * table, NULL is returned. */ struct turnstile * turnstile_lookup(struct lock_object *lock) { struct turnstile_chain *tc; struct turnstile *ts; tc = TC_LOOKUP(lock); mtx_assert(&tc->tc_lock, MA_OWNED); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) { mtx_lock_spin(&ts->ts_lock); return (ts); } return (NULL); } /* * Unlock the turnstile chain associated with a given lock. */ void turnstile_chain_unlock(struct lock_object *lock) { struct turnstile_chain *tc; tc = TC_LOOKUP(lock); mtx_unlock_spin(&tc->tc_lock); } /* * Return a pointer to the thread waiting on this turnstile with the * most important priority or NULL if the turnstile has no waiters. */ static struct thread * turnstile_first_waiter(struct turnstile *ts) { struct thread *std, *xtd; std = TAILQ_FIRST(&ts->ts_blocked[TS_SHARED_QUEUE]); xtd = TAILQ_FIRST(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]); if (xtd == NULL || (std != NULL && std->td_priority < xtd->td_priority)) return (std); return (xtd); } /* * Take ownership of a turnstile and adjust the priority of the new * owner appropriately. */ void turnstile_claim(struct turnstile *ts) { struct thread *td, *owner; struct turnstile_chain *tc; mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts != curthread->td_turnstile); owner = curthread; mtx_lock_spin(&td_contested_lock); turnstile_setowner(ts, owner); mtx_unlock_spin(&td_contested_lock); td = turnstile_first_waiter(ts); MPASS(td != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); THREAD_LOCKPTR_BLOCKED_ASSERT(td, &ts->ts_lock); /* * Update the priority of the new owner if needed. */ thread_lock(owner); if (td->td_priority < owner->td_priority) sched_lend_prio(owner, td->td_priority); thread_unlock(owner); tc = TC_LOOKUP(ts->ts_lockobj); mtx_unlock_spin(&ts->ts_lock); mtx_unlock_spin(&tc->tc_lock); } /* * Block the current thread on the turnstile assicated with 'lock'. This * function will context switch and not return until this thread has been * woken back up. This function must be called with the appropriate * turnstile chain locked and will return with it unlocked. */ void turnstile_wait(struct turnstile *ts, struct thread *owner, int queue) { struct turnstile_chain *tc; struct thread *td, *td1; struct lock_object *lock; td = curthread; mtx_assert(&ts->ts_lock, MA_OWNED); if (owner) MPASS(owner->td_proc->p_magic == P_MAGIC); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* * If the lock does not already have a turnstile, use this thread's * turnstile. Otherwise insert the current thread into the * turnstile already in use by this lock. */ tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); if (ts == td->td_turnstile) { #ifdef TURNSTILE_PROFILING tc->tc_depth++; if (tc->tc_depth > tc->tc_max_depth) { tc->tc_max_depth = tc->tc_depth; if (tc->tc_max_depth > turnstile_max_depth) turnstile_max_depth = tc->tc_max_depth; } #endif LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash); KASSERT(TAILQ_EMPTY(&ts->ts_pending), ("thread's turnstile has pending threads")); KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]), ("thread's turnstile has exclusive waiters")); KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]), ("thread's turnstile has shared waiters")); KASSERT(LIST_EMPTY(&ts->ts_free), ("thread's turnstile has a non-empty free list")); MPASS(ts->ts_lockobj != NULL); mtx_lock_spin(&td_contested_lock); TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); turnstile_setowner(ts, owner); mtx_unlock_spin(&td_contested_lock); } else { TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) if (td1->td_priority > td->td_priority) break; mtx_lock_spin(&td_contested_lock); if (td1 != NULL) TAILQ_INSERT_BEFORE(td1, td, td_lockq); else TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq); MPASS(owner == ts->ts_owner); mtx_unlock_spin(&td_contested_lock); MPASS(td->td_turnstile != NULL); LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash); } thread_lock(td); thread_lock_set(td, &ts->ts_lock); td->td_turnstile = NULL; /* Save who we are blocked on and switch. */ lock = ts->ts_lockobj; td->td_tsqueue = queue; td->td_blocked = ts; td->td_lockname = lock->lo_name; td->td_blktick = ticks; TD_SET_LOCK(td); mtx_unlock_spin(&tc->tc_lock); propagate_priority(td); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__, td->td_tid, lock, lock->lo_name); SDT_PROBE0(sched, , , sleep); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); mi_switch(SW_VOL | SWT_TURNSTILE); if (LOCK_LOG_TEST(lock, 0)) CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s", __func__, td->td_tid, lock, lock->lo_name); } /* * Pick the highest priority thread on this turnstile and put it on the * pending list. This must be called with the turnstile chain locked. */ int turnstile_signal(struct turnstile *ts, int queue) { struct turnstile_chain *tc __unused; struct thread *td; int empty; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(curthread->td_proc->p_magic == P_MAGIC); MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* * Pick the highest priority thread blocked on this lock and * move it to the pending list. */ td = TAILQ_FIRST(&ts->ts_blocked[queue]); MPASS(td->td_proc->p_magic == P_MAGIC); mtx_lock_spin(&td_contested_lock); TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq); mtx_unlock_spin(&td_contested_lock); TAILQ_INSERT_TAIL(&ts->ts_pending, td, td_lockq); /* * If the turnstile is now empty, remove it from its chain and * give it to the about-to-be-woken thread. Otherwise take a * turnstile from the free list and give it to the thread. */ empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) && TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]); if (empty) { tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(LIST_EMPTY(&ts->ts_free)); #ifdef TURNSTILE_PROFILING tc->tc_depth--; #endif } else ts = LIST_FIRST(&ts->ts_free); MPASS(ts != NULL); LIST_REMOVE(ts, ts_hash); td->td_turnstile = ts; return (empty); } /* * Put all blocked threads on the pending list. This must be called with * the turnstile chain locked. */ void turnstile_broadcast(struct turnstile *ts, int queue) { struct turnstile_chain *tc __unused; struct turnstile *ts1; struct thread *td; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(curthread->td_proc->p_magic == P_MAGIC); MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL); /* * We must have the chain locked so that we can remove the empty * turnstile from the hash queue. */ tc = TC_LOOKUP(ts->ts_lockobj); mtx_assert(&tc->tc_lock, MA_OWNED); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); /* * Transfer the blocked list to the pending list. */ mtx_lock_spin(&td_contested_lock); TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked[queue], td_lockq); mtx_unlock_spin(&td_contested_lock); /* * Give a turnstile to each thread. The last thread gets * this turnstile if the turnstile is empty. */ TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) { if (LIST_EMPTY(&ts->ts_free)) { MPASS(TAILQ_NEXT(td, td_lockq) == NULL); ts1 = ts; #ifdef TURNSTILE_PROFILING tc->tc_depth--; #endif } else ts1 = LIST_FIRST(&ts->ts_free); MPASS(ts1 != NULL); LIST_REMOVE(ts1, ts_hash); td->td_turnstile = ts1; } } static u_char turnstile_calc_unlend_prio_locked(struct thread *td) { struct turnstile *nts; u_char cp, pri; THREAD_LOCK_ASSERT(td, MA_OWNED); mtx_assert(&td_contested_lock, MA_OWNED); pri = PRI_MAX; LIST_FOREACH(nts, &td->td_contested, ts_link) { cp = turnstile_first_waiter(nts)->td_priority; if (cp < pri) pri = cp; } return (pri); } /* * Wakeup all threads on the pending list and adjust the priority of the * current thread appropriately. This must be called with the turnstile * chain locked. */ void turnstile_unpend(struct turnstile *ts) { TAILQ_HEAD( ,thread) pending_threads; struct thread *td; u_char pri; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL); MPASS(!TAILQ_EMPTY(&ts->ts_pending)); /* * Move the list of pending threads out of the turnstile and * into a local variable. */ TAILQ_INIT(&pending_threads); TAILQ_CONCAT(&pending_threads, &ts->ts_pending, td_lockq); #ifdef INVARIANTS if (TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) && TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])) ts->ts_lockobj = NULL; #endif /* * Adjust the priority of curthread based on other contested * locks it owns. Don't lower the priority below the base * priority however. */ td = curthread; thread_lock(td); mtx_lock_spin(&td_contested_lock); /* * Remove the turnstile from this thread's list of contested locks * since this thread doesn't own it anymore. New threads will * not be blocking on the turnstile until it is claimed by a new * owner. There might not be a current owner if this is a shared * lock. */ if (ts->ts_owner != NULL) { ts->ts_owner = NULL; LIST_REMOVE(ts, ts_link); } pri = turnstile_calc_unlend_prio_locked(td); mtx_unlock_spin(&td_contested_lock); sched_unlend_prio(td, pri); thread_unlock(td); /* * Wake up all the pending threads. If a thread is not blocked * on a lock, then it is currently executing on another CPU in * turnstile_wait() or sitting on a run queue waiting to resume * in turnstile_wait(). Set a flag to force it to try to acquire * the lock again instead of blocking. */ while (!TAILQ_EMPTY(&pending_threads)) { td = TAILQ_FIRST(&pending_threads); TAILQ_REMOVE(&pending_threads, td, td_lockq); SDT_PROBE2(sched, , , wakeup, td, td->td_proc); thread_lock_block_wait(td); THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock); MPASS(td->td_proc->p_magic == P_MAGIC); MPASS(TD_ON_LOCK(td)); TD_CLR_LOCK(td); MPASS(TD_CAN_RUN(td)); td->td_blocked = NULL; td->td_lockname = NULL; td->td_blktick = 0; #ifdef INVARIANTS td->td_tsqueue = 0xff; #endif sched_add(td, SRQ_HOLD | SRQ_BORING); } mtx_unlock_spin(&ts->ts_lock); } /* * Give up ownership of a turnstile. This must be called with the * turnstile chain locked. */ void turnstile_disown(struct turnstile *ts) { struct thread *td; u_char pri; MPASS(ts != NULL); mtx_assert(&ts->ts_lock, MA_OWNED); MPASS(ts->ts_owner == curthread); MPASS(TAILQ_EMPTY(&ts->ts_pending)); MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) || !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE])); /* * Remove the turnstile from this thread's list of contested locks * since this thread doesn't own it anymore. New threads will * not be blocking on the turnstile until it is claimed by a new * owner. */ mtx_lock_spin(&td_contested_lock); ts->ts_owner = NULL; LIST_REMOVE(ts, ts_link); mtx_unlock_spin(&td_contested_lock); /* * Adjust the priority of curthread based on other contested * locks it owns. Don't lower the priority below the base * priority however. */ td = curthread; thread_lock(td); mtx_unlock_spin(&ts->ts_lock); mtx_lock_spin(&td_contested_lock); pri = turnstile_calc_unlend_prio_locked(td); mtx_unlock_spin(&td_contested_lock); sched_unlend_prio(td, pri); thread_unlock(td); } /* * Return the first thread in a turnstile. */ struct thread * turnstile_head(struct turnstile *ts, int queue) { #ifdef INVARIANTS MPASS(ts != NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); mtx_assert(&ts->ts_lock, MA_OWNED); #endif return (TAILQ_FIRST(&ts->ts_blocked[queue])); } /* * Returns true if a sub-queue of a turnstile is empty. */ int turnstile_empty(struct turnstile *ts, int queue) { #ifdef INVARIANTS MPASS(ts != NULL); MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE); mtx_assert(&ts->ts_lock, MA_OWNED); #endif return (TAILQ_EMPTY(&ts->ts_blocked[queue])); } #ifdef DDB static void print_thread(struct thread *td, const char *prefix) { db_printf("%s%p (tid %d, pid %d, \"%s\")\n", prefix, td, td->td_tid, td->td_proc->p_pid, td->td_name); } static void print_queue(struct threadqueue *queue, const char *header, const char *prefix) { struct thread *td; db_printf("%s:\n", header); if (TAILQ_EMPTY(queue)) { db_printf("%sempty\n", prefix); return; } TAILQ_FOREACH(td, queue, td_lockq) { print_thread(td, prefix); } } DB_SHOW_COMMAND(turnstile, db_show_turnstile) { struct turnstile_chain *tc; struct turnstile *ts; struct lock_object *lock; int i; if (!have_addr) return; /* * First, see if there is an active turnstile for the lock indicated * by the address. */ lock = (struct lock_object *)addr; tc = TC_LOOKUP(lock); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) goto found; /* * Second, see if there is an active turnstile at the address * indicated. */ for (i = 0; i < TC_TABLESIZE; i++) LIST_FOREACH(ts, &turnstile_chains[i].tc_turnstiles, ts_hash) { if (ts == (struct turnstile *)addr) goto found; } db_printf("Unable to locate a turnstile via %p\n", (void *)addr); return; found: lock = ts->ts_lockobj; db_printf("Lock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name, lock->lo_name); if (ts->ts_owner) print_thread(ts->ts_owner, "Lock Owner: "); else db_printf("Lock Owner: none\n"); print_queue(&ts->ts_blocked[TS_SHARED_QUEUE], "Shared Waiters", "\t"); print_queue(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE], "Exclusive Waiters", "\t"); print_queue(&ts->ts_pending, "Pending Threads", "\t"); } /* * Show all the threads a particular thread is waiting on based on * non-spin locks. */ static void print_lockchain(struct thread *td, const char *prefix) { struct lock_object *lock; struct lock_class *class; struct turnstile *ts; struct thread *owner; /* * Follow the chain. We keep walking as long as the thread is * blocked on a lock that has an owner. */ while (!db_pager_quit) { if (td == (void *)LK_KERNPROC) { db_printf("%sdisowned (LK_KERNPROC)\n", prefix); return; } db_printf("%sthread %d (pid %d, %s) is ", prefix, td->td_tid, td->td_proc->p_pid, td->td_name); - switch (td->td_state) { + switch (TD_GET_STATE(td)) { case TDS_INACTIVE: db_printf("inactive\n"); return; case TDS_CAN_RUN: db_printf("runnable\n"); return; case TDS_RUNQ: db_printf("on a run queue\n"); return; case TDS_RUNNING: db_printf("running on CPU %d\n", td->td_oncpu); return; case TDS_INHIBITED: if (TD_ON_LOCK(td)) { ts = td->td_blocked; lock = ts->ts_lockobj; class = LOCK_CLASS(lock); db_printf("blocked on lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name); if (ts->ts_owner == NULL) return; td = ts->ts_owner; break; } else if (TD_ON_SLEEPQ(td)) { if (!lockmgr_chain(td, &owner) && !sx_chain(td, &owner)) { db_printf("sleeping on %p \"%s\"\n", td->td_wchan, td->td_wmesg); return; } if (owner == NULL) return; td = owner; break; } db_printf("inhibited: %s\n", KTDSTATE(td)); return; default: - db_printf("??? (%#x)\n", td->td_state); + db_printf("??? (%#x)\n", TD_GET_STATE(td)); return; } } } DB_SHOW_COMMAND(lockchain, db_show_lockchain) { struct thread *td; /* Figure out which thread to start with. */ if (have_addr) td = db_lookup_thread(addr, true); else td = kdb_thread; print_lockchain(td, ""); } DB_SHOW_ALIAS(sleepchain, db_show_lockchain); DB_SHOW_ALL_COMMAND(chains, db_show_allchains) { struct thread *td; struct proc *p; int i; i = 1; FOREACH_PROC_IN_SYSTEM(p) { FOREACH_THREAD_IN_PROC(p, td) { if ((TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested)) || (TD_IS_INHIBITED(td) && TD_ON_SLEEPQ(td))) { db_printf("chain %d:\n", i++); print_lockchain(td, " "); } if (db_pager_quit) return; } } } DB_SHOW_ALIAS(allchains, db_show_allchains) static void print_waiters(struct turnstile *ts, int indent); static void print_waiter(struct thread *td, int indent) { struct turnstile *ts; int i; if (db_pager_quit) return; for (i = 0; i < indent; i++) db_printf(" "); print_thread(td, "thread "); LIST_FOREACH(ts, &td->td_contested, ts_link) print_waiters(ts, indent + 1); } static void print_waiters(struct turnstile *ts, int indent) { struct lock_object *lock; struct lock_class *class; struct thread *td; int i; if (db_pager_quit) return; lock = ts->ts_lockobj; class = LOCK_CLASS(lock); for (i = 0; i < indent; i++) db_printf(" "); db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name); TAILQ_FOREACH(td, &ts->ts_blocked[TS_EXCLUSIVE_QUEUE], td_lockq) print_waiter(td, indent + 1); TAILQ_FOREACH(td, &ts->ts_blocked[TS_SHARED_QUEUE], td_lockq) print_waiter(td, indent + 1); TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) print_waiter(td, indent + 1); } DB_SHOW_COMMAND(locktree, db_show_locktree) { struct lock_object *lock; struct lock_class *class; struct turnstile_chain *tc; struct turnstile *ts; if (!have_addr) return; lock = (struct lock_object *)addr; tc = TC_LOOKUP(lock); LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash) if (ts->ts_lockobj == lock) break; if (ts == NULL) { class = LOCK_CLASS(lock); db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name); } else print_waiters(ts, 0); } #endif diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 99257878c2e0..0073fee2a42e 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -1,1270 +1,1282 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #ifdef _KERNEL #include #endif #include #ifndef _KERNEL #include #endif #include #include #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #include #endif #include #include #include #include #include /* Machine-dependent proc substruct. */ #ifdef _KERNEL #include #endif /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { u_int s_count; /* Ref cnt; pgrps in session - atomic. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct cdev_priv *s_ttydp; /* (m) Device of controlling tty. */ struct tty *s_ttyp; /* (e) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ struct mtx pg_mtx; /* Mutex to protect members */ int pg_flags; /* (m) PGRP_ flags */ }; #define PGRP_ORPHANED 0x00000001 /* Group is orphaned */ /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * kx- only accessed by curthread and by debugger * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * q - td_contested lock * r - p_peers lock * s - see sleepq_switch(), sleeping_on_old_rtc(), and sleep(9) * t - thread lock * u - process stat lock * w - process timer lock * x - created at fork, only changes during single threading in exec * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * z - zombie threads lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct cpuset; struct filecaps; struct filemon; struct kaioinfo; struct kaudit_record; struct kcov_info; struct kdtrace_proc; struct kdtrace_thread; struct mqueue_notifier; struct p_sched; struct proc; struct procdesc; struct racct; struct sbuf; struct sleepqueue; struct socket; struct syscall_args; struct td_sched; struct thread; struct trapframe; struct turnstile; struct vm_map; struct vm_map_entry; struct epoch_tracker; /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux. * Locking for td_rux: (t) for all fields. */ struct rusage_ext { uint64_t rux_runtime; /* (cu) Real time. */ uint64_t rux_uticks; /* (cu) Statclock hits in user mode. */ uint64_t rux_sticks; /* (cu) Statclock hits in sys mode. */ uint64_t rux_iticks; /* (cu) Statclock hits in intr mode. */ uint64_t rux_uu; /* (c) Previous user time in usec. */ uint64_t rux_su; /* (c) Previous sys time in usec. */ uint64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_runq; /* (t) Run queue. */ union { TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ struct thread *td_zombie; /* Zombie list linkage */ }; TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct domainset_ref td_domain; /* (a) NUMA policy */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals u_char td_lend_user_pri; /* (t) Lend user pri. */ u_char td_allocdomain; /* (b) NUMA domain backing this struct thread. */ /* Cleared during fork1() */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_pflags2; /* (k) Private thread (TDP2_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ const void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ short td_locks; /* (k) Debug: count of non-spin locks */ short td_rw_rlocks; /* (k) Count of rwlock read locks. */ short td_sx_slocks; /* (k) Count of sx shared locks. */ short td_lk_slocks; /* (k) Count of lockmgr shared locks. */ short td_stopsched; /* (k) Scheduler stopped. */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct ucred *td_realucred; /* (k) Reference to credentials. */ struct ucred *td_ucred; /* (k) Used credentials, temporarily switchable. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ int td_swvoltick; /* (t) Time at last SW_VOL switch. */ int td_swinvoltick; /* (t) Time at last SW_INVOL switch. */ u_int td_cow; /* (*) Number of copy-on-write faults */ struct rusage td_ru; /* (t) rusage information. */ struct rusage_ext td_rux; /* (t) Internal rusage information. */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ siginfo_t td_si; /* (c) For debugger or core file */ int td_ng_outbound; /* (k) Thread entered ng from above. */ struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ struct vnode *td_vp_reserved;/* (k) Prealloated vnode. */ u_int td_no_sleeping; /* (k) Sleeping disabled count. */ void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ int td_errno; /* (k) Error from last syscall. */ size_t td_vslock_sz; /* (k) amount of vslock-ed space */ struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */ u_int td_ucredref; /* (k) references on td_realucred */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ #define td_startcopy td_endzero sigset_t td_sigmask; /* (c) Current signal mask. */ u_char td_rqindex; /* (t) Run queue index. */ u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ u_char td_pre_epoch_prio; /* (k) User pri on entry to epoch */ uintptr_t td_rb_list; /* (k) Robust list head. */ uintptr_t td_rbp_list; /* (k) Robust priv list head. */ uintptr_t td_rb_inact; /* (k) Current in-action mutex loc. */ struct syscall_args td_sa; /* (kx) Syscall parameters. Copied on fork for child tracing. */ void *td_sigblock_ptr; /* (k) uptr for fast sigblock. */ uint32_t td_sigblock_val; /* (k) fast sigblock value read at td_sigblock_ptr on kern entry */ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1() or create_thread() * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum td_states { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ + /* Note: td_state must be accessed using TD_{GET,SET}_STATE(). */ union { register_t tdu_retval[2]; off_t tdu_off; } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval u_int td_cowgen; /* (k) Generation of COW pointers. */ /* LP64 hole */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ int td_ma_cnt; /* (k) size of *td_ma */ /* LP64 hole */ void *td_emuldata; /* Emulator state data */ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ void *td_lkpi_task; /* LinuxKPI task struct pointer */ int td_pmcpend; #ifdef EPOCH_TRACE SLIST_HEAD(, epoch_tracker) td_epochs; #endif }; struct thread0_storage { struct thread t0st_thread; uint64_t t0st_sched[10]; }; struct mtx *thread_lock_block(struct thread *); void thread_lock_block_wait(struct thread *); void thread_lock_set(struct thread *, struct mtx *); void thread_lock_unblock(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ mtx_assert((td)->td_lock, (type)) #define THREAD_LOCK_BLOCKED_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) #ifdef INVARIANTS #define THREAD_LOCKPTR_ASSERT(td, lock) \ do { \ struct mtx *__m; \ __m = (td)->td_lock; \ KASSERT(__m == (lock), \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define THREAD_LOCKPTR_BLOCKED_ASSERT(td, lock) \ do { \ struct mtx *__m; \ __m = (td)->td_lock; \ KASSERT(__m == (lock) || __m == &blocked_lock, \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define TD_LOCKS_INC(td) ((td)->td_locks++) #define TD_LOCKS_DEC(td) do { \ KASSERT(SCHEDULER_STOPPED_TD(td) || (td)->td_locks > 0, \ ("thread %p owns no locks", (td))); \ (td)->td_locks--; \ } while (0) #else #define THREAD_LOCKPTR_ASSERT(td, lock) #define THREAD_LOCKPTR_BLOCKED_ASSERT(td, lock) #define TD_LOCKS_INC(td) #define TD_LOCKS_DEC(td) #endif /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_CANSWAP 0x00000040 /* Thread can be swapped. */ #define TDF_UNUSED80 0x00000080 /* unused. */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_UNUSED12 0x00001000 /* --available-- */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_NOLOAD 0x00040000 /* Ignore during load avg calculations. */ #define TDF_SERESTART 0x00080000 /* ERESTART on stop attempts. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_ALRMPEND 0x10000000 /* Pending SIGVTALRM needs to be posted. */ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ #define TDB_USERWR 0x00000004 /* Debugger modified memory or registers */ #define TDB_SCE 0x00000008 /* Thread performs syscall enter */ #define TDB_SCX 0x00000010 /* Thread performs syscall exit */ #define TDB_EXEC 0x00000020 /* TDB_SCX from exec(2) family */ #define TDB_FORK 0x00000040 /* TDB_SCX from fork(2) that created new process */ #define TDB_STOPATFORK 0x00000080 /* Stop at the return from fork (child only) */ #define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */ #define TDB_BORN 0x00000200 /* New LWP indicator for ptrace() */ #define TDB_EXIT 0x00000400 /* Exiting LWP indicator for ptrace() */ #define TDB_VFORK 0x00000800 /* vfork indicator for ptrace() */ #define TDB_FSTP 0x00001000 /* The thread is PT_ATTACH leader */ #define TDB_STEP 0x00002000 /* (x86) PSL_T set for PT_STEP */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock acquisition - deadlock treatment. */ #define TDP_NOFAULTING 0x00000080 /* Do not handle page faults. */ #define TDP_SIGFASTBLOCK 0x00000100 /* Fast sigblock active */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_SYNCIO 0x00000800 /* Local override, disable async i/o. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ #define TDP_FORKING 0x20000000 /* Thread is being created through fork() */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ #define TDP_SIGFASTPENDING 0x80000000 /* Pending signal due to sigfastblock */ #define TDP2_SBPAGES 0x00000001 /* Owns sbusy on some pages */ #define TDP2_COMPAT32RB 0x00000002 /* compat32 ABI for robust lists */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) -#define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) -#define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) -#define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) -#define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) +#ifdef _KERNEL +#define TD_GET_STATE(td) atomic_load_int(&(td)->td_state) +#else +#define TD_GET_STATE(td) ((td)->td_state) +#endif +#define TD_IS_RUNNING(td) (TD_GET_STATE(td) == TDS_RUNNING) +#define TD_ON_RUNQ(td) (TD_GET_STATE(td) == TDS_RUNQ) +#define TD_CAN_RUN(td) (TD_GET_STATE(td) == TDS_CAN_RUN) +#define TD_IS_INHIBITED(td) (TD_GET_STATE(td) == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #define TD_CAN_ABORT(td) (TD_ON_SLEEPQ((td)) && \ ((td)->td_flags & TDF_SINTR) != 0) #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ ((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \ ((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \ ((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding") -#define TD_SET_INHIB(td, inhib) do { \ - (td)->td_state = TDS_INHIBITED; \ - (td)->td_inhibitors |= (inhib); \ +#define TD_SET_INHIB(td, inhib) do { \ + TD_SET_STATE(td, TDS_INHIBITED); \ + (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ - (td)->td_state = TDS_CAN_RUN; \ + TD_SET_STATE(td, TDS_CAN_RUN); \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) -#define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING -#define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ -#define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN +#ifdef _KERNEL +#define TD_SET_STATE(td, state) atomic_store_int(&(td)->td_state, state) +#else +#define TD_SET_STATE(td, state) (td)->td_state = state +#endif +#define TD_SET_RUNNING(td) TD_SET_STATE(td, TDS_RUNNING) +#define TD_SET_RUNQ(td) TD_SET_STATE(td, TDS_RUNQ) +#define TD_SET_CAN_RUN(td) TD_SET_STATE(td, TDS_CAN_RUN) + #define TD_SBDRY_INTR(td) \ (((td)->td_flags & (TDF_SEINTR | TDF_SERESTART)) != 0) #define TD_SBDRY_ERRNO(td) \ (((td)->td_flags & TDF_SEINTR) != 0 ? EINTR : ERESTART) /* * Process structure. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pwddesc *p_pd; /* (b) Cwd, chroot, jail, umask */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Resource limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ enum p_states { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) Process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct proc *p_reaper; /* (e) My reaper. */ LIST_HEAD(, proc) p_reaplist; /* (e) List of my descendants (if I am reaper). */ LIST_ENTRY(proc) p_reapsibling; /* (e) List of siblings - descendants of the same reaper. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct mtx p_statmtx; /* Lock for the stats */ struct mtx p_itimmtx; /* Lock for the virt/prof timers */ struct mtx p_profmtx; /* Lock for the profiling */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals pid_t p_oppid; /* (c + e) Real parent pid. */ /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_vmspace struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ u_int p_cowgen; /* (c) Generation of COW pointers. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cu) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ u_int p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_int p_ptevents; /* (c + e) ptrace() event mask. */ struct kaioinfo *p_aioinfo; /* (y) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (j) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ u_int p_treeflag; /* (e) P_TREE flags */ int p_pendingexits; /* (c) Count of pending thread exits. */ struct filemon *p_filemon; /* (c) filemon-specific data. */ int p_pdeathsig; /* (c) Signal from parent on exit. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ uint32_t p_fctl0; /* (x) ABI feature control, ELF note */ char p_comm[MAXCOMLEN + 1]; /* (x) Process name. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ pid_t p_reapsubtree; /* (e) Pid of the direct child of the reaper which spawned our subtree. */ uint16_t p_elf_machine; /* (x) ELF machine type */ uint64_t p_elf_flags; /* (x) ELF flags */ /* End area that is copied on creation. */ #define p_endcopy p_xexit u_int p_xexit; /* (c) Exit code. */ u_int p_xsig; /* (c) Stop/kill sig. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct knlist *p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ /* * An orphan is the child that has been re-parented to the * debugger as a result of attaching to it. Need to keep * track of them for parent to be able to collect the exit * status of what used to be children. */ LIST_ENTRY(proc) p_orphan; /* (e) List of orphan processes. */ LIST_HEAD(, proc) p_orphans; /* (e) Pointer to list of orphans. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU (-1) /* For when we aren't on a CPU. */ #define NOCPU_OLD (255) #define MAXCPU_OLD (254) #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) #define PROC_STATLOCK(p) mtx_lock_spin(&(p)->p_statmtx) #define PROC_STATUNLOCK(p) mtx_unlock_spin(&(p)->p_statmtx) #define PROC_STATLOCK_ASSERT(p, type) mtx_assert(&(p)->p_statmtx, (type)) #define PROC_ITIMLOCK(p) mtx_lock_spin(&(p)->p_itimmtx) #define PROC_ITIMUNLOCK(p) mtx_unlock_spin(&(p)->p_itimmtx) #define PROC_ITIMLOCK_ASSERT(p, type) mtx_assert(&(p)->p_itimmtx, (type)) #define PROC_PROFLOCK(p) mtx_lock_spin(&(p)->p_profmtx) #define PROC_PROFUNLOCK(p) mtx_unlock_spin(&(p)->p_profmtx) #define PROC_PROFLOCK_ASSERT(p, type) mtx_assert(&(p)->p_profmtx, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00000001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00000002 /* Has a controlling terminal. */ #define P_KPROC 0x00000004 /* Kernel process. */ #define P_UNUSED3 0x00000008 /* --available-- */ #define P_PPWAIT 0x00000010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00000020 /* Has started profiling. */ #define P_STOPPROF 0x00000040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00000080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00000100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00000200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00000400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00000800 /* Debugged process being traced. */ #define P_WAITED 0x00001000 /* Someone is waiting for us. */ #define P_WEXIT 0x00002000 /* Working on exiting. */ #define P_EXEC 0x00004000 /* Process called exec. */ #define P_WKILLED 0x00008000 /* Killed, go to kernel/user boundary ASAP. */ #define P_CONTINUED 0x00010000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x00020000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x00040000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x00080000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x00100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x00200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x00400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x00800000 /* Process is using HWPMCs */ #define P_JAILED 0x01000000 /* Process is in jail. */ #define P_TOTAL_STOP 0x02000000 /* Stopped in stop_all_proc. */ #define P_INEXEC 0x04000000 /* Process is in execve(). */ #define P_STATCHILD 0x08000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_PPTRACE 0x80000000 /* PT_TRACEME by vforked child. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) #define P_KILLED(p) ((p)->p_flag & P_WKILLED) /* These flags are kept in p_flag2. */ #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ #define P2_NOTRACE 0x00000002 /* No ptrace(2) attach or coredumps. */ #define P2_NOTRACE_EXEC 0x00000004 /* Keep P2_NOPTRACE on exec(2). */ #define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ #define P2_PTRACE_FSTP 0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */ #define P2_TRAPCAP 0x00000020 /* SIGTRAP on ENOTCAPABLE */ #define P2_ASLR_ENABLE 0x00000040 /* Force enable ASLR. */ #define P2_ASLR_DISABLE 0x00000080 /* Force disable ASLR. */ #define P2_ASLR_IGNSTART 0x00000100 /* Enable ASLR to consume sbrk area. */ #define P2_PROTMAX_ENABLE 0x00000200 /* Force enable implied PROT_MAX. */ #define P2_PROTMAX_DISABLE 0x00000400 /* Force disable implied PROT_MAX. */ #define P2_STKGAP_DISABLE 0x00000800 /* Disable stack gap for MAP_STACK */ #define P2_STKGAP_DISABLE_EXEC 0x00001000 /* Stack gap disabled after exec */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan list */ #define P_TREE_REAPER 0x00000004 /* Reaper of subtree */ #define P_TREE_GRPEXITED 0x00000008 /* exit1() done with job ctl */ /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Types and flags for mi_switch(). */ #define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ #define SWT_NONE 0 /* Unspecified switch. */ #define SWT_PREEMPT 1 /* Switching due to preemption. */ #define SWT_OWEPREEMPT 2 /* Switching due to owepreempt. */ #define SWT_TURNSTILE 3 /* Turnstile contention. */ #define SWT_SLEEPQ 4 /* Sleepq wait. */ #define SWT_SLEEPQTIMO 5 /* Sleepq timeout wait. */ #define SWT_RELINQUISH 6 /* yield call. */ #define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ #define SWT_IDLE 8 /* Switching from the idle thread. */ #define SWT_IWAIT 9 /* Waiting for interrupts. */ #define SWT_SUSPEND 10 /* Thread suspended. */ #define SWT_REMOTEPREEMPT 11 /* Remote processor preempted. */ #define SWT_REMOTEWAKEIDLE 12 /* Remote processor preempted idle. */ #define SWT_COUNT 13 /* Number of switch types. */ /* Flags */ #define SW_VOL 0x0100 /* Voluntary switch. */ #define SW_INVOL 0x0200 /* Involuntary switch. */ #define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 #define SINGLE_ALLPROC 3 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit * in a pid_t, as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 #define THREAD0_TID NO_PID extern pid_t pid_max; #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* * Non-zero p_lock ensures that: * - exit1() is not performed until p_lock reaches zero; * - the process' threads stack are not swapped out if they are currently * not (P_INMEM). * * PHOLD() asserts that the process (except the current process) is * not exiting, increments p_lock and swaps threads stacks into memory, * if needed. * _PHOLD() is same as PHOLD(), it takes the process locked. * _PHOLD_LITE() also takes the process locked, but comparing with * _PHOLD(), it only guarantees that exit1() is not executed, * faultin() is not called. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define _PHOLD_LITE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process %p not held", p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ PROC_ASSERT_HELD(p); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process %p held", p)); \ } while (0) #define PROC_UPDATE_COW(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (p)->p_cowgen++; \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) ((td)->td_flags & TDF_CANSWAP) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() do { \ curthread->td_no_sleeping++; \ MPASS(curthread->td_no_sleeping > 0); \ } while (0) #define THREAD_SLEEPING_OK() do { \ MPASS(curthread->td_no_sleeping > 0); \ curthread->td_no_sleeping--; \ } while (0) #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) #define PIDHASHLOCK(pid) (&pidhashtbl_lock[((pid) & pidhashlock)]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern struct sx *pidhashtbl_lock; extern u_long pidhash; extern u_long pidhashlock; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern int allproc_gen; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct mtx procid_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread0_storage thread0_st; /* Primary thread in proc0. */ #define thread0 (thread0_st.t0st_thread) extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; extern struct uma_zone *pgrp_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_any(pid_t); /* Find (zombie) process by id. */ struct proc *pfind_any_locked(pid_t pid); /* Find process by id, locked. */ struct pgrp *pgfind(pid_t); /* Find process group by id. */ void pidhash_slockall(void); /* Shared lock all pid hash lists. */ void pidhash_sunlockall(void); /* Shared unlock all pid hash lists. */ struct fork_req { int fr_flags; int fr_pages; int *fr_pidp; struct proc **fr_procp; int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; int fr_flags2; #define FR2_DROPSIG_CAUGHT 0x00000001 /* Drop caught non-DFL signals */ #define FR2_SHARE_PATHS 0x00000002 /* Invert sense of RFFDG for paths */ }; /* * pget() flags. */ #define PGET_HOLD 0x00001 /* Hold the process. */ #define PGET_CANSEE 0x00002 /* Check against p_cansee(). */ #define PGET_CANDEBUG 0x00004 /* Check against p_candebug(). */ #define PGET_ISCURRENT 0x00008 /* Check that the found process is current. */ #define PGET_NOTWEXIT 0x00010 /* Check that the process is not in P_WEXIT. */ #define PGET_NOTINEXEC 0x00020 /* Check that the process is not in P_INEXEC. */ #define PGET_NOTID 0x00040 /* Do not assume tid if pid > PID_MAX. */ #define PGET_WANTREAD (PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT) int pget(pid_t pid, int flags, struct proc **pp); void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansee(struct ucred *u1, struct ucred *u2); int cr_canseesocket(struct ucred *cred, struct socket *so); int cr_canseeothergids(struct ucred *u1, struct ucred *u2); int cr_canseeotheruids(struct ucred *u1, struct ucred *u2); int cr_canseejailproc(struct ucred *u1, struct ucred *u2); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); int fork1(struct thread *, struct fork_req *); void fork_rfppwait(struct thread *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kern_proc_vmmap_resident(struct vm_map *map, struct vm_map_entry *entry, int *resident_count, bool *super); void kern_yield(int); void kick_proc0(void); void killjobc(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void maybe_yield(void); void mi_switch(int flags); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); void procinit(void); int proc_iterate(int (*cb)(struct proc *, void *), void *cbarg); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent, bool set_oppid); void proc_add_orphan(struct proc *child, struct proc *parent); void proc_set_traced(struct proc *p, bool stop); void proc_wkilled(struct proc *p); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); void proc_clear_orphan(struct proc *p); void reaper_abandon_children(struct proc *p, bool exiting); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sess_hold(struct session *); void sess_release(struct session *); int setrunnable(struct thread *, int); void setsugid(struct proc *p); int should_yield(void); int sigonstack(size_t sp); void stopevent(struct proc *, u_int, u_int); struct thread *tdfind(lwpid_t, pid_t); void threadinit(void); void tidhash_add(struct thread *); void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int, int) __dead2; void cpu_copy_thread(struct thread *td, struct thread *td0); bool cpu_exec_vmspace_reuse(struct proc *p, struct vm_map *map); int cpu_fetch_syscall_args(struct thread *td); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *); int cpu_procctl(struct thread *td, int idtype, id_t id, int com, void *data); void cpu_set_syscall_retval(struct thread *, int); void cpu_set_upcall(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(int pages); int thread_alloc_stack(struct thread *, int pages); int thread_check_susp(struct thread *td, bool sleep); void thread_cow_get_proc(struct thread *newtd, struct proc *p); void thread_cow_get(struct thread *newtd, struct thread *td); void thread_cow_free(struct thread *td); void thread_cow_update(struct thread *td); int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk); void thread_exit(void) __dead2; void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); int thread_single(struct proc *p, int how); void thread_single_end(struct proc *p, int how); void thread_stash(struct thread *td); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); bool thread_suspend_check_needed(void); void thread_suspend_switch(struct thread *, struct proc *p); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_wait(struct proc *p); void stop_all_proc(void); void resume_all_proc(void); static __inline int curthread_pflags_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags & flags); td->td_pflags |= flags; return (save); } static __inline void curthread_pflags_restore(int save) { curthread->td_pflags &= save; } static __inline int curthread_pflags2_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags2 & flags); td->td_pflags2 |= flags; return (save); } static __inline void curthread_pflags2_restore(int save) { curthread->td_pflags2 &= save; } static __inline bool kstack_contains(struct thread *td, vm_offset_t va, size_t len) { return (va >= td->td_kstack && va + len >= va && va + len <= td->td_kstack + td->td_kstack_pages * PAGE_SIZE); } static __inline __pure2 struct td_sched * td_get_sched(struct thread *td) { return ((struct td_sched *)&td[1]); } extern void (*softdep_ast_cleanup)(struct thread *); static __inline void td_softdep_cleanup(struct thread *td) { if (td->td_su != NULL && softdep_ast_cleanup != NULL) softdep_ast_cleanup(td); } #define PROC_ID_PID 0 #define PROC_ID_GROUP 1 #define PROC_ID_SESSION 2 #define PROC_ID_REAP 3 void proc_id_set(int type, pid_t id); void proc_id_set_cond(int type, pid_t id); void proc_id_clear(int type, pid_t id); EVENTHANDLER_LIST_DECLARE(process_ctor); EVENTHANDLER_LIST_DECLARE(process_dtor); EVENTHANDLER_LIST_DECLARE(process_init); EVENTHANDLER_LIST_DECLARE(process_fini); EVENTHANDLER_LIST_DECLARE(process_exit); EVENTHANDLER_LIST_DECLARE(process_fork); EVENTHANDLER_LIST_DECLARE(process_exec); EVENTHANDLER_LIST_DECLARE(thread_ctor); EVENTHANDLER_LIST_DECLARE(thread_dtor); EVENTHANDLER_LIST_DECLARE(thread_init); #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index 44d3ac999c5a..f9e2b0c66cc8 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -1,573 +1,573 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct vmmeter __read_mostly vm_cnt = { .v_swtch = EARLY_COUNTER, .v_trap = EARLY_COUNTER, .v_syscall = EARLY_COUNTER, .v_intr = EARLY_COUNTER, .v_soft = EARLY_COUNTER, .v_vm_faults = EARLY_COUNTER, .v_io_faults = EARLY_COUNTER, .v_cow_faults = EARLY_COUNTER, .v_cow_optim = EARLY_COUNTER, .v_zfod = EARLY_COUNTER, .v_ozfod = EARLY_COUNTER, .v_swapin = EARLY_COUNTER, .v_swapout = EARLY_COUNTER, .v_swappgsin = EARLY_COUNTER, .v_swappgsout = EARLY_COUNTER, .v_vnodein = EARLY_COUNTER, .v_vnodeout = EARLY_COUNTER, .v_vnodepgsin = EARLY_COUNTER, .v_vnodepgsout = EARLY_COUNTER, .v_intrans = EARLY_COUNTER, .v_reactivated = EARLY_COUNTER, .v_pdwakeups = EARLY_COUNTER, .v_pdpages = EARLY_COUNTER, .v_pdshortfalls = EARLY_COUNTER, .v_dfree = EARLY_COUNTER, .v_pfree = EARLY_COUNTER, .v_tfree = EARLY_COUNTER, .v_forks = EARLY_COUNTER, .v_vforks = EARLY_COUNTER, .v_rforks = EARLY_COUNTER, .v_kthreads = EARLY_COUNTER, .v_forkpages = EARLY_COUNTER, .v_vforkpages = EARLY_COUNTER, .v_rforkpages = EARLY_COUNTER, .v_kthreadpages = EARLY_COUNTER, .v_wire_count = EARLY_COUNTER, }; u_long __exclusive_cache_line vm_user_wire_count; static void vmcounter_startup(void) { counter_u64_t *cnt = (counter_u64_t *)&vm_cnt; COUNTER_ARRAY_ALLOC(cnt, VM_METER_NCOUNTERS, M_WAITOK); } SYSINIT(counter, SI_SUB_KMEM, SI_ORDER_FIRST, vmcounter_startup, NULL); SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min, CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold"); SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target, CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages"); SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved, CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock"); SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target, CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive"); SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min, CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel"); SYSCTL_UINT(_vm, OID_AUTO, v_free_severe, CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point"); static int sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS) { #ifdef SCTL_MASK32 u_int32_t la[4]; if (req->flags & SCTL_MASK32) { la[0] = averunnable.ldavg[0]; la[1] = averunnable.ldavg[1]; la[2] = averunnable.ldavg[2]; la[3] = averunnable.fscale; return SYSCTL_OUT(req, la, sizeof(la)); } else #endif return SYSCTL_OUT(req, &averunnable, sizeof(averunnable)); } SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_loadavg, "S,loadavg", "Machine loadaverage history"); /* * This function aims to determine if the object is mapped, * specifically, if it is referenced by a vm_map_entry. Because * objects occasionally acquire transient references that do not * represent a mapping, the method used here is inexact. However, it * has very low overhead and is good enough for the advisory * vm.vmtotal sysctl. */ static bool is_object_active(vm_object_t obj) { return (obj->ref_count > obj->shadow_count); } #if defined(COMPAT_FREEBSD11) struct vmtotal11 { int16_t t_rq; int16_t t_dw; int16_t t_pw; int16_t t_sl; int16_t t_sw; int32_t t_vm; int32_t t_avm; int32_t t_rm; int32_t t_arm; int32_t t_vmshr; int32_t t_avmshr; int32_t t_rmshr; int32_t t_armshr; int32_t t_free; }; #endif static int vmtotal(SYSCTL_HANDLER_ARGS) { struct vmtotal total; #if defined(COMPAT_FREEBSD11) struct vmtotal11 total11; #endif vm_object_t object; struct proc *p; struct thread *td; if (req->oldptr == NULL) { #if defined(COMPAT_FREEBSD11) if (curproc->p_osrel < P_OSREL_VMTOTAL64) return (SYSCTL_OUT(req, NULL, sizeof(total11))); #endif return (SYSCTL_OUT(req, NULL, sizeof(total))); } bzero(&total, sizeof(total)); /* * Calculate process statistics. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if ((p->p_flag & P_SYSTEM) != 0) continue; PROC_LOCK(p); if (p->p_state != PRS_NEW) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); - switch (td->td_state) { + switch (TD_GET_STATE(td)) { case TDS_INHIBITED: if (TD_IS_SWAPPED(td)) total.t_sw++; else if (TD_IS_SLEEPING(td)) { if (td->td_priority <= PZERO) total.t_dw++; else total.t_sl++; } break; case TDS_CAN_RUN: total.t_sw++; break; case TDS_RUNQ: case TDS_RUNNING: total.t_rq++; break; default: break; } thread_unlock(td); } } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* * Calculate object memory usage statistics. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { /* * Perform unsynchronized reads on the object. In * this case, the lack of synchronization should not * impair the accuracy of the reported statistics. */ if ((object->flags & OBJ_FICTITIOUS) != 0) { /* * Devices, like /dev/mem, will badly skew our totals. */ continue; } if (object->ref_count == 0) { /* * Also skip unreferenced objects, including * vnodes representing mounted file systems. */ continue; } if (object->ref_count == 1 && (object->flags & OBJ_ANON) == 0) { /* * Also skip otherwise unreferenced swap * objects backing tmpfs vnodes, and POSIX or * SysV shared memory. */ continue; } total.t_vm += object->size; total.t_rm += object->resident_page_count; if (is_object_active(object)) { total.t_avm += object->size; total.t_arm += object->resident_page_count; } if (object->shadow_count > 1) { /* shared object */ total.t_vmshr += object->size; total.t_rmshr += object->resident_page_count; if (is_object_active(object)) { total.t_avmshr += object->size; total.t_armshr += object->resident_page_count; } } } mtx_unlock(&vm_object_list_mtx); total.t_pw = vm_wait_count(); total.t_free = vm_free_count(); #if defined(COMPAT_FREEBSD11) /* sysctl(8) allocates twice as much memory as reported by sysctl(3) */ if (curproc->p_osrel < P_OSREL_VMTOTAL64 && (req->oldlen == sizeof(total11) || req->oldlen == 2 * sizeof(total11))) { bzero(&total11, sizeof(total11)); total11.t_rq = total.t_rq; total11.t_dw = total.t_dw; total11.t_pw = total.t_pw; total11.t_sl = total.t_sl; total11.t_sw = total.t_sw; total11.t_vm = total.t_vm; /* truncate */ total11.t_avm = total.t_avm; /* truncate */ total11.t_rm = total.t_rm; /* truncate */ total11.t_arm = total.t_arm; /* truncate */ total11.t_vmshr = total.t_vmshr; /* truncate */ total11.t_avmshr = total.t_avmshr; /* truncate */ total11.t_rmshr = total.t_rmshr; /* truncate */ total11.t_armshr = total.t_armshr; /* truncate */ total11.t_free = total.t_free; /* truncate */ return (SYSCTL_OUT(req, &total11, sizeof(total11))); } #endif return (SYSCTL_OUT(req, &total, sizeof(total))); } SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, vmtotal, "S,vmtotal", "System virtual memory statistics"); SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "VM meter stats"); static SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "VM meter sys stats"); static SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "VM meter vm stats"); SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "VM meter misc stats"); static int sysctl_handle_vmstat(SYSCTL_HANDLER_ARGS) { uint64_t val; #ifdef COMPAT_FREEBSD11 uint32_t val32; #endif val = counter_u64_fetch(*(counter_u64_t *)arg1); #ifdef COMPAT_FREEBSD11 if (req->oldlen == sizeof(val32)) { val32 = val; /* truncate */ return (SYSCTL_OUT(req, &val32, sizeof(val32))); } #endif return (SYSCTL_OUT(req, &val, sizeof(val))); } #define VM_STATS(parent, var, descr) \ SYSCTL_OID(parent, OID_AUTO, var, CTLTYPE_U64 | CTLFLAG_MPSAFE | \ CTLFLAG_RD, &vm_cnt.var, 0, sysctl_handle_vmstat, "QU", descr) #define VM_STATS_VM(var, descr) VM_STATS(_vm_stats_vm, var, descr) #define VM_STATS_SYS(var, descr) VM_STATS(_vm_stats_sys, var, descr) VM_STATS_SYS(v_swtch, "Context switches"); VM_STATS_SYS(v_trap, "Traps"); VM_STATS_SYS(v_syscall, "System calls"); VM_STATS_SYS(v_intr, "Device interrupts"); VM_STATS_SYS(v_soft, "Software interrupts"); VM_STATS_VM(v_vm_faults, "Address memory faults"); VM_STATS_VM(v_io_faults, "Page faults requiring I/O"); VM_STATS_VM(v_cow_faults, "Copy-on-write faults"); VM_STATS_VM(v_cow_optim, "Optimized COW faults"); VM_STATS_VM(v_zfod, "Pages zero-filled on demand"); VM_STATS_VM(v_ozfod, "Optimized zero fill pages"); VM_STATS_VM(v_swapin, "Swap pager pageins"); VM_STATS_VM(v_swapout, "Swap pager pageouts"); VM_STATS_VM(v_swappgsin, "Swap pages swapped in"); VM_STATS_VM(v_swappgsout, "Swap pages swapped out"); VM_STATS_VM(v_vnodein, "Vnode pager pageins"); VM_STATS_VM(v_vnodeout, "Vnode pager pageouts"); VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in"); VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out"); VM_STATS_VM(v_intrans, "In transit page faults"); VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon"); VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups"); VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls"); VM_STATS_VM(v_dfree, "Pages freed by pagedaemon"); VM_STATS_VM(v_pfree, "Pages freed by exiting processes"); VM_STATS_VM(v_tfree, "Total pages freed"); VM_STATS_VM(v_forks, "Number of fork() calls"); VM_STATS_VM(v_vforks, "Number of vfork() calls"); VM_STATS_VM(v_rforks, "Number of rfork() calls"); VM_STATS_VM(v_kthreads, "Number of fork() calls by kernel"); VM_STATS_VM(v_forkpages, "VM pages affected by fork()"); VM_STATS_VM(v_vforkpages, "VM pages affected by vfork()"); VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()"); VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel"); static int sysctl_handle_vmstat_proc(SYSCTL_HANDLER_ARGS) { u_int (*fn)(void); uint32_t val; fn = arg1; val = fn(); return (SYSCTL_OUT(req, &val, sizeof(val))); } #define VM_STATS_PROC(var, descr, fn) \ SYSCTL_OID(_vm_stats_vm, OID_AUTO, var, CTLTYPE_U32 | CTLFLAG_MPSAFE | \ CTLFLAG_RD, fn, 0, sysctl_handle_vmstat_proc, "IU", descr) #define VM_STATS_UINT(var, descr) \ SYSCTL_UINT(_vm_stats_vm, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, 0, descr) #define VM_STATS_ULONG(var, descr) \ SYSCTL_ULONG(_vm_stats_vm, OID_AUTO, var, CTLFLAG_RD, &vm_cnt.var, 0, descr) VM_STATS_UINT(v_page_size, "Page size in bytes"); VM_STATS_UINT(v_page_count, "Total number of pages in system"); VM_STATS_UINT(v_free_reserved, "Pages reserved for deadlock"); VM_STATS_UINT(v_free_target, "Pages desired free"); VM_STATS_UINT(v_free_min, "Minimum low-free-pages threshold"); VM_STATS_PROC(v_free_count, "Free pages", vm_free_count); VM_STATS_PROC(v_wire_count, "Wired pages", vm_wire_count); VM_STATS_PROC(v_active_count, "Active pages", vm_active_count); VM_STATS_UINT(v_inactive_target, "Desired inactive pages"); VM_STATS_PROC(v_inactive_count, "Inactive pages", vm_inactive_count); VM_STATS_PROC(v_laundry_count, "Pages eligible for laundering", vm_laundry_count); VM_STATS_UINT(v_pageout_free_min, "Min pages reserved for kernel"); VM_STATS_UINT(v_interrupt_free_min, "Reserved pages for interrupt code"); VM_STATS_UINT(v_free_severe, "Severe page depletion point"); SYSCTL_ULONG(_vm_stats_vm, OID_AUTO, v_user_wire_count, CTLFLAG_RD, &vm_user_wire_count, 0, "User-wired virtual memory"); #ifdef COMPAT_FREEBSD11 /* * Provide compatibility sysctls for the benefit of old utilities which exit * with an error if they cannot be found. */ SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_cache_count, CTLFLAG_RD, SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility"); SYSCTL_UINT(_vm_stats_vm, OID_AUTO, v_tcached, CTLFLAG_RD, SYSCTL_NULL_UINT_PTR, 0, "Dummy for compatibility"); #endif u_int vm_free_count(void) { u_int v; int i; v = 0; for (i = 0; i < vm_ndomains; i++) v += vm_dom[i].vmd_free_count; return (v); } static u_int vm_pagequeue_count(int pq) { u_int v; int i; v = 0; for (i = 0; i < vm_ndomains; i++) v += vm_dom[i].vmd_pagequeues[pq].pq_cnt; return (v); } u_int vm_active_count(void) { return (vm_pagequeue_count(PQ_ACTIVE)); } u_int vm_inactive_count(void) { return (vm_pagequeue_count(PQ_INACTIVE)); } u_int vm_laundry_count(void) { return (vm_pagequeue_count(PQ_LAUNDRY)); } static int sysctl_vm_pdpages(SYSCTL_HANDLER_ARGS) { struct vm_pagequeue *pq; uint64_t ret; int dom, i; ret = counter_u64_fetch(vm_cnt.v_pdpages); for (dom = 0; dom < vm_ndomains; dom++) for (i = 0; i < PQ_COUNT; i++) { pq = &VM_DOMAIN(dom)->vmd_pagequeues[i]; ret += pq->pq_pdpages; } return (SYSCTL_OUT(req, &ret, sizeof(ret))); } SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_vm_pdpages, "QU", "Pages analyzed by pagedaemon"); static void vm_domain_stats_init(struct vm_domain *vmd, struct sysctl_oid *parent) { struct sysctl_oid *oid; vmd->vmd_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(parent), OID_AUTO, vmd->vmd_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_count", CTLFLAG_RD, &vmd->vmd_free_count, 0, "Free pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "active", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_cnt, 0, "Active pages"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "actpdpgs", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_pdpages, 0, "Active pages scanned by the page daemon"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "inactive", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt, 0, "Inactive pages"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "inactpdpgs", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_pdpages, 0, "Inactive pages scanned by the page daemon"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "laundry", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt, 0, "laundry pages"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "laundpdpgs", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_pdpages, 0, "Laundry pages scanned by the page daemon"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "unswappable", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt, 0, "Unswappable pages"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "unswppdpgs", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_pdpages, 0, "Unswappable pages scanned by the page daemon"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "inactive_target", CTLFLAG_RD, &vmd->vmd_inactive_target, 0, "Target inactive pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_target", CTLFLAG_RD, &vmd->vmd_free_target, 0, "Target free pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_reserved", CTLFLAG_RD, &vmd->vmd_free_reserved, 0, "Reserved free pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_min", CTLFLAG_RD, &vmd->vmd_free_min, 0, "Minimum free pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "free_severe", CTLFLAG_RD, &vmd->vmd_free_severe, 0, "Severe free pages"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "inactive_pps", CTLFLAG_RD, &vmd->vmd_inactive_pps, 0, "inactive pages freed/second"); } static void vm_stats_init(void *arg __unused) { struct sysctl_oid *oid; int i; oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm), OID_AUTO, "domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); for (i = 0; i < vm_ndomains; i++) vm_domain_stats_init(VM_DOMAIN(i), oid); } SYSINIT(vmstats_init, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_stats_init, NULL);