Index: head/lib/libkvm/kvm_proc.c =================================================================== --- head/lib/libkvm/kvm_proc.c (revision 273265) +++ head/lib/libkvm/kvm_proc.c (revision 273266) @@ -1,714 +1,732 @@ /*- * Copyright (c) 1989, 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software developed by the Computer Systems * Engineering group at Lawrence Berkeley Laboratory under DARPA contract * BG 91-66 and contributed to Berkeley. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #if defined(LIBC_SCCS) && !defined(lint) static char sccsid[] = "@(#)kvm_proc.c 8.3 (Berkeley) 9/23/93"; #endif /* LIBC_SCCS and not lint */ #endif #include __FBSDID("$FreeBSD$"); /* * Proc traversal interface for kvm. ps and w are (probably) the exclusive * users of this code, so we've factored it out into a separate module. * Thus, we keep this grunge out of the other kvm applications (i.e., * most other applications are interested only in open/close/read/nlist). */ #include #define _WANT_UCRED /* make ucred.h give us 'struct ucred' */ #include #include #include #include #include #include #include #include #define _WANT_PRISON /* make jail.h give us 'struct prison' */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "kvm_private.h" #define KREAD(kd, addr, obj) \ (kvm_read(kd, addr, (char *)(obj), sizeof(*obj)) != sizeof(*obj)) static int ticks; static int hz; static uint64_t cpu_tick_frequency; /* * From sys/kern/kern_tc.c. Depends on cpu_tick_frequency, which is * read/initialized before this function is ever called. */ static uint64_t cputick2usec(uint64_t tick) { if (cpu_tick_frequency == 0) return (0); if (tick > 18446744073709551) /* floor(2^64 / 1000) */ return (tick / (cpu_tick_frequency / 1000000)); else if (tick > 18446744073709) /* floor(2^64 / 1000000) */ return ((tick * 1000) / (cpu_tick_frequency / 1000)); else return ((tick * 1000000) / cpu_tick_frequency); } /* * Read proc's from memory file into buffer bp, which has space to hold * at most maxcnt procs. */ static int kvm_proclist(kvm_t *kd, int what, int arg, struct proc *p, struct kinfo_proc *bp, int maxcnt) { int cnt = 0; struct kinfo_proc kinfo_proc, *kp; struct pgrp pgrp; struct session sess; struct cdev t_cdev; struct tty tty; struct vmspace vmspace; struct sigacts sigacts; #if 0 struct pstats pstats; #endif struct ucred ucred; struct prison pr; struct thread mtd; struct proc proc; struct proc pproc; struct sysentvec sysent; char svname[KI_EMULNAMELEN]; kp = &kinfo_proc; kp->ki_structsize = sizeof(kinfo_proc); /* * Loop on the processes. this is completely broken because we need to be * able to loop on the threads and merge the ones that are the same process some how. */ for (; cnt < maxcnt && p != NULL; p = LIST_NEXT(&proc, p_list)) { memset(kp, 0, sizeof *kp); if (KREAD(kd, (u_long)p, &proc)) { _kvm_err(kd, kd->program, "can't read proc at %p", p); return (-1); } if (proc.p_state == PRS_NEW) continue; if (proc.p_state != PRS_ZOMBIE) { if (KREAD(kd, (u_long)TAILQ_FIRST(&proc.p_threads), &mtd)) { _kvm_err(kd, kd->program, "can't read thread at %p", TAILQ_FIRST(&proc.p_threads)); return (-1); } } if (KREAD(kd, (u_long)proc.p_ucred, &ucred) == 0) { kp->ki_ruid = ucred.cr_ruid; kp->ki_svuid = ucred.cr_svuid; kp->ki_rgid = ucred.cr_rgid; kp->ki_svgid = ucred.cr_svgid; kp->ki_cr_flags = ucred.cr_flags; if (ucred.cr_ngroups > KI_NGROUPS) { kp->ki_ngroups = KI_NGROUPS; kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW; } else kp->ki_ngroups = ucred.cr_ngroups; kvm_read(kd, (u_long)ucred.cr_groups, kp->ki_groups, kp->ki_ngroups * sizeof(gid_t)); kp->ki_uid = ucred.cr_uid; if (ucred.cr_prison != NULL) { if (KREAD(kd, (u_long)ucred.cr_prison, &pr)) { _kvm_err(kd, kd->program, "can't read prison at %p", ucred.cr_prison); return (-1); } kp->ki_jid = pr.pr_id; } } switch(what & ~KERN_PROC_INC_THREAD) { case KERN_PROC_GID: if (kp->ki_groups[0] != (gid_t)arg) continue; break; case KERN_PROC_PID: if (proc.p_pid != (pid_t)arg) continue; break; case KERN_PROC_RGID: if (kp->ki_rgid != (gid_t)arg) continue; break; case KERN_PROC_UID: if (kp->ki_uid != (uid_t)arg) continue; break; case KERN_PROC_RUID: if (kp->ki_ruid != (uid_t)arg) continue; break; } /* * We're going to add another proc to the set. If this * will overflow the buffer, assume the reason is because * nprocs (or the proc list) is corrupt and declare an error. */ if (cnt >= maxcnt) { _kvm_err(kd, kd->program, "nprocs corrupt"); return (-1); } /* * gather kinfo_proc */ kp->ki_paddr = p; kp->ki_addr = 0; /* XXX uarea */ /* kp->ki_kstack = proc.p_thread.td_kstack; XXXKSE */ kp->ki_args = proc.p_args; kp->ki_tracep = proc.p_tracevp; kp->ki_textvp = proc.p_textvp; kp->ki_fd = proc.p_fd; kp->ki_vmspace = proc.p_vmspace; if (proc.p_sigacts != NULL) { if (KREAD(kd, (u_long)proc.p_sigacts, &sigacts)) { _kvm_err(kd, kd->program, "can't read sigacts at %p", proc.p_sigacts); return (-1); } kp->ki_sigignore = sigacts.ps_sigignore; kp->ki_sigcatch = sigacts.ps_sigcatch; } #if 0 if ((proc.p_flag & P_INMEM) && proc.p_stats != NULL) { if (KREAD(kd, (u_long)proc.p_stats, &pstats)) { _kvm_err(kd, kd->program, "can't read stats at %x", proc.p_stats); return (-1); } kp->ki_start = pstats.p_start; /* * XXX: The times here are probably zero and need * to be calculated from the raw data in p_rux and * p_crux. */ kp->ki_rusage = pstats.p_ru; kp->ki_childstime = pstats.p_cru.ru_stime; kp->ki_childutime = pstats.p_cru.ru_utime; /* Some callers want child-times in a single value */ timeradd(&kp->ki_childstime, &kp->ki_childutime, &kp->ki_childtime); } #endif if (proc.p_oppid) kp->ki_ppid = proc.p_oppid; else if (proc.p_pptr) { if (KREAD(kd, (u_long)proc.p_pptr, &pproc)) { _kvm_err(kd, kd->program, "can't read pproc at %p", proc.p_pptr); return (-1); } kp->ki_ppid = pproc.p_pid; } else kp->ki_ppid = 0; if (proc.p_pgrp == NULL) goto nopgrp; if (KREAD(kd, (u_long)proc.p_pgrp, &pgrp)) { _kvm_err(kd, kd->program, "can't read pgrp at %p", proc.p_pgrp); return (-1); } kp->ki_pgid = pgrp.pg_id; kp->ki_jobc = pgrp.pg_jobc; if (KREAD(kd, (u_long)pgrp.pg_session, &sess)) { _kvm_err(kd, kd->program, "can't read session at %p", pgrp.pg_session); return (-1); } kp->ki_sid = sess.s_sid; (void)memcpy(kp->ki_login, sess.s_login, sizeof(kp->ki_login)); kp->ki_kiflag = sess.s_ttyvp ? KI_CTTY : 0; if (sess.s_leader == p) kp->ki_kiflag |= KI_SLEADER; if ((proc.p_flag & P_CONTROLT) && sess.s_ttyp != NULL) { if (KREAD(kd, (u_long)sess.s_ttyp, &tty)) { _kvm_err(kd, kd->program, "can't read tty at %p", sess.s_ttyp); return (-1); } if (tty.t_dev != NULL) { if (KREAD(kd, (u_long)tty.t_dev, &t_cdev)) { _kvm_err(kd, kd->program, "can't read cdev at %p", tty.t_dev); return (-1); } #if 0 kp->ki_tdev = t_cdev.si_udev; #else kp->ki_tdev = NODEV; #endif } if (tty.t_pgrp != NULL) { if (KREAD(kd, (u_long)tty.t_pgrp, &pgrp)) { _kvm_err(kd, kd->program, "can't read tpgrp at %p", tty.t_pgrp); return (-1); } kp->ki_tpgid = pgrp.pg_id; } else kp->ki_tpgid = -1; if (tty.t_session != NULL) { if (KREAD(kd, (u_long)tty.t_session, &sess)) { _kvm_err(kd, kd->program, "can't read session at %p", tty.t_session); return (-1); } kp->ki_tsid = sess.s_sid; } } else { nopgrp: kp->ki_tdev = NODEV; } if ((proc.p_state != PRS_ZOMBIE) && mtd.td_wmesg) (void)kvm_read(kd, (u_long)mtd.td_wmesg, kp->ki_wmesg, WMESGLEN); (void)kvm_read(kd, (u_long)proc.p_vmspace, (char *)&vmspace, sizeof(vmspace)); kp->ki_size = vmspace.vm_map.size; /* * Approximate the kernel's method of calculating * this field. */ #define pmap_resident_count(pm) ((pm)->pm_stats.resident_count) kp->ki_rssize = pmap_resident_count(&vmspace.vm_pmap); kp->ki_swrss = vmspace.vm_swrss; kp->ki_tsize = vmspace.vm_tsize; kp->ki_dsize = vmspace.vm_dsize; kp->ki_ssize = vmspace.vm_ssize; switch (what & ~KERN_PROC_INC_THREAD) { case KERN_PROC_PGRP: if (kp->ki_pgid != (pid_t)arg) continue; break; case KERN_PROC_SESSION: if (kp->ki_sid != (pid_t)arg) continue; break; case KERN_PROC_TTY: if ((proc.p_flag & P_CONTROLT) == 0 || kp->ki_tdev != (dev_t)arg) continue; break; } if (proc.p_comm[0] != 0) strlcpy(kp->ki_comm, proc.p_comm, MAXCOMLEN); (void)kvm_read(kd, (u_long)proc.p_sysent, (char *)&sysent, sizeof(sysent)); (void)kvm_read(kd, (u_long)sysent.sv_name, (char *)&svname, sizeof(svname)); if (svname[0] != 0) strlcpy(kp->ki_emul, svname, KI_EMULNAMELEN); if ((proc.p_state != PRS_ZOMBIE) && (mtd.td_blocked != 0)) { kp->ki_kiflag |= KI_LOCKBLOCK; if (mtd.td_lockname) (void)kvm_read(kd, (u_long)mtd.td_lockname, kp->ki_lockname, LOCKNAMELEN); kp->ki_lockname[LOCKNAMELEN] = 0; } kp->ki_runtime = cputick2usec(proc.p_rux.rux_runtime); kp->ki_pid = proc.p_pid; kp->ki_siglist = proc.p_siglist; SIGSETOR(kp->ki_siglist, mtd.td_siglist); kp->ki_sigmask = mtd.td_sigmask; kp->ki_xstat = proc.p_xstat; kp->ki_acflag = proc.p_acflag; kp->ki_lock = proc.p_lock; if (proc.p_state != PRS_ZOMBIE) { kp->ki_swtime = (ticks - proc.p_swtick) / hz; kp->ki_flag = proc.p_flag; kp->ki_sflag = 0; kp->ki_nice = proc.p_nice; kp->ki_traceflag = proc.p_traceflag; if (proc.p_state == PRS_NORMAL) { if (TD_ON_RUNQ(&mtd) || TD_CAN_RUN(&mtd) || TD_IS_RUNNING(&mtd)) { kp->ki_stat = SRUN; } else if (mtd.td_state == TDS_INHIBITED) { if (P_SHOULDSTOP(&proc)) { kp->ki_stat = SSTOP; } else if ( TD_IS_SLEEPING(&mtd)) { kp->ki_stat = SSLEEP; } else if (TD_ON_LOCK(&mtd)) { kp->ki_stat = SLOCK; } else { kp->ki_stat = SWAIT; } } } else { kp->ki_stat = SIDL; } /* Stuff from the thread */ kp->ki_pri.pri_level = mtd.td_priority; kp->ki_pri.pri_native = mtd.td_base_pri; kp->ki_lastcpu = mtd.td_lastcpu; kp->ki_wchan = mtd.td_wchan; if (mtd.td_name[0] != 0) strlcpy(kp->ki_tdname, mtd.td_name, MAXCOMLEN); kp->ki_oncpu = mtd.td_oncpu; if (mtd.td_name[0] != '\0') strlcpy(kp->ki_tdname, mtd.td_name, sizeof(kp->ki_tdname)); kp->ki_pctcpu = 0; kp->ki_rqindex = 0; + + /* + * Note: legacy fields; wraps at NO_CPU_OLD or the + * old max CPU value as appropriate + */ + if (mtd.td_lastcpu == NOCPU) + kp->ki_lastcpu_old = NOCPU_OLD; + else if (mtd.td_lastcpu > MAXCPU_OLD) + kp->ki_lastcpu_old = MAXCPU_OLD; + else + kp->ki_lastcpu_old = mtd.td_lastcpu; + + if (mtd.td_oncpu == NOCPU) + kp->ki_oncpu_old = NOCPU_OLD; + else if (mtd.td_oncpu > MAXCPU_OLD) + kp->ki_oncpu_old = MAXCPU_OLD; + else + kp->ki_oncpu_old = mtd.td_oncpu; } else { kp->ki_stat = SZOMB; } bcopy(&kinfo_proc, bp, sizeof(kinfo_proc)); ++bp; ++cnt; } return (cnt); } /* * Build proc info array by reading in proc list from a crash dump. * Return number of procs read. maxcnt is the max we will read. */ static int kvm_deadprocs(kvm_t *kd, int what, int arg, u_long a_allproc, u_long a_zombproc, int maxcnt) { struct kinfo_proc *bp = kd->procbase; int acnt, zcnt; struct proc *p; if (KREAD(kd, a_allproc, &p)) { _kvm_err(kd, kd->program, "cannot read allproc"); return (-1); } acnt = kvm_proclist(kd, what, arg, p, bp, maxcnt); if (acnt < 0) return (acnt); if (KREAD(kd, a_zombproc, &p)) { _kvm_err(kd, kd->program, "cannot read zombproc"); return (-1); } zcnt = kvm_proclist(kd, what, arg, p, bp + acnt, maxcnt - acnt); if (zcnt < 0) zcnt = 0; return (acnt + zcnt); } struct kinfo_proc * kvm_getprocs(kvm_t *kd, int op, int arg, int *cnt) { int mib[4], st, nprocs; size_t size, osize; int temp_op; if (kd->procbase != 0) { free((void *)kd->procbase); /* * Clear this pointer in case this call fails. Otherwise, * kvm_close() will free it again. */ kd->procbase = 0; } if (ISALIVE(kd)) { size = 0; mib[0] = CTL_KERN; mib[1] = KERN_PROC; mib[2] = op; mib[3] = arg; temp_op = op & ~KERN_PROC_INC_THREAD; st = sysctl(mib, temp_op == KERN_PROC_ALL || temp_op == KERN_PROC_PROC ? 3 : 4, NULL, &size, NULL, 0); if (st == -1) { _kvm_syserr(kd, kd->program, "kvm_getprocs"); return (0); } /* * We can't continue with a size of 0 because we pass * it to realloc() (via _kvm_realloc()), and passing 0 * to realloc() results in undefined behavior. */ if (size == 0) { /* * XXX: We should probably return an invalid, * but non-NULL, pointer here so any client * program trying to dereference it will * crash. However, _kvm_freeprocs() calls * free() on kd->procbase if it isn't NULL, * and free()'ing a junk pointer isn't good. * Then again, _kvm_freeprocs() isn't used * anywhere . . . */ kd->procbase = _kvm_malloc(kd, 1); goto liveout; } do { size += size / 10; kd->procbase = (struct kinfo_proc *) _kvm_realloc(kd, kd->procbase, size); if (kd->procbase == 0) return (0); osize = size; st = sysctl(mib, temp_op == KERN_PROC_ALL || temp_op == KERN_PROC_PROC ? 3 : 4, kd->procbase, &size, NULL, 0); } while (st == -1 && errno == ENOMEM && size == osize); if (st == -1) { _kvm_syserr(kd, kd->program, "kvm_getprocs"); return (0); } /* * We have to check the size again because sysctl() * may "round up" oldlenp if oldp is NULL; hence it * might've told us that there was data to get when * there really isn't any. */ if (size > 0 && kd->procbase->ki_structsize != sizeof(struct kinfo_proc)) { _kvm_err(kd, kd->program, "kinfo_proc size mismatch (expected %zu, got %d)", sizeof(struct kinfo_proc), kd->procbase->ki_structsize); return (0); } liveout: nprocs = size == 0 ? 0 : size / kd->procbase->ki_structsize; } else { struct nlist nl[7], *p; nl[0].n_name = "_nprocs"; nl[1].n_name = "_allproc"; nl[2].n_name = "_zombproc"; nl[3].n_name = "_ticks"; nl[4].n_name = "_hz"; nl[5].n_name = "_cpu_tick_frequency"; nl[6].n_name = 0; if (kvm_nlist(kd, nl) != 0) { for (p = nl; p->n_type != 0; ++p) ; _kvm_err(kd, kd->program, "%s: no such symbol", p->n_name); return (0); } if (KREAD(kd, nl[0].n_value, &nprocs)) { _kvm_err(kd, kd->program, "can't read nprocs"); return (0); } if (KREAD(kd, nl[3].n_value, &ticks)) { _kvm_err(kd, kd->program, "can't read ticks"); return (0); } if (KREAD(kd, nl[4].n_value, &hz)) { _kvm_err(kd, kd->program, "can't read hz"); return (0); } if (KREAD(kd, nl[5].n_value, &cpu_tick_frequency)) { _kvm_err(kd, kd->program, "can't read cpu_tick_frequency"); return (0); } size = nprocs * sizeof(struct kinfo_proc); kd->procbase = (struct kinfo_proc *)_kvm_malloc(kd, size); if (kd->procbase == 0) return (0); nprocs = kvm_deadprocs(kd, op, arg, nl[1].n_value, nl[2].n_value, nprocs); if (nprocs <= 0) { _kvm_freeprocs(kd); nprocs = 0; } #ifdef notdef else { size = nprocs * sizeof(struct kinfo_proc); kd->procbase = realloc(kd->procbase, size); } #endif } *cnt = nprocs; return (kd->procbase); } void _kvm_freeprocs(kvm_t *kd) { if (kd->procbase) { free(kd->procbase); kd->procbase = 0; } } void * _kvm_realloc(kvm_t *kd, void *p, size_t n) { void *np = (void *)realloc(p, n); if (np == 0) { free(p); _kvm_err(kd, kd->program, "out of memory"); } return (np); } /* * Get the command args or environment. */ static char ** kvm_argv(kvm_t *kd, const struct kinfo_proc *kp, int env, int nchr) { int oid[4]; int i; size_t bufsz; static int buflen; static char *buf, *p; static char **bufp; static int argc; if (!ISALIVE(kd)) { _kvm_err(kd, kd->program, "cannot read user space from dead kernel"); return (0); } if (nchr == 0 || nchr > ARG_MAX) nchr = ARG_MAX; if (buflen == 0) { buf = malloc(nchr); if (buf == NULL) { _kvm_err(kd, kd->program, "cannot allocate memory"); return (0); } buflen = nchr; argc = 32; bufp = malloc(sizeof(char *) * argc); } else if (nchr > buflen) { p = realloc(buf, nchr); if (p != NULL) { buf = p; buflen = nchr; } } oid[0] = CTL_KERN; oid[1] = KERN_PROC; oid[2] = env ? KERN_PROC_ENV : KERN_PROC_ARGS; oid[3] = kp->ki_pid; bufsz = buflen; if (sysctl(oid, 4, buf, &bufsz, 0, 0) == -1) { /* * If the supplied buf is too short to hold the requested * value the sysctl returns with ENOMEM. The buf is filled * with the truncated value and the returned bufsz is equal * to the requested len. */ if (errno != ENOMEM || bufsz != (size_t)buflen) return (0); buf[bufsz - 1] = '\0'; errno = 0; } else if (bufsz == 0) { return (0); } i = 0; p = buf; do { bufp[i++] = p; p += strlen(p) + 1; if (i >= argc) { argc += argc; bufp = realloc(bufp, sizeof(char *) * argc); } } while (p < buf + bufsz); bufp[i++] = 0; return (bufp); } char ** kvm_getargv(kvm_t *kd, const struct kinfo_proc *kp, int nchr) { return (kvm_argv(kd, kp, 0, nchr)); } char ** kvm_getenvv(kvm_t *kd, const struct kinfo_proc *kp, int nchr) { return (kvm_argv(kd, kp, 1, nchr)); } Index: head/sys/compat/freebsd32/freebsd32.h =================================================================== --- head/sys/compat/freebsd32/freebsd32.h (revision 273265) +++ head/sys/compat/freebsd32/freebsd32.h (revision 273266) @@ -1,391 +1,393 @@ /*- * Copyright (c) 2001 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _COMPAT_FREEBSD32_FREEBSD32_H_ #define _COMPAT_FREEBSD32_FREEBSD32_H_ #include #include #include #define PTRIN(v) (void *)(uintptr_t) (v) #define PTROUT(v) (u_int32_t)(uintptr_t) (v) #define CP(src,dst,fld) do { (dst).fld = (src).fld; } while (0) #define PTRIN_CP(src,dst,fld) \ do { (dst).fld = PTRIN((src).fld); } while (0) #define PTROUT_CP(src,dst,fld) \ do { (dst).fld = PTROUT((src).fld); } while (0) /* * Being a newer port, 32-bit FreeBSD/MIPS uses 64-bit time_t. */ #ifdef __mips__ typedef int64_t time32_t; #else typedef int32_t time32_t; #endif struct timeval32 { time32_t tv_sec; int32_t tv_usec; }; #define TV_CP(src,dst,fld) do { \ CP((src).fld,(dst).fld,tv_sec); \ CP((src).fld,(dst).fld,tv_usec); \ } while (0) struct timespec32 { time32_t tv_sec; int32_t tv_nsec; }; #define TS_CP(src,dst,fld) do { \ CP((src).fld,(dst).fld,tv_sec); \ CP((src).fld,(dst).fld,tv_nsec); \ } while (0) struct itimerspec32 { struct timespec32 it_interval; struct timespec32 it_value; }; #define ITS_CP(src, dst) do { \ TS_CP((src), (dst), it_interval); \ TS_CP((src), (dst), it_value); \ } while (0) struct rusage32 { struct timeval32 ru_utime; struct timeval32 ru_stime; int32_t ru_maxrss; int32_t ru_ixrss; int32_t ru_idrss; int32_t ru_isrss; int32_t ru_minflt; int32_t ru_majflt; int32_t ru_nswap; int32_t ru_inblock; int32_t ru_oublock; int32_t ru_msgsnd; int32_t ru_msgrcv; int32_t ru_nsignals; int32_t ru_nvcsw; int32_t ru_nivcsw; }; struct wrusage32 { struct rusage32 wru_self; struct rusage32 wru_children; }; struct itimerval32 { struct timeval32 it_interval; struct timeval32 it_value; }; #define FREEBSD4_MNAMELEN (88 - 2 * sizeof(int32_t)) /* size of on/from name bufs */ /* 4.x version */ struct statfs32 { int32_t f_spare2; int32_t f_bsize; int32_t f_iosize; int32_t f_blocks; int32_t f_bfree; int32_t f_bavail; int32_t f_files; int32_t f_ffree; fsid_t f_fsid; uid_t f_owner; int32_t f_type; int32_t f_flags; int32_t f_syncwrites; int32_t f_asyncwrites; char f_fstypename[MFSNAMELEN]; char f_mntonname[FREEBSD4_MNAMELEN]; int32_t f_syncreads; int32_t f_asyncreads; int16_t f_spares1; char f_mntfromname[FREEBSD4_MNAMELEN]; int16_t f_spares2 __packed; int32_t f_spare[2]; }; struct kevent32 { u_int32_t ident; /* identifier for this event */ short filter; /* filter for event */ u_short flags; u_int fflags; int32_t data; u_int32_t udata; /* opaque user data identifier */ }; struct iovec32 { u_int32_t iov_base; int iov_len; }; struct msghdr32 { u_int32_t msg_name; socklen_t msg_namelen; u_int32_t msg_iov; int msg_iovlen; u_int32_t msg_control; socklen_t msg_controllen; int msg_flags; }; struct stat32 { dev_t st_dev; ino_t st_ino; mode_t st_mode; nlink_t st_nlink; uid_t st_uid; gid_t st_gid; dev_t st_rdev; struct timespec32 st_atim; struct timespec32 st_mtim; struct timespec32 st_ctim; off_t st_size; int64_t st_blocks; u_int32_t st_blksize; u_int32_t st_flags; u_int32_t st_gen; int32_t st_lspare; struct timespec32 st_birthtim; unsigned int :(8 / 2) * (16 - (int)sizeof(struct timespec32)); unsigned int :(8 / 2) * (16 - (int)sizeof(struct timespec32)); }; struct ostat32 { __uint16_t st_dev; ino_t st_ino; mode_t st_mode; nlink_t st_nlink; __uint16_t st_uid; __uint16_t st_gid; __uint16_t st_rdev; __int32_t st_size; struct timespec32 st_atim; struct timespec32 st_mtim; struct timespec32 st_ctim; __int32_t st_blksize; __int32_t st_blocks; u_int32_t st_flags; __uint32_t st_gen; }; struct jail32_v0 { u_int32_t version; uint32_t path; uint32_t hostname; u_int32_t ip_number; }; struct jail32 { uint32_t version; uint32_t path; uint32_t hostname; uint32_t jailname; uint32_t ip4s; uint32_t ip6s; uint32_t ip4; uint32_t ip6; }; struct sigaction32 { u_int32_t sa_u; int sa_flags; sigset_t sa_mask; }; struct thr_param32 { uint32_t start_func; uint32_t arg; uint32_t stack_base; uint32_t stack_size; uint32_t tls_base; uint32_t tls_size; uint32_t child_tid; uint32_t parent_tid; int32_t flags; uint32_t rtp; uint32_t spare[3]; }; struct i386_ldt_args32 { uint32_t start; uint32_t descs; uint32_t num; }; /* * Alternative layouts for */ struct prstatus32 { int pr_version; u_int pr_statussz; u_int pr_gregsetsz; u_int pr_fpregsetsz; int pr_osreldate; int pr_cursig; pid_t pr_pid; struct reg32 pr_reg; }; struct prpsinfo32 { int pr_version; u_int pr_psinfosz; char pr_fname[PRFNAMESZ+1]; char pr_psargs[PRARGSZ+1]; }; struct thrmisc32 { char pr_tname[MAXCOMLEN+1]; u_int _pad; }; struct mq_attr32 { int mq_flags; int mq_maxmsg; int mq_msgsize; int mq_curmsgs; int __reserved[4]; }; struct kinfo_proc32 { int ki_structsize; int ki_layout; uint32_t ki_args; uint32_t ki_paddr; uint32_t ki_addr; uint32_t ki_tracep; uint32_t ki_textvp; uint32_t ki_fd; uint32_t ki_vmspace; uint32_t ki_wchan; pid_t ki_pid; pid_t ki_ppid; pid_t ki_pgid; pid_t ki_tpgid; pid_t ki_sid; pid_t ki_tsid; short ki_jobc; short ki_spare_short1; dev_t ki_tdev; sigset_t ki_siglist; sigset_t ki_sigmask; sigset_t ki_sigignore; sigset_t ki_sigcatch; uid_t ki_uid; uid_t ki_ruid; uid_t ki_svuid; gid_t ki_rgid; gid_t ki_svgid; short ki_ngroups; short ki_spare_short2; gid_t ki_groups[KI_NGROUPS]; uint32_t ki_size; int32_t ki_rssize; int32_t ki_swrss; int32_t ki_tsize; int32_t ki_dsize; int32_t ki_ssize; u_short ki_xstat; u_short ki_acflag; fixpt_t ki_pctcpu; u_int ki_estcpu; u_int ki_slptime; u_int ki_swtime; u_int ki_cow; u_int64_t ki_runtime; struct timeval32 ki_start; struct timeval32 ki_childtime; int ki_flag; int ki_kiflag; int ki_traceflag; char ki_stat; signed char ki_nice; char ki_lock; char ki_rqindex; - u_char ki_oncpu; - u_char ki_lastcpu; + u_char ki_oncpu_old; + u_char ki_lastcpu_old; char ki_tdname[TDNAMLEN+1]; char ki_wmesg[WMESGLEN+1]; char ki_login[LOGNAMELEN+1]; char ki_lockname[LOCKNAMELEN+1]; char ki_comm[COMMLEN+1]; char ki_emul[KI_EMULNAMELEN+1]; char ki_loginclass[LOGINCLASSLEN+1]; char ki_sparestrings[50]; int ki_spareints[KI_NSPARE_INT]; + int ki_oncpu; + int ki_lastcpu; int ki_tracer; int ki_flag2; int ki_fibnum; u_int ki_cr_flags; int ki_jid; int ki_numthreads; lwpid_t ki_tid; struct priority ki_pri; struct rusage32 ki_rusage; struct rusage32 ki_rusage_ch; uint32_t ki_pcb; uint32_t ki_kstack; uint32_t ki_udata; uint32_t ki_tdaddr; uint32_t ki_spareptrs[KI_NSPARE_PTR]; /* spare room for growth */ int ki_sparelongs[KI_NSPARE_LONG]; int ki_sflag; int ki_tdflags; }; struct kinfo_sigtramp32 { uint32_t ksigtramp_start; uint32_t ksigtramp_end; uint32_t ksigtramp_spare[4]; }; struct kld32_file_stat_1 { int version; /* set to sizeof(struct kld_file_stat_1) */ char name[MAXPATHLEN]; int refs; int id; uint32_t address; /* load address */ uint32_t size; /* size in bytes */ }; struct kld32_file_stat { int version; /* set to sizeof(struct kld_file_stat) */ char name[MAXPATHLEN]; int refs; int id; uint32_t address; /* load address */ uint32_t size; /* size in bytes */ char pathname[MAXPATHLEN]; }; #endif /* !_COMPAT_FREEBSD32_FREEBSD32_H_ */ Index: head/sys/kern/kern_proc.c =================================================================== --- head/sys/kern/kern_proc.c (revision 273265) +++ head/sys/kern/kern_proc.c (revision 273266) @@ -1,2871 +1,2895 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #endif SDT_PROVIDER_DEFINE(proc); SDT_PROBE_DEFINE4(proc, kernel, ctor, entry, "struct proc *", "int", "void *", "int"); SDT_PROBE_DEFINE4(proc, kernel, ctor, return, "struct proc *", "int", "void *", "int"); SDT_PROBE_DEFINE4(proc, kernel, dtor, entry, "struct proc *", "int", "void *", "struct thread *"); SDT_PROBE_DEFINE3(proc, kernel, dtor, return, "struct proc *", "int", "void *"); SDT_PROBE_DEFINE3(proc, kernel, init, entry, "struct proc *", "int", "int"); SDT_PROBE_DEFINE3(proc, kernel, init, return, "struct proc *", "int", "int"); MALLOC_DEFINE(M_PGRP, "pgrp", "process group header"); MALLOC_DEFINE(M_SESSION, "session", "session header"); static MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); static void doenterpgrp(struct proc *, struct pgrp *); static void orphanpg(struct pgrp *pg); static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp); static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp); static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread); static void pgadjustjobc(struct pgrp *pgrp, int entering); static void pgdelete(struct pgrp *); static int proc_ctor(void *mem, int size, void *arg, int flags); static void proc_dtor(void *mem, int size, void *arg); static int proc_init(void *mem, int size, int flags); static void proc_fini(void *mem, int size); static void pargs_free(struct pargs *pa); static struct proc *zpfind_locked(pid_t pid); /* * Other process lists */ struct pidhashhead *pidhashtbl; u_long pidhash; struct pgrphashhead *pgrphashtbl; u_long pgrphash; struct proclist allproc; struct proclist zombproc; struct sx allproc_lock; struct sx proctree_lock; struct mtx ppeers_lock; uma_zone_t proc_zone; int kstack_pages = KSTACK_PAGES; SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, "Kernel stack size in pages"); static int vmmap_skip_res_cnt = 0; SYSCTL_INT(_kern, OID_AUTO, proc_vmmap_skip_resident_count, CTLFLAG_RW, &vmmap_skip_res_cnt, 0, "Skip calculation of the pages resident count in kern.proc.vmmap"); CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); #ifdef COMPAT_FREEBSD32 CTASSERT(sizeof(struct kinfo_proc32) == KINFO_PROC32_SIZE); #endif /* * Initialize global process hashing structures. */ void procinit() { sx_init(&allproc_lock, "allproc"); sx_init(&proctree_lock, "proctree"); mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF); LIST_INIT(&allproc); LIST_INIT(&zombproc); pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), proc_ctor, proc_dtor, proc_init, proc_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uihashinit(); } /* * Prepare a proc for use. */ static int proc_ctor(void *mem, int size, void *arg, int flags) { struct proc *p; p = (struct proc *)mem; SDT_PROBE(proc, kernel, ctor , entry, p, size, arg, flags, 0); EVENTHANDLER_INVOKE(process_ctor, p); SDT_PROBE(proc, kernel, ctor , return, p, size, arg, flags, 0); return (0); } /* * Reclaim a proc after use. */ static void proc_dtor(void *mem, int size, void *arg) { struct proc *p; struct thread *td; /* INVARIANTS checks go here */ p = (struct proc *)mem; td = FIRST_THREAD_IN_PROC(p); SDT_PROBE(proc, kernel, dtor, entry, p, size, arg, td, 0); if (td != NULL) { #ifdef INVARIANTS KASSERT((p->p_numthreads == 1), ("bad number of threads in exiting process")); KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr")); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); } EVENTHANDLER_INVOKE(process_dtor, p); if (p->p_ksi != NULL) KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue")); SDT_PROBE(proc, kernel, dtor, return, p, size, arg, 0, 0); } /* * Initialize type-stable parts of a proc (when newly created). */ static int proc_init(void *mem, int size, int flags) { struct proc *p; p = (struct proc *)mem; SDT_PROBE(proc, kernel, init, entry, p, size, flags, 0, 0); p->p_sched = (struct p_sched *)&p[1]; bzero(&p->p_mtx, sizeof(struct mtx)); mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE); cv_init(&p->p_pwait, "ppwait"); cv_init(&p->p_dbgwait, "dbgwait"); TAILQ_INIT(&p->p_threads); /* all threads in proc */ EVENTHANDLER_INVOKE(process_init, p); p->p_stats = pstats_alloc(); SDT_PROBE(proc, kernel, init, return, p, size, flags, 0, 0); return (0); } /* * UMA should ensure that this function is never called. * Freeing a proc structure would violate type stability. */ static void proc_fini(void *mem, int size) { #ifdef notnow struct proc *p; p = (struct proc *)mem; EVENTHANDLER_INVOKE(process_fini, p); pstats_free(p->p_stats); thread_free(FIRST_THREAD_IN_PROC(p)); mtx_destroy(&p->p_mtx); if (p->p_ksi != NULL) ksiginfo_free(p->p_ksi); #else panic("proc reclaimed"); #endif } /* * Is p an inferior of the current process? */ int inferior(struct proc *p) { sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); for (; p != curproc; p = proc_realparent(p)) { if (p->p_pid == 0) return (0); } return (1); } struct proc * pfind_locked(pid_t pid) { struct proc *p; sx_assert(&allproc_lock, SX_LOCKED); LIST_FOREACH(p, PIDHASH(pid), p_hash) { if (p->p_pid == pid) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); p = NULL; } break; } } return (p); } /* * Locate a process by number; return only "live" processes -- i.e., neither * zombies nor newly born but incompletely initialized processes. By not * returning processes in the PRS_NEW state, we allow callers to avoid * testing for that condition to avoid dereferencing p_ucred, et al. */ struct proc * pfind(pid_t pid) { struct proc *p; sx_slock(&allproc_lock); p = pfind_locked(pid); sx_sunlock(&allproc_lock); return (p); } static struct proc * pfind_tid_locked(pid_t tid) { struct proc *p; struct thread *td; sx_assert(&allproc_lock, SX_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) goto found; } PROC_UNLOCK(p); } found: return (p); } /* * Locate a process group by number. * The caller must hold proctree_lock. */ struct pgrp * pgfind(pgid) register pid_t pgid; { register struct pgrp *pgrp; sx_assert(&proctree_lock, SX_LOCKED); LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { if (pgrp->pg_id == pgid) { PGRP_LOCK(pgrp); return (pgrp); } } return (NULL); } /* * Locate process and do additional manipulations, depending on flags. */ int pget(pid_t pid, int flags, struct proc **pp) { struct proc *p; int error; sx_slock(&allproc_lock); if (pid <= PID_MAX) { p = pfind_locked(pid); if (p == NULL && (flags & PGET_NOTWEXIT) == 0) p = zpfind_locked(pid); } else if ((flags & PGET_NOTID) == 0) { p = pfind_tid_locked(pid); } else { p = NULL; } sx_sunlock(&allproc_lock); if (p == NULL) return (ESRCH); if ((flags & PGET_CANSEE) != 0) { error = p_cansee(curthread, p); if (error != 0) goto errout; } if ((flags & PGET_CANDEBUG) != 0) { error = p_candebug(curthread, p); if (error != 0) goto errout; } if ((flags & PGET_ISCURRENT) != 0 && curproc != p) { error = EPERM; goto errout; } if ((flags & PGET_NOTWEXIT) != 0 && (p->p_flag & P_WEXIT) != 0) { error = ESRCH; goto errout; } if ((flags & PGET_NOTINEXEC) != 0 && (p->p_flag & P_INEXEC) != 0) { /* * XXXRW: Not clear ESRCH is the right error during proc * execve(). */ error = ESRCH; goto errout; } if ((flags & PGET_HOLD) != 0) { _PHOLD(p); PROC_UNLOCK(p); } *pp = p; return (0); errout: PROC_UNLOCK(p); return (error); } /* * Create a new process group. * pgid must be equal to the pid of p. * Begin a new session if required. */ int enterpgrp(p, pgid, pgrp, sess) register struct proc *p; pid_t pgid; struct pgrp *pgrp; struct session *sess; { sx_assert(&proctree_lock, SX_XLOCKED); KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL")); KASSERT(p->p_pid == pgid, ("enterpgrp: new pgrp and pid != pgid")); KASSERT(pgfind(pgid) == NULL, ("enterpgrp: pgrp with pgid exists")); KASSERT(!SESS_LEADER(p), ("enterpgrp: session leader attempted setpgrp")); mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); if (sess != NULL) { /* * new session */ mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF); PROC_LOCK(p); p->p_flag &= ~P_CONTROLT; PROC_UNLOCK(p); PGRP_LOCK(pgrp); sess->s_leader = p; sess->s_sid = p->p_pid; refcount_init(&sess->s_count, 1); sess->s_ttyvp = NULL; sess->s_ttydp = NULL; sess->s_ttyp = NULL; bcopy(p->p_session->s_login, sess->s_login, sizeof(sess->s_login)); pgrp->pg_session = sess; KASSERT(p == curproc, ("enterpgrp: mksession and p != curproc")); } else { pgrp->pg_session = p->p_session; sess_hold(pgrp->pg_session); PGRP_LOCK(pgrp); } pgrp->pg_id = pgid; LIST_INIT(&pgrp->pg_members); /* * As we have an exclusive lock of proctree_lock, * this should not deadlock. */ LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); pgrp->pg_jobc = 0; SLIST_INIT(&pgrp->pg_sigiolst); PGRP_UNLOCK(pgrp); doenterpgrp(p, pgrp); return (0); } /* * Move p to an existing process group */ int enterthispgrp(p, pgrp) register struct proc *p; struct pgrp *pgrp; { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); KASSERT(pgrp->pg_session == p->p_session, ("%s: pgrp's session %p, p->p_session %p.\n", __func__, pgrp->pg_session, p->p_session)); KASSERT(pgrp != p->p_pgrp, ("%s: p belongs to pgrp.", __func__)); doenterpgrp(p, pgrp); return (0); } /* * Move p to a process group */ static void doenterpgrp(p, pgrp) struct proc *p; struct pgrp *pgrp; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED); savepgrp = p->p_pgrp; /* * Adjust eligibility of affected pgrps to participate in job control. * Increment eligibility counts before decrementing, otherwise we * could reach 0 spuriously during the first call. */ fixjobc(p, pgrp, 1); fixjobc(p, p->p_pgrp, 0); PGRP_LOCK(pgrp); PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = pgrp; PROC_UNLOCK(p); LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); PGRP_UNLOCK(savepgrp); PGRP_UNLOCK(pgrp); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); } /* * remove process from process group */ int leavepgrp(p) register struct proc *p; { struct pgrp *savepgrp; sx_assert(&proctree_lock, SX_XLOCKED); savepgrp = p->p_pgrp; PGRP_LOCK(savepgrp); PROC_LOCK(p); LIST_REMOVE(p, p_pglist); p->p_pgrp = NULL; PROC_UNLOCK(p); PGRP_UNLOCK(savepgrp); if (LIST_EMPTY(&savepgrp->pg_members)) pgdelete(savepgrp); return (0); } /* * delete a process group */ static void pgdelete(pgrp) register struct pgrp *pgrp; { struct session *savesess; struct tty *tp; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pgid. */ funsetownlst(&pgrp->pg_sigiolst); PGRP_LOCK(pgrp); tp = pgrp->pg_session->s_ttyp; LIST_REMOVE(pgrp, pg_hash); savesess = pgrp->pg_session; PGRP_UNLOCK(pgrp); /* Remove the reference to the pgrp before deallocating it. */ if (tp != NULL) { tty_lock(tp); tty_rel_pgrp(tp, pgrp); } mtx_destroy(&pgrp->pg_mtx); free(pgrp, M_PGRP); sess_release(savesess); } static void pgadjustjobc(pgrp, entering) struct pgrp *pgrp; int entering; { PGRP_LOCK(pgrp); if (entering) pgrp->pg_jobc++; else { --pgrp->pg_jobc; if (pgrp->pg_jobc == 0) orphanpg(pgrp); } PGRP_UNLOCK(pgrp); } /* * Adjust pgrp jobc counters when specified process changes process group. * We count the number of processes in each process group that "qualify" * the group for terminal job control (those with a parent in a different * process group of the same session). If that count reaches zero, the * process group becomes orphaned. Check both the specified process' * process group and that of its children. * entering == 0 => p is leaving specified group. * entering == 1 => p is entering specified group. */ void fixjobc(p, pgrp, entering) register struct proc *p; register struct pgrp *pgrp; int entering; { register struct pgrp *hispgrp; register struct session *mysession; sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); /* * Check p's parent to see whether p qualifies its own process * group; if so, adjust count for p's process group. */ mysession = pgrp->pg_session; if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && hispgrp->pg_session == mysession) pgadjustjobc(pgrp, entering); /* * Check this process' children to see whether they qualify * their process groups; if so, adjust counts for children's * process groups. */ LIST_FOREACH(p, &p->p_children, p_sibling) { hispgrp = p->p_pgrp; if (hispgrp == pgrp || hispgrp->pg_session != mysession) continue; PROC_LOCK(p); if (p->p_state == PRS_ZOMBIE) { PROC_UNLOCK(p); continue; } PROC_UNLOCK(p); pgadjustjobc(hispgrp, entering); } } /* * A process group has become orphaned; * if there are any stopped processes in the group, * hang-up all process in that group. */ static void orphanpg(pg) struct pgrp *pg; { register struct proc *p; PGRP_LOCK_ASSERT(pg, MA_OWNED); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (P_SHOULDSTOP(p)) { PROC_UNLOCK(p); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); kern_psignal(p, SIGHUP); kern_psignal(p, SIGCONT); PROC_UNLOCK(p); } return; } PROC_UNLOCK(p); } } void sess_hold(struct session *s) { refcount_acquire(&s->s_count); } void sess_release(struct session *s) { if (refcount_release(&s->s_count)) { if (s->s_ttyp != NULL) { tty_lock(s->s_ttyp); tty_rel_sess(s->s_ttyp, s); } mtx_destroy(&s->s_mtx); free(s, M_SESSION); } } #ifdef DDB DB_SHOW_COMMAND(pgrpdump, pgrpdump) { register struct pgrp *pgrp; register struct proc *p; register int i; for (i = 0; i <= pgrphash; i++) { if (!LIST_EMPTY(&pgrphashtbl[i])) { printf("\tindx %d\n", i); LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { printf( "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", (void *)pgrp, (long)pgrp->pg_id, (void *)pgrp->pg_session, pgrp->pg_session->s_count, (void *)LIST_FIRST(&pgrp->pg_members)); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { printf("\t\tpid %ld addr %p pgrp %p\n", (long)p->p_pid, (void *)p, (void *)p->p_pgrp); } } } } } #endif /* DDB */ /* * Calculate the kinfo_proc members which contain process-wide * informations. * Must be called with the target process locked. */ static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); kp->ki_estcpu = 0; kp->ki_pctcpu = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); kp->ki_pctcpu += sched_pctcpu(td); kp->ki_estcpu += td->td_estcpu; thread_unlock(td); } } /* * Clear kinfo_proc and fill in any information that is common * to all threads in the process. * Must be called with the target process locked. */ static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp) { struct thread *td0; struct tty *tp; struct session *sp; struct ucred *cred; struct sigacts *ps; /* For proc_realparent. */ sx_assert(&proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); bzero(kp, sizeof(*kp)); kp->ki_structsize = sizeof(*kp); kp->ki_paddr = p; kp->ki_addr =/* p->p_addr; */0; /* XXX */ kp->ki_args = p->p_args; kp->ki_textvp = p->p_textvp; #ifdef KTRACE kp->ki_tracep = p->p_tracevp; kp->ki_traceflag = p->p_traceflag; #endif kp->ki_fd = p->p_fd; kp->ki_vmspace = p->p_vmspace; kp->ki_flag = p->p_flag; kp->ki_flag2 = p->p_flag2; cred = p->p_ucred; if (cred) { kp->ki_uid = cred->cr_uid; kp->ki_ruid = cred->cr_ruid; kp->ki_svuid = cred->cr_svuid; kp->ki_cr_flags = 0; if (cred->cr_flags & CRED_FLAG_CAPMODE) kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE; /* XXX bde doesn't like KI_NGROUPS */ if (cred->cr_ngroups > KI_NGROUPS) { kp->ki_ngroups = KI_NGROUPS; kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW; } else kp->ki_ngroups = cred->cr_ngroups; bcopy(cred->cr_groups, kp->ki_groups, kp->ki_ngroups * sizeof(gid_t)); kp->ki_rgid = cred->cr_rgid; kp->ki_svgid = cred->cr_svgid; /* If jailed(cred), emulate the old P_JAILED flag. */ if (jailed(cred)) { kp->ki_flag |= P_JAILED; /* If inside the jail, use 0 as a jail ID. */ if (cred->cr_prison != curthread->td_ucred->cr_prison) kp->ki_jid = cred->cr_prison->pr_id; } strlcpy(kp->ki_loginclass, cred->cr_loginclass->lc_name, sizeof(kp->ki_loginclass)); } ps = p->p_sigacts; if (ps) { mtx_lock(&ps->ps_mtx); kp->ki_sigignore = ps->ps_sigignore; kp->ki_sigcatch = ps->ps_sigcatch; mtx_unlock(&ps->ps_mtx); } if (p->p_state != PRS_NEW && p->p_state != PRS_ZOMBIE && p->p_vmspace != NULL) { struct vmspace *vm = p->p_vmspace; kp->ki_size = vm->vm_map.size; kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/ FOREACH_THREAD_IN_PROC(p, td0) { if (!TD_IS_SWAPPED(td0)) kp->ki_rssize += td0->td_kstack_pages; } kp->ki_swrss = vm->vm_swrss; kp->ki_tsize = vm->vm_tsize; kp->ki_dsize = vm->vm_dsize; kp->ki_ssize = vm->vm_ssize; } else if (p->p_state == PRS_ZOMBIE) kp->ki_stat = SZOMB; if (kp->ki_flag & P_INMEM) kp->ki_sflag = PS_INMEM; else kp->ki_sflag = 0; /* Calculate legacy swtime as seconds since 'swtick'. */ kp->ki_swtime = (ticks - p->p_swtick) / hz; kp->ki_pid = p->p_pid; kp->ki_nice = p->p_nice; kp->ki_fibnum = p->p_fibnum; kp->ki_start = p->p_stats->p_start; timevaladd(&kp->ki_start, &boottime); PROC_SLOCK(p); rufetch(p, &kp->ki_rusage); kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime); calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime); PROC_SUNLOCK(p); calccru(p, &kp->ki_childutime, &kp->ki_childstime); /* Some callers want child times in a single value. */ kp->ki_childtime = kp->ki_childstime; timevaladd(&kp->ki_childtime, &kp->ki_childutime); FOREACH_THREAD_IN_PROC(p, td0) kp->ki_cow += td0->td_cow; tp = NULL; if (p->p_pgrp) { kp->ki_pgid = p->p_pgrp->pg_id; kp->ki_jobc = p->p_pgrp->pg_jobc; sp = p->p_pgrp->pg_session; if (sp != NULL) { kp->ki_sid = sp->s_sid; SESS_LOCK(sp); strlcpy(kp->ki_login, sp->s_login, sizeof(kp->ki_login)); if (sp->s_ttyvp) kp->ki_kiflag |= KI_CTTY; if (SESS_LEADER(p)) kp->ki_kiflag |= KI_SLEADER; /* XXX proctree_lock */ tp = sp->s_ttyp; SESS_UNLOCK(sp); } } if ((p->p_flag & P_CONTROLT) && tp != NULL) { kp->ki_tdev = tty_udev(tp); kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; if (tp->t_session) kp->ki_tsid = tp->t_session->s_sid; } else kp->ki_tdev = NODEV; if (p->p_comm[0] != '\0') strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm)); if (p->p_sysent && p->p_sysent->sv_name != NULL && p->p_sysent->sv_name[0] != '\0') strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul)); kp->ki_siglist = p->p_siglist; kp->ki_xstat = p->p_xstat; kp->ki_acflag = p->p_acflag; kp->ki_lock = p->p_lock; if (p->p_pptr) { kp->ki_ppid = proc_realparent(p)->p_pid; if (p->p_flag & P_TRACED) kp->ki_tracer = p->p_pptr->p_pid; } } /* * Fill in information that is thread specific. Must be called with * target process locked. If 'preferthread' is set, overwrite certain * process-related fields that are maintained for both threads and * processes. */ static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread) { struct proc *p; p = td->td_proc; kp->ki_tdaddr = td; PROC_LOCK_ASSERT(p, MA_OWNED); if (preferthread) PROC_SLOCK(p); thread_lock(td); if (td->td_wmesg != NULL) strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg)); else bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg)); strlcpy(kp->ki_tdname, td->td_name, sizeof(kp->ki_tdname)); if (TD_ON_LOCK(td)) { kp->ki_kiflag |= KI_LOCKBLOCK; strlcpy(kp->ki_lockname, td->td_lockname, sizeof(kp->ki_lockname)); } else { kp->ki_kiflag &= ~KI_LOCKBLOCK; bzero(kp->ki_lockname, sizeof(kp->ki_lockname)); } if (p->p_state == PRS_NORMAL) { /* approximate. */ if (TD_ON_RUNQ(td) || TD_CAN_RUN(td) || TD_IS_RUNNING(td)) { kp->ki_stat = SRUN; } else if (P_SHOULDSTOP(p)) { kp->ki_stat = SSTOP; } else if (TD_IS_SLEEPING(td)) { kp->ki_stat = SSLEEP; } else if (TD_ON_LOCK(td)) { kp->ki_stat = SLOCK; } else { kp->ki_stat = SWAIT; } } else if (p->p_state == PRS_ZOMBIE) { kp->ki_stat = SZOMB; } else { kp->ki_stat = SIDL; } /* Things in the thread */ kp->ki_wchan = td->td_wchan; kp->ki_pri.pri_level = td->td_priority; kp->ki_pri.pri_native = td->td_base_pri; + + /* + * Note: legacy fields; clamp at the old NOCPU value and/or + * the maximum u_char CPU value. + */ + if (td->td_lastcpu == NOCPU) + kp->ki_lastcpu_old = NOCPU_OLD; + else if (td->td_lastcpu > MAXCPU_OLD) + kp->ki_lastcpu_old = MAXCPU_OLD; + else + kp->ki_lastcpu_old = td->td_lastcpu; + + if (td->td_oncpu == NOCPU) + kp->ki_oncpu_old = NOCPU_OLD; + else if (td->td_oncpu > MAXCPU_OLD) + kp->ki_oncpu_old = MAXCPU_OLD; + else + kp->ki_oncpu_old = td->td_oncpu; + kp->ki_lastcpu = td->td_lastcpu; kp->ki_oncpu = td->td_oncpu; kp->ki_tdflags = td->td_flags; kp->ki_tid = td->td_tid; kp->ki_numthreads = p->p_numthreads; kp->ki_pcb = td->td_pcb; kp->ki_kstack = (void *)td->td_kstack; kp->ki_slptime = (ticks - td->td_slptick) / hz; kp->ki_pri.pri_class = td->td_pri_class; kp->ki_pri.pri_user = td->td_user_pri; if (preferthread) { rufetchtd(td, &kp->ki_rusage); kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime); kp->ki_pctcpu = sched_pctcpu(td); kp->ki_estcpu = td->td_estcpu; kp->ki_cow = td->td_cow; } /* We can't get this anymore but ps etc never used it anyway. */ kp->ki_rqindex = 0; if (preferthread) kp->ki_siglist = td->td_siglist; kp->ki_sigmask = td->td_sigmask; thread_unlock(td); if (preferthread) PROC_SUNLOCK(p); } /* * Fill in a kinfo_proc structure for the specified process. * Must be called with the target process locked. */ void fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp) { MPASS(FIRST_THREAD_IN_PROC(p) != NULL); fill_kinfo_proc_only(p, kp); fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0); fill_kinfo_aggregate(p, kp); } struct pstats * pstats_alloc(void) { return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK)); } /* * Copy parts of p_stats; zero the rest of p_stats (statistics). */ void pstats_fork(struct pstats *src, struct pstats *dst) { bzero(&dst->pstat_startzero, __rangeof(struct pstats, pstat_startzero, pstat_endzero)); bcopy(&src->pstat_startcopy, &dst->pstat_startcopy, __rangeof(struct pstats, pstat_startcopy, pstat_endcopy)); } void pstats_free(struct pstats *ps) { free(ps, M_SUBPROC); } static struct proc * zpfind_locked(pid_t pid) { struct proc *p; sx_assert(&allproc_lock, SX_LOCKED); LIST_FOREACH(p, &zombproc, p_list) { if (p->p_pid == pid) { PROC_LOCK(p); break; } } return (p); } /* * Locate a zombie process by number */ struct proc * zpfind(pid_t pid) { struct proc *p; sx_slock(&allproc_lock); p = zpfind_locked(pid); sx_sunlock(&allproc_lock); return (p); } #ifdef COMPAT_FREEBSD32 /* * This function is typically used to copy out the kernel address, so * it can be replaced by assignment of zero. */ static inline uint32_t ptr32_trim(void *ptr) { uintptr_t uptr; uptr = (uintptr_t)ptr; return ((uptr > UINT_MAX) ? 0 : uptr); } #define PTRTRIM_CP(src,dst,fld) \ do { (dst).fld = ptr32_trim((src).fld); } while (0) static void freebsd32_kinfo_proc_out(const struct kinfo_proc *ki, struct kinfo_proc32 *ki32) { int i; bzero(ki32, sizeof(struct kinfo_proc32)); ki32->ki_structsize = sizeof(struct kinfo_proc32); CP(*ki, *ki32, ki_layout); PTRTRIM_CP(*ki, *ki32, ki_args); PTRTRIM_CP(*ki, *ki32, ki_paddr); PTRTRIM_CP(*ki, *ki32, ki_addr); PTRTRIM_CP(*ki, *ki32, ki_tracep); PTRTRIM_CP(*ki, *ki32, ki_textvp); PTRTRIM_CP(*ki, *ki32, ki_fd); PTRTRIM_CP(*ki, *ki32, ki_vmspace); PTRTRIM_CP(*ki, *ki32, ki_wchan); CP(*ki, *ki32, ki_pid); CP(*ki, *ki32, ki_ppid); CP(*ki, *ki32, ki_pgid); CP(*ki, *ki32, ki_tpgid); CP(*ki, *ki32, ki_sid); CP(*ki, *ki32, ki_tsid); CP(*ki, *ki32, ki_jobc); CP(*ki, *ki32, ki_tdev); CP(*ki, *ki32, ki_siglist); CP(*ki, *ki32, ki_sigmask); CP(*ki, *ki32, ki_sigignore); CP(*ki, *ki32, ki_sigcatch); CP(*ki, *ki32, ki_uid); CP(*ki, *ki32, ki_ruid); CP(*ki, *ki32, ki_svuid); CP(*ki, *ki32, ki_rgid); CP(*ki, *ki32, ki_svgid); CP(*ki, *ki32, ki_ngroups); for (i = 0; i < KI_NGROUPS; i++) CP(*ki, *ki32, ki_groups[i]); CP(*ki, *ki32, ki_size); CP(*ki, *ki32, ki_rssize); CP(*ki, *ki32, ki_swrss); CP(*ki, *ki32, ki_tsize); CP(*ki, *ki32, ki_dsize); CP(*ki, *ki32, ki_ssize); CP(*ki, *ki32, ki_xstat); CP(*ki, *ki32, ki_acflag); CP(*ki, *ki32, ki_pctcpu); CP(*ki, *ki32, ki_estcpu); CP(*ki, *ki32, ki_slptime); CP(*ki, *ki32, ki_swtime); CP(*ki, *ki32, ki_cow); CP(*ki, *ki32, ki_runtime); TV_CP(*ki, *ki32, ki_start); TV_CP(*ki, *ki32, ki_childtime); CP(*ki, *ki32, ki_flag); CP(*ki, *ki32, ki_kiflag); CP(*ki, *ki32, ki_traceflag); CP(*ki, *ki32, ki_stat); CP(*ki, *ki32, ki_nice); CP(*ki, *ki32, ki_lock); CP(*ki, *ki32, ki_rqindex); CP(*ki, *ki32, ki_oncpu); CP(*ki, *ki32, ki_lastcpu); + + /* XXX TODO: wrap cpu value as appropriate */ + CP(*ki, *ki32, ki_oncpu_old); + CP(*ki, *ki32, ki_lastcpu_old); + bcopy(ki->ki_tdname, ki32->ki_tdname, TDNAMLEN + 1); bcopy(ki->ki_wmesg, ki32->ki_wmesg, WMESGLEN + 1); bcopy(ki->ki_login, ki32->ki_login, LOGNAMELEN + 1); bcopy(ki->ki_lockname, ki32->ki_lockname, LOCKNAMELEN + 1); bcopy(ki->ki_comm, ki32->ki_comm, COMMLEN + 1); bcopy(ki->ki_emul, ki32->ki_emul, KI_EMULNAMELEN + 1); bcopy(ki->ki_loginclass, ki32->ki_loginclass, LOGINCLASSLEN + 1); CP(*ki, *ki32, ki_tracer); CP(*ki, *ki32, ki_flag2); CP(*ki, *ki32, ki_fibnum); CP(*ki, *ki32, ki_cr_flags); CP(*ki, *ki32, ki_jid); CP(*ki, *ki32, ki_numthreads); CP(*ki, *ki32, ki_tid); CP(*ki, *ki32, ki_pri); freebsd32_rusage_out(&ki->ki_rusage, &ki32->ki_rusage); freebsd32_rusage_out(&ki->ki_rusage_ch, &ki32->ki_rusage_ch); PTRTRIM_CP(*ki, *ki32, ki_pcb); PTRTRIM_CP(*ki, *ki32, ki_kstack); PTRTRIM_CP(*ki, *ki32, ki_udata); CP(*ki, *ki32, ki_sflag); CP(*ki, *ki32, ki_tdflags); } #endif int kern_proc_out(struct proc *p, struct sbuf *sb, int flags) { struct thread *td; struct kinfo_proc ki; #ifdef COMPAT_FREEBSD32 struct kinfo_proc32 ki32; #endif int error; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(FIRST_THREAD_IN_PROC(p) != NULL); error = 0; fill_kinfo_proc(p, &ki); if ((flags & KERN_PROC_NOTHREADS) != 0) { #ifdef COMPAT_FREEBSD32 if ((flags & KERN_PROC_MASK32) != 0) { freebsd32_kinfo_proc_out(&ki, &ki32); if (sbuf_bcat(sb, &ki32, sizeof(ki32)) != 0) error = ENOMEM; } else #endif if (sbuf_bcat(sb, &ki, sizeof(ki)) != 0) error = ENOMEM; } else { FOREACH_THREAD_IN_PROC(p, td) { fill_kinfo_thread(td, &ki, 1); #ifdef COMPAT_FREEBSD32 if ((flags & KERN_PROC_MASK32) != 0) { freebsd32_kinfo_proc_out(&ki, &ki32); if (sbuf_bcat(sb, &ki32, sizeof(ki32)) != 0) error = ENOMEM; } else #endif if (sbuf_bcat(sb, &ki, sizeof(ki)) != 0) error = ENOMEM; if (error != 0) break; } } PROC_UNLOCK(p); return (error); } static int sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags, int doingzomb) { struct sbuf sb; struct kinfo_proc ki; struct proc *np; int error, error2; pid_t pid; pid = p->p_pid; sbuf_new_for_sysctl(&sb, (char *)&ki, sizeof(ki), req); error = kern_proc_out(p, &sb, flags); error2 = sbuf_finish(&sb); sbuf_delete(&sb); if (error != 0) return (error); else if (error2 != 0) return (error2); if (doingzomb) np = zpfind(pid); else { if (pid == 0) return (0); np = pfind(pid); } if (np == NULL) return (ESRCH); if (np != p) { PROC_UNLOCK(np); return (ESRCH); } PROC_UNLOCK(np); return (0); } static int sysctl_kern_proc(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int flags, doingzomb, oid_number; int error = 0; oid_number = oidp->oid_number; if (oid_number != KERN_PROC_ALL && (oid_number & KERN_PROC_INC_THREAD) == 0) flags = KERN_PROC_NOTHREADS; else { flags = 0; oid_number &= ~KERN_PROC_INC_THREAD; } #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) flags |= KERN_PROC_MASK32; #endif if (oid_number == KERN_PROC_PID) { if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); sx_slock(&proctree_lock); error = pget((pid_t)name[0], PGET_CANSEE, &p); if (error == 0) error = sysctl_out_proc(p, req, flags, 0); sx_sunlock(&proctree_lock); return (error); } switch (oid_number) { case KERN_PROC_ALL: if (namelen != 0) return (EINVAL); break; case KERN_PROC_PROC: if (namelen != 0 && namelen != 1) return (EINVAL); break; default: if (namelen != 1) return (EINVAL); break; } if (!req->oldptr) { /* overestimate by 5 procs */ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); if (error) return (error); } error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sx_slock(&proctree_lock); sx_slock(&allproc_lock); for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { if (!doingzomb) p = LIST_FIRST(&allproc); else p = LIST_FIRST(&zombproc); for (; p != 0; p = LIST_NEXT(p, p_list)) { /* * Skip embryonic processes. */ PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } KASSERT(p->p_ucred != NULL, ("process credential is NULL for non-NEW proc")); /* * Show a user only appropriate processes. */ if (p_cansee(curthread, p)) { PROC_UNLOCK(p); continue; } /* * TODO - make more efficient (see notes below). * do by session. */ switch (oid_number) { case KERN_PROC_GID: if (p->p_ucred->cr_gid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PGRP: /* could do this by traversing pgrp */ if (p->p_pgrp == NULL || p->p_pgrp->pg_id != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RGID: if (p->p_ucred->cr_rgid != (gid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_SESSION: if (p->p_session == NULL || p->p_session->s_sid != (pid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_TTY: if ((p->p_flag & P_CONTROLT) == 0 || p->p_session == NULL) { PROC_UNLOCK(p); continue; } /* XXX proctree_lock */ SESS_LOCK(p->p_session); if (p->p_session->s_ttyp == NULL || tty_udev(p->p_session->s_ttyp) != (dev_t)name[0]) { SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); continue; } SESS_UNLOCK(p->p_session); break; case KERN_PROC_UID: if (p->p_ucred->cr_uid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_RUID: if (p->p_ucred->cr_ruid != (uid_t)name[0]) { PROC_UNLOCK(p); continue; } break; case KERN_PROC_PROC: break; default: break; } error = sysctl_out_proc(p, req, flags, doingzomb); if (error) { sx_sunlock(&allproc_lock); sx_sunlock(&proctree_lock); return (error); } } } sx_sunlock(&allproc_lock); sx_sunlock(&proctree_lock); return (0); } struct pargs * pargs_alloc(int len) { struct pargs *pa; pa = malloc(sizeof(struct pargs) + len, M_PARGS, M_WAITOK); refcount_init(&pa->ar_ref, 1); pa->ar_length = len; return (pa); } static void pargs_free(struct pargs *pa) { free(pa, M_PARGS); } void pargs_hold(struct pargs *pa) { if (pa == NULL) return; refcount_acquire(&pa->ar_ref); } void pargs_drop(struct pargs *pa) { if (pa == NULL) return; if (refcount_release(&pa->ar_ref)) pargs_free(pa); } static int proc_read_mem(struct thread *td, struct proc *p, vm_offset_t offset, void* buf, size_t len) { struct iovec iov; struct uio uio; iov.iov_base = (caddr_t)buf; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = offset; uio.uio_resid = (ssize_t)len; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; return (proc_rwmem(p, &uio)); } static int proc_read_string(struct thread *td, struct proc *p, const char *sptr, char *buf, size_t len) { size_t i; int error; error = proc_read_mem(td, p, (vm_offset_t)sptr, buf, len); /* * Reading the chunk may validly return EFAULT if the string is shorter * than the chunk and is aligned at the end of the page, assuming the * next page is not mapped. So if EFAULT is returned do a fallback to * one byte read loop. */ if (error == EFAULT) { for (i = 0; i < len; i++, buf++, sptr++) { error = proc_read_mem(td, p, (vm_offset_t)sptr, buf, 1); if (error != 0) return (error); if (*buf == '\0') break; } error = 0; } return (error); } #define PROC_AUXV_MAX 256 /* Safety limit on auxv size. */ enum proc_vector_type { PROC_ARG, PROC_ENV, PROC_AUX, }; #ifdef COMPAT_FREEBSD32 static int get_proc_vector32(struct thread *td, struct proc *p, char ***proc_vectorp, size_t *vsizep, enum proc_vector_type type) { struct freebsd32_ps_strings pss; Elf32_Auxinfo aux; vm_offset_t vptr, ptr; uint32_t *proc_vector32; char **proc_vector; size_t vsize, size; int i, error; error = proc_read_mem(td, p, (vm_offset_t)(p->p_sysent->sv_psstrings), &pss, sizeof(pss)); if (error != 0) return (error); switch (type) { case PROC_ARG: vptr = (vm_offset_t)PTRIN(pss.ps_argvstr); vsize = pss.ps_nargvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(int32_t); break; case PROC_ENV: vptr = (vm_offset_t)PTRIN(pss.ps_envstr); vsize = pss.ps_nenvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(int32_t); break; case PROC_AUX: vptr = (vm_offset_t)PTRIN(pss.ps_envstr) + (pss.ps_nenvstr + 1) * sizeof(int32_t); if (vptr % 4 != 0) return (ENOEXEC); for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) { error = proc_read_mem(td, p, ptr, &aux, sizeof(aux)); if (error != 0) return (error); if (aux.a_type == AT_NULL) break; ptr += sizeof(aux); } if (aux.a_type != AT_NULL) return (ENOEXEC); vsize = i + 1; size = vsize * sizeof(aux); break; default: KASSERT(0, ("Wrong proc vector type: %d", type)); return (EINVAL); } proc_vector32 = malloc(size, M_TEMP, M_WAITOK); error = proc_read_mem(td, p, vptr, proc_vector32, size); if (error != 0) goto done; if (type == PROC_AUX) { *proc_vectorp = (char **)proc_vector32; *vsizep = vsize; return (0); } proc_vector = malloc(vsize * sizeof(char *), M_TEMP, M_WAITOK); for (i = 0; i < (int)vsize; i++) proc_vector[i] = PTRIN(proc_vector32[i]); *proc_vectorp = proc_vector; *vsizep = vsize; done: free(proc_vector32, M_TEMP); return (error); } #endif static int get_proc_vector(struct thread *td, struct proc *p, char ***proc_vectorp, size_t *vsizep, enum proc_vector_type type) { struct ps_strings pss; Elf_Auxinfo aux; vm_offset_t vptr, ptr; char **proc_vector; size_t vsize, size; int error, i; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32) != 0) return (get_proc_vector32(td, p, proc_vectorp, vsizep, type)); #endif error = proc_read_mem(td, p, (vm_offset_t)(p->p_sysent->sv_psstrings), &pss, sizeof(pss)); if (error != 0) return (error); switch (type) { case PROC_ARG: vptr = (vm_offset_t)pss.ps_argvstr; vsize = pss.ps_nargvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(char *); break; case PROC_ENV: vptr = (vm_offset_t)pss.ps_envstr; vsize = pss.ps_nenvstr; if (vsize > ARG_MAX) return (ENOEXEC); size = vsize * sizeof(char *); break; case PROC_AUX: /* * The aux array is just above env array on the stack. Check * that the address is naturally aligned. */ vptr = (vm_offset_t)pss.ps_envstr + (pss.ps_nenvstr + 1) * sizeof(char *); #if __ELF_WORD_SIZE == 64 if (vptr % sizeof(uint64_t) != 0) #else if (vptr % sizeof(uint32_t) != 0) #endif return (ENOEXEC); /* * We count the array size reading the aux vectors from the * stack until AT_NULL vector is returned. So (to keep the code * simple) we read the process stack twice: the first time here * to find the size and the second time when copying the vectors * to the allocated proc_vector. */ for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) { error = proc_read_mem(td, p, ptr, &aux, sizeof(aux)); if (error != 0) return (error); if (aux.a_type == AT_NULL) break; ptr += sizeof(aux); } /* * If the PROC_AUXV_MAX entries are iterated over, and we have * not reached AT_NULL, it is most likely we are reading wrong * data: either the process doesn't have auxv array or data has * been modified. Return the error in this case. */ if (aux.a_type != AT_NULL) return (ENOEXEC); vsize = i + 1; size = vsize * sizeof(aux); break; default: KASSERT(0, ("Wrong proc vector type: %d", type)); return (EINVAL); /* In case we are built without INVARIANTS. */ } proc_vector = malloc(size, M_TEMP, M_WAITOK); if (proc_vector == NULL) return (ENOMEM); error = proc_read_mem(td, p, vptr, proc_vector, size); if (error != 0) { free(proc_vector, M_TEMP); return (error); } *proc_vectorp = proc_vector; *vsizep = vsize; return (0); } #define GET_PS_STRINGS_CHUNK_SZ 256 /* Chunk size (bytes) for ps_strings operations. */ static int get_ps_strings(struct thread *td, struct proc *p, struct sbuf *sb, enum proc_vector_type type) { size_t done, len, nchr, vsize; int error, i; char **proc_vector, *sptr; char pss_string[GET_PS_STRINGS_CHUNK_SZ]; PROC_ASSERT_HELD(p); /* * We are not going to read more than 2 * (PATH_MAX + ARG_MAX) bytes. */ nchr = 2 * (PATH_MAX + ARG_MAX); error = get_proc_vector(td, p, &proc_vector, &vsize, type); if (error != 0) return (error); for (done = 0, i = 0; i < (int)vsize && done < nchr; i++) { /* * The program may have scribbled into its argv array, e.g. to * remove some arguments. If that has happened, break out * before trying to read from NULL. */ if (proc_vector[i] == NULL) break; for (sptr = proc_vector[i]; ; sptr += GET_PS_STRINGS_CHUNK_SZ) { error = proc_read_string(td, p, sptr, pss_string, sizeof(pss_string)); if (error != 0) goto done; len = strnlen(pss_string, GET_PS_STRINGS_CHUNK_SZ); if (done + len >= nchr) len = nchr - done - 1; sbuf_bcat(sb, pss_string, len); if (len != GET_PS_STRINGS_CHUNK_SZ) break; done += GET_PS_STRINGS_CHUNK_SZ; } sbuf_bcat(sb, "", 1); done += len + 1; } done: free(proc_vector, M_TEMP); return (error); } int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb) { return (get_ps_strings(curthread, p, sb, PROC_ARG)); } int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb) { return (get_ps_strings(curthread, p, sb, PROC_ENV)); } int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb) { size_t vsize, size; char **auxv; int error; error = get_proc_vector(td, p, &auxv, &vsize, PROC_AUX); if (error == 0) { #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32) != 0) size = vsize * sizeof(Elf32_Auxinfo); else #endif size = vsize * sizeof(Elf_Auxinfo); if (sbuf_bcat(sb, auxv, size) != 0) error = ENOMEM; free(auxv, M_TEMP); } return (error); } /* * This sysctl allows a process to retrieve the argument list or process * title for another process without groping around in the address space * of the other process. It also allow a process to set its own "process * title to a string of its own choice. */ static int sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct pargs *newpa, *pa; struct proc *p; struct sbuf sb; int flags, error = 0, error2; if (namelen != 1) return (EINVAL); flags = PGET_CANSEE; if (req->newptr != NULL) flags |= PGET_ISCURRENT; error = pget((pid_t)name[0], flags, &p); if (error) return (error); pa = p->p_args; if (pa != NULL) { pargs_hold(pa); PROC_UNLOCK(p); error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length); pargs_drop(pa); } else if ((p->p_flag & (P_WEXIT | P_SYSTEM)) == 0) { _PHOLD(p); PROC_UNLOCK(p); sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); error = proc_getargv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); if (error == 0 && error2 != 0) error = error2; } else { PROC_UNLOCK(p); } if (error != 0 || req->newptr == NULL) return (error); if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit) return (ENOMEM); newpa = pargs_alloc(req->newlen); error = SYSCTL_IN(req, newpa->ar_args, req->newlen); if (error != 0) { pargs_free(newpa); return (error); } PROC_LOCK(p); pa = p->p_args; p->p_args = newpa; PROC_UNLOCK(p); pargs_drop(pa); return (0); } /* * This sysctl allows a process to retrieve environment of another process. */ static int sysctl_kern_proc_env(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct sbuf sb; int error, error2; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); if ((p->p_flag & P_SYSTEM) != 0) { PRELE(p); return (0); } sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); error = proc_getenvv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); return (error != 0 ? error : error2); } /* * This sysctl allows a process to retrieve ELF auxiliary vector of * another process. */ static int sysctl_kern_proc_auxv(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct sbuf sb; int error, error2; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); if ((p->p_flag & P_SYSTEM) != 0) { PRELE(p); return (0); } sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req); error = proc_getauxv(curthread, p, &sb); error2 = sbuf_finish(&sb); PRELE(p); sbuf_delete(&sb); return (error != 0 ? error : error2); } /* * This sysctl allows a process to retrieve the path of the executable for * itself or another process. */ static int sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS) { pid_t *pidp = (pid_t *)arg1; unsigned int arglen = arg2; struct proc *p; struct vnode *vp; char *retbuf, *freebuf; int error; if (arglen != 1) return (EINVAL); if (*pidp == -1) { /* -1 means this process */ p = req->td->td_proc; } else { error = pget(*pidp, PGET_CANSEE, &p); if (error != 0) return (error); } vp = p->p_textvp; if (vp == NULL) { if (*pidp != -1) PROC_UNLOCK(p); return (0); } vref(vp); if (*pidp != -1) PROC_UNLOCK(p); error = vn_fullpath(req->td, vp, &retbuf, &freebuf); vrele(vp); if (error) return (error); error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1); free(freebuf, M_TEMP); return (error); } static int sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS) { struct proc *p; char *sv_name; int *name; int namelen; int error; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANSEE, &p); if (error != 0) return (error); sv_name = p->p_sysent->sv_name; PROC_UNLOCK(p); return (sysctl_handle_string(oidp, sv_name, 0, req)); } #ifdef KINFO_OVMENTRY_SIZE CTASSERT(sizeof(struct kinfo_ovmentry) == KINFO_OVMENTRY_SIZE); #endif #ifdef COMPAT_FREEBSD7 static int sysctl_kern_proc_ovmmap(SYSCTL_HANDLER_ARGS) { vm_map_entry_t entry, tmp_entry; unsigned int last_timestamp; char *fullpath, *freepath; struct kinfo_ovmentry *kve; struct vattr va; struct ucred *cred; int error, *name; struct vnode *vp; struct proc *p; vm_map_t map; struct vmspace *vm; name = (int *)arg1; error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); vm = vmspace_acquire_ref(p); if (vm == NULL) { PRELE(p); return (ESRCH); } kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK); map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { vm_object_t obj, tobj, lobj; vm_offset_t addr; if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; bzero(kve, sizeof(*kve)); kve->kve_structsize = sizeof(*kve); kve->kve_private_resident = 0; obj = entry->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); if (obj->shadow_count == 1) kve->kve_private_resident = obj->resident_page_count; } kve->kve_resident = 0; addr = entry->start; while (addr < entry->end) { if (pmap_extract(map->pmap, addr)) kve->kve_resident++; addr += PAGE_SIZE; } for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; } kve->kve_start = (void*)entry->start; kve->kve_end = (void*)entry->end; kve->kve_offset = (off_t)entry->offset; if (entry->protection & VM_PROT_READ) kve->kve_protection |= KVME_PROT_READ; if (entry->protection & VM_PROT_WRITE) kve->kve_protection |= KVME_PROT_WRITE; if (entry->protection & VM_PROT_EXECUTE) kve->kve_protection |= KVME_PROT_EXEC; if (entry->eflags & MAP_ENTRY_COW) kve->kve_flags |= KVME_FLAG_COW; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) kve->kve_flags |= KVME_FLAG_NEEDS_COPY; if (entry->eflags & MAP_ENTRY_NOCOREDUMP) kve->kve_flags |= KVME_FLAG_NOCOREDUMP; last_timestamp = map->timestamp; vm_map_unlock_read(map); kve->kve_fileid = 0; kve->kve_fsid = 0; freepath = NULL; fullpath = ""; if (lobj) { vp = NULL; switch (lobj->type) { case OBJT_DEFAULT: kve->kve_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kve->kve_type = KVME_TYPE_VNODE; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: kve->kve_type = KVME_TYPE_SWAP; break; case OBJT_DEVICE: kve->kve_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kve->kve_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kve->kve_type = KVME_TYPE_DEAD; break; case OBJT_SG: kve->kve_type = KVME_TYPE_SG; break; default: kve->kve_type = KVME_TYPE_UNKNOWN; break; } if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); kve->kve_ref_count = obj->ref_count; kve->kve_shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); cred = curthread->td_ucred; vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, cred) == 0) { kve->kve_fileid = va.va_fileid; kve->kve_fsid = va.va_fsid; } vput(vp); } } else { kve->kve_type = KVME_TYPE_NONE; kve->kve_ref_count = 0; kve->kve_shadow_count = 0; } strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path)); if (freepath != NULL) free(freepath, M_TEMP); error = SYSCTL_OUT(req, kve, sizeof(*kve)); vm_map_lock_read(map); if (error) break; if (last_timestamp != map->timestamp) { vm_map_lookup_entry(map, addr - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); PRELE(p); free(kve, M_TEMP); return (error); } #endif /* COMPAT_FREEBSD7 */ #ifdef KINFO_VMENTRY_SIZE CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE); #endif static void kern_proc_vmmap_resident(vm_map_t map, vm_map_entry_t entry, struct kinfo_vmentry *kve) { vm_object_t obj, tobj; vm_page_t m, m_adv; vm_offset_t addr; vm_paddr_t locked_pa; vm_pindex_t pi, pi_adv, pindex; locked_pa = 0; obj = entry->object.vm_object; addr = entry->start; m_adv = NULL; pi = OFF_TO_IDX(entry->offset); for (; addr < entry->end; addr += IDX_TO_OFF(pi_adv), pi += pi_adv) { if (m_adv != NULL) { m = m_adv; } else { pi_adv = OFF_TO_IDX(entry->end - addr); pindex = pi; for (tobj = obj;; tobj = tobj->backing_object) { m = vm_page_find_least(tobj, pindex); if (m != NULL) { if (m->pindex == pindex) break; if (pi_adv > m->pindex - pindex) { pi_adv = m->pindex - pindex; m_adv = m; } } if (tobj->backing_object == NULL) goto next; pindex += OFF_TO_IDX(tobj-> backing_object_offset); } } m_adv = NULL; if (m->psind != 0 && addr + pagesizes[1] <= entry->end && (addr & (pagesizes[1] - 1)) == 0 && (pmap_mincore(map->pmap, addr, &locked_pa) & MINCORE_SUPER) != 0) { kve->kve_flags |= KVME_FLAG_SUPER; pi_adv = OFF_TO_IDX(pagesizes[1]); } else { /* * We do not test the found page on validity. * Either the page is busy and being paged in, * or it was invalidated. The first case * should be counted as resident, the second * is not so clear; we do account both. */ pi_adv = 1; } kve->kve_resident += pi_adv; next:; } PA_UNLOCK_COND(locked_pa); } /* * Must be called with the process locked and will return unlocked. */ int kern_proc_vmmap_out(struct proc *p, struct sbuf *sb) { vm_map_entry_t entry, tmp_entry; struct vattr va; vm_map_t map; vm_object_t obj, tobj, lobj; char *fullpath, *freepath; struct kinfo_vmentry *kve; struct ucred *cred; struct vnode *vp; struct vmspace *vm; vm_offset_t addr; unsigned int last_timestamp; int error; PROC_LOCK_ASSERT(p, MA_OWNED); _PHOLD(p); PROC_UNLOCK(p); vm = vmspace_acquire_ref(p); if (vm == NULL) { PRELE(p); return (ESRCH); } kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK); error = 0; map = &vm->vm_map; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; addr = entry->end; bzero(kve, sizeof(*kve)); obj = entry->object.vm_object; if (obj != NULL) { for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { VM_OBJECT_RLOCK(tobj); lobj = tobj; } if (obj->backing_object == NULL) kve->kve_private_resident = obj->resident_page_count; if (!vmmap_skip_res_cnt) kern_proc_vmmap_resident(map, entry, kve); for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { if (tobj != obj && tobj != lobj) VM_OBJECT_RUNLOCK(tobj); } } else { lobj = NULL; } kve->kve_start = entry->start; kve->kve_end = entry->end; kve->kve_offset = entry->offset; if (entry->protection & VM_PROT_READ) kve->kve_protection |= KVME_PROT_READ; if (entry->protection & VM_PROT_WRITE) kve->kve_protection |= KVME_PROT_WRITE; if (entry->protection & VM_PROT_EXECUTE) kve->kve_protection |= KVME_PROT_EXEC; if (entry->eflags & MAP_ENTRY_COW) kve->kve_flags |= KVME_FLAG_COW; if (entry->eflags & MAP_ENTRY_NEEDS_COPY) kve->kve_flags |= KVME_FLAG_NEEDS_COPY; if (entry->eflags & MAP_ENTRY_NOCOREDUMP) kve->kve_flags |= KVME_FLAG_NOCOREDUMP; if (entry->eflags & MAP_ENTRY_GROWS_UP) kve->kve_flags |= KVME_FLAG_GROWS_UP; if (entry->eflags & MAP_ENTRY_GROWS_DOWN) kve->kve_flags |= KVME_FLAG_GROWS_DOWN; last_timestamp = map->timestamp; vm_map_unlock_read(map); freepath = NULL; fullpath = ""; if (lobj != NULL) { vp = NULL; switch (lobj->type) { case OBJT_DEFAULT: kve->kve_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kve->kve_type = KVME_TYPE_VNODE; vp = lobj->handle; vref(vp); break; case OBJT_SWAP: kve->kve_type = KVME_TYPE_SWAP; break; case OBJT_DEVICE: kve->kve_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kve->kve_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kve->kve_type = KVME_TYPE_DEAD; break; case OBJT_SG: kve->kve_type = KVME_TYPE_SG; break; case OBJT_MGTDEVICE: kve->kve_type = KVME_TYPE_MGTDEVICE; break; default: kve->kve_type = KVME_TYPE_UNKNOWN; break; } if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); kve->kve_ref_count = obj->ref_count; kve->kve_shadow_count = obj->shadow_count; VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); kve->kve_vn_type = vntype_to_kinfo(vp->v_type); cred = curthread->td_ucred; vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, cred) == 0) { kve->kve_vn_fileid = va.va_fileid; kve->kve_vn_fsid = va.va_fsid; kve->kve_vn_mode = MAKEIMODE(va.va_type, va.va_mode); kve->kve_vn_size = va.va_size; kve->kve_vn_rdev = va.va_rdev; kve->kve_status = KF_ATTR_VALID; } vput(vp); } } else { kve->kve_type = KVME_TYPE_NONE; kve->kve_ref_count = 0; kve->kve_shadow_count = 0; } strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kve->kve_structsize = offsetof(struct kinfo_vmentry, kve_path) + strlen(kve->kve_path) + 1; kve->kve_structsize = roundup(kve->kve_structsize, sizeof(uint64_t)); if (sbuf_bcat(sb, kve, kve->kve_structsize) != 0) error = ENOMEM; vm_map_lock_read(map); if (error != 0) break; if (last_timestamp != map->timestamp) { vm_map_lookup_entry(map, addr - 1, &tmp_entry); entry = tmp_entry; } } vm_map_unlock_read(map); vmspace_free(vm); PRELE(p); free(kve, M_TEMP); return (error); } static int sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS) { struct proc *p; struct sbuf sb; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_vmentry), req); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } error = kern_proc_vmmap_out(p, &sb); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #if defined(STACK) || defined(DDB) static int sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS) { struct kinfo_kstack *kkstp; int error, i, *name, numthreads; lwpid_t *lwpidarray; struct thread *td; struct stack *st; struct sbuf sb; struct proc *p; name = (int *)arg1; error = pget((pid_t)name[0], PGET_NOTINEXEC | PGET_WANTREAD, &p); if (error != 0) return (error); kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK); st = stack_create(); lwpidarray = NULL; numthreads = 0; PROC_LOCK(p); repeat: if (numthreads < p->p_numthreads) { if (lwpidarray != NULL) { free(lwpidarray, M_TEMP); lwpidarray = NULL; } numthreads = p->p_numthreads; PROC_UNLOCK(p); lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP, M_WAITOK | M_ZERO); PROC_LOCK(p); goto repeat; } i = 0; /* * XXXRW: During the below loop, execve(2) and countless other sorts * of changes could have taken place. Should we check to see if the * vmspace has been replaced, or the like, in order to prevent * giving a snapshot that spans, say, execve(2), with some threads * before and some after? Among other things, the credentials could * have changed, in which case the right to extract debug info might * no longer be assured. */ FOREACH_THREAD_IN_PROC(p, td) { KASSERT(i < numthreads, ("sysctl_kern_proc_kstack: numthreads")); lwpidarray[i] = td->td_tid; i++; } numthreads = i; for (i = 0; i < numthreads; i++) { td = thread_find(p, lwpidarray[i]); if (td == NULL) { continue; } bzero(kkstp, sizeof(*kkstp)); (void)sbuf_new(&sb, kkstp->kkst_trace, sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN); thread_lock(td); kkstp->kkst_tid = td->td_tid; if (TD_IS_SWAPPED(td)) kkstp->kkst_state = KKST_STATE_SWAPPED; else if (TD_IS_RUNNING(td)) kkstp->kkst_state = KKST_STATE_RUNNING; else { kkstp->kkst_state = KKST_STATE_STACKOK; stack_save_td(st, td); } thread_unlock(td); PROC_UNLOCK(p); stack_sbuf_print(&sb, st); sbuf_finish(&sb); sbuf_delete(&sb); error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp)); PROC_LOCK(p); if (error) break; } _PRELE(p); PROC_UNLOCK(p); if (lwpidarray != NULL) free(lwpidarray, M_TEMP); stack_destroy(st); free(kkstp, M_TEMP); return (error); } #endif /* * This sysctl allows a process to retrieve the full list of groups from * itself or another process. */ static int sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS) { pid_t *pidp = (pid_t *)arg1; unsigned int arglen = arg2; struct proc *p; struct ucred *cred; int error; if (arglen != 1) return (EINVAL); if (*pidp == -1) { /* -1 means this process */ p = req->td->td_proc; PROC_LOCK(p); } else { error = pget(*pidp, PGET_CANSEE, &p); if (error != 0) return (error); } cred = crhold(p->p_ucred); PROC_UNLOCK(p); error = SYSCTL_OUT(req, cred->cr_groups, cred->cr_ngroups * sizeof(gid_t)); crfree(cred); return (error); } /* * This sysctl allows a process to retrieve or/and set the resource limit for * another process. */ static int sysctl_kern_proc_rlimit(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct rlimit rlim; struct proc *p; u_int which; int flags, error; if (namelen != 2) return (EINVAL); which = (u_int)name[1]; if (which >= RLIM_NLIMITS) return (EINVAL); if (req->newptr != NULL && req->newlen != sizeof(rlim)) return (EINVAL); flags = PGET_HOLD | PGET_NOTWEXIT; if (req->newptr != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget((pid_t)name[0], flags, &p); if (error != 0) return (error); /* * Retrieve limit. */ if (req->oldptr != NULL) { PROC_LOCK(p); lim_rlimit(p, which, &rlim); PROC_UNLOCK(p); } error = SYSCTL_OUT(req, &rlim, sizeof(rlim)); if (error != 0) goto errout; /* * Set limit. */ if (req->newptr != NULL) { error = SYSCTL_IN(req, &rlim, sizeof(rlim)); if (error == 0) error = kern_proc_setrlimit(curthread, p, which, &rlim); } errout: PRELE(p); return (error); } /* * This sysctl allows a process to retrieve ps_strings structure location of * another process. */ static int sysctl_kern_proc_ps_strings(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; vm_offset_t ps_strings; int error; #ifdef COMPAT_FREEBSD32 uint32_t ps_strings32; #endif if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_CANDEBUG, &p); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 if ((req->flags & SCTL_MASK32) != 0) { /* * We return 0 if the 32 bit emulation request is for a 64 bit * process. */ ps_strings32 = SV_PROC_FLAG(p, SV_ILP32) != 0 ? PTROUT(p->p_sysent->sv_psstrings) : 0; PROC_UNLOCK(p); error = SYSCTL_OUT(req, &ps_strings32, sizeof(ps_strings32)); return (error); } #endif ps_strings = p->p_sysent->sv_psstrings; PROC_UNLOCK(p); error = SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings)); return (error); } /* * This sysctl allows a process to retrieve umask of another process. */ static int sysctl_kern_proc_umask(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int error; u_short fd_cmask; if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_WANTREAD, &p); if (error != 0) return (error); FILEDESC_SLOCK(p->p_fd); fd_cmask = p->p_fd->fd_cmask; FILEDESC_SUNLOCK(p->p_fd); PRELE(p); error = SYSCTL_OUT(req, &fd_cmask, sizeof(fd_cmask)); return (error); } /* * This sysctl allows a process to set and retrieve binary osreldate of * another process. */ static int sysctl_kern_proc_osrel(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; int flags, error, osrel; if (namelen != 1) return (EINVAL); if (req->newptr != NULL && req->newlen != sizeof(osrel)) return (EINVAL); flags = PGET_HOLD | PGET_NOTWEXIT; if (req->newptr != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget((pid_t)name[0], flags, &p); if (error != 0) return (error); error = SYSCTL_OUT(req, &p->p_osrel, sizeof(p->p_osrel)); if (error != 0) goto errout; if (req->newptr != NULL) { error = SYSCTL_IN(req, &osrel, sizeof(osrel)); if (error != 0) goto errout; if (osrel < 0) { error = EINVAL; goto errout; } p->p_osrel = osrel; } errout: PRELE(p); return (error); } static int sysctl_kern_proc_sigtramp(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct proc *p; struct kinfo_sigtramp kst; const struct sysentvec *sv; int error; #ifdef COMPAT_FREEBSD32 struct kinfo_sigtramp32 kst32; #endif if (namelen != 1) return (EINVAL); error = pget((pid_t)name[0], PGET_CANDEBUG, &p); if (error != 0) return (error); sv = p->p_sysent; #ifdef COMPAT_FREEBSD32 if ((req->flags & SCTL_MASK32) != 0) { bzero(&kst32, sizeof(kst32)); if (SV_PROC_FLAG(p, SV_ILP32)) { if (sv->sv_sigcode_base != 0) { kst32.ksigtramp_start = sv->sv_sigcode_base; kst32.ksigtramp_end = sv->sv_sigcode_base + *sv->sv_szsigcode; } else { kst32.ksigtramp_start = sv->sv_psstrings - *sv->sv_szsigcode; kst32.ksigtramp_end = sv->sv_psstrings; } } PROC_UNLOCK(p); error = SYSCTL_OUT(req, &kst32, sizeof(kst32)); return (error); } #endif bzero(&kst, sizeof(kst)); if (sv->sv_sigcode_base != 0) { kst.ksigtramp_start = (char *)sv->sv_sigcode_base; kst.ksigtramp_end = (char *)sv->sv_sigcode_base + *sv->sv_szsigcode; } else { kst.ksigtramp_start = (char *)sv->sv_psstrings - *sv->sv_szsigcode; kst.ksigtramp_end = (char *)sv->sv_psstrings; } PROC_UNLOCK(p); error = SYSCTL_OUT(req, &kst, sizeof(kst)); return (error); } SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT| CTLFLAG_MPSAFE, 0, 0, sysctl_kern_proc, "S,proc", "Return entire process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Return process table, no threads"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_args, "Process argument list"); static SYSCTL_NODE(_kern_proc, KERN_PROC_ENV, env, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_env, "Process environment"); static SYSCTL_NODE(_kern_proc, KERN_PROC_AUXV, auxv, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_auxv, "Process ELF auxiliary vector"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_pathname, "Process executable path"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_sv_name, "Process syscall vector name (ABI type)"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD), sid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table"); static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Return process table, no threads"); #ifdef COMPAT_FREEBSD7 static SYSCTL_NODE(_kern_proc, KERN_PROC_OVMMAP, ovmmap, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_ovmmap, "Old Process vm map entries"); #endif static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_vmmap, "Process vm map entries"); #if defined(STACK) || defined(DDB) static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_kstack, "Process kernel stacks"); #endif static SYSCTL_NODE(_kern_proc, KERN_PROC_GROUPS, groups, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_groups, "Process groups"); static SYSCTL_NODE(_kern_proc, KERN_PROC_RLIMIT, rlimit, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_rlimit, "Process resource limits"); static SYSCTL_NODE(_kern_proc, KERN_PROC_PS_STRINGS, ps_strings, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_ps_strings, "Process ps_strings location"); static SYSCTL_NODE(_kern_proc, KERN_PROC_UMASK, umask, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_umask, "Process umask"); static SYSCTL_NODE(_kern_proc, KERN_PROC_OSREL, osrel, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_osrel, "Process binary osreldate"); static SYSCTL_NODE(_kern_proc, KERN_PROC_SIGTRAMP, sigtramp, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc_sigtramp, "Process signal trampoline location"); Index: head/sys/kern/sched_ule.c =================================================================== --- head/sys/kern/sched_ule.c (revision 273265) +++ head/sys/kern/sched_ule.c (revision 273266) @@ -1,2893 +1,2893 @@ /*- * Copyright (c) 2002-2007, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This file implements the ULE scheduler. ULE supports independent CPU * run queues and fine grain locking. It has superior interactive * performance under load even on uni-processor systems. * * etymology: * ULE is the last three letters in schedule. It owes its name to a * generic user created for a scheduling system by Paul Mikesell at * Isilon Systems and a general lack of creativity on the part of the author. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #ifdef KDTRACE_HOOKS #include int dtrace_vtime_active; dtrace_vtime_switch_func_t dtrace_vtime_switch_func; #endif #include #include #define KTR_ULE 0 #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) #define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU))) #define TDQ_LOADNAME_LEN (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load")) /* * Thread scheduler specific section. All fields are protected * by the thread lock. */ struct td_sched { struct runq *ts_runq; /* Run-queue we're queued on. */ short ts_flags; /* TSF_* flags. */ - u_char ts_cpu; /* CPU that we have affinity for. */ + int ts_cpu; /* CPU that we have affinity for. */ int ts_rltick; /* Real last tick, for affinity. */ int ts_slice; /* Ticks of slice remaining. */ u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ int ts_ltick; /* Last tick that we were running on */ int ts_ftick; /* First tick that we were running on */ int ts_ticks; /* Tick count */ #ifdef KTR char ts_name[TS_NAME_LEN]; #endif }; /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ static struct td_sched td_sched0; #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) #define THREAD_CAN_SCHED(td, cpu) \ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) /* * Priority ranges used for interactive and non-interactive timeshare * threads. The timeshare priorities are split up into four ranges. * The first range handles interactive threads. The last three ranges * (NHALF, x, and NHALF) handle non-interactive threads with the outer * ranges supporting nice values. */ #define PRI_TIMESHARE_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define PRI_INTERACT_RANGE ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2) #define PRI_BATCH_RANGE (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE) #define PRI_MIN_INTERACT PRI_MIN_TIMESHARE #define PRI_MAX_INTERACT (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1) #define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE) #define PRI_MAX_BATCH PRI_MAX_TIMESHARE /* * Cpu percentage computation macros and defines. * * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. * SCHED_TICK_MAX: Maximum number of ticks before scaling back. * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. */ #define SCHED_TICK_SECS 10 #define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) #define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) #define SCHED_TICK_SHIFT 10 #define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) #define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) /* * These macros determine priorities for non-interactive threads. They are * assigned a priority based on their recent cpu utilization as expressed * by the ratio of ticks to the tick total. NHALF priorities at the start * and end of the MIN to MAX timeshare range are only reachable with negative * or positive nice respectively. * * PRI_RANGE: Priority range for utilization dependent priorities. * PRI_NRESV: Number of nice values. * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. * PRI_NICE: Determines the part of the priority inherited from nice. */ #define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) #define SCHED_PRI_MIN (PRI_MIN_BATCH + SCHED_PRI_NHALF) #define SCHED_PRI_MAX (PRI_MAX_BATCH - SCHED_PRI_NHALF) #define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) #define SCHED_PRI_TICKS(ts) \ (SCHED_TICK_HZ((ts)) / \ (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) #define SCHED_PRI_NICE(nice) (nice) /* * These determine the interactivity of a process. Interactivity differs from * cpu utilization in that it expresses the voluntary time slept vs time ran * while cpu utilization includes all time not running. This more accurately * models the intent of the thread. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. * INTERACT_MAX: Maximum interactivity value. Smaller is better. * INTERACT_THRESH: Threshold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) #define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) #define SCHED_INTERACT_MAX (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) #define SCHED_INTERACT_THRESH (30) /* * These parameters determine the slice behavior for batch work. */ #define SCHED_SLICE_DEFAULT_DIVISOR 10 /* ~94 ms, 12 stathz ticks. */ #define SCHED_SLICE_MIN_DIVISOR 6 /* DEFAULT/MIN = ~16 ms. */ /* Flags kept in td_flags. */ #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ /* * tickincr: Converts a stathz tick into a hz domain scaled by * the shift factor. Without the shift the error rate * due to rounding would be unacceptably high. * realstathz: stathz is sometimes 0 and run off of hz. * sched_slice: Runtime of each thread before rescheduling. * preempt_thresh: Priority threshold for preemption and remote IPIs. */ static int sched_interact = SCHED_INTERACT_THRESH; static int tickincr = 8 << SCHED_TICK_SHIFT; static int realstathz = 127; /* reset during boot. */ static int sched_slice = 10; /* reset during boot. */ static int sched_slice_min = 1; /* reset during boot. */ #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int preempt_thresh = PRI_MAX_IDLE; #else static int preempt_thresh = PRI_MIN_KERN; #endif #else static int preempt_thresh = 0; #endif static int static_boost = PRI_MIN_BATCH; static int sched_idlespins = 10000; static int sched_idlespinthresh = -1; /* * tdq - per processor runqs and statistics. All fields are protected by the * tdq_lock. The load and lowpri may be accessed without to avoid excess * locking in sched_pickcpu(); */ struct tdq { /* * Ordered to improve efficiency of cpu_search() and switch(). * tdq_lock is padded to avoid false sharing with tdq_load and * tdq_cpu_idle. */ struct mtx_padalign tdq_lock; /* run queue lock. */ struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ volatile int tdq_load; /* Aggregate load. */ volatile int tdq_cpu_idle; /* cpu_idle() is active. */ int tdq_sysload; /* For loadavg, !ITHD load. */ int tdq_transferable; /* Transferable thread count. */ short tdq_switchcnt; /* Switches this tick. */ short tdq_oldswitchcnt; /* Switches last tick. */ u_char tdq_lowpri; /* Lowest priority thread. */ u_char tdq_ipipending; /* IPI pending. */ u_char tdq_idx; /* Current insert index. */ u_char tdq_ridx; /* Current removal index. */ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ char tdq_name[TDQ_NAME_LEN]; #ifdef KTR char tdq_loadname[TDQ_LOADNAME_LEN]; #endif } __aligned(64); /* Idle thread states and config. */ #define TDQ_RUNNING 1 #define TDQ_IDLE 2 #ifdef SMP struct cpu_group *cpu_top; /* CPU topology */ #define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) #define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) /* * Run-time tunables. */ static int rebalance = 1; static int balance_interval = 128; /* Default set in sched_initticks(). */ static int affinity; static int steal_idle = 1; static int steal_thresh = 2; /* * One thread queue per processor. */ static struct tdq tdq_cpu[MAXCPU]; static struct tdq *balance_tdq; static int balance_ticks; static DPCPU_DEFINE(uint32_t, randomval); #define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) #define TDQ_CPU(x) (&tdq_cpu[(x)]) #define TDQ_ID(x) ((int)((x) - tdq_cpu)) #else /* !SMP */ static struct tdq tdq_cpu; #define TDQ_ID(x) (0) #define TDQ_SELF() (&tdq_cpu) #define TDQ_CPU(x) (&tdq_cpu) #endif #define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCKPTR(t) ((struct mtx *)(&(t)->tdq_lock)) static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); static int sched_interact_score(struct thread *); static void sched_interact_update(struct thread *); static void sched_interact_fork(struct thread *); static void sched_pctcpu_update(struct td_sched *, int); /* Operations on per processor queues */ static struct thread *tdq_choose(struct tdq *); static void tdq_setup(struct tdq *); static void tdq_load_add(struct tdq *, struct thread *); static void tdq_load_rem(struct tdq *, struct thread *); static __inline void tdq_runq_add(struct tdq *, struct thread *, int); static __inline void tdq_runq_rem(struct tdq *, struct thread *); static inline int sched_shouldpreempt(int, int, int); void tdq_print(int cpu); static void runq_print(struct runq *rq); static void tdq_add(struct tdq *, struct thread *, int); #ifdef SMP static int tdq_move(struct tdq *, struct tdq *); static int tdq_idled(struct tdq *); static void tdq_notify(struct tdq *, struct thread *); static struct thread *tdq_steal(struct tdq *, int); static struct thread *runq_steal(struct runq *, int); static int sched_pickcpu(struct thread *, int); static void sched_balance(void); static int sched_balance_pair(struct tdq *, struct tdq *); static inline struct tdq *sched_setcpu(struct thread *, int, int); static inline void thread_unblock_switch(struct thread *, struct mtx *); static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int); static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS); static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent); #endif static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); static void sched_initticks(void *dummy); SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL); SDT_PROVIDER_DEFINE(sched); SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", "struct proc *", "uint8_t"); SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", "struct proc *", "void *"); SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", "struct proc *", "void *", "int"); SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", "struct proc *", "uint8_t", "struct thread *"); SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", "struct proc *"); SDT_PROBE_DEFINE(sched, , , on__cpu); SDT_PROBE_DEFINE(sched, , , remain__cpu); SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", "struct proc *"); /* * Print the threads waiting on a run-queue. */ static void runq_print(struct runq *rq) { struct rqhead *rqh; struct thread *td; int pri; int j; int i; for (i = 0; i < RQB_LEN; i++) { printf("\t\trunq bits %d 0x%zx\n", i, rq->rq_status.rqb_bits[i]); for (j = 0; j < RQB_BPW; j++) if (rq->rq_status.rqb_bits[i] & (1ul << j)) { pri = j + (i << RQB_L2BPW); rqh = &rq->rq_queues[pri]; TAILQ_FOREACH(td, rqh, td_runq) { printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", td, td->td_name, td->td_priority, td->td_rqindex, pri); } } } } /* * Print the status of a per-cpu thread queue. Should be a ddb show cmd. */ void tdq_print(int cpu) { struct tdq *tdq; tdq = TDQ_CPU(cpu); printf("tdq %d:\n", TDQ_ID(tdq)); printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); printf("\tLock name: %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt); printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); printf("\tload transferable: %d\n", tdq->tdq_transferable); printf("\tlowest priority: %d\n", tdq->tdq_lowpri); printf("\trealtime runq:\n"); runq_print(&tdq->tdq_realtime); printf("\ttimeshare runq:\n"); runq_print(&tdq->tdq_timeshare); printf("\tidle runq:\n"); runq_print(&tdq->tdq_idle); } static inline int sched_shouldpreempt(int pri, int cpri, int remote) { /* * If the new priority is not better than the current priority there is * nothing to do. */ if (pri >= cpri) return (0); /* * Always preempt idle. */ if (cpri >= PRI_MIN_IDLE) return (1); /* * If preemption is disabled don't preempt others. */ if (preempt_thresh == 0) return (0); /* * Preempt if we exceed the threshold. */ if (pri <= preempt_thresh) return (1); /* * If we're interactive or better and there is non-interactive * or worse running preempt only remote processors. */ if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT) return (1); return (0); } /* * Add a thread to the actual run-queue. Keeps transferable counts up to * date with what is actually on the run-queue. Selects the correct * queue position for timeshare threads. */ static __inline void tdq_runq_add(struct tdq *tdq, struct thread *td, int flags) { struct td_sched *ts; u_char pri; TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); pri = td->td_priority; ts = td->td_sched; TD_SET_RUNQ(td); if (THREAD_CAN_MIGRATE(td)) { tdq->tdq_transferable++; ts->ts_flags |= TSF_XFERABLE; } if (pri < PRI_MIN_BATCH) { ts->ts_runq = &tdq->tdq_realtime; } else if (pri <= PRI_MAX_BATCH) { ts->ts_runq = &tdq->tdq_timeshare; KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH, ("Invalid priority %d on timeshare runq", pri)); /* * This queue contains only priorities between MIN and MAX * realtime. Use the whole queue to represent these values. */ if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE; pri = (pri + tdq->tdq_idx) % RQ_NQS; /* * This effectively shortens the queue by one so we * can have a one slot difference between idx and * ridx while we wait for threads to drain. */ if (tdq->tdq_ridx != tdq->tdq_idx && pri == tdq->tdq_ridx) pri = (unsigned char)(pri - 1) % RQ_NQS; } else pri = tdq->tdq_ridx; runq_add_pri(ts->ts_runq, td, pri, flags); return; } else ts->ts_runq = &tdq->tdq_idle; runq_add(ts->ts_runq, td, flags); } /* * Remove a thread from a run-queue. This typically happens when a thread * is selected to run. Running threads are not on the queue and the * transferable count does not reflect them. */ static __inline void tdq_runq_rem(struct tdq *tdq, struct thread *td) { struct td_sched *ts; ts = td->td_sched; TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT(ts->ts_runq != NULL, ("tdq_runq_remove: thread %p null ts_runq", td)); if (ts->ts_flags & TSF_XFERABLE) { tdq->tdq_transferable--; ts->ts_flags &= ~TSF_XFERABLE; } if (ts->ts_runq == &tdq->tdq_timeshare) { if (tdq->tdq_idx != tdq->tdq_ridx) runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx); else runq_remove_idx(ts->ts_runq, td, NULL); } else runq_remove(ts->ts_runq, td); } /* * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load * for this thread to the referenced thread queue. */ static void tdq_load_add(struct tdq *tdq, struct thread *td) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); tdq->tdq_load++; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload++; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Remove the load from a thread that is transitioning to a sleep state or * exiting. */ static void tdq_load_rem(struct tdq *tdq, struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload--; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Bound timeshare latency by decreasing slice size as load increases. We * consider the maximum latency as the sum of the threads waiting to run * aside from curthread and target no more than sched_slice latency but * no less than sched_slice_min runtime. */ static inline int tdq_slice(struct tdq *tdq) { int load; /* * It is safe to use sys_load here because this is called from * contexts where timeshare threads are running and so there * cannot be higher priority load in the system. */ load = tdq->tdq_sysload - 1; if (load >= SCHED_SLICE_MIN_DIVISOR) return (sched_slice_min); if (load <= 1) return (sched_slice); return (sched_slice / load); } /* * Set lowpri to its exact value by searching the run-queue and * evaluating curthread. curthread may be passed as an optimization. */ static void tdq_setlowpri(struct tdq *tdq, struct thread *ctd) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if (ctd == NULL) ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread; td = tdq_choose(tdq); if (td == NULL || td->td_priority > ctd->td_priority) tdq->tdq_lowpri = ctd->td_priority; else tdq->tdq_lowpri = td->td_priority; } #ifdef SMP struct cpu_search { cpuset_t cs_mask; u_int cs_prefer; int cs_pri; /* Min priority for low. */ int cs_limit; /* Max load for low, min load for high. */ int cs_cpu; int cs_load; }; #define CPU_SEARCH_LOWEST 0x1 #define CPU_SEARCH_HIGHEST 0x2 #define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST) #define CPUSET_FOREACH(cpu, mask) \ for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++) \ if (CPU_ISSET(cpu, &mask)) static __always_inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high, const int match); int __noinline cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low); int __noinline cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high); int __noinline cpu_search_both(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high); /* * Search the tree of cpu_groups for the lowest or highest loaded cpu * according to the match argument. This routine actually compares the * load on all paths through the tree and finds the least loaded cpu on * the least loaded path, which may differ from the least loaded cpu in * the system. This balances work among caches and busses. * * This inline is instantiated in three forms below using constants for the * match argument. It is reduced to the minimum set for each case. It is * also recursive to the depth of the tree. */ static __always_inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high, const int match) { struct cpu_search lgroup; struct cpu_search hgroup; cpuset_t cpumask; struct cpu_group *child; struct tdq *tdq; int cpu, i, hload, lload, load, total, rnd, *rndptr; total = 0; cpumask = cg->cg_mask; if (match & CPU_SEARCH_LOWEST) { lload = INT_MAX; lgroup = *low; } if (match & CPU_SEARCH_HIGHEST) { hload = INT_MIN; hgroup = *high; } /* Iterate through the child CPU groups and then remaining CPUs. */ for (i = cg->cg_children, cpu = mp_maxid; ; ) { if (i == 0) { #ifdef HAVE_INLINE_FFSL cpu = CPU_FFS(&cpumask) - 1; #else while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask)) cpu--; #endif if (cpu < 0) break; child = NULL; } else child = &cg->cg_child[i - 1]; if (match & CPU_SEARCH_LOWEST) lgroup.cs_cpu = -1; if (match & CPU_SEARCH_HIGHEST) hgroup.cs_cpu = -1; if (child) { /* Handle child CPU group. */ CPU_NAND(&cpumask, &child->cg_mask); switch (match) { case CPU_SEARCH_LOWEST: load = cpu_search_lowest(child, &lgroup); break; case CPU_SEARCH_HIGHEST: load = cpu_search_highest(child, &hgroup); break; case CPU_SEARCH_BOTH: load = cpu_search_both(child, &lgroup, &hgroup); break; } } else { /* Handle child CPU. */ CPU_CLR(cpu, &cpumask); tdq = TDQ_CPU(cpu); load = tdq->tdq_load * 256; rndptr = DPCPU_PTR(randomval); rnd = (*rndptr = *rndptr * 69069 + 5) >> 26; if (match & CPU_SEARCH_LOWEST) { if (cpu == low->cs_prefer) load -= 64; /* If that CPU is allowed and get data. */ if (tdq->tdq_lowpri > lgroup.cs_pri && tdq->tdq_load <= lgroup.cs_limit && CPU_ISSET(cpu, &lgroup.cs_mask)) { lgroup.cs_cpu = cpu; lgroup.cs_load = load - rnd; } } if (match & CPU_SEARCH_HIGHEST) if (tdq->tdq_load >= hgroup.cs_limit && tdq->tdq_transferable && CPU_ISSET(cpu, &hgroup.cs_mask)) { hgroup.cs_cpu = cpu; hgroup.cs_load = load - rnd; } } total += load; /* We have info about child item. Compare it. */ if (match & CPU_SEARCH_LOWEST) { if (lgroup.cs_cpu >= 0 && (load < lload || (load == lload && lgroup.cs_load < low->cs_load))) { lload = load; low->cs_cpu = lgroup.cs_cpu; low->cs_load = lgroup.cs_load; } } if (match & CPU_SEARCH_HIGHEST) if (hgroup.cs_cpu >= 0 && (load > hload || (load == hload && hgroup.cs_load > high->cs_load))) { hload = load; high->cs_cpu = hgroup.cs_cpu; high->cs_load = hgroup.cs_load; } if (child) { i--; if (i == 0 && CPU_EMPTY(&cpumask)) break; } #ifndef HAVE_INLINE_FFSL else cpu--; #endif } return (total); } /* * cpu_search instantiations must pass constants to maintain the inline * optimization. */ int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low) { return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST); } int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high) { return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST); } int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low, struct cpu_search *high) { return cpu_search(cg, low, high, CPU_SEARCH_BOTH); } /* * Find the cpu with the least load via the least loaded path that has a * lowpri greater than pri pri. A pri of -1 indicates any priority is * acceptable. */ static inline int sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload, int prefer) { struct cpu_search low; low.cs_cpu = -1; low.cs_prefer = prefer; low.cs_mask = mask; low.cs_pri = pri; low.cs_limit = maxload; cpu_search_lowest(cg, &low); return low.cs_cpu; } /* * Find the cpu with the highest load via the highest loaded path. */ static inline int sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload) { struct cpu_search high; high.cs_cpu = -1; high.cs_mask = mask; high.cs_limit = minload; cpu_search_highest(cg, &high); return high.cs_cpu; } static void sched_balance_group(struct cpu_group *cg) { cpuset_t hmask, lmask; int high, low, anylow; CPU_FILL(&hmask); for (;;) { high = sched_highest(cg, hmask, 1); /* Stop if there is no more CPU with transferrable threads. */ if (high == -1) break; CPU_CLR(high, &hmask); CPU_COPY(&hmask, &lmask); /* Stop if there is no more CPU left for low. */ if (CPU_EMPTY(&lmask)) break; anylow = 1; nextlow: low = sched_lowest(cg, lmask, -1, TDQ_CPU(high)->tdq_load - 1, high); /* Stop if we looked well and found no less loaded CPU. */ if (anylow && low == -1) break; /* Go to next high if we found no less loaded CPU. */ if (low == -1) continue; /* Transfer thread from high to low. */ if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) { /* CPU that got thread can no longer be a donor. */ CPU_CLR(low, &hmask); } else { /* * If failed, then there is no threads on high * that can run on this low. Drop low from low * mask and look for different one. */ CPU_CLR(low, &lmask); anylow = 0; goto nextlow; } } } static void sched_balance(void) { struct tdq *tdq; /* * Select a random time between .5 * balance_interval and * 1.5 * balance_interval. */ balance_ticks = max(balance_interval / 2, 1); balance_ticks += random() % balance_interval; if (smp_started == 0 || rebalance == 0) return; tdq = TDQ_SELF(); TDQ_UNLOCK(tdq); sched_balance_group(cpu_top); TDQ_LOCK(tdq); } /* * Lock two thread queues using their address to maintain lock order. */ static void tdq_lock_pair(struct tdq *one, struct tdq *two) { if (one < two) { TDQ_LOCK(one); TDQ_LOCK_FLAGS(two, MTX_DUPOK); } else { TDQ_LOCK(two); TDQ_LOCK_FLAGS(one, MTX_DUPOK); } } /* * Unlock two thread queues. Order is not important here. */ static void tdq_unlock_pair(struct tdq *one, struct tdq *two) { TDQ_UNLOCK(one); TDQ_UNLOCK(two); } /* * Transfer load between two imbalanced thread queues. */ static int sched_balance_pair(struct tdq *high, struct tdq *low) { int moved; int cpu; tdq_lock_pair(high, low); moved = 0; /* * Determine what the imbalance is and then adjust that to how many * threads we actually have to give up (transferable). */ if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load && (moved = tdq_move(high, low)) > 0) { /* * In case the target isn't the current cpu IPI it to force a * reschedule with the new workload. */ cpu = TDQ_ID(low); if (cpu != PCPU_GET(cpuid)) ipi_cpu(cpu, IPI_PREEMPT); } tdq_unlock_pair(high, low); return (moved); } /* * Move a thread from one thread queue to another. */ static int tdq_move(struct tdq *from, struct tdq *to) { struct td_sched *ts; struct thread *td; struct tdq *tdq; int cpu; TDQ_LOCK_ASSERT(from, MA_OWNED); TDQ_LOCK_ASSERT(to, MA_OWNED); tdq = from; cpu = TDQ_ID(to); td = tdq_steal(tdq, cpu); if (td == NULL) return (0); ts = td->td_sched; /* * Although the run queue is locked the thread may be blocked. Lock * it to clear this and acquire the run-queue lock. */ thread_lock(td); /* Drop recursive lock on from acquired via thread_lock(). */ TDQ_UNLOCK(from); sched_rem(td); ts->ts_cpu = cpu; td->td_lock = TDQ_LOCKPTR(to); tdq_add(to, td, SRQ_YIELDING); return (1); } /* * This tdq has idled. Try to steal a thread from another cpu and switch * to it. */ static int tdq_idled(struct tdq *tdq) { struct cpu_group *cg; struct tdq *steal; cpuset_t mask; int thresh; int cpu; if (smp_started == 0 || steal_idle == 0) return (1); CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); /* We don't want to be preempted while we're iterating. */ spinlock_enter(); for (cg = tdq->tdq_cg; cg != NULL; ) { if ((cg->cg_flags & CG_FLAG_THREAD) == 0) thresh = steal_thresh; else thresh = 1; cpu = sched_highest(cg, mask, thresh); if (cpu == -1) { cg = cg->cg_parent; continue; } steal = TDQ_CPU(cpu); CPU_CLR(cpu, &mask); tdq_lock_pair(tdq, steal); if (steal->tdq_load < thresh || steal->tdq_transferable == 0) { tdq_unlock_pair(tdq, steal); continue; } /* * If a thread was added while interrupts were disabled don't * steal one here. If we fail to acquire one due to affinity * restrictions loop again with this cpu removed from the * set. */ if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) { tdq_unlock_pair(tdq, steal); continue; } spinlock_exit(); TDQ_UNLOCK(steal); mi_switch(SW_VOL | SWT_IDLE, NULL); thread_unlock(curthread); return (0); } spinlock_exit(); return (1); } /* * Notify a remote cpu of new work. Sends an IPI if criteria are met. */ static void tdq_notify(struct tdq *tdq, struct thread *td) { struct thread *ctd; int pri; int cpu; if (tdq->tdq_ipipending) return; cpu = td->td_sched->ts_cpu; pri = td->td_priority; ctd = pcpu_find(cpu)->pc_curthread; if (!sched_shouldpreempt(pri, ctd->td_priority, 1)) return; /* * Make sure that our caller's earlier update to tdq_load is * globally visible before we read tdq_cpu_idle. Idle thread * accesses both of them without locks, and the order is important. */ mb(); if (TD_IS_IDLETHREAD(ctd)) { /* * If the MD code has an idle wakeup routine try that before * falling back to IPI. */ if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu)) return; } tdq->tdq_ipipending = 1; ipi_cpu(cpu, IPI_PREEMPT); } /* * Steals load from a timeshare queue. Honors the rotating queue head * index. */ static struct thread * runq_steal_from(struct runq *rq, int cpu, u_char start) { struct rqbits *rqb; struct rqhead *rqh; struct thread *td, *first; int bit; int i; rqb = &rq->rq_status; bit = start & (RQB_BPW -1); first = NULL; again: for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { if (rqb->rqb_bits[i] == 0) continue; if (bit == 0) bit = RQB_FFS(rqb->rqb_bits[i]); for (; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[i] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) { if (first && THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); first = td; } } } if (start != 0) { start = 0; goto again; } if (first && THREAD_CAN_MIGRATE(first) && THREAD_CAN_SCHED(first, cpu)) return (first); return (NULL); } /* * Steals load from a standard linear queue. */ static struct thread * runq_steal(struct runq *rq, int cpu) { struct rqhead *rqh; struct rqbits *rqb; struct thread *td; int word; int bit; rqb = &rq->rq_status; for (word = 0; word < RQB_LEN; word++) { if (rqb->rqb_bits[word] == 0) continue; for (bit = 0; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); } } return (NULL); } /* * Attempt to steal a thread in priority order from a thread queue. */ static struct thread * tdq_steal(struct tdq *tdq, int cpu) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL) return (td); if ((td = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx)) != NULL) return (td); return (runq_steal(&tdq->tdq_idle, cpu)); } /* * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the * current lock and returns with the assigned queue locked. */ static inline struct tdq * sched_setcpu(struct thread *td, int cpu, int flags) { struct tdq *tdq; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_CPU(cpu); td->td_sched->ts_cpu = cpu; /* * If the lock matches just return the queue. */ if (td->td_lock == TDQ_LOCKPTR(tdq)) return (tdq); #ifdef notyet /* * If the thread isn't running its lockptr is a * turnstile or a sleepqueue. We can just lock_set without * blocking. */ if (TD_CAN_RUN(td)) { TDQ_LOCK(tdq); thread_lock_set(td, TDQ_LOCKPTR(tdq)); return (tdq); } #endif /* * The hard case, migration, we need to block the thread first to * prevent order reversals with other cpus locks. */ spinlock_enter(); thread_lock_block(td); TDQ_LOCK(tdq); thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); spinlock_exit(); return (tdq); } SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding"); SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load"); SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu"); SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration"); static int sched_pickcpu(struct thread *td, int flags) { struct cpu_group *cg, *ccg; struct td_sched *ts; struct tdq *tdq; cpuset_t mask; int cpu, pri, self; self = PCPU_GET(cpuid); ts = td->td_sched; if (smp_started == 0) return (self); /* * Don't migrate a running thread from sched_switch(). */ if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td)) return (ts->ts_cpu); /* * Prefer to run interrupt threads on the processors that generate * the interrupt. */ pri = td->td_priority; if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && curthread->td_intr_nesting_level && ts->ts_cpu != self) { SCHED_STAT_INC(pickcpu_intrbind); ts->ts_cpu = self; if (TDQ_CPU(self)->tdq_lowpri > pri) { SCHED_STAT_INC(pickcpu_affinity); return (ts->ts_cpu); } } /* * If the thread can run on the last cpu and the affinity has not * expired or it is idle run it there. */ tdq = TDQ_CPU(ts->ts_cpu); cg = tdq->tdq_cg; if (THREAD_CAN_SCHED(td, ts->ts_cpu) && tdq->tdq_lowpri >= PRI_MIN_IDLE && SCHED_AFFINITY(ts, CG_SHARE_L2)) { if (cg->cg_flags & CG_FLAG_THREAD) { CPUSET_FOREACH(cpu, cg->cg_mask) { if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) break; } } else cpu = INT_MAX; if (cpu > mp_maxid) { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } } /* * Search for the last level cache CPU group in the tree. * Skip caches with expired affinity time and SMT groups. * Affinity to higher level caches will be handled less aggressively. */ for (ccg = NULL; cg != NULL; cg = cg->cg_parent) { if (cg->cg_flags & CG_FLAG_THREAD) continue; if (!SCHED_AFFINITY(ts, cg->cg_level)) continue; ccg = cg; } if (ccg != NULL) cg = ccg; cpu = -1; /* Search the group for the less loaded idle CPU we can run now. */ mask = td->td_cpuset->cs_mask; if (cg != NULL && cg != cpu_top && CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0) cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE), INT_MAX, ts->ts_cpu); /* Search globally for the less loaded CPU we can run now. */ if (cpu == -1) cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu); /* Search globally for the less loaded CPU. */ if (cpu == -1) cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu); KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu.")); /* * Compare the lowest loaded cpu to current cpu. */ if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri && TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE && TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) { SCHED_STAT_INC(pickcpu_local); cpu = self; } else SCHED_STAT_INC(pickcpu_lowest); if (cpu != ts->ts_cpu) SCHED_STAT_INC(pickcpu_migration); return (cpu); } #endif /* * Pick the highest priority task we have and return it. */ static struct thread * tdq_choose(struct tdq *tdq) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = runq_choose(&tdq->tdq_realtime); if (td != NULL) return (td); td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_BATCH, ("tdq_choose: Invalid priority on timeshare queue %d", td->td_priority)); return (td); } td = runq_choose(&tdq->tdq_idle); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_IDLE, ("tdq_choose: Invalid priority on idle queue %d", td->td_priority)); return (td); } return (NULL); } /* * Initialize a thread queue. */ static void tdq_setup(struct tdq *tdq) { if (bootverbose) printf("ULE: setup cpu %d\n", TDQ_ID(tdq)); runq_init(&tdq->tdq_realtime); runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), "sched lock %d", (int)TDQ_ID(tdq)); mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN | MTX_RECURSE); #ifdef KTR snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname), "CPU %d load", (int)TDQ_ID(tdq)); #endif } #ifdef SMP static void sched_setup_smp(void) { struct tdq *tdq; int i; cpu_top = smp_topo(); CPU_FOREACH(i) { tdq = TDQ_CPU(i); tdq_setup(tdq); tdq->tdq_cg = smp_topo_find(cpu_top, i); if (tdq->tdq_cg == NULL) panic("Can't find cpu group for %d\n", i); } balance_tdq = TDQ_SELF(); sched_balance(); } #endif /* * Setup the thread queues and initialize the topology based on MD * information. */ static void sched_setup(void *dummy) { struct tdq *tdq; tdq = TDQ_SELF(); #ifdef SMP sched_setup_smp(); #else tdq_setup(tdq); #endif /* Add thread0's load since it's running. */ TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); tdq_load_add(tdq, &thread0); tdq->tdq_lowpri = thread0.td_priority; TDQ_UNLOCK(tdq); } /* * This routine determines time constants after stathz and hz are setup. */ /* ARGSUSED */ static void sched_initticks(void *dummy) { int incr; realstathz = stathz ? stathz : hz; sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR; sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); /* * tickincr is shifted out by 10 to avoid rounding errors due to * hz not being evenly divisible by stathz on all platforms. */ incr = (hz << SCHED_TICK_SHIFT) / realstathz; /* * This does not work for values of stathz that are more than * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. */ if (incr == 0) incr = 1; tickincr = incr; #ifdef SMP /* * Set the default balance interval now that we know * what realstathz is. */ balance_interval = realstathz; affinity = SCHED_AFFINITY_DEFAULT; #endif if (sched_idlespinthresh < 0) sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz; } /* * This is the core of the interactivity algorithm. Determines a score based * on past behavior. It is the ratio of sleep time to run time scaled to * a [0, 100] integer. This is the voluntary sleep time of a process, which * differs from the cpu usage because it does not account for time spent * waiting on a run-queue. Would be prettier if we had floating point. */ static int sched_interact_score(struct thread *td) { struct td_sched *ts; int div; ts = td->td_sched; /* * The score is only needed if this is likely to be an interactive * task. Don't go through the expense of computing it if there's * no chance. */ if (sched_interact <= SCHED_INTERACT_HALF && ts->ts_runtime >= ts->ts_slptime) return (SCHED_INTERACT_HALF); if (ts->ts_runtime > ts->ts_slptime) { div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); } if (ts->ts_slptime > ts->ts_runtime) { div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); return (ts->ts_runtime / div); } /* runtime == slptime */ if (ts->ts_runtime) return (SCHED_INTERACT_HALF); /* * This can happen if slptime and runtime are 0. */ return (0); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct thread *td) { int score; int pri; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; /* * If the score is interactive we place the thread in the realtime * queue with a priority that is less than kernel and interrupt * priorities. These threads are not subject to nice restrictions. * * Scores greater than this are placed on the normal timeshare queue * where the priority is partially decided by the most recent cpu * utilization and the rest is decided by nice value. * * The nice value of the process has a linear effect on the calculated * score. Negative nice values make it easier for a thread to be * considered interactive. */ score = imax(0, sched_interact_score(td) + td->td_proc->p_nice); if (score < sched_interact) { pri = PRI_MIN_INTERACT; pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) / sched_interact) * score; KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT, ("sched_priority: invalid interactive priority %d score %d", pri, score)); } else { pri = SCHED_PRI_MIN; if (td->td_sched->ts_ticks) pri += min(SCHED_PRI_TICKS(td->td_sched), SCHED_PRI_RANGE - 1); pri += SCHED_PRI_NICE(td->td_proc->p_nice); KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH, ("sched_priority: invalid priority %d: nice %d, " "ticks %d ftick %d ltick %d tick pri %d", pri, td->td_proc->p_nice, td->td_sched->ts_ticks, td->td_sched->ts_ftick, td->td_sched->ts_ltick, SCHED_PRI_TICKS(td->td_sched))); } sched_user_prio(td, pri); return; } /* * This routine enforces a maximum limit on the amount of scheduling history * kept. It is called after either the slptime or runtime is adjusted. This * function is ugly due to integer math. */ static void sched_interact_update(struct thread *td) { struct td_sched *ts; u_int sum; ts = td->td_sched; sum = ts->ts_runtime + ts->ts_slptime; if (sum < SCHED_SLP_RUN_MAX) return; /* * This only happens from two places: * 1) We have added an unusual amount of run time from fork_exit. * 2) We have added an unusual amount of sleep time from sched_sleep(). */ if (sum > SCHED_SLP_RUN_MAX * 2) { if (ts->ts_runtime > ts->ts_slptime) { ts->ts_runtime = SCHED_SLP_RUN_MAX; ts->ts_slptime = 1; } else { ts->ts_slptime = SCHED_SLP_RUN_MAX; ts->ts_runtime = 1; } return; } /* * If we have exceeded by more than 1/5th then the algorithm below * will not bring us back into range. Dividing by two here forces * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] */ if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { ts->ts_runtime /= 2; ts->ts_slptime /= 2; return; } ts->ts_runtime = (ts->ts_runtime / 5) * 4; ts->ts_slptime = (ts->ts_slptime / 5) * 4; } /* * Scale back the interactivity history when a child thread is created. The * history is inherited from the parent but the thread may behave totally * differently. For example, a shell spawning a compiler process. We want * to learn that the compiler is behaving badly very quickly. */ static void sched_interact_fork(struct thread *td) { int ratio; int sum; sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime; if (sum > SCHED_SLP_RUN_FORK) { ratio = sum / SCHED_SLP_RUN_FORK; td->td_sched->ts_runtime /= ratio; td->td_sched->ts_slptime /= ratio; } } /* * Called from proc0_init() to setup the scheduler fields. */ void schedinit(void) { /* * Set up the scheduler specific parts of proc0. */ proc0.p_sched = NULL; /* XXX */ thread0.td_sched = &td_sched0; td_sched0.ts_ltick = ticks; td_sched0.ts_ftick = ticks; td_sched0.ts_slice = 0; } /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most sched_slice stathz ticks. */ int sched_rr_interval(void) { /* Convert sched_slice from stathz to hz. */ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); } /* * Update the percent cpu tracking information when it is requested or * the total history exceeds the maximum. We keep a sliding history of * tick counts that slowly decays. This is less precise than the 4BSD * mechanism since it happens with less regular and frequent events. */ static void sched_pctcpu_update(struct td_sched *ts, int run) { int t = ticks; if (t - ts->ts_ltick >= SCHED_TICK_TARG) { ts->ts_ticks = 0; ts->ts_ftick = t - SCHED_TICK_TARG; } else if (t - ts->ts_ftick >= SCHED_TICK_MAX) { ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) * (ts->ts_ltick - (t - SCHED_TICK_TARG)); ts->ts_ftick = t - SCHED_TICK_TARG; } if (run) ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT; ts->ts_ltick = t; } /* * Adjust the priority of a thread. Move it to the appropriate run-queue * if necessary. This is the back-end for several priority related * functions. */ static void sched_thread_priority(struct thread *td, u_char prio) { struct td_sched *ts; struct tdq *tdq; int oldpri; KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); if (td != curthread && prio < td->td_priority) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), "lend prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, curthread); } ts = td->td_sched; THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; /* * If the priority has been elevated due to priority * propagation, we may have to move ourselves to a new * queue. This could be optimized to not re-add in some * cases. */ if (TD_ON_RUNQ(td) && prio < td->td_priority) { sched_rem(td); td->td_priority = prio; sched_add(td, SRQ_BORROWING); return; } /* * If the thread is currently running we may have to adjust the lowpri * information so other cpus are aware of our current priority. */ if (TD_IS_RUNNING(td)) { tdq = TDQ_CPU(ts->ts_cpu); oldpri = td->td_priority; td->td_priority = prio; if (prio < tdq->tdq_lowpri) tdq->tdq_lowpri = prio; else if (tdq->tdq_lowpri == oldpri) tdq_setlowpri(tdq, td); return; } td->td_priority = prio; } /* * Update a thread's priority when it is lent another thread's * priority. */ void sched_lend_prio(struct thread *td, u_char prio) { td->td_flags |= TDF_BORROWING; sched_thread_priority(td, prio); } /* * Restore a thread's priority when priority propagation is * over. The prio argument is the minimum priority the thread * needs to have to satisfy other possible priority lending * requests. If the thread's regular priority is less * important than prio, the thread will keep a priority boost * of prio. */ void sched_unlend_prio(struct thread *td, u_char prio) { u_char base_pri; if (td->td_base_pri >= PRI_MIN_TIMESHARE && td->td_base_pri <= PRI_MAX_TIMESHARE) base_pri = td->td_user_pri; else base_pri = td->td_base_pri; if (prio >= base_pri) { td->td_flags &= ~TDF_BORROWING; sched_thread_priority(td, base_pri); } else sched_lend_prio(td, prio); } /* * Standard entry for setting the priority to an absolute value. */ void sched_prio(struct thread *td, u_char prio) { u_char oldprio; /* First, update the base priority. */ td->td_base_pri = prio; /* * If the thread is borrowing another thread's priority, don't * ever lower the priority. */ if (td->td_flags & TDF_BORROWING && td->td_priority < prio) return; /* Change the real priority. */ oldprio = td->td_priority; sched_thread_priority(td, prio); /* * If the thread is on a turnstile, then let the turnstile update * its state. */ if (TD_ON_LOCK(td) && oldprio != prio) turnstile_adjust(td, oldprio); } /* * Set the base user priority, does not effect current running priority. */ void sched_user_prio(struct thread *td, u_char prio) { td->td_base_user_pri = prio; if (td->td_lend_user_pri <= prio) return; td->td_user_pri = prio; } void sched_lend_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lend_user_pri = prio; td->td_user_pri = min(prio, td->td_base_user_pri); if (td->td_priority > td->td_user_pri) sched_prio(td, td->td_user_pri); else if (td->td_priority != td->td_user_pri) td->td_flags |= TDF_NEEDRESCHED; } /* * Handle migration from sched_switch(). This happens only for * cpu binding. */ static struct mtx * sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) { struct tdq *tdn; tdn = TDQ_CPU(td->td_sched->ts_cpu); #ifdef SMP tdq_load_rem(tdq, td); /* * Do the lock dance required to avoid LOR. We grab an extra * spinlock nesting to prevent preemption while we're * not holding either run-queue lock. */ spinlock_enter(); thread_lock_block(td); /* This releases the lock on tdq. */ /* * Acquire both run-queue locks before placing the thread on the new * run-queue to avoid deadlocks created by placing a thread with a * blocked lock on the run-queue of a remote processor. The deadlock * occurs when a third processor attempts to lock the two queues in * question while the target processor is spinning with its own * run-queue lock held while waiting for the blocked lock to clear. */ tdq_lock_pair(tdn, tdq); tdq_add(tdn, td, flags); tdq_notify(tdn, td); TDQ_UNLOCK(tdn); spinlock_exit(); #endif return (TDQ_LOCKPTR(tdn)); } /* * Variadic version of thread_lock_unblock() that does not assume td_lock * is blocked. */ static inline void thread_unblock_switch(struct thread *td, struct mtx *mtx) { atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, (uintptr_t)mtx); } /* * Switch threads. This function has to handle threads coming in while * blocked for some reason, running, or idle. It also must deal with * migrating a thread from one queue to another as running threads may * be assigned elsewhere via binding. */ void sched_switch(struct thread *td, struct thread *newtd, int flags) { struct tdq *tdq; struct td_sched *ts; struct mtx *mtx; int srqflag; int cpuid, preempted; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument")); cpuid = PCPU_GET(cpuid); tdq = TDQ_CPU(cpuid); ts = td->td_sched; mtx = td->td_lock; sched_pctcpu_update(ts, 1); ts->ts_rltick = ticks; td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; preempted = !((td->td_flags & TDF_SLICEEND) || (flags & SWT_RELINQUISH)); td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND); td->td_owepreempt = 0; if (!TD_IS_IDLETHREAD(td)) tdq->tdq_switchcnt++; /* * The lock pointer in an idle thread should never change. Reset it * to CAN_RUN as well. */ if (TD_IS_IDLETHREAD(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); srqflag = preempted ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; #ifdef SMP if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu)) ts->ts_cpu = sched_pickcpu(td, 0); #endif if (ts->ts_cpu == cpuid) tdq_runq_add(tdq, td, srqflag); else { KASSERT(THREAD_CAN_MIGRATE(td) || (ts->ts_flags & TSF_BOUND) != 0, ("Thread %p shouldn't migrate", td)); mtx = sched_switch_migrate(tdq, td, srqflag); } } else { /* This thread must be going to sleep. */ TDQ_LOCK(tdq); mtx = thread_lock_block(td); tdq_load_rem(tdq, td); } /* * We enter here with the thread blocked and assigned to the * appropriate cpu run-queue or sleep-queue and with the current * thread-queue locked. */ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); newtd = choosethread(); /* * Call the MD code to switch contexts if necessary. */ if (td != newtd) { #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; sched_pctcpu_update(newtd->td_sched, 0); #ifdef KDTRACE_HOOKS /* * If DTrace has set the active vtime enum to anything * other than INACTIVE (0), then it should have set the * function to call. */ if (dtrace_vtime_active) (*dtrace_vtime_switch_func)(newtd); #endif cpu_switch(td, newtd, mtx); /* * We may return from cpu_switch on a different cpu. However, * we always return with td_lock pointing to the current cpu's * run queue lock. */ cpuid = PCPU_GET(cpuid); tdq = TDQ_CPU(cpuid); lock_profile_obtain_lock_success( &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); SDT_PROBE0(sched, , , on__cpu); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); #endif } else { thread_unblock_switch(td, mtx); SDT_PROBE0(sched, , , remain__cpu); } /* * Assert that all went well and return. */ TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); td->td_oncpu = cpuid; } /* * Adjust thread priorities as a result of a nice request. */ void sched_nice(struct proc *p, int nice) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); sched_priority(td); sched_prio(td, td->td_base_user_pri); thread_unlock(td); } } /* * Record the sleep time for the interactivity scorer. */ void sched_sleep(struct thread *td, int prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptick = ticks; if (TD_IS_SUSPENDED(td) || prio >= PSOCK) td->td_flags |= TDF_CANSWAP; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; if (static_boost == 1 && prio) sched_prio(td, prio); else if (static_boost && td->td_priority > static_boost) sched_prio(td, static_boost); } /* * Schedule a thread to resume execution and record how long it voluntarily * slept. We also update the pctcpu, interactivity, and priority. */ void sched_wakeup(struct thread *td) { struct td_sched *ts; int slptick; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; td->td_flags &= ~TDF_CANSWAP; /* * If we slept for more than a tick update our interactivity and * priority. */ slptick = td->td_slptick; td->td_slptick = 0; if (slptick && slptick != ticks) { ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT; sched_interact_update(td); sched_pctcpu_update(ts, 0); } /* * Reset the slice value since we slept and advanced the round-robin. */ ts->ts_slice = 0; sched_add(td, SRQ_BORING); } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct thread *td, struct thread *child) { THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(td->td_sched, 1); sched_fork_thread(td, child); /* * Penalize the parent and child for forking. */ sched_interact_fork(child); sched_priority(child); td->td_sched->ts_runtime += tickincr; sched_interact_update(td); sched_priority(td); } /* * Fork a new thread, may be within the same process. */ void sched_fork_thread(struct thread *td, struct thread *child) { struct td_sched *ts; struct td_sched *ts2; struct tdq *tdq; tdq = TDQ_SELF(); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Initialize child. */ ts = td->td_sched; ts2 = child->td_sched; child->td_lock = TDQ_LOCKPTR(tdq); child->td_cpuset = cpuset_ref(td->td_cpuset); ts2->ts_cpu = ts->ts_cpu; ts2->ts_flags = 0; /* * Grab our parents cpu estimation information. */ ts2->ts_ticks = ts->ts_ticks; ts2->ts_ltick = ts->ts_ltick; ts2->ts_ftick = ts->ts_ftick; /* * Do not inherit any borrowed priority from the parent. */ child->td_priority = child->td_base_pri; /* * And update interactivity score. */ ts2->ts_slptime = ts->ts_slptime; ts2->ts_runtime = ts->ts_runtime; /* Attempt to quickly learn interactivity. */ ts2->ts_slice = tdq_slice(tdq) - sched_slice_min; #ifdef KTR bzero(ts2->ts_name, sizeof(ts2->ts_name)); #endif } /* * Adjust the priority class of a thread. */ void sched_class(struct thread *td, int class) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_pri_class == class) return; td->td_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct thread *child) { struct thread *td; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit", "prio:%d", child->td_priority); PROC_LOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td, child); } /* * Penalize another thread for the time spent on this one. This helps to * worsen the priority and interactivity of processes which schedule batch * jobs such as make. This has little effect on the make process itself but * causes new processes spawned by it to receive worse scores immediately. */ void sched_exit_thread(struct thread *td, struct thread *child) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit", "prio:%d", child->td_priority); /* * Give the child's runtime to the parent without returning the * sleep time as a penalty to the parent. This causes shells that * launch expensive things to mark their children as expensive. */ thread_lock(td); td->td_sched->ts_runtime += child->td_sched->ts_runtime; sched_interact_update(td); sched_priority(td); thread_unlock(td); } void sched_preempt(struct thread *td) { struct tdq *tdq; SDT_PROBE2(sched, , , surrender, td, td->td_proc); thread_lock(td); tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); tdq->tdq_ipipending = 0; if (td->td_priority > tdq->tdq_lowpri) { int flags; flags = SW_INVOL | SW_PREEMPT; if (td->td_critnest > 1) td->td_owepreempt = 1; else if (TD_IS_IDLETHREAD(td)) mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL); else mi_switch(flags | SWT_REMOTEPREEMPT, NULL); } thread_unlock(td); } /* * Fix priorities on return to user-space. Priorities may be elevated due * to static priorities in msleep() or similar. */ void sched_userret(struct thread *td) { /* * XXX we cheat slightly on the locking here to avoid locking in * the usual case. Setting td_priority here is essentially an * incomplete workaround for not setting it properly elsewhere. * Now that some interrupt handlers are threads, not setting it * properly elsewhere can clobber it in the window between setting * it here and returning to user mode, so don't waste time setting * it perfectly here. */ KASSERT((td->td_flags & TDF_BORROWING) == 0, ("thread with borrowed priority returning to userland")); if (td->td_priority != td->td_user_pri) { thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; tdq_setlowpri(TDQ_SELF(), td); thread_unlock(td); } } /* * Handle a stathz tick. This is really only relevant for timeshare * threads. */ void sched_clock(struct thread *td) { struct tdq *tdq; struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_SELF(); #ifdef SMP /* * We run the long term load balancer infrequently on the first cpu. */ if (balance_tdq == tdq) { if (balance_ticks && --balance_ticks == 0) sched_balance(); } #endif /* * Save the old switch count so we have a record of the last ticks * activity. Initialize the new switch count based on our load. * If there is some activity seed it to reflect that. */ tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt; tdq->tdq_switchcnt = tdq->tdq_load; /* * Advance the insert index once for each tick to ensure that all * threads get a chance to run. */ if (tdq->tdq_idx == tdq->tdq_ridx) { tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) tdq->tdq_ridx = tdq->tdq_idx; } ts = td->td_sched; sched_pctcpu_update(ts, 1); if (td->td_pri_class & PRI_FIFO_BIT) return; if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) { /* * We used a tick; charge it to the thread so * that we can compute our interactivity. */ td->td_sched->ts_runtime += tickincr; sched_interact_update(td); sched_priority(td); } /* * Force a context switch if the current thread has used up a full * time slice (default is 100ms). */ if (!TD_IS_IDLETHREAD(td) && ++ts->ts_slice >= tdq_slice(tdq)) { ts->ts_slice = 0; td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND; } } /* * Called once per hz tick. */ void sched_tick(int cnt) { } /* * Return whether the current CPU has runnable tasks. Used for in-kernel * cooperative idle threads. */ int sched_runnable(void) { struct tdq *tdq; int load; load = 1; tdq = TDQ_SELF(); if ((curthread->td_flags & TDF_IDLETD) != 0) { if (tdq->tdq_load > 0) goto out; } else if (tdq->tdq_load - 1 > 0) goto out; load = 0; out: return (load); } /* * Choose the highest priority thread to run. The thread is removed from * the run-queue while running however the load remains. For SMP we set * the tdq in the global idle bitmask if it idles here. */ struct thread * sched_choose(void) { struct thread *td; struct tdq *tdq; tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = tdq_choose(tdq); if (td) { tdq_runq_rem(tdq, td); tdq->tdq_lowpri = td->td_priority; return (td); } tdq->tdq_lowpri = PRI_MAX_IDLE; return (PCPU_GET(idlethread)); } /* * Set owepreempt if necessary. Preemption never happens directly in ULE, * we always request it once we exit a critical section. */ static inline void sched_setpreempt(struct thread *td) { struct thread *ctd; int cpri; int pri; THREAD_LOCK_ASSERT(curthread, MA_OWNED); ctd = curthread; pri = td->td_priority; cpri = ctd->td_priority; if (pri < cpri) ctd->td_flags |= TDF_NEEDRESCHED; if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) return; if (!sched_shouldpreempt(pri, cpri, 0)) return; ctd->td_owepreempt = 1; } /* * Add a thread to a thread queue. Select the appropriate runq and add the * thread to it. This is the internal function called when the tdq is * predetermined. */ void tdq_add(struct tdq *tdq, struct thread *td, int flags) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); if (td->td_priority < tdq->tdq_lowpri) tdq->tdq_lowpri = td->td_priority; tdq_runq_add(tdq, td, flags); tdq_load_add(tdq, td); } /* * Select the target thread queue and add a thread to it. Request * preemption or IPI a remote processor if required. */ void sched_add(struct thread *td, int flags) { struct tdq *tdq; #ifdef SMP int cpu; #endif KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Recalculate the priority before we select the target cpu or * run-queue. */ if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_priority(td); #ifdef SMP /* * Pick the destination cpu and if it isn't ours transfer to the * target cpu. */ cpu = sched_pickcpu(td, flags); tdq = sched_setcpu(td, cpu, flags); tdq_add(tdq, td, flags); if (cpu != PCPU_GET(cpuid)) { tdq_notify(tdq, td); return; } #else tdq = TDQ_SELF(); TDQ_LOCK(tdq); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ thread_lock_set(td, TDQ_LOCKPTR(tdq)); tdq_add(tdq, td, flags); #endif if (!(flags & SRQ_YIELDING)) sched_setpreempt(td); } /* * Remove a thread from a run-queue without running it. This is used * when we're stealing a thread from a remote queue. Otherwise all threads * exit by calling sched_exit_thread() and sched_throw() themselves. */ void sched_rem(struct thread *td) { struct tdq *tdq; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem", "prio:%d", td->td_priority); SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); tdq = TDQ_CPU(td->td_sched->ts_cpu); TDQ_LOCK_ASSERT(tdq, MA_OWNED); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); tdq_runq_rem(tdq, td); tdq_load_rem(tdq, td); TD_SET_CAN_RUN(td); if (td->td_priority == tdq->tdq_lowpri) tdq_setlowpri(tdq, NULL); } /* * Fetch cpu utilization information. Updates on demand. */ fixpt_t sched_pctcpu(struct thread *td) { fixpt_t pctcpu; struct td_sched *ts; pctcpu = 0; ts = td->td_sched; if (ts == NULL) return (0); THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(ts, TD_IS_RUNNING(td)); if (ts->ts_ticks) { int rtick; /* How many rtick per second ? */ rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; } return (pctcpu); } /* * Enforce affinity settings for a thread. Called after adjustments to * cpumask. */ void sched_affinity(struct thread *td) { #ifdef SMP struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td->td_sched; if (THREAD_CAN_SCHED(td, ts->ts_cpu)) return; if (TD_ON_RUNQ(td)) { sched_rem(td); sched_add(td, SRQ_BORING); return; } if (!TD_IS_RUNNING(td)) return; /* * Force a switch before returning to userspace. If the * target thread is not running locally send an ipi to force * the issue. */ td->td_flags |= TDF_NEEDRESCHED; if (td != curthread) ipi_cpu(ts->ts_cpu, IPI_PREEMPT); #endif } /* * Bind a thread to a target cpu. */ void sched_bind(struct thread *td, int cpu) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); KASSERT(td == curthread, ("sched_bind: can only bind curthread")); ts = td->td_sched; if (ts->ts_flags & TSF_BOUND) sched_unbind(td); KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td)); ts->ts_flags |= TSF_BOUND; sched_pin(); if (PCPU_GET(cpuid) == cpu) return; ts->ts_cpu = cpu; /* When we return from mi_switch we'll be on the correct cpu. */ mi_switch(SW_VOL, NULL); } /* * Release a bound thread. */ void sched_unbind(struct thread *td) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); ts = td->td_sched; if ((ts->ts_flags & TSF_BOUND) == 0) return; ts->ts_flags &= ~TSF_BOUND; sched_unpin(); } int sched_is_bound(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); return (td->td_sched->ts_flags & TSF_BOUND); } /* * Basic yield call. */ void sched_relinquish(struct thread *td) { thread_lock(td); mi_switch(SW_VOL | SWT_RELINQUISH, NULL); thread_unlock(td); } /* * Return the total system load. */ int sched_load(void) { #ifdef SMP int total; int i; total = 0; CPU_FOREACH(i) total += TDQ_CPU(i)->tdq_sysload; return (total); #else return (TDQ_SELF()->tdq_sysload); #endif } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } #ifdef SMP #define TDQ_IDLESPIN(tdq) \ ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0) #else #define TDQ_IDLESPIN(tdq) 1 #endif /* * The actual idle process. */ void sched_idletd(void *dummy) { struct thread *td; struct tdq *tdq; int oldswitchcnt, switchcnt; int i; mtx_assert(&Giant, MA_NOTOWNED); td = curthread; tdq = TDQ_SELF(); THREAD_NO_SLEEPING(); oldswitchcnt = -1; for (;;) { if (tdq->tdq_load) { thread_lock(td); mi_switch(SW_VOL | SWT_IDLE, NULL); thread_unlock(td); } switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; #ifdef SMP if (switchcnt != oldswitchcnt) { oldswitchcnt = switchcnt; if (tdq_idled(tdq) == 0) continue; } switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; #else oldswitchcnt = switchcnt; #endif /* * If we're switching very frequently, spin while checking * for load rather than entering a low power state that * may require an IPI. However, don't do any busy * loops while on SMT machines as this simply steals * cycles from cores doing useful work. */ if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) { for (i = 0; i < sched_idlespins; i++) { if (tdq->tdq_load) break; cpu_spinwait(); } } /* If there was context switch during spin, restart it. */ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt) continue; /* Run main MD idle handler. */ tdq->tdq_cpu_idle = 1; /* * Make sure that tdq_cpu_idle update is globally visible * before cpu_idle() read tdq_load. The order is important * to avoid race with tdq_notify. */ mb(); cpu_idle(switchcnt * 4 > sched_idlespinthresh); tdq->tdq_cpu_idle = 0; /* * Account thread-less hardware interrupts and * other wakeup reasons equal to context switches. */ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; if (switchcnt != oldswitchcnt) continue; tdq->tdq_switchcnt++; oldswitchcnt++; } } /* * A CPU is entering for the first time or a thread is exiting. */ void sched_throw(struct thread *td) { struct thread *newtd; struct tdq *tdq; tdq = TDQ_SELF(); if (td == NULL) { /* Correct spinlock nesting and acquire the correct lock. */ TDQ_LOCK(tdq); spinlock_exit(); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); } else { MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); tdq_load_rem(tdq, td); lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object); } KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); newtd = choosethread(); TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd; cpu_throw(td, newtd); /* doesn't return */ } /* * This is called from fork_exit(). Just acquire the correct locks and * let fork do the rest of the work. */ void sched_fork_exit(struct thread *td) { struct tdq *tdq; int cpuid; /* * Finish setting up thread glue so that it begins execution in a * non-nested critical section with the scheduler lock held. */ cpuid = PCPU_GET(cpuid); tdq = TDQ_CPU(cpuid); if (TD_IS_IDLETHREAD(td)) td->td_lock = TDQ_LOCKPTR(tdq); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); td->td_oncpu = cpuid; TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); lock_profile_obtain_lock_success( &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__); } /* * Create on first use to catch odd startup conditons. */ char * sched_tdname(struct thread *td) { #ifdef KTR struct td_sched *ts; ts = td->td_sched; if (ts->ts_name[0] == '\0') snprintf(ts->ts_name, sizeof(ts->ts_name), "%s tid %d", td->td_name, td->td_tid); return (ts->ts_name); #else return (td->td_name); #endif } #ifdef KTR void sched_clear_tdname(struct thread *td) { struct td_sched *ts; ts = td->td_sched; ts->ts_name[0] = '\0'; } #endif #ifdef SMP /* * Build the CPU topology dump string. Is recursively called to collect * the topology tree. */ static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent) { char cpusetbuf[CPUSETBUFSIZ]; int i, first; sbuf_printf(sb, "%*s\n", indent, "", 1 + indent / 2, cg->cg_level); sbuf_printf(sb, "%*s ", indent, "", cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask)); first = TRUE; for (i = 0; i < MAXCPU; i++) { if (CPU_ISSET(i, &cg->cg_mask)) { if (!first) sbuf_printf(sb, ", "); else first = FALSE; sbuf_printf(sb, "%d", i); } } sbuf_printf(sb, "\n"); if (cg->cg_flags != 0) { sbuf_printf(sb, "%*s ", indent, ""); if ((cg->cg_flags & CG_FLAG_HTT) != 0) sbuf_printf(sb, "HTT group"); if ((cg->cg_flags & CG_FLAG_THREAD) != 0) sbuf_printf(sb, "THREAD group"); if ((cg->cg_flags & CG_FLAG_SMT) != 0) sbuf_printf(sb, "SMT group"); sbuf_printf(sb, "\n"); } if (cg->cg_children > 0) { sbuf_printf(sb, "%*s \n", indent, ""); for (i = 0; i < cg->cg_children; i++) sysctl_kern_sched_topology_spec_internal(sb, &cg->cg_child[i], indent+2); sbuf_printf(sb, "%*s \n", indent, ""); } sbuf_printf(sb, "%*s\n", indent, ""); return (0); } /* * Sysctl handler for retrieving topology dump. It's a wrapper for * the recursive sysctl_kern_smp_topology_spec_internal(). */ static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS) { struct sbuf *topo; int err; KASSERT(cpu_top != NULL, ("cpu_top isn't initialized")); topo = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND); if (topo == NULL) return (ENOMEM); sbuf_printf(topo, "\n"); err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1); sbuf_printf(topo, "\n"); if (err == 0) { sbuf_finish(topo); err = SYSCTL_OUT(req, sbuf_data(topo), sbuf_len(topo)); } sbuf_delete(topo); return (err); } #endif static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val, period; period = 1000000 / realstathz; new_val = period * sched_slice; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val <= 0) return (EINVAL); sched_slice = imax(1, (new_val + period / 2) / period); sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); return (0); } SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, "Scheduler name"); SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_kern_quantum, "I", "Quantum for timeshare threads in microseconds"); SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "Quantum for timeshare threads in stathz ticks"); SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "Interactivity score threshold"); SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 0, "Maximal (lowest) priority for preemption"); SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0, "Assign static kernel priorities to sleeping threads"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0, "Number of times idle thread will spin waiting for new work"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh, 0, "Threshold before we will permit idle thread spinning"); #ifdef SMP SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for"); SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "Enables the long-term load balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, &balance_interval, 0, "Average period in stathz ticks to run the long-term balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, "Attempts to steal work from other cores before idling"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, "Minimum load on remote CPU before we'll steal"); SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A", "XML dump of detected CPU topology"); #endif /* ps compat. All cpu percentages from ULE are weighted. */ static int ccpu = 0; SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h (revision 273265) +++ head/sys/sys/proc.h (revision 273266) @@ -1,983 +1,985 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #include #ifndef _KERNEL #include #endif #include #include #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #endif #include #include #include /* Machine-dependent proc substruct. */ /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { u_int s_count; /* Ref cnt; pgrps in session - atomic. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct cdev_priv *s_ttydp; /* (m) Device of controlling tty. */ struct tty *s_ttyp; /* (e) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ int pg_jobc; /* (m) Job control process count. */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * q - td_contested lock * r - p_peers lock * t - thread lock * x - created at fork, only changes during single threading in exec * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * z - zombie threads lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct cpuset; struct kaioinfo; struct kaudit_record; struct kdtrace_proc; struct kdtrace_thread; struct mqueue_notifier; struct nlminfo; struct p_sched; struct proc; struct procdesc; struct racct; struct sbuf; struct sleepqueue; struct td_sched; struct thread; struct trapframe; struct turnstile; /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking for p_rux: (cj) means (j) for p_rux and (c) for p_crux. * Locking for td_rux: (t) for all fields. */ struct rusage_ext { uint64_t rux_runtime; /* (cj) Real time. */ uint64_t rux_uticks; /* (cj) Statclock hits in user mode. */ uint64_t rux_sticks; /* (cj) Statclock hits in sys mode. */ uint64_t rux_iticks; /* (cj) Statclock hits in intr mode. */ uint64_t rux_uu; /* (c) Previous user time in usec. */ uint64_t rux_su; /* (c) Previous sys time in usec. */ uint64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_runq; /* (t) Run queue. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals u_char td_lend_user_pri; /* (t) Lend user pri. */ /* Cleared during fork1() */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ - u_char td_lastcpu; /* (t) Last cpu we were on. */ - u_char td_oncpu; /* (t) Which cpu we are on. */ + int td_lastcpu; /* (t) Last cpu we were on. */ + int td_oncpu; /* (t) Which cpu we are on. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ short td_locks; /* (k) Count of non-spin locks. */ short td_rw_rlocks; /* (k) Count of rwlock read locks. */ short td_lk_slocks; /* (k) Count of lockmgr shared locks. */ short td_stopsched; /* (k) Scheduler stopped. */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ u_int td_estcpu; /* (t) estimated cpu utilization */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ int td_swvoltick; /* (t) Time at last SW_VOL switch. */ u_int td_cow; /* (*) Number of copy-on-write faults */ struct rusage td_ru; /* (t) rusage information. */ struct rusage_ext td_rux; /* (t) Internal rusage information. */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ struct ksiginfo td_dbgksi; /* (c) ksi reflected to debugger. */ int td_ng_outbound; /* (k) Thread entered ng from above. */ struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ int td_dom_rr_idx; /* (k) RR Numa domain selection. */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ #define td_startcopy td_endzero sigset_t td_sigmask; /* (c) Current signal mask. */ u_char td_rqindex; /* (t) Run queue index. */ u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1() or create_thread() * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ union { register_t tdu_retval[2]; off_t tdu_off; } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct td_sched *td_sched; /* (*) Scheduler-specific data. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ int td_errno; /* Error returned by last syscall. */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ int td_ma_cnt; /* (k) size of *td_ma */ }; struct mtx *thread_lock_block(struct thread *); void thread_lock_unblock(struct thread *, struct mtx *); void thread_lock_set(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) #ifdef INVARIANTS #define THREAD_LOCKPTR_ASSERT(td, lock) \ do { \ struct mtx *__m = (td)->td_lock; \ KASSERT((__m == &blocked_lock || __m == (lock)), \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #else #define THREAD_LOCKPTR_ASSERT(td, lock) #endif /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_CANSWAP 0x00000040 /* Thread can be swapped. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_UNUSED09 0x00000200 /* --available-- */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_NOLOAD 0x00040000 /* Ignore during load avg calculations. */ #define TDF_UNUSED19 0x00080000 /* --available-- */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_UNUSED21 0x00200000 /* --available-- */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_ALRMPEND 0x10000000 /* Pending SIGVTALRM needs to be posted. */ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ #define TDB_USERWR 0x00000004 /* Debugger modified memory or registers */ #define TDB_SCE 0x00000008 /* Thread performs syscall enter */ #define TDB_SCX 0x00000010 /* Thread performs syscall exit */ #define TDB_EXEC 0x00000020 /* TDB_SCX from exec(2) family */ #define TDB_FORK 0x00000040 /* TDB_SCX from fork(2) that created new process */ #define TDB_STOPATFORK 0x00000080 /* Stop at the return from fork (child only) */ #define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock treatment. */ #define TDP_NOFAULTING 0x00000080 /* Do not handle page faults. */ #define TDP_UNUSED9 0x00000100 /* --available-- */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_SYNCIO 0x00000800 /* Local override, disable async i/o. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ #define TDP_DEVMEMIO 0x20000000 /* Accessing memory for /dev/mem */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING #define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ #define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN /* * Process structure. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Process limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) Process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cj) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ u_int p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ struct kaioinfo *p_aioinfo; /* (y) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (j) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ u_int p_treeflag; /* (e) P_TREE flags */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ char p_comm[MAXCOMLEN + 1]; /* (b) Process name. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ /* End area that is copied on creation. */ #define p_endcopy p_xstat u_short p_xstat; /* (c) Exit status; also stop sig. */ struct knlist p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ struct p_sched *p_sched; /* (*) Scheduler-specific data. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ struct cv p_dbgwait; /* (*) wait cv for debugger attach after fork. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ u_char p_throttled; /* (c) Flag for racct pcpu throttling */ /* * An orphan is the child that has beed re-parented to the * debugger as a result of attaching to it. Need to keep * track of them for parent to be able to collect the exit * status of what used to be children. */ LIST_ENTRY(proc) p_orphan; /* (e) List of orphan processes. */ LIST_HEAD(, proc) p_orphans; /* (e) Pointer to list of orphans. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id -#define NOCPU 0xff /* For when we aren't on a CPU. */ +#define NOCPU (-1) /* For when we aren't on a CPU. */ +#define NOCPU_OLD (255) +#define MAXCPU_OLD (254) #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KTHREAD 0x00004 /* Kernel thread (*). */ #define P_FOLLOWFORK 0x00008 /* Attach parent debugger to children. */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ #define P_WKILLED 0x08000 /* Killed, go to kernel/user boundary ASAP. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x800000 /* Process is using HWPMCs */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_UNUSED1 0x2000000 #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STATCHILD 0x8000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_PPTRACE 0x80000000 /* PT_TRACEME by vforked child. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) #define P_KILLED(p) ((p)->p_flag & P_WKILLED) /* These flags are kept in p_flag2. */ #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan list */ /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Types and flags for mi_switch(). */ #define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ #define SWT_NONE 0 /* Unspecified switch. */ #define SWT_PREEMPT 1 /* Switching due to preemption. */ #define SWT_OWEPREEMPT 2 /* Switching due to opepreempt. */ #define SWT_TURNSTILE 3 /* Turnstile contention. */ #define SWT_SLEEPQ 4 /* Sleepq wait. */ #define SWT_SLEEPQTIMO 5 /* Sleepq timeout wait. */ #define SWT_RELINQUISH 6 /* yield call. */ #define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ #define SWT_IDLE 8 /* Switching from the idle thread. */ #define SWT_IWAIT 9 /* Waiting for interrupts. */ #define SWT_SUSPEND 10 /* Thread suspended. */ #define SWT_REMOTEPREEMPT 11 /* Remote processor preempted. */ #define SWT_REMOTEWAKEIDLE 12 /* Remote processor preempted idle. */ #define SWT_COUNT 13 /* Number of switch types. */ /* Flags */ #define SW_VOL 0x0100 /* Voluntary switch. */ #define SW_INVOL 0x0200 /* Involuntary switch. */ #define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit * in a pid_t, as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 extern pid_t pid_max; #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define STOPEVENT(p, e, v) do { \ if ((p)->p_stops & (e)) { \ PROC_LOCK(p); \ stopevent((p), (e), (v)); \ PROC_UNLOCK(p); \ } \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) \ stopevent((p), (e), (v)); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* Hold process U-area in memory, normally for ptrace/procfs work. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process")); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process not held")); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ PROC_ASSERT_HELD(p); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process held")); \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) ((td)->td_flags & TDF_CANSWAP) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() ((curthread)->td_no_sleeping++) #define THREAD_SLEEPING_OK() ((curthread)->td_no_sleeping--) #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) extern LIST_HEAD(tidhashhead, thread) *tidhashtbl; extern u_long tidhash; extern struct rwlock tidhash_lock; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread thread0; /* Primary thread in proc0. */ extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_locked(pid_t pid); struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ /* * pget() flags. */ #define PGET_HOLD 0x00001 /* Hold the process. */ #define PGET_CANSEE 0x00002 /* Check against p_cansee(). */ #define PGET_CANDEBUG 0x00004 /* Check against p_candebug(). */ #define PGET_ISCURRENT 0x00008 /* Check that the found process is current. */ #define PGET_NOTWEXIT 0x00010 /* Check that the process is not in P_WEXIT. */ #define PGET_NOTINEXEC 0x00020 /* Check that the process is not in P_INEXEC. */ #define PGET_NOTID 0x00040 /* Do not assume tid if pid > PID_MAX. */ #define PGET_WANTREAD (PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT) int pget(pid_t pid, int flags, struct proc **pp); void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, int, int, struct proc **, int *, int); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kern_yield(int); void kick_proc0(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void maybe_yield(void); void mi_switch(int flags, struct thread *newtd); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sess_hold(struct session *); void sess_release(struct session *); int setrunnable(struct thread *); void setsugid(struct proc *p); int should_yield(void); int sigonstack(size_t sp); void stopevent(struct proc *, u_int, u_int); struct thread *tdfind(lwpid_t, pid_t); void threadinit(void); void tidhash_add(struct thread *); void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int) __dead2; struct syscall_args; int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_set_fork_handler(struct thread *, void (*)(void *), void *); void cpu_set_syscall_retval(struct thread *, int); void cpu_set_upcall(struct thread *td, struct thread *td0); void cpu_set_upcall_kse(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(int pages); int thread_alloc_stack(struct thread *, int pages); void thread_exit(void) __dead2; void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap(void); int thread_single(int how); void thread_single_end(void); void thread_stash(struct thread *td); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); void thread_suspend_switch(struct thread *); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); int thread_unsuspend_one(struct thread *td); void thread_wait(struct proc *p); struct thread *thread_find(struct proc *p, lwpid_t tid); static __inline int curthread_pflags_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags & flags); td->td_pflags |= flags; return (save); } static __inline void curthread_pflags_restore(int save) { curthread->td_pflags &= save; } #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ Index: head/sys/sys/user.h =================================================================== --- head/sys/sys/user.h (revision 273265) +++ head/sys/sys/user.h (revision 273266) @@ -1,537 +1,539 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. * Copyright (c) 2007 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)user.h 8.2 (Berkeley) 9/23/93 * $FreeBSD$ */ #ifndef _SYS_USER_H_ #define _SYS_USER_H_ #include #ifndef _KERNEL /* stuff that *used* to be included by user.h, or is now needed */ #include #include #include #include #include #include #include #include #include #include /* XXX */ #include /* XXX */ #include /* XXX */ #include /* XXX */ #endif /* !_KERNEL */ #ifndef _SYS_RESOURCEVAR_H_ #include #endif #ifndef _SYS_SIGNALVAR_H_ #include #endif #ifndef _SYS_SOCKET_VAR_H_ #include #endif #include /* * KERN_PROC subtype ops return arrays of selected proc structure entries: * * This struct includes several arrays of spare space, with different arrays * for different standard C-types. When adding new variables to this struct, * the space for byte-aligned data should be taken from the ki_sparestring, * pointers from ki_spareptrs, word-aligned data from ki_spareints, and * doubleword-aligned data from ki_sparelongs. Make sure the space for new * variables come from the array which matches the size and alignment of * those variables on ALL hardware platforms, and then adjust the appropriate * KI_NSPARE_* value(s) to match. * * Always verify that sizeof(struct kinfo_proc) == KINFO_PROC_SIZE on all * platforms after you have added new variables. Note that if you change * the value of KINFO_PROC_SIZE, then many userland programs will stop * working until they are recompiled! * * Once you have added the new field, you will need to add code to initialize * it in two places: function fill_kinfo_proc in sys/kern/kern_proc.c and * function kvm_proclist in lib/libkvm/kvm_proc.c . */ -#define KI_NSPARE_INT 6 +#define KI_NSPARE_INT 4 #define KI_NSPARE_LONG 12 #define KI_NSPARE_PTR 6 #ifndef _KERNEL #ifndef KINFO_PROC_SIZE #error "Unknown architecture" #endif #endif /* !_KERNEL */ #define WMESGLEN 8 /* size of returned wchan message */ #define LOCKNAMELEN 8 /* size of returned lock name */ #define TDNAMLEN 16 /* size of returned thread name */ #define COMMLEN 19 /* size of returned ki_comm name */ #define KI_EMULNAMELEN 16 /* size of returned ki_emul */ #define KI_NGROUPS 16 /* number of groups in ki_groups */ #define LOGNAMELEN 17 /* size of returned ki_login */ #define LOGINCLASSLEN 17 /* size of returned ki_loginclass */ #ifndef BURN_BRIDGES #define OCOMMLEN TDNAMLEN #define ki_ocomm ki_tdname #endif /* Flags for the process credential. */ #define KI_CRF_CAPABILITY_MODE 0x00000001 /* * Steal a bit from ki_cr_flags to indicate that the cred had more than * KI_NGROUPS groups. */ #define KI_CRF_GRP_OVERFLOW 0x80000000 struct kinfo_proc { int ki_structsize; /* size of this structure */ int ki_layout; /* reserved: layout identifier */ struct pargs *ki_args; /* address of command arguments */ struct proc *ki_paddr; /* address of proc */ struct user *ki_addr; /* kernel virtual addr of u-area */ struct vnode *ki_tracep; /* pointer to trace file */ struct vnode *ki_textvp; /* pointer to executable file */ struct filedesc *ki_fd; /* pointer to open file info */ struct vmspace *ki_vmspace; /* pointer to kernel vmspace struct */ void *ki_wchan; /* sleep address */ pid_t ki_pid; /* Process identifier */ pid_t ki_ppid; /* parent process id */ pid_t ki_pgid; /* process group id */ pid_t ki_tpgid; /* tty process group id */ pid_t ki_sid; /* Process session ID */ pid_t ki_tsid; /* Terminal session ID */ short ki_jobc; /* job control counter */ short ki_spare_short1; /* unused (just here for alignment) */ dev_t ki_tdev; /* controlling tty dev */ sigset_t ki_siglist; /* Signals arrived but not delivered */ sigset_t ki_sigmask; /* Current signal mask */ sigset_t ki_sigignore; /* Signals being ignored */ sigset_t ki_sigcatch; /* Signals being caught by user */ uid_t ki_uid; /* effective user id */ uid_t ki_ruid; /* Real user id */ uid_t ki_svuid; /* Saved effective user id */ gid_t ki_rgid; /* Real group id */ gid_t ki_svgid; /* Saved effective group id */ short ki_ngroups; /* number of groups */ short ki_spare_short2; /* unused (just here for alignment) */ gid_t ki_groups[KI_NGROUPS]; /* groups */ vm_size_t ki_size; /* virtual size */ segsz_t ki_rssize; /* current resident set size in pages */ segsz_t ki_swrss; /* resident set size before last swap */ segsz_t ki_tsize; /* text size (pages) XXX */ segsz_t ki_dsize; /* data size (pages) XXX */ segsz_t ki_ssize; /* stack size (pages) */ u_short ki_xstat; /* Exit status for wait & stop signal */ u_short ki_acflag; /* Accounting flags */ fixpt_t ki_pctcpu; /* %cpu for process during ki_swtime */ u_int ki_estcpu; /* Time averaged value of ki_cpticks */ u_int ki_slptime; /* Time since last blocked */ u_int ki_swtime; /* Time swapped in or out */ u_int ki_cow; /* number of copy-on-write faults */ u_int64_t ki_runtime; /* Real time in microsec */ struct timeval ki_start; /* starting time */ struct timeval ki_childtime; /* time used by process children */ long ki_flag; /* P_* flags */ long ki_kiflag; /* KI_* flags (below) */ int ki_traceflag; /* Kernel trace points */ char ki_stat; /* S* process status */ signed char ki_nice; /* Process "nice" value */ char ki_lock; /* Process lock (prevent swap) count */ char ki_rqindex; /* Run queue index */ - u_char ki_oncpu; /* Which cpu we are on */ - u_char ki_lastcpu; /* Last cpu we were on */ + u_char ki_oncpu_old; /* Which cpu we are on (legacy) */ + u_char ki_lastcpu_old; /* Last cpu we were on (legacy) */ char ki_tdname[TDNAMLEN+1]; /* thread name */ char ki_wmesg[WMESGLEN+1]; /* wchan message */ char ki_login[LOGNAMELEN+1]; /* setlogin name */ char ki_lockname[LOCKNAMELEN+1]; /* lock name */ char ki_comm[COMMLEN+1]; /* command name */ char ki_emul[KI_EMULNAMELEN+1]; /* emulation name */ char ki_loginclass[LOGINCLASSLEN+1]; /* login class */ /* * When adding new variables, take space for char-strings from the * front of ki_sparestrings, and ints from the end of ki_spareints. * That way the spare room from both arrays will remain contiguous. */ char ki_sparestrings[50]; /* spare string space */ int ki_spareints[KI_NSPARE_INT]; /* spare room for growth */ + int ki_oncpu; /* Which cpu we are on */ + int ki_lastcpu; /* Last cpu we were on */ int ki_tracer; /* Pid of tracing process */ int ki_flag2; /* P2_* flags */ int ki_fibnum; /* Default FIB number */ u_int ki_cr_flags; /* Credential flags */ int ki_jid; /* Process jail ID */ int ki_numthreads; /* XXXKSE number of threads in total */ lwpid_t ki_tid; /* XXXKSE thread id */ struct priority ki_pri; /* process priority */ struct rusage ki_rusage; /* process rusage statistics */ /* XXX - most fields in ki_rusage_ch are not (yet) filled in */ struct rusage ki_rusage_ch; /* rusage of children processes */ struct pcb *ki_pcb; /* kernel virtual addr of pcb */ void *ki_kstack; /* kernel virtual addr of stack */ void *ki_udata; /* User convenience pointer */ struct thread *ki_tdaddr; /* address of thread */ /* * When adding new variables, take space for pointers from the * front of ki_spareptrs, and longs from the end of ki_sparelongs. * That way the spare room from both arrays will remain contiguous. */ void *ki_spareptrs[KI_NSPARE_PTR]; /* spare room for growth */ long ki_sparelongs[KI_NSPARE_LONG]; /* spare room for growth */ long ki_sflag; /* PS_* flags */ long ki_tdflags; /* XXXKSE kthread flag */ }; void fill_kinfo_proc(struct proc *, struct kinfo_proc *); /* XXX - the following two defines are temporary */ #define ki_childstime ki_rusage_ch.ru_stime #define ki_childutime ki_rusage_ch.ru_utime /* * Legacy PS_ flag. This moved to p_flag but is maintained for * compatibility. */ #define PS_INMEM 0x00001 /* Loaded into memory. */ /* ki_sessflag values */ #define KI_CTTY 0x00000001 /* controlling tty vnode active */ #define KI_SLEADER 0x00000002 /* session leader */ #define KI_LOCKBLOCK 0x00000004 /* proc blocked on lock ki_lockname */ /* * This used to be the per-process structure containing data that * isn't needed in core when the process is swapped out, but now it * remains only for the benefit of a.out core dumps. */ struct user { struct pstats u_stats; /* *p_stats */ struct kinfo_proc u_kproc; /* eproc */ }; /* * The KERN_PROC_FILE sysctl allows a process to dump the file descriptor * array of another process. */ #define KF_ATTR_VALID 0x0001 #define KF_TYPE_NONE 0 #define KF_TYPE_VNODE 1 #define KF_TYPE_SOCKET 2 #define KF_TYPE_PIPE 3 #define KF_TYPE_FIFO 4 #define KF_TYPE_KQUEUE 5 #define KF_TYPE_CRYPTO 6 #define KF_TYPE_MQUEUE 7 #define KF_TYPE_SHM 8 #define KF_TYPE_SEM 9 #define KF_TYPE_PTS 10 #define KF_TYPE_PROCDESC 11 #define KF_TYPE_UNKNOWN 255 #define KF_VTYPE_VNON 0 #define KF_VTYPE_VREG 1 #define KF_VTYPE_VDIR 2 #define KF_VTYPE_VBLK 3 #define KF_VTYPE_VCHR 4 #define KF_VTYPE_VLNK 5 #define KF_VTYPE_VSOCK 6 #define KF_VTYPE_VFIFO 7 #define KF_VTYPE_VBAD 8 #define KF_VTYPE_UNKNOWN 255 #define KF_FD_TYPE_CWD -1 /* Current working directory */ #define KF_FD_TYPE_ROOT -2 /* Root directory */ #define KF_FD_TYPE_JAIL -3 /* Jail directory */ #define KF_FD_TYPE_TRACE -4 /* Ktrace vnode */ #define KF_FD_TYPE_TEXT -5 /* Text vnode */ #define KF_FD_TYPE_CTTY -6 /* Controlling terminal */ #define KF_FLAG_READ 0x00000001 #define KF_FLAG_WRITE 0x00000002 #define KF_FLAG_APPEND 0x00000004 #define KF_FLAG_ASYNC 0x00000008 #define KF_FLAG_FSYNC 0x00000010 #define KF_FLAG_NONBLOCK 0x00000020 #define KF_FLAG_DIRECT 0x00000040 #define KF_FLAG_HASLOCK 0x00000080 #define KF_FLAG_SHLOCK 0x00000100 #define KF_FLAG_EXLOCK 0x00000200 #define KF_FLAG_NOFOLLOW 0x00000400 #define KF_FLAG_CREAT 0x00000800 #define KF_FLAG_TRUNC 0x00001000 #define KF_FLAG_EXCL 0x00002000 #define KF_FLAG_EXEC 0x00004000 /* * Old format. Has variable hidden padding due to alignment. * This is a compatability hack for pre-build 7.1 packages. */ #if defined(__amd64__) #define KINFO_OFILE_SIZE 1328 #endif #if defined(__i386__) #define KINFO_OFILE_SIZE 1324 #endif struct kinfo_ofile { int kf_structsize; /* Size of kinfo_file. */ int kf_type; /* Descriptor type. */ int kf_fd; /* Array index. */ int kf_ref_count; /* Reference count. */ int kf_flags; /* Flags. */ /* XXX Hidden alignment padding here on amd64 */ off_t kf_offset; /* Seek location. */ int kf_vnode_type; /* Vnode type. */ int kf_sock_domain; /* Socket domain. */ int kf_sock_type; /* Socket type. */ int kf_sock_protocol; /* Socket protocol. */ char kf_path[PATH_MAX]; /* Path to file, if any. */ struct sockaddr_storage kf_sa_local; /* Socket address. */ struct sockaddr_storage kf_sa_peer; /* Peer address. */ }; #if defined(__amd64__) || defined(__i386__) /* * This size should never be changed. If you really need to, you must provide * backward ABI compatibility by allocating a new sysctl MIB that will return * the new structure. The current structure has to be returned by the current * sysctl MIB. See how it is done for the kinfo_ofile structure. */ #define KINFO_FILE_SIZE 1392 #endif struct kinfo_file { int kf_structsize; /* Variable size of record. */ int kf_type; /* Descriptor type. */ int kf_fd; /* Array index. */ int kf_ref_count; /* Reference count. */ int kf_flags; /* Flags. */ int kf_pad0; /* Round to 64 bit alignment. */ int64_t kf_offset; /* Seek location. */ int kf_vnode_type; /* Vnode type. */ int kf_sock_domain; /* Socket domain. */ int kf_sock_type; /* Socket type. */ int kf_sock_protocol; /* Socket protocol. */ struct sockaddr_storage kf_sa_local; /* Socket address. */ struct sockaddr_storage kf_sa_peer; /* Peer address. */ union { struct { /* Address of so_pcb. */ uint64_t kf_sock_pcb; /* Address of inp_ppcb. */ uint64_t kf_sock_inpcb; /* Address of unp_conn. */ uint64_t kf_sock_unpconn; /* Send buffer state. */ uint16_t kf_sock_snd_sb_state; /* Receive buffer state. */ uint16_t kf_sock_rcv_sb_state; /* Round to 64 bit alignment. */ uint32_t kf_sock_pad0; } kf_sock; struct { /* Global file id. */ uint64_t kf_file_fileid; /* File size. */ uint64_t kf_file_size; /* Vnode filesystem id. */ uint32_t kf_file_fsid; /* File device. */ uint32_t kf_file_rdev; /* File mode. */ uint16_t kf_file_mode; /* Round to 64 bit alignment. */ uint16_t kf_file_pad0; uint32_t kf_file_pad1; } kf_file; struct { uint32_t kf_sem_value; uint16_t kf_sem_mode; } kf_sem; struct { uint64_t kf_pipe_addr; uint64_t kf_pipe_peer; uint32_t kf_pipe_buffer_cnt; /* Round to 64 bit alignment. */ uint32_t kf_pipe_pad0[3]; } kf_pipe; struct { uint32_t kf_pts_dev; /* Round to 64 bit alignment. */ uint32_t kf_pts_pad0[7]; } kf_pts; struct { pid_t kf_pid; } kf_proc; } kf_un; uint16_t kf_status; /* Status flags. */ uint16_t kf_pad1; /* Round to 32 bit alignment. */ int _kf_ispare0; /* Space for more stuff. */ cap_rights_t kf_cap_rights; /* Capability rights. */ uint64_t _kf_cap_spare; /* Space for future cap_rights_t. */ /* Truncated before copyout in sysctl */ char kf_path[PATH_MAX]; /* Path to file, if any. */ }; /* * The KERN_PROC_VMMAP sysctl allows a process to dump the VM layout of * another process as a series of entries. */ #define KVME_TYPE_NONE 0 #define KVME_TYPE_DEFAULT 1 #define KVME_TYPE_VNODE 2 #define KVME_TYPE_SWAP 3 #define KVME_TYPE_DEVICE 4 #define KVME_TYPE_PHYS 5 #define KVME_TYPE_DEAD 6 #define KVME_TYPE_SG 7 #define KVME_TYPE_MGTDEVICE 8 #define KVME_TYPE_UNKNOWN 255 #define KVME_PROT_READ 0x00000001 #define KVME_PROT_WRITE 0x00000002 #define KVME_PROT_EXEC 0x00000004 #define KVME_FLAG_COW 0x00000001 #define KVME_FLAG_NEEDS_COPY 0x00000002 #define KVME_FLAG_NOCOREDUMP 0x00000004 #define KVME_FLAG_SUPER 0x00000008 #define KVME_FLAG_GROWS_UP 0x00000010 #define KVME_FLAG_GROWS_DOWN 0x00000020 #if defined(__amd64__) #define KINFO_OVMENTRY_SIZE 1168 #endif #if defined(__i386__) #define KINFO_OVMENTRY_SIZE 1128 #endif struct kinfo_ovmentry { int kve_structsize; /* Size of kinfo_vmmapentry. */ int kve_type; /* Type of map entry. */ void *kve_start; /* Starting address. */ void *kve_end; /* Finishing address. */ int kve_flags; /* Flags on map entry. */ int kve_resident; /* Number of resident pages. */ int kve_private_resident; /* Number of private pages. */ int kve_protection; /* Protection bitmask. */ int kve_ref_count; /* VM obj ref count. */ int kve_shadow_count; /* VM obj shadow count. */ char kve_path[PATH_MAX]; /* Path to VM obj, if any. */ void *_kve_pspare[8]; /* Space for more stuff. */ off_t kve_offset; /* Mapping offset in object */ uint64_t kve_fileid; /* inode number if vnode */ dev_t kve_fsid; /* dev_t of vnode location */ int _kve_ispare[3]; /* Space for more stuff. */ }; #if defined(__amd64__) || defined(__i386__) #define KINFO_VMENTRY_SIZE 1160 #endif struct kinfo_vmentry { int kve_structsize; /* Variable size of record. */ int kve_type; /* Type of map entry. */ uint64_t kve_start; /* Starting address. */ uint64_t kve_end; /* Finishing address. */ uint64_t kve_offset; /* Mapping offset in object */ uint64_t kve_vn_fileid; /* inode number if vnode */ uint32_t kve_vn_fsid; /* dev_t of vnode location */ int kve_flags; /* Flags on map entry. */ int kve_resident; /* Number of resident pages. */ int kve_private_resident; /* Number of private pages. */ int kve_protection; /* Protection bitmask. */ int kve_ref_count; /* VM obj ref count. */ int kve_shadow_count; /* VM obj shadow count. */ int kve_vn_type; /* Vnode type. */ uint64_t kve_vn_size; /* File size. */ uint32_t kve_vn_rdev; /* Device id if device. */ uint16_t kve_vn_mode; /* File mode. */ uint16_t kve_status; /* Status flags. */ int _kve_ispare[12]; /* Space for more stuff. */ /* Truncated before copyout in sysctl */ char kve_path[PATH_MAX]; /* Path to VM obj, if any. */ }; /* * The KERN_PROC_KSTACK sysctl allows a process to dump the kernel stacks of * another process as a series of entries. Each stack is represented by a * series of symbol names and offsets as generated by stack_sbuf_print(9). */ #define KKST_MAXLEN 1024 #define KKST_STATE_STACKOK 0 /* Stack is valid. */ #define KKST_STATE_SWAPPED 1 /* Stack swapped out. */ #define KKST_STATE_RUNNING 2 /* Stack ephemeral. */ #if defined(__amd64__) || defined(__i386__) #define KINFO_KSTACK_SIZE 1096 #endif struct kinfo_kstack { lwpid_t kkst_tid; /* ID of thread. */ int kkst_state; /* Validity of stack. */ char kkst_trace[KKST_MAXLEN]; /* String representing stack. */ int _kkst_ispare[16]; /* Space for more stuff. */ }; struct kinfo_sigtramp { void *ksigtramp_start; void *ksigtramp_end; void *ksigtramp_spare[4]; }; #ifdef _KERNEL /* Flags for kern_proc_out function. */ #define KERN_PROC_NOTHREADS 0x1 #define KERN_PROC_MASK32 0x2 struct sbuf; /* * The kern_proc out functions are helper functions to dump process * miscellaneous kinfo structures to sbuf. The main consumers are KERN_PROC * sysctls but they may also be used by other kernel subsystems. * * The functions manipulate the process locking state and expect the process * to be locked on enter. On return the process is unlocked. */ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen); int kern_proc_out(struct proc *p, struct sbuf *sb, int flags); int kern_proc_vmmap_out(struct proc *p, struct sbuf *sb); int vntype_to_kinfo(int vtype); #endif /* !_KERNEL */ #endif