Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c (revision 285632) +++ head/sys/kern/init_main.c (revision 285633) @@ -1,875 +1,878 @@ /*- * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread thread0 __aligned(16); struct vmspace vmspace0; struct proc *initproc; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { register struct sysinit **sipp; /* system initialization*/ register struct sysinit **xipp; /* interior loop of sort*/ register struct sysinit *save; /* bubble*/ #if defined(VERBOSE_SYSINIT) int last; int verbose; #endif if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } #if defined(VERBOSE_SYSINIT) last = SI_SUB_COPYRIGHT; verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last) { verbose = 1; last = (*sipp)->subsystem; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } /* *************************************************************************** **** **** The following SYSINIT's belong elsewhere, but have not yet **** been moved. **** *************************************************************************** */ static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused, struct syscall_args *sa __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_mask = 0, .sv_sigsize = 0, .sv_sigtbl = NULL, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_prepsyscall = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, }; /* *************************************************************************** **** **** The two following SYSINIT's are proc0 specific glue code. I am not **** convinced that they can not be safely combined, but their order of **** operation has been maintained as the same as the original init_main.c **** for right now. **** **** These probably belong in init_proc.c or kern_proc.c, since they **** deal with proc0 (the fork template process). **** *************************************************************************** */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM; p->p_flag2 = 0; p->p_state = PRS_NORMAL; knlist_init_mtx(&p->p_klist, &p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; /* pid_max cannot be greater than PID_MAX */ td->td_tid = PID_MAX + 1; LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); td->td_state = TDS_RUNNING; td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = 0; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); vm_domain_policy_init(&td->td_vm_dom_policy); vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1); vm_domain_policy_init(&p->p_vm_dom_policy); vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1); prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, 1); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_prison = &prison0; newcred->cr_loginclass = loginclass_find("default"); proc_set_cred_init(p, newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL, false); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_cnt.v_free_count); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_INVOKE(process_init, p); EVENTHANDLER_INVOKE(thread_init, td); EVENTHANDLER_INVOKE(process_ctor, p); EVENTHANDLER_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ PROC_STATUNLOCK(p); p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); static void random_init(void *dummy __unused) { /* * After CPU has been started we have some randomness on most * platforms via get_cyclecount(). For platforms that don't * we will reseed random(9) in proc0_post() as well. */ srandom(get_cyclecount()); } SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* *************************************************************************** **** **** The following code probably belongs in another file, like **** kern/init_init.c. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct thread *td; struct proc *p; mtx_lock(&Giant); GIANT_REQUIRED; td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = p->p_sysent->sv_usrstack - PAGE_SIZE; if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)p->p_sysent->sv_usrstack; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ if ((error = sys_execve(td, &args)) == 0) { mtx_unlock(&Giant); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in it's own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { struct ucred *newcred, *oldcred; + struct thread *td; int error; error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc, NULL, 0); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; LIST_INSERT_HEAD(&initproc->p_reaplist, &proc0, p_reapsibling); oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); - cred_update_thread(FIRST_THREAD_IN_PROC(initproc)); + td = FIRST_THREAD_IN_PROC(initproc); + crfree(td->td_ucred); + td->td_ucred = crhold(initproc->p_ucred); PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); thread_unlock(td); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); Index: head/sys/kern/kern_prot.c =================================================================== --- head/sys/kern/kern_prot.c (revision 285632) +++ head/sys/kern/kern_prot.c (revision 285633) @@ -1,2264 +1,2246 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 * The Regents of the University of California. * (c) UNIX System Laboratories, Inc. * Copyright (c) 2000-2001 Robert N. M. Watson. * All rights reserved. * * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94 */ /* * System calls related to processes and protection */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef REGRESSION FEATURE(regression, "Kernel support for interfaces necessary for regression testing (SECURITY RISK!)"); #endif #if defined(INET) || defined(INET6) #include #include #endif #include #include static MALLOC_DEFINE(M_CRED, "cred", "credentials"); SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy"); static void crextend(struct ucred *cr, int n); static void crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups); #ifndef _SYS_SYSPROTO_H_ struct getpid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getpid(struct thread *td, struct getpid_args *uap) { struct proc *p = td->td_proc; td->td_retval[0] = p->p_pid; #if defined(COMPAT_43) td->td_retval[1] = kern_getppid(td); #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct getppid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getppid(struct thread *td, struct getppid_args *uap) { td->td_retval[0] = kern_getppid(td); return (0); } int kern_getppid(struct thread *td) { struct proc *p = td->td_proc; struct proc *pp; int ppid; PROC_LOCK(p); if (!(p->p_flag & P_TRACED)) { ppid = p->p_pptr->p_pid; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); sx_slock(&proctree_lock); pp = proc_realparent(p); ppid = pp->p_pid; sx_sunlock(&proctree_lock); } return (ppid); } /* * Get process group ID; note that POSIX getpgrp takes no parameter. */ #ifndef _SYS_SYSPROTO_H_ struct getpgrp_args { int dummy; }; #endif int sys_getpgrp(struct thread *td, struct getpgrp_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* Get an arbitary pid's process group id */ #ifndef _SYS_SYSPROTO_H_ struct getpgid_args { pid_t pid; }; #endif int sys_getpgid(struct thread *td, struct getpgid_args *uap) { struct proc *p; int error; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* * Get an arbitary pid's session id. */ #ifndef _SYS_SYSPROTO_H_ struct getsid_args { pid_t pid; }; #endif int sys_getsid(struct thread *td, struct getsid_args *uap) { struct proc *p; int error; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_session->s_sid; PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct getuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getuid(struct thread *td, struct getuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_ruid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_uid; #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct geteuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_geteuid(struct thread *td, struct geteuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_uid; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getgid(struct thread *td, struct getgid_args *uap) { td->td_retval[0] = td->td_ucred->cr_rgid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_groups[0]; #endif return (0); } /* * Get effective group ID. The "egid" is groups[0], and could be obtained * via getgroups. This syscall exists because it is somewhat painful to do * correctly in a library function. */ #ifndef _SYS_SYSPROTO_H_ struct getegid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getegid(struct thread *td, struct getegid_args *uap) { td->td_retval[0] = td->td_ucred->cr_groups[0]; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgroups_args { u_int gidsetsize; gid_t *gidset; }; #endif int sys_getgroups(struct thread *td, register struct getgroups_args *uap) { struct ucred *cred; u_int ngrp; int error; cred = td->td_ucred; ngrp = cred->cr_ngroups; if (uap->gidsetsize == 0) { error = 0; goto out; } if (uap->gidsetsize < ngrp) return (EINVAL); error = copyout(cred->cr_groups, uap->gidset, ngrp * sizeof(gid_t)); out: td->td_retval[0] = ngrp; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setsid_args { int dummy; }; #endif /* ARGSUSED */ int sys_setsid(register struct thread *td, struct setsid_args *uap) { struct pgrp *pgrp; int error; struct proc *p = td->td_proc; struct pgrp *newpgrp; struct session *newsess; error = 0; pgrp = NULL; newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); sx_xlock(&proctree_lock); if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) { if (pgrp != NULL) PGRP_UNLOCK(pgrp); error = EPERM; } else { (void)enterpgrp(p, p->p_pid, newpgrp, newsess); td->td_retval[0] = p->p_pid; newpgrp = NULL; newsess = NULL; } sx_xunlock(&proctree_lock); if (newpgrp != NULL) free(newpgrp, M_PGRP); if (newsess != NULL) free(newsess, M_SESSION); return (error); } /* * set process group (setpgid/old setpgrp) * * caller does setpgid(targpid, targpgid) * * pid must be caller or child of caller (ESRCH) * if a child * pid must be in same session (EPERM) * pid can't have done an exec (EACCES) * if pgid != pid * there must exist some pid in same session having pgid (EPERM) * pid must not be session leader (EPERM) */ #ifndef _SYS_SYSPROTO_H_ struct setpgid_args { int pid; /* target process id */ int pgid; /* target pgrp id */ }; #endif /* ARGSUSED */ int sys_setpgid(struct thread *td, register struct setpgid_args *uap) { struct proc *curp = td->td_proc; register struct proc *targp; /* target process */ register struct pgrp *pgrp; /* target pgrp */ int error; struct pgrp *newpgrp; if (uap->pgid < 0) return (EINVAL); error = 0; newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); sx_xlock(&proctree_lock); if (uap->pid != 0 && uap->pid != curp->p_pid) { if ((targp = pfind(uap->pid)) == NULL) { error = ESRCH; goto done; } if (!inferior(targp)) { PROC_UNLOCK(targp); error = ESRCH; goto done; } if ((error = p_cansee(td, targp))) { PROC_UNLOCK(targp); goto done; } if (targp->p_pgrp == NULL || targp->p_session != curp->p_session) { PROC_UNLOCK(targp); error = EPERM; goto done; } if (targp->p_flag & P_EXEC) { PROC_UNLOCK(targp); error = EACCES; goto done; } PROC_UNLOCK(targp); } else targp = curp; if (SESS_LEADER(targp)) { error = EPERM; goto done; } if (uap->pgid == 0) uap->pgid = targp->p_pid; if ((pgrp = pgfind(uap->pgid)) == NULL) { if (uap->pgid == targp->p_pid) { error = enterpgrp(targp, uap->pgid, newpgrp, NULL); if (error == 0) newpgrp = NULL; } else error = EPERM; } else { if (pgrp == targp->p_pgrp) { PGRP_UNLOCK(pgrp); goto done; } if (pgrp->pg_id != targp->p_pid && pgrp->pg_session != curp->p_session) { PGRP_UNLOCK(pgrp); error = EPERM; goto done; } PGRP_UNLOCK(pgrp); error = enterthispgrp(targp, pgrp); } done: sx_xunlock(&proctree_lock); KASSERT((error == 0) || (newpgrp != NULL), ("setpgid failed and newpgrp is NULL")); if (newpgrp != NULL) free(newpgrp, M_PGRP); return (error); } /* * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD * compatible. It says that setting the uid/gid to euid/egid is a special * case of "appropriate privilege". Once the rules are expanded out, this * basically means that setuid(nnn) sets all three id's, in all permitted * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid()) * does not set the saved id - this is dangerous for traditional BSD * programs. For this reason, we *really* do not want to set * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2. */ #define POSIX_APPENDIX_B_4_2_2 #ifndef _SYS_SYSPROTO_H_ struct setuid_args { uid_t uid; }; #endif /* ARGSUSED */ int sys_setuid(struct thread *td, struct setuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t uid; struct uidinfo *uip; int error; uid = uap->uid; AUDIT_ARG_UID(uid); newcred = crget(); uip = uifind(uid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setuid(oldcred, uid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setuid(geteuid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setuid(xx)" sets all * three id's (assuming you have privs). * * Notes on the logic. We do things in three steps. * 1: We determine if the euid is going to change, and do EPERM * right away. We unconditionally change the euid later if this * test is satisfied, simplifying that part of the logic. * 2: We determine if the real and/or saved uids are going to * change. Determined by compile options. * 3: Change euid last. (after tests in #2 for "appropriate privs") */ if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */ #ifdef _POSIX_SAVED_IDS uid != oldcred->cr_svuid && /* allow setuid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ uid != oldcred->cr_uid && /* allow setuid(geteuid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETUID, 0)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or uid == euid) * If so, we are changing the real uid and/or saved uid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */ uid == oldcred->cr_uid || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETUID, 0) == 0) #endif { /* * Set the real uid and transfer proc count to new user. */ if (uid != oldcred->cr_ruid) { change_ruid(newcred, uip); setsugid(p); } /* * Set saved uid * * XXX always set saved uid even if not _POSIX_SAVED_IDS, as * the security of seteuid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (uid != oldcred->cr_svuid) { change_svuid(newcred, uid); setsugid(p); } } /* * In all permitted cases, we are changing the euid. */ if (uid != oldcred->cr_uid) { change_euid(newcred, uip); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif uifree(uip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(uip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct seteuid_args { uid_t euid; }; #endif /* ARGSUSED */ int sys_seteuid(struct thread *td, struct seteuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid; struct uidinfo *euip; int error; euid = uap->euid; AUDIT_ARG_EUID(euid); newcred = crget(); euip = uifind(euid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_seteuid(oldcred, euid); if (error) goto fail; #endif if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */ euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID, 0)) != 0) goto fail; /* * Everything's okay, do it. */ if (oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgid_args { gid_t gid; }; #endif /* ARGSUSED */ int sys_setgid(struct thread *td, struct setgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t gid; int error; gid = uap->gid; AUDIT_ARG_GID(gid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgid(oldcred, gid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setgid(getegid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setgid(xx)" sets all * three id's (assuming you have privs). * * For notes on the logic here, see setuid() above. */ if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */ #ifdef _POSIX_SAVED_IDS gid != oldcred->cr_svgid && /* allow setgid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETGID, 0)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or gid == egid) * If so, we are changing the real uid and saved gid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */ gid == oldcred->cr_groups[0] || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETGID, 0) == 0) #endif { /* * Set real gid */ if (oldcred->cr_rgid != gid) { change_rgid(newcred, gid); setsugid(p); } /* * Set saved gid * * XXX always set saved gid even if not _POSIX_SAVED_IDS, as * the security of setegid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (oldcred->cr_svgid != gid) { change_svgid(newcred, gid); setsugid(p); } } /* * In all cases permitted cases, we are changing the egid. * Copy credentials so other references do not see our changes. */ if (oldcred->cr_groups[0] != gid) { change_egid(newcred, gid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setegid_args { gid_t egid; }; #endif /* ARGSUSED */ int sys_setegid(struct thread *td, struct setegid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid; int error; egid = uap->egid; AUDIT_ARG_EGID(egid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setegid(oldcred, egid); if (error) goto fail; #endif if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */ egid != oldcred->cr_svgid && /* allow setegid(saved gid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID, 0)) != 0) goto fail; if (oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgroups_args { u_int gidsetsize; gid_t *gidset; }; #endif /* ARGSUSED */ int sys_setgroups(struct thread *td, struct setgroups_args *uap) { gid_t smallgroups[XU_NGROUPS]; gid_t *groups; u_int gidsetsize; int error; gidsetsize = uap->gidsetsize; if (gidsetsize > ngroups_max + 1) return (EINVAL); if (gidsetsize > XU_NGROUPS) groups = malloc(gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK); else groups = smallgroups; error = copyin(uap->gidset, groups, gidsetsize * sizeof(gid_t)); if (error == 0) error = kern_setgroups(td, gidsetsize, groups); if (gidsetsize > XU_NGROUPS) free(groups, M_TEMP); return (error); } int kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; int error; MPASS(ngrp <= ngroups_max + 1); AUDIT_ARG_GROUPSET(groups, ngrp); newcred = crget(); crextend(newcred, ngrp); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgroups(oldcred, ngrp, groups); if (error) goto fail; #endif error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0); if (error) goto fail; if (ngrp == 0) { /* * setgroups(0, NULL) is a legitimate way of clearing the * groups vector on non-BSD systems (which generally do not * have the egid in the groups[0]). We risk security holes * when running non-BSD software if we do not do the same. */ newcred->cr_ngroups = 1; } else { crsetgroups_locked(newcred, ngrp, groups); } setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setreuid_args { uid_t ruid; uid_t euid; }; #endif /* ARGSUSED */ int sys_setreuid(register struct thread *td, struct setreuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setreuid(oldcred, ruid, euid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid) || (euid != (uid_t)-1 && euid != oldcred->cr_uid && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID, 0)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) && newcred->cr_svuid != newcred->cr_uid) { change_svuid(newcred, newcred->cr_uid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setregid_args { gid_t rgid; gid_t egid; }; #endif /* ARGSUSED */ int sys_setregid(register struct thread *td, struct setregid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid; int error; egid = uap->egid; rgid = uap->rgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setregid(oldcred, rgid, egid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid) || (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID, 0)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) && newcred->cr_svgid != newcred->cr_groups[0]) { change_svgid(newcred, newcred->cr_groups[0]); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } /* * setresuid(ruid, euid, suid) is like setreuid except control over the saved * uid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresuid_args { uid_t ruid; uid_t euid; uid_t suid; }; #endif /* ARGSUSED */ int sys_setresuid(register struct thread *td, struct setresuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid, suid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; suid = uap->suid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); AUDIT_ARG_SUID(suid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresuid(oldcred, ruid, euid, suid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid && ruid != oldcred->cr_uid) || (euid != (uid_t)-1 && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid && euid != oldcred->cr_uid) || (suid != (uid_t)-1 && suid != oldcred->cr_ruid && suid != oldcred->cr_svuid && suid != oldcred->cr_uid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID, 0)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) { change_svuid(newcred, suid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } /* * setresgid(rgid, egid, sgid) is like setregid except control over the saved * gid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresgid_args { gid_t rgid; gid_t egid; gid_t sgid; }; #endif /* ARGSUSED */ int sys_setresgid(register struct thread *td, struct setresgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid, sgid; int error; egid = uap->egid; rgid = uap->rgid; sgid = uap->sgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); AUDIT_ARG_SGID(sgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid && rgid != oldcred->cr_groups[0]) || (egid != (gid_t)-1 && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid && egid != oldcred->cr_groups[0]) || (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid && sgid != oldcred->cr_svgid && sgid != oldcred->cr_groups[0])) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID, 0)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) { change_svgid(newcred, sgid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getresuid_args { uid_t *ruid; uid_t *euid; uid_t *suid; }; #endif /* ARGSUSED */ int sys_getresuid(register struct thread *td, struct getresuid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->ruid) error1 = copyout(&cred->cr_ruid, uap->ruid, sizeof(cred->cr_ruid)); if (uap->euid) error2 = copyout(&cred->cr_uid, uap->euid, sizeof(cred->cr_uid)); if (uap->suid) error3 = copyout(&cred->cr_svuid, uap->suid, sizeof(cred->cr_svuid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct getresgid_args { gid_t *rgid; gid_t *egid; gid_t *sgid; }; #endif /* ARGSUSED */ int sys_getresgid(register struct thread *td, struct getresgid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->rgid) error1 = copyout(&cred->cr_rgid, uap->rgid, sizeof(cred->cr_rgid)); if (uap->egid) error2 = copyout(&cred->cr_groups[0], uap->egid, sizeof(cred->cr_groups[0])); if (uap->sgid) error3 = copyout(&cred->cr_svgid, uap->sgid, sizeof(cred->cr_svgid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct issetugid_args { int dummy; }; #endif /* ARGSUSED */ int sys_issetugid(register struct thread *td, struct issetugid_args *uap) { struct proc *p = td->td_proc; /* * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, * we use P_SUGID because we consider changing the owners as * "tainting" as well. * This is significant for procs that start as root and "become" * a user without an exec - programs cannot know *everything* * that libc *might* have put in their data segment. */ PROC_LOCK(p); td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0; PROC_UNLOCK(p); return (0); } int sys___setugid(struct thread *td, struct __setugid_args *uap) { #ifdef REGRESSION struct proc *p; p = td->td_proc; switch (uap->flag) { case 0: PROC_LOCK(p); p->p_flag &= ~P_SUGID; PROC_UNLOCK(p); return (0); case 1: PROC_LOCK(p); p->p_flag |= P_SUGID; PROC_UNLOCK(p); return (0); default: return (EINVAL); } #else /* !REGRESSION */ return (ENOSYS); #endif /* REGRESSION */ } /* * Check if gid is a member of the group set. */ int groupmember(gid_t gid, struct ucred *cred) { int l; int h; int m; if (cred->cr_groups[0] == gid) return(1); /* * If gid was not our primary group, perform a binary search * of the supplemental groups. This is possible because we * sort the groups in crsetgroups(). */ l = 1; h = cred->cr_ngroups; while (l < h) { m = l + ((h - l) / 2); if (cred->cr_groups[m] < gid) l = m + 1; else h = m; } if ((l < cred->cr_ngroups) && (cred->cr_groups[l] == gid)) return (1); return (0); } /* * Test the active securelevel against a given level. securelevel_gt() * implements (securelevel > level). securelevel_ge() implements * (securelevel >= level). Note that the logic is inverted -- these * functions return EPERM on "success" and 0 on "failure". * * Due to care taken when setting the securelevel, we know that no jail will * be less secure that its parent (or the physical system), so it is sufficient * to test the current jail only. * * XXXRW: Possibly since this has to do with privilege, it should move to * kern_priv.c. */ int securelevel_gt(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel > level ? EPERM : 0); } int securelevel_ge(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0); } /* * 'see_other_uids' determines whether or not visibility of processes * and sockets with credentials holding different real uids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_uids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW, &see_other_uids, 0, "Unprivileged processes may see subjects/objects with different real uid"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_other_uids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_seeotheruids(struct ucred *u1, struct ucred *u2) { if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) { if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, 0) != 0) return (ESRCH); } return (0); } /* * 'see_other_gids' determines whether or not visibility of processes * and sockets with credentials holding different real gids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_gids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW, &see_other_gids, 0, "Unprivileged processes may see subjects/objects with different real gid"); /* * Determine if u1 can "see" the subject specified by u2, according to the * 'see_other_gids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_seeothergids(struct ucred *u1, struct ucred *u2) { int i, match; if (!see_other_gids) { match = 0; for (i = 0; i < u1->cr_ngroups; i++) { if (groupmember(u1->cr_groups[i], u2)) match = 1; if (match) break; } if (!match) { if (priv_check_cred(u1, PRIV_SEEOTHERGIDS, 0) != 0) return (ESRCH); } } return (0); } /*- * Determine if u1 "can see" the subject specified by u2. * Returns: 0 for permitted, an errno value otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_cansee(struct ucred *u1, struct ucred *u2) { int error; if ((error = prison_check(u1, u2))) return (error); #ifdef MAC if ((error = mac_cred_check_visible(u1, u2))) return (error); #endif if ((error = cr_seeotheruids(u1, u2))) return (error); if ((error = cr_seeothergids(u1, u2))) return (error); return (0); } /*- * Determine if td "can see" the subject specified by p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect p->p_ucred must be held. td really * should be curthread. * References: td and p must be valid for the lifetime of the call */ int p_cansee(struct thread *td, struct proc *p) { /* Wrap cr_cansee() for all functionality. */ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); return (cr_cansee(td->td_ucred, p->p_ucred)); } /* * 'conservative_signals' prevents the delivery of a broad class of * signals by unprivileged processes to processes that have changed their * credentials since the last invocation of execve(). This can prevent * the leakage of cached information or retained privileges as a result * of a common class of signal-related vulnerabilities. However, this * may interfere with some applications that expect to be able to * deliver these signals to peer processes after having given up * privilege. */ static int conservative_signals = 1; SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW, &conservative_signals, 0, "Unprivileged processes prevented from " "sending certain signals to processes whose credentials have changed"); /*- * Determine whether cred may deliver the specified signal to proc. * Returns: 0 for permitted, an errno value otherwise. * Locks: A lock must be held for proc. * References: cred and proc must be valid for the lifetime of the call. */ int cr_cansignal(struct ucred *cred, struct proc *proc, int signum) { int error; PROC_LOCK_ASSERT(proc, MA_OWNED); /* * Jail semantics limit the scope of signalling to proc in the * same jail as cred, if cred is in jail. */ error = prison_check(cred, proc->p_ucred); if (error) return (error); #ifdef MAC if ((error = mac_proc_check_signal(cred, proc, signum))) return (error); #endif if ((error = cr_seeotheruids(cred, proc->p_ucred))) return (error); if ((error = cr_seeothergids(cred, proc->p_ucred))) return (error); /* * UNIX signal semantics depend on the status of the P_SUGID * bit on the target process. If the bit is set, then additional * restrictions are placed on the set of available signals. */ if (conservative_signals && (proc->p_flag & P_SUGID)) { switch (signum) { case 0: case SIGKILL: case SIGINT: case SIGTERM: case SIGALRM: case SIGSTOP: case SIGTTIN: case SIGTTOU: case SIGTSTP: case SIGHUP: case SIGUSR1: case SIGUSR2: /* * Generally, permit job and terminal control * signals. */ break; default: /* Not permitted without privilege. */ error = priv_check_cred(cred, PRIV_SIGNAL_SUGID, 0); if (error) return (error); } } /* * Generally, the target credential's ruid or svuid must match the * subject credential's ruid or euid. */ if (cred->cr_ruid != proc->p_ucred->cr_ruid && cred->cr_ruid != proc->p_ucred->cr_svuid && cred->cr_uid != proc->p_ucred->cr_ruid && cred->cr_uid != proc->p_ucred->cr_svuid) { error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED, 0); if (error) return (error); } return (0); } /*- * Determine whether td may deliver the specified signal to p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must be * held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansignal(struct thread *td, struct proc *p, int signum) { KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); /* * UNIX signalling semantics require that processes in the same * session always be able to deliver SIGCONT to one another, * overriding the remaining protections. */ /* XXX: This will require an additional lock of some sort. */ if (signum == SIGCONT && td->td_proc->p_session == p->p_session) return (0); /* * Some compat layers use SIGTHR and higher signals for * communication between different kernel threads of the same * process, so that they expect that it's always possible to * deliver them, even for suid applications where cr_cansignal() can * deny such ability for security consideration. It should be * pretty safe to do since the only way to create two processes * with the same p_leader is via rfork(2). */ if (td->td_proc->p_leader != NULL && signum >= SIGTHR && signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader) return (0); return (cr_cansignal(td->td_ucred, p, signum)); } /*- * Determine whether td may reschedule p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansched(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_sched(td->td_ucred, p))) return (error); #endif if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) return (error); if ((error = cr_seeothergids(td->td_ucred, p->p_ucred))) return (error); if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid && td->td_ucred->cr_uid != p->p_ucred->cr_ruid) { error = priv_check(td, PRIV_SCHED_DIFFCRED); if (error) return (error); } return (0); } /* * The 'unprivileged_proc_debug' flag may be used to disable a variety of * unprivileged inter-process debugging services, including some procfs * functionality, ptrace(), and ktrace(). In the past, inter-process * debugging has been involved in a variety of security problems, and sites * not requiring the service might choose to disable it when hardening * systems. * * XXX: Should modifying and reading this variable require locking? * XXX: data declarations should be together near the beginning of the file. */ static int unprivileged_proc_debug = 1; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW, &unprivileged_proc_debug, 0, "Unprivileged processes may use process debugging facilities"); /*- * Determine whether td may debug p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_candebug(struct thread *td, struct proc *p) { int credentialchanged, error, grpsubset, i, uidsubset; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (!unprivileged_proc_debug) { error = priv_check(td, PRIV_DEBUG_UNPRIV); if (error) return (error); } if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_debug(td->td_ucred, p))) return (error); #endif if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) return (error); if ((error = cr_seeothergids(td->td_ucred, p->p_ucred))) return (error); /* * Is p's group set a subset of td's effective group set? This * includes p's egid, group access list, rgid, and svgid. */ grpsubset = 1; for (i = 0; i < p->p_ucred->cr_ngroups; i++) { if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) { grpsubset = 0; break; } } grpsubset = grpsubset && groupmember(p->p_ucred->cr_rgid, td->td_ucred) && groupmember(p->p_ucred->cr_svgid, td->td_ucred); /* * Are the uids present in p's credential equal to td's * effective uid? This includes p's euid, svuid, and ruid. */ uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid && td->td_ucred->cr_uid == p->p_ucred->cr_svuid && td->td_ucred->cr_uid == p->p_ucred->cr_ruid); /* * Has the credential of the process changed since the last exec()? */ credentialchanged = (p->p_flag & P_SUGID); /* * If p's gids aren't a subset, or the uids aren't a subset, * or the credential has changed, require appropriate privilege * for td to debug p. */ if (!grpsubset || !uidsubset) { error = priv_check(td, PRIV_DEBUG_DIFFCRED); if (error) return (error); } if (credentialchanged) { error = priv_check(td, PRIV_DEBUG_SUGID); if (error) return (error); } /* Can't trace init when securelevel > 0. */ if (p == initproc) { error = securelevel_gt(td->td_ucred, 0); if (error) return (error); } /* * Can't trace a process that's currently exec'ing. * * XXX: Note, this is not a security policy decision, it's a * basic correctness/functionality decision. Therefore, this check * should be moved to the caller's of p_candebug(). */ if ((p->p_flag & P_INEXEC) != 0) return (EBUSY); /* Denied explicitely */ if ((p->p_flag2 & P2_NOTRACE) != 0) { error = priv_check(td, PRIV_DEBUG_DENIED); if (error != 0) return (error); } return (0); } /*- * Determine whether the subject represented by cred can "see" a socket. * Returns: 0 for permitted, ENOENT otherwise. */ int cr_canseesocket(struct ucred *cred, struct socket *so) { int error; error = prison_check(cred, so->so_cred); if (error) return (ENOENT); #ifdef MAC error = mac_socket_check_visible(cred, so); if (error) return (error); #endif if (cr_seeotheruids(cred, so->so_cred)) return (ENOENT); if (cr_seeothergids(cred, so->so_cred)) return (ENOENT); return (0); } #if defined(INET) || defined(INET6) /*- * Determine whether the subject represented by cred can "see" a socket. * Returns: 0 for permitted, ENOENT otherwise. */ int cr_canseeinpcb(struct ucred *cred, struct inpcb *inp) { int error; error = prison_check(cred, inp->inp_cred); if (error) return (ENOENT); #ifdef MAC INP_LOCK_ASSERT(inp); error = mac_inpcb_check_visible(cred, inp); if (error) return (error); #endif if (cr_seeotheruids(cred, inp->inp_cred)) return (ENOENT); if (cr_seeothergids(cred, inp->inp_cred)) return (ENOENT); return (0); } #endif /*- * Determine whether td can wait for the exit of p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_canwait(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_wait(td->td_ucred, p))) return (error); #endif #if 0 /* XXXMAC: This could have odd effects on some shells. */ if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) return (error); #endif return (0); } /* * Allocate a zeroed cred structure. */ struct ucred * crget(void) { register struct ucred *cr; cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); refcount_init(&cr->cr_ref, 1); #ifdef AUDIT audit_cred_init(cr); #endif #ifdef MAC mac_cred_init(cr); #endif cr->cr_groups = cr->cr_smallgroups; cr->cr_agroups = sizeof(cr->cr_smallgroups) / sizeof(cr->cr_smallgroups[0]); return (cr); } /* * Claim another reference to a ucred structure. */ struct ucred * crhold(struct ucred *cr) { refcount_acquire(&cr->cr_ref); return (cr); } /* * Free a cred structure. Throws away space when ref count gets to 0. */ void crfree(struct ucred *cr) { KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref)); KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred")); if (refcount_release(&cr->cr_ref)) { /* * Some callers of crget(), such as nfs_statfs(), * allocate a temporary credential, but don't * allocate a uidinfo structure. */ if (cr->cr_uidinfo != NULL) uifree(cr->cr_uidinfo); if (cr->cr_ruidinfo != NULL) uifree(cr->cr_ruidinfo); /* * Free a prison, if any. */ if (cr->cr_prison != NULL) prison_free(cr->cr_prison); if (cr->cr_loginclass != NULL) loginclass_free(cr->cr_loginclass); #ifdef AUDIT audit_cred_destroy(cr); #endif #ifdef MAC mac_cred_destroy(cr); #endif if (cr->cr_groups != cr->cr_smallgroups) free(cr->cr_groups, M_CRED); free(cr, M_CRED); } } /* * Copy a ucred's contents from a template. Does not block. */ void crcopy(struct ucred *dest, struct ucred *src) { KASSERT(dest->cr_ref == 1, ("crcopy of shared ucred")); bcopy(&src->cr_startcopy, &dest->cr_startcopy, (unsigned)((caddr_t)&src->cr_endcopy - (caddr_t)&src->cr_startcopy)); crsetgroups(dest, src->cr_ngroups, src->cr_groups); uihold(dest->cr_uidinfo); uihold(dest->cr_ruidinfo); prison_hold(dest->cr_prison); loginclass_hold(dest->cr_loginclass); #ifdef AUDIT audit_cred_copy(src, dest); #endif #ifdef MAC mac_cred_copy(src, dest); #endif } /* * Dup cred struct to a new held one. */ struct ucred * crdup(struct ucred *cr) { struct ucred *newcr; newcr = crget(); crcopy(newcr, cr); return (newcr); } /* * Fill in a struct xucred based on a struct ucred. */ void cru2x(struct ucred *cr, struct xucred *xcr) { int ngroups; bzero(xcr, sizeof(*xcr)); xcr->cr_version = XUCRED_VERSION; xcr->cr_uid = cr->cr_uid; ngroups = MIN(cr->cr_ngroups, XU_NGROUPS); xcr->cr_ngroups = ngroups; bcopy(cr->cr_groups, xcr->cr_groups, ngroups * sizeof(*cr->cr_groups)); } /* - * small routine to swap a thread's current ucred for the correct one taken - * from the process. - */ -void -cred_update_thread(struct thread *td) -{ - struct proc *p; - struct ucred *cred; - - p = td->td_proc; - cred = td->td_ucred; - PROC_LOCK_ASSERT(p, MA_OWNED); - td->td_ucred = crhold(p->p_ucred); - if (cred != NULL) - crfree(cred); -} - -/* * Set initial process credentials. * Callers are responsible for providing the reference for provided credentials. */ void proc_set_cred_init(struct proc *p, struct ucred *newcred) { p->p_ucred = newcred; } /* * Change process credentials. * Callers are responsible for providing the reference for passed credentials * and for freeing old ones. * * Process has to be locked except when it does not have credentials (as it * should not be visible just yet) or when newcred is NULL (as this can be * only used when the process is about to be freed, at which point it should * not be visible anymore). */ struct ucred * proc_set_cred(struct proc *p, struct ucred *newcred) { struct ucred *oldcred; MPASS(p->p_ucred != NULL); if (newcred == NULL) MPASS(p->p_state == PRS_ZOMBIE); else PROC_LOCK_ASSERT(p, MA_OWNED); oldcred = p->p_ucred; p->p_ucred = newcred; if (newcred != NULL) PROC_UPDATE_COW(p); return (oldcred); } struct ucred * crcopysafe(struct proc *p, struct ucred *cr) { struct ucred *oldcred; int groups; PROC_LOCK_ASSERT(p, MA_OWNED); oldcred = p->p_ucred; while (cr->cr_agroups < oldcred->cr_agroups) { groups = oldcred->cr_agroups; PROC_UNLOCK(p); crextend(cr, groups); PROC_LOCK(p); oldcred = p->p_ucred; } crcopy(cr, oldcred); return (oldcred); } /* * Extend the passed in credential to hold n items. */ static void crextend(struct ucred *cr, int n) { int cnt; /* Truncate? */ if (n <= cr->cr_agroups) return; /* * We extend by 2 each time since we're using a power of two * allocator until we need enough groups to fill a page. * Once we're allocating multiple pages, only allocate as many * as we actually need. The case of processes needing a * non-power of two number of pages seems more likely than * a real world process that adds thousands of groups one at a * time. */ if ( n < PAGE_SIZE / sizeof(gid_t) ) { if (cr->cr_agroups == 0) cnt = MINALLOCSIZE / sizeof(gid_t); else cnt = cr->cr_agroups * 2; while (cnt < n) cnt *= 2; } else cnt = roundup2(n, PAGE_SIZE / sizeof(gid_t)); /* Free the old array. */ if (cr->cr_groups != cr->cr_smallgroups) free(cr->cr_groups, M_CRED); cr->cr_groups = malloc(cnt * sizeof(gid_t), M_CRED, M_WAITOK | M_ZERO); cr->cr_agroups = cnt; } /* * Copy groups in to a credential, preserving any necessary invariants. * Currently this includes the sorting of all supplemental gids. * crextend() must have been called before hand to ensure sufficient * space is available. */ static void crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups) { int i; int j; gid_t g; KASSERT(cr->cr_agroups >= ngrp, ("cr_ngroups is too small")); bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t)); cr->cr_ngroups = ngrp; /* * Sort all groups except cr_groups[0] to allow groupmember to * perform a binary search. * * XXX: If large numbers of groups become common this should * be replaced with shell sort like linux uses or possibly * heap sort. */ for (i = 2; i < ngrp; i++) { g = cr->cr_groups[i]; for (j = i-1; j >= 1 && g < cr->cr_groups[j]; j--) cr->cr_groups[j + 1] = cr->cr_groups[j]; cr->cr_groups[j + 1] = g; } } /* * Copy groups in to a credential after expanding it if required. * Truncate the list to (ngroups_max + 1) if it is too large. */ void crsetgroups(struct ucred *cr, int ngrp, gid_t *groups) { if (ngrp > ngroups_max + 1) ngrp = ngroups_max + 1; crextend(cr, ngrp); crsetgroups_locked(cr, ngrp, groups); } /* * Get login name, if available. */ #ifndef _SYS_SYSPROTO_H_ struct getlogin_args { char *namebuf; u_int namelen; }; #endif /* ARGSUSED */ int sys_getlogin(struct thread *td, struct getlogin_args *uap) { char login[MAXLOGNAME]; struct proc *p = td->td_proc; size_t len; if (uap->namelen > MAXLOGNAME) uap->namelen = MAXLOGNAME; PROC_LOCK(p); SESS_LOCK(p->p_session); len = strlcpy(login, p->p_session->s_login, uap->namelen) + 1; SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (len > uap->namelen) return (ERANGE); return (copyout(login, uap->namebuf, len)); } /* * Set login name. */ #ifndef _SYS_SYSPROTO_H_ struct setlogin_args { char *namebuf; }; #endif /* ARGSUSED */ int sys_setlogin(struct thread *td, struct setlogin_args *uap) { struct proc *p = td->td_proc; int error; char logintmp[MAXLOGNAME]; CTASSERT(sizeof(p->p_session->s_login) >= sizeof(logintmp)); error = priv_check(td, PRIV_PROC_SETLOGIN); if (error) return (error); error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL); if (error != 0) { if (error == ENAMETOOLONG) error = EINVAL; return (error); } PROC_LOCK(p); SESS_LOCK(p->p_session); strcpy(p->p_session->s_login, logintmp); SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); return (0); } void setsugid(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_SUGID; if (!(p->p_pfsflags & PF_ISUGID)) p->p_stops = 0; } /*- * Change a process's effective uid. * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_euid(struct ucred *newcred, struct uidinfo *euip) { newcred->cr_uid = euip->ui_uid; uihold(euip); uifree(newcred->cr_uidinfo); newcred->cr_uidinfo = euip; } /*- * Change a process's effective gid. * Side effects: newcred->cr_gid will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_egid(struct ucred *newcred, gid_t egid) { newcred->cr_groups[0] = egid; } /*- * Change a process's real uid. * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo * will be updated, and the old and new cr_ruidinfo proc * counts will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_ruid(struct ucred *newcred, struct uidinfo *ruip) { (void)chgproccnt(newcred->cr_ruidinfo, -1, 0); newcred->cr_ruid = ruip->ui_uid; uihold(ruip); uifree(newcred->cr_ruidinfo); newcred->cr_ruidinfo = ruip; (void)chgproccnt(newcred->cr_ruidinfo, 1, 0); } /*- * Change a process's real gid. * Side effects: newcred->cr_rgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_rgid(struct ucred *newcred, gid_t rgid) { newcred->cr_rgid = rgid; } /*- * Change a process's saved uid. * Side effects: newcred->cr_svuid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svuid(struct ucred *newcred, uid_t svuid) { newcred->cr_svuid = svuid; } /*- * Change a process's saved gid. * Side effects: newcred->cr_svgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svgid(struct ucred *newcred, gid_t svgid) { newcred->cr_svgid = svgid; } Index: head/sys/kern/kern_resource.c =================================================================== --- head/sys/kern/kern_resource.c (revision 285632) +++ head/sys/kern/kern_resource.c (revision 285633) @@ -1,1452 +1,1438 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures"); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) static struct rwlock uihashtbl_lock; static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; static u_long uihash; /* size of hash table - 1 */ static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp); static int donice(struct thread *td, struct proc *chgp, int n); static struct uidinfo *uilookup(uid_t uid); static void ruxagg_locked(struct rusage_ext *rux, struct thread *td); static __inline int lim_shared(struct plimit *limp); /* * Resource controls and accounting. */ #ifndef _SYS_SYSPROTO_H_ struct getpriority_args { int which; int who; }; #endif int sys_getpriority(struct thread *td, register struct getpriority_args *uap) { struct proc *p; struct pgrp *pg; int error, low; error = 0; low = PRIO_MAX + 1; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) low = td->td_proc->p_nice; else { p = pfind(uap->who); if (p == NULL) break; if (p_cansee(td, p) == 0) low = p->p_nice; PROC_UNLOCK(p); } break; case PRIO_PGRP: sx_slock(&proctree_lock); if (uap->who == 0) { pg = td->td_proc->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0) { if (p->p_nice < low) low = p->p_nice; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0 && p->p_ucred->cr_uid == uap->who) { if (p->p_nice < low) low = p->p_nice; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (low == PRIO_MAX + 1 && error == 0) error = ESRCH; td->td_retval[0] = low; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setpriority_args { int which; int who; int prio; }; #endif int sys_setpriority(struct thread *td, struct setpriority_args *uap) { struct proc *curp, *p; struct pgrp *pg; int found = 0, error = 0; curp = td->td_proc; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) { PROC_LOCK(curp); error = donice(td, curp, uap->prio); PROC_UNLOCK(curp); } else { p = pfind(uap->who); if (p == NULL) break; error = p_cansee(td, p); if (error == 0) error = donice(td, p, uap->prio); PROC_UNLOCK(p); } found++; break; case PRIO_PGRP: sx_slock(&proctree_lock); if (uap->who == 0) { pg = curp->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0) { error = donice(td, p, uap->prio); found++; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p->p_ucred->cr_uid == uap->who && p_cansee(td, p) == 0) { error = donice(td, p, uap->prio); found++; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (found == 0 && error == 0) error = ESRCH; return (error); } /* * Set "nice" for a (whole) process. */ static int donice(struct thread *td, struct proc *p, int n) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = p_cansched(td, p))) return (error); if (n > PRIO_MAX) n = PRIO_MAX; if (n < PRIO_MIN) n = PRIO_MIN; if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0) return (EACCES); sched_nice(p, n); return (0); } static int unprivileged_idprio; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_idprio, CTLFLAG_RW, &unprivileged_idprio, 0, "Allow non-root users to set an idle priority"); /* * Set realtime priority for LWP. */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_thread_args { int function; lwpid_t lwpid; struct rtprio *rtp; }; #endif int sys_rtprio_thread(struct thread *td, struct rtprio_thread_args *uap) { struct proc *p; struct rtprio rtp; struct thread *td1; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; if (uap->lwpid == 0 || uap->lwpid == td->td_tid) { p = td->td_proc; td1 = td; PROC_LOCK(p); } else { /* Only look up thread in current process */ td1 = tdfind(uap->lwpid, curproc->p_pid); if (td1 == NULL) return (ESRCH); p = td1->td_proc; } switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; pri_to_rtp(td1, &rtp); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* Disallow setting rtprio in most cases if not superuser. */ /* * Realtime priority has to be restricted for reasons which * should be obvious. However, for idleprio processes, there is * a potential for system deadlock if an idleprio process gains * a lock on a resource that other processes need (and the * idleprio process can't run due to a CPU-bound normal * process). Fix me! XXX * * This problem is not only related to idleprio process. * A user level program can obtain a file lock and hold it * indefinitely. Additionally, without idleprio processes it is * still conceivable that a program with low priority will never * get to run. In short, allowing this feature might make it * easier to lock a resource indefinitely, but it is not the * only thing that makes it possible. */ if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME || (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE && unprivileged_idprio == 0)) { error = priv_check(td, PRIV_SCHED_RTPRIO); if (error) break; } error = rtp_to_pri(&rtp, td1); break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } /* * Set realtime priority. */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_args { int function; pid_t pid; struct rtprio *rtp; }; #endif int sys_rtprio(struct thread *td, register struct rtprio_args *uap) { struct proc *p; struct thread *tdp; struct rtprio rtp; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); } switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; /* * Return OUR priority if no pid specified, * or if one is, report the highest priority * in the process. There isn't much more you can do as * there is only room to return a single priority. * Note: specifying our own pid is not the same * as leaving it zero. */ if (uap->pid == 0) { pri_to_rtp(td, &rtp); } else { struct rtprio rtp2; rtp.type = RTP_PRIO_IDLE; rtp.prio = RTP_PRIO_MAX; FOREACH_THREAD_IN_PROC(p, tdp) { pri_to_rtp(tdp, &rtp2); if (rtp2.type < rtp.type || (rtp2.type == rtp.type && rtp2.prio < rtp.prio)) { rtp.type = rtp2.type; rtp.prio = rtp2.prio; } } } PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* * Disallow setting rtprio in most cases if not superuser. * See the comment in sys_rtprio_thread about idprio * threads holding a lock. */ if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME || (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE && !unprivileged_idprio)) { error = priv_check(td, PRIV_SCHED_RTPRIO); if (error) break; } /* * If we are setting our own priority, set just our * thread but if we are doing another process, * do all the threads on that process. If we * specify our own pid we do the latter. */ if (uap->pid == 0) { error = rtp_to_pri(&rtp, td); } else { FOREACH_THREAD_IN_PROC(p, td) { if ((error = rtp_to_pri(&rtp, td)) != 0) break; } } break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } int rtp_to_pri(struct rtprio *rtp, struct thread *td) { u_char newpri, oldclass, oldpri; switch (RTP_PRIO_BASE(rtp->type)) { case RTP_PRIO_REALTIME: if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); newpri = PRI_MIN_REALTIME + rtp->prio; break; case RTP_PRIO_NORMAL: if (rtp->prio > (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE)) return (EINVAL); newpri = PRI_MIN_TIMESHARE + rtp->prio; break; case RTP_PRIO_IDLE: if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); newpri = PRI_MIN_IDLE + rtp->prio; break; default: return (EINVAL); } thread_lock(td); oldclass = td->td_pri_class; sched_class(td, rtp->type); /* XXX fix */ oldpri = td->td_user_pri; sched_user_prio(td, newpri); if (td->td_user_pri != oldpri && (oldclass != RTP_PRIO_NORMAL || td->td_pri_class != RTP_PRIO_NORMAL)) sched_prio(td, td->td_user_pri); if (TD_ON_UPILOCK(td) && oldpri != newpri) { critical_enter(); thread_unlock(td); umtx_pi_adjust(td, oldpri); critical_exit(); } else thread_unlock(td); return (0); } void pri_to_rtp(struct thread *td, struct rtprio *rtp) { thread_lock(td); switch (PRI_BASE(td->td_pri_class)) { case PRI_REALTIME: rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME; break; case PRI_TIMESHARE: rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE; break; case PRI_IDLE: rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE; break; default: break; } rtp->type = td->td_pri_class; thread_unlock(td); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osetrlimit_args { u_int which; struct orlimit *rlp; }; #endif int osetrlimit(struct thread *td, register struct osetrlimit_args *uap) { struct orlimit olim; struct rlimit lim; int error; if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit)))) return (error); lim.rlim_cur = olim.rlim_cur; lim.rlim_max = olim.rlim_max; error = kern_setrlimit(td, uap->which, &lim); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ogetrlimit_args { u_int which; struct orlimit *rlp; }; #endif int ogetrlimit(struct thread *td, register struct ogetrlimit_args *uap) { struct orlimit olim; struct rlimit rl; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); lim_rlimit(td, uap->which, &rl); /* * XXX would be more correct to convert only RLIM_INFINITY to the * old RLIM_INFINITY and fail with EOVERFLOW for other larger * values. Most 64->32 and 32->16 conversions, including not * unimportant ones of uids are even more broken than what we * do here (they blindly truncate). We don't do this correctly * here since we have little experience with EOVERFLOW yet. * Elsewhere, getuid() can't fail... */ olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur; olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max; error = copyout(&olim, uap->rlp, sizeof(olim)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct __setrlimit_args { u_int which; struct rlimit *rlp; }; #endif int sys_setrlimit(struct thread *td, register struct __setrlimit_args *uap) { struct rlimit alim; int error; if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit)))) return (error); error = kern_setrlimit(td, uap->which, &alim); return (error); } static void lim_cb(void *arg) { struct rlimit rlim; struct thread *td; struct proc *p; p = arg; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if the process exceeds its cpu resource allocation. If * it reaches the max, arrange to kill the process in ast(). */ if (p->p_cpulimit == RLIM_INFINITY) return; PROC_STATLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { ruxagg(p, td); } PROC_STATUNLOCK(p); if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) { lim_rlimit_proc(p, RLIMIT_CPU, &rlim); if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) { killproc(p, "exceeded maximum CPU limit"); } else { if (p->p_cpulimit < rlim.rlim_max) p->p_cpulimit += 5; kern_psignal(p, SIGXCPU); } } if ((p->p_flag & P_WEXIT) == 0) callout_reset_sbt(&p->p_limco, SBT_1S, 0, lim_cb, p, C_PREL(1)); } int kern_setrlimit(struct thread *td, u_int which, struct rlimit *limp) { return (kern_proc_setrlimit(td, td->td_proc, which, limp)); } int kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which, struct rlimit *limp) { struct plimit *newlim, *oldlim; register struct rlimit *alimp; struct rlimit oldssiz; int error; if (which >= RLIM_NLIMITS) return (EINVAL); /* * Preserve historical bugs by treating negative limits as unsigned. */ if (limp->rlim_cur < 0) limp->rlim_cur = RLIM_INFINITY; if (limp->rlim_max < 0) limp->rlim_max = RLIM_INFINITY; oldssiz.rlim_cur = 0; newlim = lim_alloc(); PROC_LOCK(p); oldlim = p->p_limit; alimp = &oldlim->pl_rlimit[which]; if (limp->rlim_cur > alimp->rlim_max || limp->rlim_max > alimp->rlim_max) if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) { PROC_UNLOCK(p); lim_free(newlim); return (error); } if (limp->rlim_cur > limp->rlim_max) limp->rlim_cur = limp->rlim_max; lim_copy(newlim, oldlim); alimp = &newlim->pl_rlimit[which]; switch (which) { case RLIMIT_CPU: if (limp->rlim_cur != RLIM_INFINITY && p->p_cpulimit == RLIM_INFINITY) callout_reset_sbt(&p->p_limco, SBT_1S, 0, lim_cb, p, C_PREL(1)); p->p_cpulimit = limp->rlim_cur; break; case RLIMIT_DATA: if (limp->rlim_cur > maxdsiz) limp->rlim_cur = maxdsiz; if (limp->rlim_max > maxdsiz) limp->rlim_max = maxdsiz; break; case RLIMIT_STACK: if (limp->rlim_cur > maxssiz) limp->rlim_cur = maxssiz; if (limp->rlim_max > maxssiz) limp->rlim_max = maxssiz; oldssiz = *alimp; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(&oldssiz, RLIMIT_STACK); break; case RLIMIT_NOFILE: if (limp->rlim_cur > maxfilesperproc) limp->rlim_cur = maxfilesperproc; if (limp->rlim_max > maxfilesperproc) limp->rlim_max = maxfilesperproc; break; case RLIMIT_NPROC: if (limp->rlim_cur > maxprocperuid) limp->rlim_cur = maxprocperuid; if (limp->rlim_max > maxprocperuid) limp->rlim_max = maxprocperuid; if (limp->rlim_cur < 1) limp->rlim_cur = 1; if (limp->rlim_max < 1) limp->rlim_max = 1; break; } if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(limp, which); *alimp = *limp; p->p_limit = newlim; PROC_UPDATE_COW(p); PROC_UNLOCK(p); lim_free(oldlim); if (which == RLIMIT_STACK && /* * Skip calls from exec_new_vmspace(), done when stack is * not mapped yet. */ (td != curthread || (p->p_flag & P_INEXEC) == 0)) { /* * Stack is allocated to the max at exec time with only * "rlim_cur" bytes accessible. If stack limit is going * up make more accessible, if going down make inaccessible. */ if (limp->rlim_cur != oldssiz.rlim_cur) { vm_offset_t addr; vm_size_t size; vm_prot_t prot; if (limp->rlim_cur > oldssiz.rlim_cur) { prot = p->p_sysent->sv_stackprot; size = limp->rlim_cur - oldssiz.rlim_cur; addr = p->p_sysent->sv_usrstack - limp->rlim_cur; } else { prot = VM_PROT_NONE; size = oldssiz.rlim_cur - limp->rlim_cur; addr = p->p_sysent->sv_usrstack - oldssiz.rlim_cur; } addr = trunc_page(addr); size = round_page(size); (void)vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, FALSE); } } return (0); } #ifndef _SYS_SYSPROTO_H_ struct __getrlimit_args { u_int which; struct rlimit *rlp; }; #endif /* ARGSUSED */ int sys_getrlimit(struct thread *td, register struct __getrlimit_args *uap) { struct rlimit rlim; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); lim_rlimit(td, uap->which, &rlim); error = copyout(&rlim, uap->rlp, sizeof(struct rlimit)); return (error); } /* * Transform the running time and tick information for children of proc p * into user and system time usage. */ void calccru(struct proc *p, struct timeval *up, struct timeval *sp) { PROC_LOCK_ASSERT(p, MA_OWNED); calcru1(p, &p->p_crux, up, sp); } /* * Transform the running time and tick information in proc p into user * and system time usage. If appropriate, include the current time slice * on this CPU. */ void calcru(struct proc *p, struct timeval *up, struct timeval *sp) { struct thread *td; uint64_t runtime, u; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_STATLOCK_ASSERT(p, MA_OWNED); /* * If we are getting stats for the current process, then add in the * stats that this thread has accumulated in its current time slice. * We reset the thread and CPU state as if we had performed a context * switch right here. */ td = curthread; if (td->td_proc == p) { u = cpu_ticks(); runtime = u - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, u); } /* Make sure the per-thread stats are current. */ FOREACH_THREAD_IN_PROC(p, td) { if (td->td_incruntime == 0) continue; ruxagg(p, td); } calcru1(p, &p->p_rux, up, sp); } /* Collect resource usage for a single thread. */ void rufetchtd(struct thread *td, struct rusage *ru) { struct proc *p; uint64_t runtime, u; p = td->td_proc; PROC_STATLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If we are getting stats for the current thread, then add in the * stats that this thread has accumulated in its current time slice. * We reset the thread and CPU state as if we had performed a context * switch right here. */ if (td == curthread) { u = cpu_ticks(); runtime = u - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, u); } ruxagg(p, td); *ru = td->td_ru; calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime); } static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp) { /* {user, system, interrupt, total} {ticks, usec}: */ uint64_t ut, uu, st, su, it, tt, tu; ut = ruxp->rux_uticks; st = ruxp->rux_sticks; it = ruxp->rux_iticks; tt = ut + st + it; if (tt == 0) { /* Avoid divide by zero */ st = 1; tt = 1; } tu = cputick2usec(ruxp->rux_runtime); if ((int64_t)tu < 0) { /* XXX: this should be an assert /phk */ printf("calcru: negative runtime of %jd usec for pid %d (%s)\n", (intmax_t)tu, p->p_pid, p->p_comm); tu = ruxp->rux_tu; } if (tu >= ruxp->rux_tu) { /* * The normal case, time increased. * Enforce monotonicity of bucketed numbers. */ uu = (tu * ut) / tt; if (uu < ruxp->rux_uu) uu = ruxp->rux_uu; su = (tu * st) / tt; if (su < ruxp->rux_su) su = ruxp->rux_su; } else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) { /* * When we calibrate the cputicker, it is not uncommon to * see the presumably fixed frequency increase slightly over * time as a result of thermal stabilization and NTP * discipline (of the reference clock). We therefore ignore * a bit of backwards slop because we expect to catch up * shortly. We use a 3 microsecond limit to catch low * counts and a 1% limit for high counts. */ uu = ruxp->rux_uu; su = ruxp->rux_su; tu = ruxp->rux_tu; } else { /* tu < ruxp->rux_tu */ /* * What happened here was likely that a laptop, which ran at * a reduced clock frequency at boot, kicked into high gear. * The wisdom of spamming this message in that case is * dubious, but it might also be indicative of something * serious, so lets keep it and hope laptops can be made * more truthful about their CPU speed via ACPI. */ printf("calcru: runtime went backwards from %ju usec " "to %ju usec for pid %d (%s)\n", (uintmax_t)ruxp->rux_tu, (uintmax_t)tu, p->p_pid, p->p_comm); uu = (tu * ut) / tt; su = (tu * st) / tt; } ruxp->rux_uu = uu; ruxp->rux_su = su; ruxp->rux_tu = tu; up->tv_sec = uu / 1000000; up->tv_usec = uu % 1000000; sp->tv_sec = su / 1000000; sp->tv_usec = su % 1000000; } #ifndef _SYS_SYSPROTO_H_ struct getrusage_args { int who; struct rusage *rusage; }; #endif int sys_getrusage(register struct thread *td, register struct getrusage_args *uap) { struct rusage ru; int error; error = kern_getrusage(td, uap->who, &ru); if (error == 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int kern_getrusage(struct thread *td, int who, struct rusage *rup) { struct proc *p; int error; error = 0; p = td->td_proc; PROC_LOCK(p); switch (who) { case RUSAGE_SELF: rufetchcalc(p, rup, &rup->ru_utime, &rup->ru_stime); break; case RUSAGE_CHILDREN: *rup = p->p_stats->p_cru; calccru(p, &rup->ru_utime, &rup->ru_stime); break; case RUSAGE_THREAD: PROC_STATLOCK(p); thread_lock(td); rufetchtd(td, rup); thread_unlock(td); PROC_STATUNLOCK(p); break; default: error = EINVAL; } PROC_UNLOCK(p); return (error); } void rucollect(struct rusage *ru, struct rusage *ru2) { long *ip, *ip2; int i; if (ru->ru_maxrss < ru2->ru_maxrss) ru->ru_maxrss = ru2->ru_maxrss; ip = &ru->ru_first; ip2 = &ru2->ru_first; for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) *ip++ += *ip2++; } void ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2, struct rusage_ext *rux2) { rux->rux_runtime += rux2->rux_runtime; rux->rux_uticks += rux2->rux_uticks; rux->rux_sticks += rux2->rux_sticks; rux->rux_iticks += rux2->rux_iticks; rux->rux_uu += rux2->rux_uu; rux->rux_su += rux2->rux_su; rux->rux_tu += rux2->rux_tu; rucollect(ru, ru2); } /* * Aggregate tick counts into the proc's rusage_ext. */ static void ruxagg_locked(struct rusage_ext *rux, struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); PROC_STATLOCK_ASSERT(td->td_proc, MA_OWNED); rux->rux_runtime += td->td_incruntime; rux->rux_uticks += td->td_uticks; rux->rux_sticks += td->td_sticks; rux->rux_iticks += td->td_iticks; } void ruxagg(struct proc *p, struct thread *td) { thread_lock(td); ruxagg_locked(&p->p_rux, td); ruxagg_locked(&td->td_rux, td); td->td_incruntime = 0; td->td_uticks = 0; td->td_iticks = 0; td->td_sticks = 0; thread_unlock(td); } /* * Update the rusage_ext structure and fetch a valid aggregate rusage * for proc p if storage for one is supplied. */ void rufetch(struct proc *p, struct rusage *ru) { struct thread *td; PROC_STATLOCK_ASSERT(p, MA_OWNED); *ru = p->p_ru; if (p->p_numthreads > 0) { FOREACH_THREAD_IN_PROC(p, td) { ruxagg(p, td); rucollect(ru, &td->td_ru); } } } /* * Atomically perform a rufetch and a calcru together. * Consumers, can safely assume the calcru is executed only once * rufetch is completed. */ void rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up, struct timeval *sp) { PROC_STATLOCK(p); rufetch(p, ru); calcru(p, up, sp); PROC_STATUNLOCK(p); } /* * Allocate a new resource limits structure and initialize its * reference count and mutex pointer. */ struct plimit * lim_alloc() { struct plimit *limp; limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK); refcount_init(&limp->pl_refcnt, 1); return (limp); } struct plimit * lim_hold(struct plimit *limp) { refcount_acquire(&limp->pl_refcnt); return (limp); } static __inline int lim_shared(struct plimit *limp) { return (limp->pl_refcnt > 1); } void lim_fork(struct proc *p1, struct proc *p2) { PROC_LOCK_ASSERT(p1, MA_OWNED); PROC_LOCK_ASSERT(p2, MA_OWNED); p2->p_limit = lim_hold(p1->p_limit); callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0); if (p1->p_cpulimit != RLIM_INFINITY) callout_reset_sbt(&p2->p_limco, SBT_1S, 0, lim_cb, p2, C_PREL(1)); } void lim_free(struct plimit *limp) { if (refcount_release(&limp->pl_refcnt)) free((void *)limp, M_PLIMIT); } /* * Make a copy of the plimit structure. * We share these structures copy-on-write after fork. */ void lim_copy(struct plimit *dst, struct plimit *src) { KASSERT(!lim_shared(dst), ("lim_copy to shared limit")); bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit)); } /* * Return the hard limit for a particular system resource. The * which parameter specifies the index into the rlimit array. */ rlim_t lim_max(struct thread *td, int which) { struct rlimit rl; lim_rlimit(td, which, &rl); return (rl.rlim_max); } rlim_t lim_max_proc(struct proc *p, int which) { struct rlimit rl; lim_rlimit_proc(p, which, &rl); return (rl.rlim_max); } /* * Return the current (soft) limit for a particular system resource. * The which parameter which specifies the index into the rlimit array */ rlim_t lim_cur(struct thread *td, int which) { struct rlimit rl; lim_rlimit(td, which, &rl); return (rl.rlim_cur); } rlim_t lim_cur_proc(struct proc *p, int which) { struct rlimit rl; lim_rlimit_proc(p, which, &rl); return (rl.rlim_cur); } /* * Return a copy of the entire rlimit structure for the system limit * specified by 'which' in the rlimit structure pointed to by 'rlp'. */ void lim_rlimit(struct thread *td, int which, struct rlimit *rlp) { struct proc *p = td->td_proc; MPASS(td == curthread); KASSERT(which >= 0 && which < RLIM_NLIMITS, ("request for invalid resource limit")); *rlp = td->td_limit->pl_rlimit[which]; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(rlp, which); } void lim_rlimit_proc(struct proc *p, int which, struct rlimit *rlp) { PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(which >= 0 && which < RLIM_NLIMITS, ("request for invalid resource limit")); *rlp = p->p_limit->pl_rlimit[which]; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(rlp, which); } void uihashinit() { uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash); rw_init(&uihashtbl_lock, "uidinfo hash"); } /* * Look up a uidinfo struct for the parameter uid. * uihashtbl_lock must be locked. * Increase refcount on uidinfo struct returned. */ static struct uidinfo * uilookup(uid_t uid) { struct uihashhead *uipp; struct uidinfo *uip; rw_assert(&uihashtbl_lock, RA_LOCKED); uipp = UIHASH(uid); LIST_FOREACH(uip, uipp, ui_hash) if (uip->ui_uid == uid) { uihold(uip); break; } return (uip); } /* * Find or allocate a struct uidinfo for a particular uid. * Returns with uidinfo struct referenced. * uifree() should be called on a struct uidinfo when released. */ struct uidinfo * uifind(uid_t uid) { struct uidinfo *new_uip, *uip; rw_rlock(&uihashtbl_lock); uip = uilookup(uid); rw_runlock(&uihashtbl_lock); if (uip != NULL) return (uip); new_uip = malloc(sizeof(*new_uip), M_UIDINFO, M_WAITOK | M_ZERO); racct_create(&new_uip->ui_racct); refcount_init(&new_uip->ui_ref, 1); new_uip->ui_uid = uid; mtx_init(&new_uip->ui_vmsize_mtx, "ui_vmsize", NULL, MTX_DEF); rw_wlock(&uihashtbl_lock); /* * There's a chance someone created our uidinfo while we * were in malloc and not holding the lock, so we have to * make sure we don't insert a duplicate uidinfo. */ if ((uip = uilookup(uid)) == NULL) { LIST_INSERT_HEAD(UIHASH(uid), new_uip, ui_hash); rw_wunlock(&uihashtbl_lock); uip = new_uip; } else { rw_wunlock(&uihashtbl_lock); racct_destroy(&new_uip->ui_racct); mtx_destroy(&new_uip->ui_vmsize_mtx); free(new_uip, M_UIDINFO); } return (uip); } /* * Place another refcount on a uidinfo struct. */ void uihold(struct uidinfo *uip) { refcount_acquire(&uip->ui_ref); } /*- * Since uidinfo structs have a long lifetime, we use an * opportunistic refcounting scheme to avoid locking the lookup hash * for each release. * * If the refcount hits 0, we need to free the structure, * which means we need to lock the hash. * Optimal case: * After locking the struct and lowering the refcount, if we find * that we don't need to free, simply unlock and return. * Suboptimal case: * If refcount lowering results in need to free, bump the count * back up, lose the lock and acquire the locks in the proper * order to try again. */ void uifree(struct uidinfo *uip) { int old; /* Prepare for optimal case. */ old = uip->ui_ref; if (old > 1 && atomic_cmpset_int(&uip->ui_ref, old, old - 1)) return; /* Prepare for suboptimal case. */ rw_wlock(&uihashtbl_lock); if (refcount_release(&uip->ui_ref) == 0) { rw_wunlock(&uihashtbl_lock); return; } racct_destroy(&uip->ui_racct); LIST_REMOVE(uip, ui_hash); rw_wunlock(&uihashtbl_lock); if (uip->ui_sbsize != 0) printf("freeing uidinfo: uid = %d, sbsize = %ld\n", uip->ui_uid, uip->ui_sbsize); if (uip->ui_proccnt != 0) printf("freeing uidinfo: uid = %d, proccnt = %ld\n", uip->ui_uid, uip->ui_proccnt); if (uip->ui_vmsize != 0) printf("freeing uidinfo: uid = %d, swapuse = %lld\n", uip->ui_uid, (unsigned long long)uip->ui_vmsize); mtx_destroy(&uip->ui_vmsize_mtx); free(uip, M_UIDINFO); } #ifdef RACCT void ui_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void *arg2, void *arg3) { struct uidinfo *uip; struct uihashhead *uih; rw_rlock(&uihashtbl_lock); for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) { LIST_FOREACH(uip, uih, ui_hash) { (callback)(uip->ui_racct, arg2, arg3); } } rw_runlock(&uihashtbl_lock); } #endif static inline int chglimit(struct uidinfo *uip, long *limit, int diff, rlim_t max, const char *name) { /* Don't allow them to exceed max, but allow subtraction. */ if (diff > 0 && max != 0) { if (atomic_fetchadd_long(limit, (long)diff) + diff > max) { atomic_subtract_long(limit, (long)diff); return (0); } } else { atomic_add_long(limit, (long)diff); if (*limit < 0) printf("negative %s for uid = %d\n", name, uip->ui_uid); } return (1); } /* * Change the count associated with number of processes * a given user is using. When 'max' is 0, don't enforce a limit */ int chgproccnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_proccnt, diff, max, "proccnt")); } /* * Change the total socket buffer size a user has used. */ int chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to, rlim_t max) { int diff, rv; diff = to - *hiwat; if (diff > 0 && max == 0) { rv = 0; } else { rv = chglimit(uip, &uip->ui_sbsize, diff, max, "sbsize"); if (rv != 0) *hiwat = to; } return (rv); } /* * Change the count associated with number of pseudo-terminals * a given user is using. When 'max' is 0, don't enforce a limit */ int chgptscnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_ptscnt, diff, max, "ptscnt")); } int chgkqcnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_kqcnt, diff, max, "kqcnt")); } - -void -lim_update_thread(struct thread *td) -{ - struct proc *p; - struct plimit *lim; - - p = td->td_proc; - lim = td->td_limit; - PROC_LOCK_ASSERT(p, MA_OWNED); - td->td_limit = lim_hold(p->p_limit); - if (lim != NULL) - lim_free(lim); -} Index: head/sys/kern/kern_thread.c =================================================================== --- head/sys/kern/kern_thread.c (revision 285632) +++ head/sys/kern/kern_thread.c (revision 285633) @@ -1,1198 +1,1210 @@ /*- * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, , , lwp__exit); /* * thread related storage. */ static uma_zone_t thread_zone; TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); static struct mtx zombie_lock; MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN); static void thread_zombie(struct thread *); static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary); #define TID_BUFFER_SIZE 1024 struct mtx tid_lock; static struct unrhdr *tid_unrhdr; static lwpid_t tid_buffer[TID_BUFFER_SIZE]; static int tid_head, tid_tail; static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash"); struct tidhashhead *tidhashtbl; u_long tidhash; struct rwlock tidhash_lock; static lwpid_t tid_alloc(void) { lwpid_t tid; tid = alloc_unr(tid_unrhdr); if (tid != -1) return (tid); mtx_lock(&tid_lock); if (tid_head == tid_tail) { mtx_unlock(&tid_lock); return (-1); } tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); return (tid); } static void tid_free(lwpid_t tid) { lwpid_t tmp_tid = -1; mtx_lock(&tid_lock); if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) { tmp_tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; } tid_buffer[tid_tail] = tid; tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); if (tmp_tid != -1) free_unr(tid_unrhdr, tmp_tid); } /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; td->td_tid = tid_alloc(); /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; td->td_lend_user_pri = PRI_MAX; EVENTHANDLER_INVOKE(thread_ctor, td); #ifdef AUDIT audit_thread_alloc(td); #endif umtx_thread_alloc(td); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); EVENTHANDLER_INVOKE(thread_dtor, td); tid_free(td->td_tid); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_rlqe = NULL; EVENTHANDLER_INVOKE(thread_init, td); td->td_sched = (struct td_sched *)&td[1]; umtx_thread_init(td); td->td_kstack = 0; td->td_sel = NULL; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; EVENTHANDLER_INVOKE(thread_fini, td); rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); seltdfini(td); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c {arch}_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void proc_linkup0(struct proc *p, struct thread *td) { TAILQ_INIT(&p->p_threads); /* all threads in proc */ proc_linkup(p, td); } void proc_linkup(struct proc *p, struct thread *td) { sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } /* * Initialize global thread allocation resources. */ void threadinit(void) { mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); /* * pid_max cannot be greater than PID_MAX. * leave one number for thread0. */ tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock); thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 16 - 1, 0); tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash); rw_init(&tidhash_lock, "tidhash"); } /* * Place an unused thread on the zombie list. * Use the slpq as that must be unused by now. */ void thread_zombie(struct thread *td) { mtx_lock_spin(&zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq); mtx_unlock_spin(&zombie_lock); } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombie resources. */ void thread_reap(void) { struct thread *td_first, *td_next; /* * Don't even bother to lock if none at this instant, * we really don't care about the next instant.. */ if (!TAILQ_EMPTY(&zombie_threads)) { mtx_lock_spin(&zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); if (td_first) TAILQ_INIT(&zombie_threads); mtx_unlock_spin(&zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_slpq); thread_cow_free(td_first); thread_free(td_first); td_first = td_next; } } } /* * Allocate a thread. */ struct thread * thread_alloc(int pages) { struct thread *td; thread_reap(); /* check if any zombies to get */ td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK); KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); if (!vm_thread_new(td, pages)) { uma_zfree(thread_zone, td); return (NULL); } cpu_thread_alloc(td); vm_domain_policy_init(&td->td_vm_dom_policy); return (td); } int thread_alloc_stack(struct thread *td, int pages) { KASSERT(td->td_kstack == 0, ("thread_alloc_stack called on a thread with kstack")); if (!vm_thread_new(td, pages)) return (0); cpu_thread_alloc(td); return (1); } /* * Deallocate a thread. */ void thread_free(struct thread *td) { lock_profile_thread_exit(td); if (td->td_cpuset) cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); vm_domain_policy_cleanup(&td->td_vm_dom_policy); uma_zfree(thread_zone, td); } void thread_cow_get_proc(struct thread *newtd, struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_ucred = crhold(p->p_ucred); newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } void thread_cow_get(struct thread *newtd, struct thread *td) { newtd->td_ucred = crhold(td->td_ucred); newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } void thread_cow_free(struct thread *td) { - if (td->td_ucred) + if (td->td_ucred != NULL) crfree(td->td_ucred); - if (td->td_limit) + if (td->td_limit != NULL) lim_free(td->td_limit); } void thread_cow_update(struct thread *td) { struct proc *p; + struct ucred *oldcred; + struct plimit *oldlimit; p = td->td_proc; + oldcred = NULL; + oldlimit = NULL; PROC_LOCK(p); - if (td->td_ucred != p->p_ucred) - cred_update_thread(td); - if (td->td_limit != p->p_limit) - lim_update_thread(td); + if (td->td_ucred != p->p_ucred) { + oldcred = td->td_ucred; + td->td_ucred = crhold(p->p_ucred); + } + if (td->td_limit != p->p_limit) { + oldlimit = td->td_limit; + td->td_limit = lim_hold(p->p_limit); + } td->td_cowgen = p->p_cowgen; PROC_UNLOCK(p); + if (oldcred != NULL) + crfree(oldcred); + if (oldlimit != NULL) + lim_free(oldlimit); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { uint64_t runtime, new_switchtime; struct thread *td; struct thread *td2; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, td->td_name); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); #ifdef AUDIT AUDIT_SYSCALL_EXIT(0, td); #endif /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* XXXSMP */ /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { atomic_add_int(&td->td_proc->p_exitthreads, 1); thread_unlink(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SINGLE is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() */ panic ("thread_exit: Last thread exiting on its own"); } } #ifdef HWPMC_HOOKS /* * If this thread is part of a process that is being tracked by hwpmc(4), * inform the module of the thread's impending exit. */ if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); PROC_SUNLOCK(p); /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); PCPU_INC(cnt.v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; ruxagg(p, td); rucollect(&p->p_ru, &td->td_ru); PROC_STATUNLOCK(p); td->td_state = TDS_INACTIVE; #ifdef WITNESS witness_thread_exit(td); #endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()")); KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking")); td = FIRST_THREAD_IN_PROC(p); /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); lock_profile_thread_exit(td); cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * its lock has been created. * PROC_LOCK_ASSERT(p, MA_OWNED); */ td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } static int calc_remaining(struct proc *p, int mode) { int remaining; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC) remaining = p->p_numthreads - p->p_suspcount; else panic("calc_remaining: wrong mode %d", mode); return (remaining); } static int remain_for_mode(int mode) { return (mode == SINGLE_ALLPROC ? 0 : 1); } static int weed_inhib(int mode, struct thread *td2, struct proc *p) { int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td2, MA_OWNED); wakeup_swapper = 0; switch (mode) { case SINGLE_EXIT: if (TD_IS_SUSPENDED(td2)) wakeup_swapper |= thread_unsuspend_one(td2, p, true); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, EINTR); break; case SINGLE_BOUNDARY: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, ERESTART); break; case SINGLE_NO_EXIT: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, ERESTART); break; case SINGLE_ALLPROC: /* * ALLPROC suspend tries to avoid spurious EINTR for * threads sleeping interruptable, by suspending the * thread directly, similarly to sig_suspend_threads(). * Since such sleep is not performed at the user * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP * is used to avoid immediate un-suspend. */ if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY | TDF_ALLPROCSUSP)) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) { if ((td2->td_flags & TDF_SBDRY) == 0) { thread_suspend_one(td2); td2->td_flags |= TDF_ALLPROCSUSP; } else { wakeup_swapper |= sleepq_abort(td2, ERESTART); } } break; } return (wakeup_swapper); } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(struct proc *p, int mode) { struct thread *td; struct thread *td2; int remaining, wakeup_swapper; td = curthread; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); /* * If allowing non-ALLPROC singlethreading for non-curproc * callers, calc_remaining() and remain_for_mode() should be * adjusted to also account for td->td_proc != p. For now * this is not implemented because it is not used. */ KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) || (mode != SINGLE_ALLPROC && td->td_proc == p), ("mode %d proc %p curproc %p", mode, p, td->td_proc)); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } if (mode == SINGLE_ALLPROC) p->p_flag |= P_TOTAL_STOP; p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; remaining = calc_remaining(p, mode); while (remaining != remain_for_mode(mode)) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (TD_IS_INHIBITED(td2)) { wakeup_swapper |= weed_inhib(mode, td2, p); #ifdef SMP } else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); #endif } thread_unlock(td2); } if (wakeup_swapper) kick_proc0(); remaining = calc_remaining(p, mode); /* * Maybe we suspended some threads.. was it enough? */ if (remaining == remain_for_mode(mode)) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td, p); remaining = calc_remaining(p, mode); } if (mode == SINGLE_EXIT) { /* * Convert the process to an unthreaded process. The * SINGLE_EXIT is called by exit1() or execve(), in * both cases other threads must be retired. */ KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads")); p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS); /* * Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads != 0) { PROC_SUNLOCK(p); PROC_UNLOCK(p); sched_relinquish(td); PROC_LOCK(p); PROC_SLOCK(p); } } else if (mode == SINGLE_BOUNDARY) { /* * Wait until all suspended threads are removed from * the processors. The thread_suspend_check() * increments p_boundary_count while it is still * running, which makes it possible for the execve() * to destroy vmspace while our other threads are * still using the address space. * * We lock the thread, which is only allowed to * succeed after context switch code finished using * the address space. */ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); KASSERT((td2->td_flags & TDF_BOUNDARY) != 0, ("td %p not on boundary", td2)); KASSERT(TD_IS_SUSPENDED(td2), ("td %p is not suspended", td2)); thread_unlock(td2); } } PROC_SUNLOCK(p); return (0); } bool thread_suspend_check_needed(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 && (td->td_dbgflags & TDB_SUSPEND) != 0)); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediately *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediately * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (thread_suspend_check_needed()) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * XXX Should be safe to access unlocked * as it can only be set to be true by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* * Ignore suspend requests if they are deferred. */ if ((td->td_flags & TDF_SBDRY) != 0) { KASSERT(return_instead, ("TDF_SBDRY set for unsafe thread_suspend_check")); return (0); } /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { PROC_UNLOCK(p); tidhash_remove(td); /* * Allow Linux emulation layer to do some work * before thread suicide. */ if (__predict_false(p->p_sysent->sv_thread_detach != NULL)) (p->p_sysent->sv_thread_detach)(td); PROC_LOCK(p); tdsigcleanup(td); umtx_thread_exit(td); PROC_SLOCK(p); thread_stopped(p); thread_exit(); } PROC_SLOCK(p); thread_stopped(p); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL | SWT_SUSPEND, NULL); thread_unlock(td); PROC_LOCK(p); } return (0); } void thread_suspend_switch(struct thread *td, struct proc *p) { KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ if (p == td->td_proc) { thread_stopped(p); p->p_suspcount++; } PROC_UNLOCK(p); thread_lock(td); td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL | SWT_SUSPEND, NULL); thread_unlock(td); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); } static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); td->td_flags &= ~TDF_ALLPROCSUSP; if (td->td_proc == p) { PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_suspcount--; if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) { td->td_flags &= ~TDF_BOUNDARY; p->p_boundary_count--; } } return (setrunnable(td)); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, true); } thread_unlock(td); } } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && p->p_numthreads == p->p_suspcount) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ if (p->p_singlethread->td_proc == p) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); } } if (wakeup_swapper) kick_proc0(); } /* * End the single threading mode.. */ void thread_single_end(struct proc *p, int mode) { struct thread *td; int wakeup_swapper; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) || (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0), ("mode %d does not match P_TOTAL_STOP", mode)); KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread, ("thread_single_end from other thread %p %p", curthread, p->p_singlethread)); KASSERT(mode != SINGLE_BOUNDARY || (p->p_flag & P_SINGLE_BOUNDARY) != 0, ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag)); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY | P_TOTAL_STOP); PROC_SLOCK(p); p->p_singlethread = NULL; wakeup_swapper = 0; /* * If there are other threads they may now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, mode == SINGLE_BOUNDARY); } thread_unlock(td); } } KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, ("inconsistent boundary count %d", p->p_boundary_count)); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); } struct thread * thread_find(struct proc *p, lwpid_t tid) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) break; } return (td); } /* Locate a thread by number; return with proc lock held. */ struct thread * tdfind(lwpid_t tid, pid_t pid) { #define RUN_THRESH 16 struct thread *td; int run = 0; rw_rlock(&tidhash_lock); LIST_FOREACH(td, TIDHASH(tid), td_hash) { if (td->td_tid == tid) { if (pid != -1 && td->td_proc->p_pid != pid) { td = NULL; break; } PROC_LOCK(td->td_proc); if (td->td_proc->p_state == PRS_NEW) { PROC_UNLOCK(td->td_proc); td = NULL; break; } if (run > RUN_THRESH) { if (rw_try_upgrade(&tidhash_lock)) { LIST_REMOVE(td, td_hash); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); return (td); } } break; } run++; } rw_runlock(&tidhash_lock); return (td); } void tidhash_add(struct thread *td) { rw_wlock(&tidhash_lock); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); } void tidhash_remove(struct thread *td) { rw_wlock(&tidhash_lock); LIST_REMOVE(td, td_hash); rw_wunlock(&tidhash_lock); } Index: head/sys/sys/resourcevar.h =================================================================== --- head/sys/sys/resourcevar.h (revision 285632) +++ head/sys/sys/resourcevar.h (revision 285633) @@ -1,165 +1,163 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)resourcevar.h 8.4 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_RESOURCEVAR_H_ #define _SYS_RESOURCEVAR_H_ #include #include #ifdef _KERNEL #include #include #endif /* * Kernel per-process accounting / statistics * (not necessarily resident except when running). * * Locking key: * b - created at fork, never changes * c - locked by proc mtx * k - only accessed by curthread * w - locked by proc itim lock * w2 - locked by proc prof lock */ struct pstats { #define pstat_startzero p_cru struct rusage p_cru; /* Stats for reaped children. */ struct itimerval p_timer[3]; /* (w) Virtual-time timers. */ #define pstat_endzero pstat_startcopy #define pstat_startcopy p_prof struct uprof { /* Profile arguments. */ caddr_t pr_base; /* (c + w2) Buffer base. */ u_long pr_size; /* (c + w2) Buffer size. */ u_long pr_off; /* (c + w2) PC offset. */ u_long pr_scale; /* (c + w2) PC scaling. */ } p_prof; #define pstat_endcopy p_start struct timeval p_start; /* (b) Starting time. */ }; #ifdef _KERNEL /* * Kernel shareable process resource limits. Because this structure * is moderately large but changes infrequently, it is normally * shared copy-on-write after forks. */ struct plimit { struct rlimit pl_rlimit[RLIM_NLIMITS]; int pl_refcnt; /* number of references */ }; struct racct; /*- * Per uid resource consumption. This structure is used to track * the total resource consumption (process count, socket buffer size, * etc) for the uid and impose limits. * * Locking guide: * (a) Constant from inception * (b) Lockless, updated using atomics * (c) Locked by global uihashtbl_lock * (d) Locked by the ui_vmsize_mtx */ struct uidinfo { LIST_ENTRY(uidinfo) ui_hash; /* (c) hash chain of uidinfos */ struct mtx ui_vmsize_mtx; vm_ooffset_t ui_vmsize; /* (d) swap reservation by uid */ long ui_sbsize; /* (b) socket buffer space consumed */ long ui_proccnt; /* (b) number of processes */ long ui_ptscnt; /* (b) number of pseudo-terminals */ long ui_kqcnt; /* (b) number of kqueues */ uid_t ui_uid; /* (a) uid */ u_int ui_ref; /* (b) reference count */ #ifdef RACCT struct racct *ui_racct; /* (a) resource accounting */ #endif }; #define UIDINFO_VMSIZE_LOCK(ui) mtx_lock(&((ui)->ui_vmsize_mtx)) #define UIDINFO_VMSIZE_UNLOCK(ui) mtx_unlock(&((ui)->ui_vmsize_mtx)) struct proc; struct rusage_ext; struct thread; void addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks); void addupc_task(struct thread *td, uintfptr_t pc, u_int ticks); void calccru(struct proc *p, struct timeval *up, struct timeval *sp); void calcru(struct proc *p, struct timeval *up, struct timeval *sp); int chgkqcnt(struct uidinfo *uip, int diff, rlim_t max); int chgproccnt(struct uidinfo *uip, int diff, rlim_t maxval); int chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to, rlim_t maxval); int chgptscnt(struct uidinfo *uip, int diff, rlim_t maxval); int fuswintr(void *base); int kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which, struct rlimit *limp); struct plimit *lim_alloc(void); void lim_copy(struct plimit *dst, struct plimit *src); rlim_t lim_cur(struct thread *td, int which); rlim_t lim_cur_proc(struct proc *p, int which); void lim_fork(struct proc *p1, struct proc *p2); void lim_free(struct plimit *limp); struct plimit *lim_hold(struct plimit *limp); rlim_t lim_max(struct thread *td, int which); rlim_t lim_max_proc(struct proc *p, int which); void lim_rlimit(struct thread *td, int which, struct rlimit *rlp); void lim_rlimit_proc(struct proc *p, int which, struct rlimit *rlp); void ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2, struct rusage_ext *rux2); void rucollect(struct rusage *ru, struct rusage *ru2); void rufetch(struct proc *p, struct rusage *ru); void rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up, struct timeval *sp); void rufetchtd(struct thread *td, struct rusage *ru); void ruxagg(struct proc *p, struct thread *td); int suswintr(void *base, int word); struct uidinfo *uifind(uid_t uid); void uifree(struct uidinfo *uip); void uihashinit(void); void uihold(struct uidinfo *uip); #ifdef RACCT void ui_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void *arg2, void *arg3); #endif -void lim_update_thread(struct thread *td); - #endif /* _KERNEL */ #endif /* !_SYS_RESOURCEVAR_H_ */ Index: head/sys/sys/ucred.h =================================================================== --- head/sys/sys/ucred.h (revision 285632) +++ head/sys/sys/ucred.h (revision 285633) @@ -1,119 +1,118 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ucred.h 8.4 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_UCRED_H_ #define _SYS_UCRED_H_ #include struct loginclass; #define XU_NGROUPS 16 /* * Credentials. * * Please do not inspect cr_uid directly to determine superuserness. The * priv(9) interface should be used to check for privilege. */ #if defined(_KERNEL) || defined(_WANT_UCRED) struct ucred { u_int cr_ref; /* reference count */ #define cr_startcopy cr_uid uid_t cr_uid; /* effective user id */ uid_t cr_ruid; /* real user id */ uid_t cr_svuid; /* saved user id */ int cr_ngroups; /* number of groups */ gid_t cr_rgid; /* real group id */ gid_t cr_svgid; /* saved group id */ struct uidinfo *cr_uidinfo; /* per euid resource consumption */ struct uidinfo *cr_ruidinfo; /* per ruid resource consumption */ struct prison *cr_prison; /* jail(2) */ struct loginclass *cr_loginclass; /* login class */ u_int cr_flags; /* credential flags */ void *cr_pspare2[2]; /* general use 2 */ #define cr_endcopy cr_label struct label *cr_label; /* MAC label */ struct auditinfo_addr cr_audit; /* Audit properties. */ gid_t *cr_groups; /* groups */ int cr_agroups; /* Available groups */ gid_t cr_smallgroups[XU_NGROUPS]; /* storage for small groups */ }; #define NOCRED ((struct ucred *)0) /* no credential available */ #define FSCRED ((struct ucred *)-1) /* filesystem credential */ #endif /* _KERNEL || _WANT_UCRED */ /* * Flags for cr_flags. */ #define CRED_FLAG_CAPMODE 0x00000001 /* In capability mode. */ /* * This is the external representation of struct ucred. */ struct xucred { u_int cr_version; /* structure layout version */ uid_t cr_uid; /* effective user id */ short cr_ngroups; /* number of groups */ gid_t cr_groups[XU_NGROUPS]; /* groups */ void *_cr_unused1; /* compatibility with old ucred */ }; #define XUCRED_VERSION 0 /* This can be used for both ucred and xucred structures. */ #define cr_gid cr_groups[0] #ifdef _KERNEL struct proc; struct thread; void change_egid(struct ucred *newcred, gid_t egid); void change_euid(struct ucred *newcred, struct uidinfo *euip); void change_rgid(struct ucred *newcred, gid_t rgid); void change_ruid(struct ucred *newcred, struct uidinfo *ruip); void change_svgid(struct ucred *newcred, gid_t svgid); void change_svuid(struct ucred *newcred, uid_t svuid); void crcopy(struct ucred *dest, struct ucred *src); struct ucred *crcopysafe(struct proc *p, struct ucred *cr); struct ucred *crdup(struct ucred *cr); -void cred_update_thread(struct thread *td); void proc_set_cred_init(struct proc *p, struct ucred *cr); struct ucred *proc_set_cred(struct proc *p, struct ucred *cr); void crfree(struct ucred *cr); struct ucred *crget(void); struct ucred *crhold(struct ucred *cr); void cru2x(struct ucred *cr, struct xucred *xcr); void crsetgroups(struct ucred *cr, int n, gid_t *groups); int groupmember(gid_t gid, struct ucred *cred); #endif /* _KERNEL */ #endif /* !_SYS_UCRED_H_ */