diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index a8df9f84a29a..f39f5b8955ed 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -1,921 +1,921 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include #include "opt_ddb.h" #include "opt_kdb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; struct proc *initproc; int linux_alloc_current_noop(struct thread *td __unused, int flags __unused) { return (0); } int (*lkpi_alloc_current)(struct thread *, int) = linux_alloc_current_noop; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef VERBOSE_SYSINIT /* * We'll use the defined value of VERBOSE_SYSINIT from the kernel config to * dictate the default VERBOSE_SYSINIT behavior. Significant values for this * option and associated tunable are: * - 0, 'compiled in but silent by default' * - 1, 'compiled in but verbose by default' (default) */ int verbose_sysinit = VERBOSE_SYSINIT; TUNABLE_INT("debug.verbose_sysinit", &verbose_sysinit); #endif #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit linker set compiled into the kernel. These are placed onto the * sysinit list by mi_startup; sysinit_add can add (e.g., from klds) additional * sysinits to the linked list but the linker set here does not change. */ SET_DECLARE(sysinit_set, struct sysinit); /* * The sysinit lists. Items are moved to sysinit_done_list when done. */ static STAILQ_HEAD(sysinitlist, sysinit) sysinit_list; static struct sysinitlist sysinit_done_list = STAILQ_HEAD_INITIALIZER(sysinit_done_list); /* * Compare two sysinits; return -1, 0, or 1 if a comes before, at the same time * as, or after b. */ static int sysinit_compar(struct sysinit *a, struct sysinit *b, void *thunk __unused) { if (a->subsystem < b->subsystem) return (-1); if (a->subsystem > b->subsystem) return (1); if (a->order < b->order) return (-1); if (a->order > b->order) return (1); return (0); } static void sysinit_mklist(struct sysinitlist *list, struct sysinit **set, struct sysinit **set_end) { struct sysinit **sipp; TSENTER(); TSENTER2("listify"); STAILQ_INIT(list); for (sipp = set; sipp < set_end; sipp++) STAILQ_INSERT_TAIL(list, *sipp, next); TSEXIT2("listify"); TSENTER2("mergesort"); STAILQ_MERGESORT(list, NULL, sysinit_compar, sysinit, next); TSEXIT2("mergesort"); TSEXIT(); } /* * Merge a new sysinit set into the sysinit list. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinitlist new_list; TSENTER(); /* Construct a sorted list from the new sysinits. */ sysinit_mklist(&new_list, set, set_end); /* Merge the new list into the existing one. */ TSENTER2("STAILQ_MERGE"); STAILQ_MERGE(&sysinit_list, &new_list, NULL, sysinit_compar, sysinit, next); TSEXIT2("STAILQ_MERGE"); TSEXIT(); } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { struct sysinit *sip; int last; #if defined(VERBOSE_SYSINIT) int verbose; #endif TSENTER(); if (boothowto & RB_VERBOSE) bootverbose++; /* Construct and sort sysinit list. */ sysinit_mklist(&sysinit_list, SET_BEGIN(sysinit_set), SET_LIMIT(sysinit_set)); last = SI_SUB_COPYRIGHT; #if defined(VERBOSE_SYSINIT) verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Perform each system initialization task from the ordered list. Note * that if sysinit_list is modified (e.g. by a KLD) we will nonetheless * always perform the earlist-sorted sysinit at each step; using the * STAILQ_FOREACH macro would result in items being skipped if inserted * earlier than the "current item". */ while ((sip = STAILQ_FIRST(&sysinit_list)) != NULL) { STAILQ_REMOVE_HEAD(&sysinit_list, next); STAILQ_INSERT_TAIL(&sysinit_done_list, sip, next); if (sip->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if (sip->subsystem > last) BOOTTRACE_INIT("sysinit 0x%7x", sip->subsystem); #if defined(VERBOSE_SYSINIT) if (sip->subsystem > last && verbose_sysinit != 0) { verbose = 1; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)sip->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)sip->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, sip->udata); else #endif printf(" %p(%p)... ", sip->func, sip->udata); } #endif /* Call function */ (*(sip->func))(sip->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ last = sip->subsystem; } TSEXIT(); /* Here so we don't overlap with start_init. */ BOOTTRACE("mi_startup done"); mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_FOURTH, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_FOURTH, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_FIFTH, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_FIFTH, print_caddr_t, diag_warn); #endif #if __SIZEOF_LONG__ == 4 static char ilp32_warn[] = "WARNING: 32-bit kernels are deprecated and may be removed in FreeBSD 15.0.\n"; SYSINIT(ilp32warn, SI_SUB_COPYRIGHT, SI_ORDER_FIFTH, print_caddr_t, ilp32_warn); SYSINIT(ilp32warn2, SI_SUB_LAST, SI_ORDER_FIFTH, print_caddr_t, ilp32_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } static void null_set_fork_retval(struct thread *td __unused) { } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_minsigstksz = 0, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_psstringssz = sizeof(struct ps_strings), .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, .sv_set_fork_retval = null_set_fork_retval, .sv_regset_begin = NULL, .sv_regset_end = NULL, }; /* * The two following SYSINIT's are proc0 specific glue code. I am not * convinced that they can not be safely combined, but their order of * operation has been maintained as the same as the original init_main.c * for right now. */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; struct uidinfo tmpuinfo; struct loginclass tmplc = { .lc_name = "", }; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); sx_init(&pgrp0.pg_killsx, "killpg racer"); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; td->td_tid = THREAD0_TID; tidhash_add(td); TD_SET_STATE(td, TDS_RUNNING); td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = curcpu; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); td->td_domain.dr_policy = td->td_cpuset->cs_domain; prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; p->p_treeflag |= P_TREE_REAPER; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, 1); TAILQ_INIT(&p->p_kqtim_stop); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ /* A hack to prevent uifind from tripping over NULL pointers. */ curthread->td_ucred = newcred; tmpuinfo.ui_uid = 1; newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo; newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_loginclass = &tmplc; newcred->cr_loginclass = loginclass_find("default"); /* End hack. creds get properly set later with thread_cow_get_proc */ curthread->td_ucred = NULL; newcred->cr_prison = &prison0; newcred->cr_users++; /* avoid assertion failure */ - proc_set_cred_init(p, newcred); + p->p_ucred = crcowget(newcred); newcred->cr_users--; crfree(newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_pd = pdinit(NULL, false); p->p_fd = fdinit(); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_free_count()); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; refcount_init(&vmspace0.vm_refcnt, 1); pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_DIRECT_INVOKE(process_init, p); EVENTHANDLER_DIRECT_INVOKE(thread_init, td); #ifdef KDTRACE_HOOKS kdtrace_proc_ctor(p); kdtrace_thread_ctor(td); #endif EVENTHANDLER_DIRECT_INVOKE(process_ctor, p); EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; PROC_STATUNLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { struct image_args args; int error; char *var, *path; char *free_init_path, *tmp_init_path; struct thread *td; struct proc *p; struct vmspace *oldvmspace; TSENTER(); /* Here so we don't overlap with mi_startup. */ td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* For Multicons, report which console is primary to both */ if (boothowto & RB_MULTIPLE) { if (boothowto & RB_SERIAL) printf("Dual Console: Serial Primary, Video Secondary\n"); else printf("Dual Console: Video Primary, Serial Secondary\n"); } if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } free_init_path = tmp_init_path = strdup(init_path, M_TEMP); while ((path = strsep(&tmp_init_path, ":")) != NULL) { if (bootverbose) printf("start_init: trying %s\n", path); memset(&args, 0, sizeof(args)); error = exec_alloc_args(&args); if (error != 0) panic("%s: Can't allocate space for init arguments %d", __func__, error); error = exec_args_add_fname(&args, path, UIO_SYSSPACE); if (error != 0) panic("%s: Can't add fname %d", __func__, error); error = exec_args_add_arg(&args, path, UIO_SYSSPACE); if (error != 0) panic("%s: Can't add argv[0] %d", __func__, error); if (boothowto & RB_SINGLE) error = exec_args_add_arg(&args, "-s", UIO_SYSSPACE); if (error != 0) panic("%s: Can't add argv[0] %d", __func__, error); /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ KASSERT((td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); oldvmspace = p->p_vmspace; error = kern_execve(td, &args, NULL, oldvmspace); KASSERT(error != 0, ("kern_execve returned success, not EJUSTRETURN")); if (error == EJUSTRETURN) { exec_cleanup(td, oldvmspace); free(free_init_path, M_TEMP); TSEXIT(); return; } if (error != ENOENT) printf("exec %s: error %d\n", path, error); } free(free_init_path, M_TEMP); printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in its own address space. We do this * early to reserve pid 1. Note special case - do not make it * runnable yet, init execution is started when userspace can be served. */ static void create_init(const void *udata __unused) { struct fork_req fr; struct ucred *newcred, *oldcred; struct thread *td; int error; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); crcowfree(td); td->td_realucred = crcowget(initproc->p_ucred); td->td_ucred = td->td_realucred; PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); /* * DDB(4). */ #ifdef DDB static void db_show_print_syinit(struct sysinit *sip, bool ddb) { const char *sname, *funcname; c_db_sym_t sym; db_expr_t offset; #define xprint(...) \ if (ddb) \ db_printf(__VA_ARGS__); \ else \ printf(__VA_ARGS__) if (sip == NULL) { xprint("%s: no sysinit * given\n", __func__); return; } sym = db_search_symbol((vm_offset_t)sip, DB_STGY_ANY, &offset); db_symbol_values(sym, &sname, NULL); sym = db_search_symbol((vm_offset_t)sip->func, DB_STGY_PROC, &offset); db_symbol_values(sym, &funcname, NULL); xprint("%s(%p)\n", (sname != NULL) ? sname : "", sip); xprint(" %#08x %#08x\n", sip->subsystem, sip->order); xprint(" %p(%s)(%p)\n", sip->func, (funcname != NULL) ? funcname : "", sip->udata); #undef xprint } DB_SHOW_COMMAND_FLAGS(sysinit, db_show_sysinit, DB_CMD_MEMSAFE) { struct sysinit *sip; db_printf("SYSINIT vs Name(Ptr)\n"); db_printf(" Subsystem Order\n"); db_printf(" Function(Name)(Arg)\n"); STAILQ_FOREACH(sip, &sysinit_done_list, next) { db_show_print_syinit(sip, true); if (db_pager_quit) return; } STAILQ_FOREACH(sip, &sysinit_list, next) { db_show_print_syinit(sip, true); if (db_pager_quit) break; } } #endif /* DDB */ diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index aaa46a64ef9f..3080bd11123d 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -1,1239 +1,1239 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 */ #include #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_fork_func_t dtrace_fasttrap_fork; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int"); #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif /* ARGSUSED */ int sys_fork(struct thread *td, struct fork_args *uap) { struct fork_req fr; int error, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } /* ARGUSED */ int sys_pdfork(struct thread *td, struct pdfork_args *uap) { struct fork_req fr; int error, fd, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFPROCDESC; fr.fr_pidp = &pid; fr.fr_pd_fd = &fd; fr.fr_pd_flags = uap->flags; AUDIT_ARG_FFLAGS(uap->flags); /* * It is necessary to return fd by reference because 0 is a valid file * descriptor number, and the child needs to be able to distinguish * itself from the parent using the return value. */ error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; error = copyout(&fd, uap->fdp, sizeof(fd)); } return (error); } /* ARGSUSED */ int sys_vfork(struct thread *td, struct vfork_args *uap) { struct fork_req fr; int error, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } int sys_rfork(struct thread *td, struct rfork_args *uap) { struct fork_req fr; int error, pid; /* Don't allow kernel-only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); /* RFSPAWN must not appear with others */ if ((uap->flags & RFSPAWN) != 0 && uap->flags != RFSPAWN) return (EINVAL); AUDIT_ARG_FFLAGS(uap->flags); bzero(&fr, sizeof(fr)); if ((uap->flags & RFSPAWN) != 0) { fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; fr.fr_flags2 = FR2_DROPSIG_CAUGHT; } else { fr.fr_flags = uap->flags; } fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } int __exclusive_cache_line nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error != 0) return(error); sx_xlock(&allproc_lock); pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid == 0) randompid = 0; else if (pid == 1) /* generate a random PID modulus between 100 and 1123 */ randompid = 100 + arc4random() % 1024; else if (pid < 0 || pid > pid_max - 100) /* out of range */ randompid = pid_max - 100; else if (pid < 100) /* Make it reasonable */ randompid = 100; else randompid = pid; } sx_xunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus. Special values: 0: disable, 1: choose random value"); extern bitstr_t proc_id_pidmap; extern bitstr_t proc_id_grpidmap; extern bitstr_t proc_id_sessidmap; extern bitstr_t proc_id_reapmap; /* * Find an unused process ID * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ static int fork_findpid(int flags) { pid_t result; int trypid, random; /* * Avoid calling arc4random with procid_lock held. */ random = 0; if (__predict_false(randompid)) random = arc4random() % randompid; mtx_lock(&procid_lock); trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { trypid += random; } retry: if (trypid >= pid_max) trypid = 2; bit_ffc_at(&proc_id_pidmap, trypid, pid_max, &result); if (result == -1) { KASSERT(trypid != 2, ("unexpectedly ran out of IDs")); trypid = 2; goto retry; } if (bit_test(&proc_id_grpidmap, result) || bit_test(&proc_id_sessidmap, result) || bit_test(&proc_id_reapmap, result)) { trypid = result + 1; goto retry; } /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if ((flags & RFHIGHPID) == 0) lastpid = result; bit_set(&proc_id_pidmap, result); mtx_unlock(&procid_lock); return (result); } static int fork_norfproc(struct thread *td, int flags) { struct proc *p1; int error; KASSERT((flags & RFPROC) == 0, ("fork_norfproc called with RFPROC set")); p1 = td->td_proc; /* * Quiesce other threads if necessary. If RFMEM is not specified we * must ensure that other threads do not concurrently create a second * process sharing the vmspace, see vmspace_unshare(). */ if ((p1->p_flag & (P_HADTHREADS | P_SYSTEM)) == P_HADTHREADS && ((flags & (RFCFDG | RFFDG)) != 0 || (flags & RFMEM) == 0)) { PROC_LOCK(p1); if (thread_single(p1, SINGLE_BOUNDARY)) { PROC_UNLOCK(p1); return (ERESTART); } PROC_UNLOCK(p1); } error = vm_forkproc(td, NULL, NULL, NULL, flags); if (error != 0) goto fail; /* * Close all file descriptors. */ if ((flags & RFCFDG) != 0) { struct filedesc *fdtmp; struct pwddesc *pdtmp; pdtmp = pdinit(td->td_proc->p_pd, false); fdtmp = fdinit(); pdescfree(td); fdescfree(td); p1->p_fd = fdtmp; p1->p_pd = pdtmp; } /* * Unshare file descriptors (from parent). */ if ((flags & RFFDG) != 0) { fdunshare(td); pdunshare(td); } fail: if ((p1->p_flag & (P_HADTHREADS | P_SYSTEM)) == P_HADTHREADS && ((flags & (RFCFDG | RFFDG)) != 0 || (flags & RFMEM) == 0)) { PROC_LOCK(p1); thread_single_end(p1, SINGLE_BOUNDARY); PROC_UNLOCK(p1); } return (error); } static void do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *td2, struct vmspace *vm2, struct file *fp_procdesc) { struct proc *p1, *pptr; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct pwddesc *pd; struct sigacts *newsigacts; p1 = td->td_proc; PROC_LOCK(p1); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); pargs_hold(p2->p_args); PROC_UNLOCK(p1); bzero(&p2->p_startzero, __rangeof(struct proc, p_startzero, p_endzero)); /* Tell the prison that we exist. */ prison_proc_hold(p2->p_ucred->cr_prison); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = fork_findpid(fr->fr_flags); AUDIT_ARG_PID(p2->p_pid); TSFORK(p2->p_pid, p1->p_pid); sx_xlock(&allproc_lock); LIST_INSERT_HEAD(&allproc, p2, p_list); allproc_gen++; prison_proc_link(p2->p_ucred->cr_prison, p2); sx_xunlock(&allproc_lock); sx_xlock(PIDHASHLOCK(p2->p_pid)); LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); sx_xunlock(PIDHASHLOCK(p2->p_pid)); tidhash_add(td2); /* * Malloc things while we don't hold any locks. */ if (fr->fr_flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (fr->fr_flags & RFCFDG) { pd = pdinit(p1->p_pd, false); fd = fdinit(); fdtol = NULL; } else if (fr->fr_flags & RFFDG) { if (fr->fr_flags2 & FR2_SHARE_PATHS) pd = pdshare(p1->p_pd); else pd = pdcopy(p1->p_pd); fd = fdcopy(p1->p_fd); fdtol = NULL; } else { if (fr->fr_flags2 & FR2_SHARE_PATHS) pd = pdcopy(p1->p_pd); else pd = pdshare(p1->p_pd); fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((fr->fr_flags & RFTHREAD) != 0) { /* * Shared file descriptor table, and shared * process leaders. */ fdtol = filedesc_to_leader_share(p1->p_fdtol, p1->p_fd); } else { /* * Shared file descriptor table, and different * process leaders. */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ PROC_LOCK(p2); PROC_LOCK(p1); bzero(&td2->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name)); td2->td_sigstk = td->td_sigstk; td2->td_flags = TDF_INMEM; td2->td_lend_user_pri = PRI_MAX; #ifdef VIMAGE td2->td_vnet = NULL; td2->td_vnet_lpush = NULL; #endif /* * Allow the scheduler to initialize the child. */ thread_lock(td); sched_fork(td, td2); /* * Request AST to check for TDP_RFPPWAIT. Do it here * to avoid calling thread_lock() again. */ if ((fr->fr_flags & RFPPWAIT) != 0) ast_sched_locked(td, TDA_VFORK); thread_unlock(td); /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. */ p2->p_flag = P_INMEM; p2->p_flag2 = p1->p_flag2 & (P2_ASLR_DISABLE | P2_ASLR_ENABLE | P2_ASLR_IGNSTART | P2_NOTRACE | P2_NOTRACE_EXEC | P2_PROTMAX_ENABLE | P2_PROTMAX_DISABLE | P2_TRAPCAP | P2_STKGAP_DISABLE | P2_STKGAP_DISABLE_EXEC | P2_NO_NEW_PRIVS | P2_WXORX_DISABLE | P2_WXORX_ENABLE_EXEC); p2->p_swtick = ticks; if (p1->p_flag & P_PROFIL) startprofclock(p2); if (fr->fr_flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; if ((fr->fr_flags2 & (FR2_DROPSIG_CAUGHT | FR2_KPROC)) != 0) { mtx_lock(&p2->p_sigacts->ps_mtx); if ((fr->fr_flags2 & FR2_DROPSIG_CAUGHT) != 0) sig_drop_caught(p2); if ((fr->fr_flags2 & FR2_KPROC) != 0) p2->p_sigacts->ps_flag |= PS_NOCLDWAIT; mtx_unlock(&p2->p_sigacts->ps_mtx); } } if (fr->fr_flags & RFTSIGZMB) p2->p_sigparent = RFTSIGNUM(fr->fr_flags); else if (fr->fr_flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; if ((fr->fr_flags2 & FR2_KPROC) != 0) { p2->p_flag |= P_SYSTEM | P_KPROC; td2->td_pflags |= TDP_KTHREAD; } p2->p_textvp = p1->p_textvp; p2->p_textdvp = p1->p_textdvp; p2->p_fd = fd; p2->p_fdtol = fdtol; p2->p_pd = pd; if (p1->p_flag2 & P2_INHERIT_PROTECTED) { p2->p_flag |= P_PROTECTED; p2->p_flag2 |= P2_INHERIT_PROTECTED; } /* * p_limit is copy-on-write. Bump its refcount. */ lim_fork(p1, p2); thread_cow_get_proc(td2, p2); pstats_fork(p1->p_stats, p2->p_stats); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* * Bump references to the text vnode and directory, and copy * the hardlink name. */ if (p2->p_textvp != NULL) vrefact(p2->p_textvp); if (p2->p_textdvp != NULL) vrefact(p2->p_textdvp); p2->p_binname = p1->p_binname == NULL ? NULL : strdup(p1->p_binname, M_PARGS); /* * Set up linkage for kernel based threading. */ if ((fr->fr_flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); kern_psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & P_SUGID; td2->td_pflags |= (td->td_pflags & (TDP_ALTSTACK | TDP_SIGFASTBLOCK)); SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (fr->fr_flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; p2->p_pgrp = p1->p_pgrp; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); LIST_INIT(&p2->p_orphans); callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0); TAILQ_INIT(&p2->p_kqtim_stop); /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if ((fr->fr_flags & RFNOWAIT) != 0) { pptr = p1->p_reaper; p2->p_reaper = pptr; } else { p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ? p1 : p1->p_reaper; pptr = p1; } p2->p_pptr = pptr; p2->p_oppid = pptr->p_pid; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_reaplist); LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling); if (p2->p_reaper == p1 && p1 != initproc) { p2->p_reapsubtree = p2->p_pid; proc_id_set_cond(PROC_ID_REAP, p2->p_pid); } sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); #ifdef KTRACE ktrprocfork(p1, p2); #endif /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, td2, vm2, fr->fr_flags); if (fr->fr_flags == (RFFDG | RFPROC)) { VM_CNT_INC(v_forks); VM_CNT_ADD(v_forkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (fr->fr_flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { VM_CNT_INC(v_vforks); VM_CNT_ADD(v_vforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (p1 == &proc0) { VM_CNT_INC(v_kthreads); VM_CNT_ADD(v_kthreadpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else { VM_CNT_INC(v_rforks); VM_CNT_ADD(v_rforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } /* * Associate the process descriptor with the process before anything * can happen that might cause that process to need the descriptor. * However, don't do this until after fork(2) can no longer fail. */ if (fr->fr_flags & RFPROCDESC) procdesc_new(p2, fr->fr_pd_flags); /* * Both processes are set up, now check if any loadable modules want * to adjust anything. */ EVENTHANDLER_DIRECT_INVOKE(process_fork, p1, p2, fr->fr_flags); /* * Set the child start time and mark the process as being complete. */ PROC_LOCK(p2); PROC_LOCK(p1); microuptime(&p2->p_stats->p_start); PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; PROC_SUNLOCK(p2); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the new process so that any * tracepoints inherited from the parent can be removed. We have to do * this only after p_state is PRS_NORMAL since the fasttrap module will * use pfind() later on. */ if ((fr->fr_flags & RFMEM) == 0 && dtrace_fasttrap_fork) dtrace_fasttrap_fork(p1, p2); #endif if (fr->fr_flags & RFPPWAIT) { td->td_pflags |= TDP_RFPPWAIT; td->td_rfppwait_p = p2; td->td_dbgflags |= TDB_VFORK; } PROC_UNLOCK(p2); /* * Tell any interested parties about the new process. */ knote_fork(p1->p_klist, p2->p_pid); /* * Now can be swapped. */ _PRELE(p1); PROC_UNLOCK(p1); SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags); if (fr->fr_flags & RFPROCDESC) { procdesc_finit(p2->p_procdesc, fp_procdesc); fdrop(fp_procdesc, td); } /* * Speculative check for PTRACE_FORK. PTRACE_FORK is not * synced with forks in progress so it is OK if we miss it * if being set atm. */ if ((p1->p_ptevents & PTRACE_FORK) != 0) { sx_xlock(&proctree_lock); PROC_LOCK(p2); /* * p1->p_ptevents & p1->p_pptr are protected by both * process and proctree locks for modifications, * so owning proctree_lock allows the race-free read. */ if ((p1->p_ptevents & PTRACE_FORK) != 0) { /* * Arrange for debugger to receive the fork event. * * We can report PL_FLAG_FORKED regardless of * P_FOLLOWFORK settings, but it does not make a sense * for runaway child. */ td->td_dbgflags |= TDB_FORK; td->td_dbg_forked = p2->p_pid; td2->td_dbgflags |= TDB_STOPATFORK; proc_set_traced(p2, true); CTR2(KTR_PTRACE, "do_fork: attaching to new child pid %d: oppid %d", p2->p_pid, p2->p_oppid); proc_reparent(p2, p1->p_pptr, false); } PROC_UNLOCK(p2); sx_xunlock(&proctree_lock); } racct_proc_fork_done(p2); if ((fr->fr_flags & RFSTOPPED) == 0) { if (fr->fr_pidp != NULL) *fr->fr_pidp = p2->p_pid; /* * If RFSTOPPED not requested, make child runnable and * add to run queue. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); } else { *fr->fr_procp = p2; } } static void ast_vfork(struct thread *td, int tda __unused) { struct proc *p, *p2; MPASS(td->td_pflags & TDP_RFPPWAIT); p = td->td_proc; /* * Preserve synchronization semantics of vfork. If * waiting for child to exec or exit, fork set * P_PPWAIT on child, and there we sleep on our proc * (in case of exit). * * Do it after the ptracestop() above is finished, to * not block our debugger until child execs or exits * to finish vfork wait. */ td->td_pflags &= ~TDP_RFPPWAIT; p2 = td->td_rfppwait_p; again: PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) { PROC_LOCK(p); if (thread_suspend_check_needed()) { PROC_UNLOCK(p2); thread_suspend_check(0); PROC_UNLOCK(p); goto again; } else { PROC_UNLOCK(p); } cv_timedwait(&p2->p_pwait, &p2->p_mtx, hz); } PROC_UNLOCK(p2); if (td->td_dbgflags & TDB_VFORK) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_VFORK) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~TDB_VFORK; PROC_UNLOCK(p); } } int fork1(struct thread *td, struct fork_req *fr) { struct proc *p1, *newproc; struct thread *td2; struct vmspace *vm2; struct ucred *cred; struct file *fp_procdesc; struct pgrp *pg; vm_ooffset_t mem_charged; int error, nprocs_new; static int curfail; static struct timeval lastfail; int flags, pages; bool killsx_locked, singlethreaded; flags = fr->fr_flags; pages = fr->fr_pages; if ((flags & RFSTOPPED) != 0) MPASS(fr->fr_procp != NULL && fr->fr_pidp == NULL); else MPASS(fr->fr_procp == NULL); /* Check for the undefined or unimplemented flags. */ if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0) return (EINVAL); /* Signal value requires RFTSIGZMB. */ if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0) return (EINVAL); /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); /* Check the validity of the signal number. */ if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG) return (EINVAL); if ((flags & RFPROCDESC) != 0) { /* Can't not create a process yet get a process descriptor. */ if ((flags & RFPROC) == 0) return (EINVAL); /* Must provide a place to put a procdesc if creating one. */ if (fr->fr_pd_fd == NULL) return (EINVAL); /* Check if we are using supported flags. */ if ((fr->fr_pd_flags & ~PD_ALLOWED_AT_FORK) != 0) return (EINVAL); } p1 = td->td_proc; /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { if (fr->fr_procp != NULL) *fr->fr_procp = NULL; else if (fr->fr_pidp != NULL) *fr->fr_pidp = 0; return (fork_norfproc(td, flags)); } fp_procdesc = NULL; newproc = NULL; vm2 = NULL; killsx_locked = false; singlethreaded = false; /* * Increment the nprocs resource before allocations occur. * Although process entries are dynamically created, we still * keep a global limit on the maximum number we will * create. There are hard-limits as to the number of processes * that can run, established by the KVA and memory usage for * the process data. * * Don't allow a nonprivileged user to use the last ten * processes; don't let root exceed the limit. */ nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1; if (nprocs_new >= maxproc - 10) { if (priv_check_cred(td->td_ucred, PRIV_MAXPROC) != 0 || nprocs_new >= maxproc) { error = EAGAIN; sx_xlock(&allproc_lock); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("maxproc limit exceeded by uid %u " "(pid %d); see tuning(7) and " "login.conf(5)\n", td->td_ucred->cr_ruid, p1->p_pid); } sx_xunlock(&allproc_lock); goto fail2; } } /* * If we are possibly multi-threaded, and there is a process * sending a signal to our group right now, ensure that our * other threads cannot be chosen for the signal queueing. * Otherwise, this might delay signal action, and make the new * child escape the signaling. */ pg = p1->p_pgrp; if (p1->p_numthreads > 1) { if (sx_try_slock(&pg->pg_killsx) != 0) { killsx_locked = true; } else { PROC_LOCK(p1); if (thread_single(p1, SINGLE_BOUNDARY)) { PROC_UNLOCK(p1); error = ERESTART; goto fail2; } PROC_UNLOCK(p1); singlethreaded = true; } } /* * Atomically check for signals and block processes from sending * a signal to our process group until the child is visible. */ if (!killsx_locked && sx_slock_sig(&pg->pg_killsx) != 0) { error = ERESTART; goto fail2; } if (__predict_false(p1->p_pgrp != pg || sig_intr() != 0)) { /* * Either the process was moved to other process * group, or there is pending signal. sx_slock_sig() * does not check for signals if not sleeping for the * lock. */ sx_sunlock(&pg->pg_killsx); killsx_locked = false; error = ERESTART; goto fail2; } else { killsx_locked = true; } /* * If required, create a process descriptor in the parent first; we * will abandon it if something goes wrong. We don't finit() until * later. */ if (flags & RFPROCDESC) { error = procdesc_falloc(td, &fp_procdesc, fr->fr_pd_fd, fr->fr_pd_flags, fr->fr_pd_fcaps); if (error != 0) goto fail2; AUDIT_ARG_FD(*fr->fr_pd_fd); } mem_charged = 0; if (pages == 0) pages = kstack_pages; /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); td2 = FIRST_THREAD_IN_PROC(newproc); if (td2 == NULL) { td2 = thread_alloc(pages); if (td2 == NULL) { error = ENOMEM; goto fail2; } proc_linkup(newproc, td2); } else { kmsan_thread_alloc(td2); if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) { if (td2->td_kstack != 0) vm_thread_dispose(td2); if (!thread_alloc_stack(td2, pages)) { error = ENOMEM; goto fail2; } } } if ((flags & RFMEM) == 0) { vm2 = vmspace_fork(p1->p_vmspace, &mem_charged); if (vm2 == NULL) { error = ENOMEM; goto fail2; } if (!swap_reserve(mem_charged)) { /* * The swap reservation failed. The accounting * from the entries of the copied vm2 will be * subtracted in vmspace_free(), so force the * reservation there. */ swap_reserve_force(mem_charged); error = ENOMEM; goto fail2; } } else vm2 = NULL; /* * XXX: This is ugly; when we copy resource usage, we need to bump * per-cred resource counters. */ - proc_set_cred_init(newproc, td->td_ucred); + newproc->p_ucred = crcowget(td->td_ucred); /* * Initialize resource accounting for the child process. */ error = racct_proc_fork(p1, newproc); if (error != 0) { error = EAGAIN; goto fail1; } #ifdef MAC mac_proc_init(newproc); #endif newproc->p_klist = knlist_alloc(&newproc->p_mtx); STAILQ_INIT(&newproc->p_ktr); /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. */ cred = td->td_ucred; if (!chgproccnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPROC))) { if (priv_check_cred(cred, PRIV_PROC_LIMIT) != 0) goto fail0; chgproccnt(cred->cr_ruidinfo, 1, 0); } do_fork(td, fr, newproc, td2, vm2, fp_procdesc); error = 0; goto cleanup; fail0: error = EAGAIN; #ifdef MAC mac_proc_destroy(newproc); #endif racct_proc_exit(newproc); fail1: proc_unset_cred(newproc); fail2: if (vm2 != NULL) vmspace_free(vm2); uma_zfree(proc_zone, newproc); if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) { fdclose(td, fp_procdesc, *fr->fr_pd_fd); fdrop(fp_procdesc, td); } atomic_add_int(&nprocs, -1); cleanup: if (killsx_locked) sx_sunlock(&pg->pg_killsx); if (singlethreaded) { PROC_LOCK(p1); thread_single_end(p1, SINGLE_BOUNDARY); PROC_UNLOCK(p1); } if (error != 0) pause("fork", hz / 2); return (error); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(void (*callout)(void *, struct trapframe *), void *arg, struct trapframe *frame) { struct proc *p; struct thread *td; struct thread *dtd; kmsan_mark(frame, sizeof(*frame), KMSAN_STATE_INITED); td = curthread; p = td->td_proc; KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)", td, td_get_sched(td), p->p_pid, td->td_name); sched_fork_exit(td); /* * Processes normally resume in mi_switch() after being * cpu_switch()'ed to, but when children start up they arrive here * instead, so we must do much the same things as mi_switch() would. */ if ((dtd = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(dtd); } thread_unlock(td); /* * cpu_fork_kthread_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ if (p->p_flag & P_KPROC) { printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", td->td_name, p->p_pid); kthread_exit(); } mtx_assert(&Giant, MA_NOTOWNED); if (p->p_sysent->sv_schedtail != NULL) (p->p_sysent->sv_schedtail)(td); } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. This function is passed in to fork_exit() * as the first parameter and is called when returning to a new * userland process. */ void fork_return(struct thread *td, struct trapframe *frame) { struct proc *p; p = td->td_proc; if (td->td_dbgflags & TDB_STOPATFORK) { PROC_LOCK(p); if ((p->p_flag & P_TRACED) != 0) { /* * Inform the debugger if one is still present. */ td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP; ptracestop(td, SIGSTOP, NULL); td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX); } else { /* * ... otherwise clear the request. */ td->td_dbgflags &= ~TDB_STOPATFORK; } PROC_UNLOCK(p); } else if (p->p_flag & P_TRACED) { /* * This is the start of a new thread in a traced * process. Report a system call exit event. */ PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; if ((p->p_ptevents & PTRACE_SCX) != 0 || (td->td_dbgflags & TDB_BORN) != 0) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~(TDB_SCX | TDB_BORN); PROC_UNLOCK(p); } /* * If the prison was killed mid-fork, die along with it. */ if (!prison_isalive(td->td_ucred->cr_prison)) exit1(td, 0, SIGKILL); userret(td, frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(td->td_sa.code, 0, 0); #endif } static void fork_init(void *arg __unused) { ast_register(TDA_VFORK, ASTR_ASTF_REQUIRED | ASTR_TDP, TDP_RFPPWAIT, ast_vfork); } SYSINIT(fork, SI_SUB_INTRINSIC, SI_ORDER_ANY, fork_init, NULL); diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c index 00eb2fccdeef..6d4d5ef47926 100644 --- a/sys/kern/kern_prot.c +++ b/sys/kern/kern_prot.c @@ -1,2531 +1,2520 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 * The Regents of the University of California. * (c) UNIX System Laboratories, Inc. * Copyright (c) 2000-2001 Robert N. M. Watson. * All rights reserved. * * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94 */ /* * System calls related to processes and protection */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_43 #include #endif #include #include #include #include #include #include #include #include #include #ifdef REGRESSION FEATURE(regression, "Kernel support for interfaces necessary for regression testing (SECURITY RISK!)"); #endif #include #include static MALLOC_DEFINE(M_CRED, "cred", "credentials"); SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "BSD security policy"); static void crfree_final(struct ucred *cr); static void crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups); static int cr_canseeotheruids(struct ucred *u1, struct ucred *u2); static int cr_canseeothergids(struct ucred *u1, struct ucred *u2); static int cr_canseejailproc(struct ucred *u1, struct ucred *u2); #ifndef _SYS_SYSPROTO_H_ struct getpid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getpid(struct thread *td, struct getpid_args *uap) { struct proc *p = td->td_proc; td->td_retval[0] = p->p_pid; #if defined(COMPAT_43) if (SV_PROC_FLAG(p, SV_AOUT)) td->td_retval[1] = kern_getppid(td); #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct getppid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getppid(struct thread *td, struct getppid_args *uap) { td->td_retval[0] = kern_getppid(td); return (0); } int kern_getppid(struct thread *td) { struct proc *p = td->td_proc; return (p->p_oppid); } /* * Get process group ID; note that POSIX getpgrp takes no parameter. */ #ifndef _SYS_SYSPROTO_H_ struct getpgrp_args { int dummy; }; #endif int sys_getpgrp(struct thread *td, struct getpgrp_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* Get an arbitrary pid's process group id */ #ifndef _SYS_SYSPROTO_H_ struct getpgid_args { pid_t pid; }; #endif int sys_getpgid(struct thread *td, struct getpgid_args *uap) { struct proc *p; int error; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* * Get an arbitrary pid's session id. */ #ifndef _SYS_SYSPROTO_H_ struct getsid_args { pid_t pid; }; #endif int sys_getsid(struct thread *td, struct getsid_args *uap) { return (kern_getsid(td, uap->pid)); } int kern_getsid(struct thread *td, pid_t pid) { struct proc *p; int error; if (pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_session->s_sid; PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct getuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getuid(struct thread *td, struct getuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_ruid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_uid; #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct geteuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_geteuid(struct thread *td, struct geteuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_uid; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getgid(struct thread *td, struct getgid_args *uap) { td->td_retval[0] = td->td_ucred->cr_rgid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_groups[0]; #endif return (0); } /* * Get effective group ID. The "egid" is groups[0], and could be obtained * via getgroups. This syscall exists because it is somewhat painful to do * correctly in a library function. */ #ifndef _SYS_SYSPROTO_H_ struct getegid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getegid(struct thread *td, struct getegid_args *uap) { td->td_retval[0] = td->td_ucred->cr_groups[0]; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgroups_args { int gidsetsize; gid_t *gidset; }; #endif int sys_getgroups(struct thread *td, struct getgroups_args *uap) { struct ucred *cred; int ngrp, error; cred = td->td_ucred; ngrp = cred->cr_ngroups; if (uap->gidsetsize == 0) { error = 0; goto out; } if (uap->gidsetsize < ngrp) return (EINVAL); error = copyout(cred->cr_groups, uap->gidset, ngrp * sizeof(gid_t)); out: td->td_retval[0] = ngrp; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setsid_args { int dummy; }; #endif /* ARGSUSED */ int sys_setsid(struct thread *td, struct setsid_args *uap) { struct pgrp *pgrp; int error; struct proc *p = td->td_proc; struct pgrp *newpgrp; struct session *newsess; pgrp = NULL; newpgrp = uma_zalloc(pgrp_zone, M_WAITOK); newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); again: error = 0; sx_xlock(&proctree_lock); if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) { if (pgrp != NULL) PGRP_UNLOCK(pgrp); error = EPERM; } else { error = enterpgrp(p, p->p_pid, newpgrp, newsess); if (error == ERESTART) goto again; MPASS(error == 0); td->td_retval[0] = p->p_pid; newpgrp = NULL; newsess = NULL; } sx_xunlock(&proctree_lock); uma_zfree(pgrp_zone, newpgrp); free(newsess, M_SESSION); return (error); } /* * set process group (setpgid/old setpgrp) * * caller does setpgid(targpid, targpgid) * * pid must be caller or child of caller (ESRCH) * if a child * pid must be in same session (EPERM) * pid can't have done an exec (EACCES) * if pgid != pid * there must exist some pid in same session having pgid (EPERM) * pid must not be session leader (EPERM) */ #ifndef _SYS_SYSPROTO_H_ struct setpgid_args { int pid; /* target process id */ int pgid; /* target pgrp id */ }; #endif /* ARGSUSED */ int sys_setpgid(struct thread *td, struct setpgid_args *uap) { struct proc *curp = td->td_proc; struct proc *targp; /* target process */ struct pgrp *pgrp; /* target pgrp */ int error; struct pgrp *newpgrp; if (uap->pgid < 0) return (EINVAL); newpgrp = uma_zalloc(pgrp_zone, M_WAITOK); again: error = 0; sx_xlock(&proctree_lock); if (uap->pid != 0 && uap->pid != curp->p_pid) { if ((targp = pfind(uap->pid)) == NULL) { error = ESRCH; goto done; } if (!inferior(targp)) { PROC_UNLOCK(targp); error = ESRCH; goto done; } if ((error = p_cansee(td, targp))) { PROC_UNLOCK(targp); goto done; } if (targp->p_pgrp == NULL || targp->p_session != curp->p_session) { PROC_UNLOCK(targp); error = EPERM; goto done; } if (targp->p_flag & P_EXEC) { PROC_UNLOCK(targp); error = EACCES; goto done; } PROC_UNLOCK(targp); } else targp = curp; if (SESS_LEADER(targp)) { error = EPERM; goto done; } if (uap->pgid == 0) uap->pgid = targp->p_pid; if ((pgrp = pgfind(uap->pgid)) == NULL) { if (uap->pgid == targp->p_pid) { error = enterpgrp(targp, uap->pgid, newpgrp, NULL); if (error == 0) newpgrp = NULL; } else error = EPERM; } else { if (pgrp == targp->p_pgrp) { PGRP_UNLOCK(pgrp); goto done; } if (pgrp->pg_id != targp->p_pid && pgrp->pg_session != curp->p_session) { PGRP_UNLOCK(pgrp); error = EPERM; goto done; } PGRP_UNLOCK(pgrp); error = enterthispgrp(targp, pgrp); } done: KASSERT(error == 0 || newpgrp != NULL, ("setpgid failed and newpgrp is NULL")); if (error == ERESTART) goto again; sx_xunlock(&proctree_lock); uma_zfree(pgrp_zone, newpgrp); return (error); } /* * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD * compatible. It says that setting the uid/gid to euid/egid is a special * case of "appropriate privilege". Once the rules are expanded out, this * basically means that setuid(nnn) sets all three id's, in all permitted * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid()) * does not set the saved id - this is dangerous for traditional BSD * programs. For this reason, we *really* do not want to set * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2. */ #define POSIX_APPENDIX_B_4_2_2 #ifndef _SYS_SYSPROTO_H_ struct setuid_args { uid_t uid; }; #endif /* ARGSUSED */ int sys_setuid(struct thread *td, struct setuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t uid; struct uidinfo *uip; int error; uid = uap->uid; AUDIT_ARG_UID(uid); newcred = crget(); uip = uifind(uid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setuid(oldcred, uid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setuid(geteuid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setuid(xx)" sets all * three id's (assuming you have privs). * * Notes on the logic. We do things in three steps. * 1: We determine if the euid is going to change, and do EPERM * right away. We unconditionally change the euid later if this * test is satisfied, simplifying that part of the logic. * 2: We determine if the real and/or saved uids are going to * change. Determined by compile options. * 3: Change euid last. (after tests in #2 for "appropriate privs") */ if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */ #ifdef _POSIX_SAVED_IDS uid != oldcred->cr_svuid && /* allow setuid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ uid != oldcred->cr_uid && /* allow setuid(geteuid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETUID)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or uid == euid) * If so, we are changing the real uid and/or saved uid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */ uid == oldcred->cr_uid || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETUID) == 0) #endif { /* * Set the real uid and transfer proc count to new user. */ if (uid != oldcred->cr_ruid) { change_ruid(newcred, uip); setsugid(p); } /* * Set saved uid * * XXX always set saved uid even if not _POSIX_SAVED_IDS, as * the security of seteuid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (uid != oldcred->cr_svuid) { change_svuid(newcred, uid); setsugid(p); } } /* * In all permitted cases, we are changing the euid. */ if (uid != oldcred->cr_uid) { change_euid(newcred, uip); setsugid(p); } proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(uip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(uip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct seteuid_args { uid_t euid; }; #endif /* ARGSUSED */ int sys_seteuid(struct thread *td, struct seteuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid; struct uidinfo *euip; int error; euid = uap->euid; AUDIT_ARG_EUID(euid); newcred = crget(); euip = uifind(euid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_seteuid(oldcred, euid); if (error) goto fail; #endif if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */ euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID)) != 0) goto fail; /* * Everything's okay, do it. */ if (oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgid_args { gid_t gid; }; #endif /* ARGSUSED */ int sys_setgid(struct thread *td, struct setgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t gid; int error; gid = uap->gid; AUDIT_ARG_GID(gid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgid(oldcred, gid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setgid(getegid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setgid(xx)" sets all * three id's (assuming you have privs). * * For notes on the logic here, see setuid() above. */ if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */ #ifdef _POSIX_SAVED_IDS gid != oldcred->cr_svgid && /* allow setgid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETGID)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or gid == egid) * If so, we are changing the real uid and saved gid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */ gid == oldcred->cr_groups[0] || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETGID) == 0) #endif { /* * Set real gid */ if (oldcred->cr_rgid != gid) { change_rgid(newcred, gid); setsugid(p); } /* * Set saved gid * * XXX always set saved gid even if not _POSIX_SAVED_IDS, as * the security of setegid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (oldcred->cr_svgid != gid) { change_svgid(newcred, gid); setsugid(p); } } /* * In all cases permitted cases, we are changing the egid. * Copy credentials so other references do not see our changes. */ if (oldcred->cr_groups[0] != gid) { change_egid(newcred, gid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setegid_args { gid_t egid; }; #endif /* ARGSUSED */ int sys_setegid(struct thread *td, struct setegid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid; int error; egid = uap->egid; AUDIT_ARG_EGID(egid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setegid(oldcred, egid); if (error) goto fail; #endif if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */ egid != oldcred->cr_svgid && /* allow setegid(saved gid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID)) != 0) goto fail; if (oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgroups_args { int gidsetsize; gid_t *gidset; }; #endif /* ARGSUSED */ int sys_setgroups(struct thread *td, struct setgroups_args *uap) { gid_t smallgroups[XU_NGROUPS]; gid_t *groups; int gidsetsize, error; gidsetsize = uap->gidsetsize; if (gidsetsize > ngroups_max + 1 || gidsetsize < 0) return (EINVAL); if (gidsetsize > XU_NGROUPS) groups = malloc(gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK); else groups = smallgroups; error = copyin(uap->gidset, groups, gidsetsize * sizeof(gid_t)); if (error == 0) error = kern_setgroups(td, gidsetsize, groups); if (gidsetsize > XU_NGROUPS) free(groups, M_TEMP); return (error); } int kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; int error; MPASS(ngrp <= ngroups_max + 1); AUDIT_ARG_GROUPSET(groups, ngrp); newcred = crget(); crextend(newcred, ngrp); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgroups(oldcred, ngrp, groups); if (error) goto fail; #endif error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS); if (error) goto fail; if (ngrp == 0) { /* * setgroups(0, NULL) is a legitimate way of clearing the * groups vector on non-BSD systems (which generally do not * have the egid in the groups[0]). We risk security holes * when running non-BSD software if we do not do the same. */ newcred->cr_ngroups = 1; } else { crsetgroups_locked(newcred, ngrp, groups); } setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setreuid_args { uid_t ruid; uid_t euid; }; #endif /* ARGSUSED */ int sys_setreuid(struct thread *td, struct setreuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setreuid(oldcred, ruid, euid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid) || (euid != (uid_t)-1 && euid != oldcred->cr_uid && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) && newcred->cr_svuid != newcred->cr_uid) { change_svuid(newcred, newcred->cr_uid); setsugid(p); } proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setregid_args { gid_t rgid; gid_t egid; }; #endif /* ARGSUSED */ int sys_setregid(struct thread *td, struct setregid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid; int error; egid = uap->egid; rgid = uap->rgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setregid(oldcred, rgid, egid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid) || (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) && newcred->cr_svgid != newcred->cr_groups[0]) { change_svgid(newcred, newcred->cr_groups[0]); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } /* * setresuid(ruid, euid, suid) is like setreuid except control over the saved * uid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresuid_args { uid_t ruid; uid_t euid; uid_t suid; }; #endif /* ARGSUSED */ int sys_setresuid(struct thread *td, struct setresuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid, suid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; suid = uap->suid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); AUDIT_ARG_SUID(suid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresuid(oldcred, ruid, euid, suid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid && ruid != oldcred->cr_uid) || (euid != (uid_t)-1 && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid && euid != oldcred->cr_uid) || (suid != (uid_t)-1 && suid != oldcred->cr_ruid && suid != oldcred->cr_svuid && suid != oldcred->cr_uid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) { change_svuid(newcred, suid); setsugid(p); } proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } /* * setresgid(rgid, egid, sgid) is like setregid except control over the saved * gid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresgid_args { gid_t rgid; gid_t egid; gid_t sgid; }; #endif /* ARGSUSED */ int sys_setresgid(struct thread *td, struct setresgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid, sgid; int error; egid = uap->egid; rgid = uap->rgid; sgid = uap->sgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); AUDIT_ARG_SGID(sgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid && rgid != oldcred->cr_groups[0]) || (egid != (gid_t)-1 && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid && egid != oldcred->cr_groups[0]) || (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid && sgid != oldcred->cr_svgid && sgid != oldcred->cr_groups[0])) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) { change_svgid(newcred, sgid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getresuid_args { uid_t *ruid; uid_t *euid; uid_t *suid; }; #endif /* ARGSUSED */ int sys_getresuid(struct thread *td, struct getresuid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->ruid) error1 = copyout(&cred->cr_ruid, uap->ruid, sizeof(cred->cr_ruid)); if (uap->euid) error2 = copyout(&cred->cr_uid, uap->euid, sizeof(cred->cr_uid)); if (uap->suid) error3 = copyout(&cred->cr_svuid, uap->suid, sizeof(cred->cr_svuid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct getresgid_args { gid_t *rgid; gid_t *egid; gid_t *sgid; }; #endif /* ARGSUSED */ int sys_getresgid(struct thread *td, struct getresgid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->rgid) error1 = copyout(&cred->cr_rgid, uap->rgid, sizeof(cred->cr_rgid)); if (uap->egid) error2 = copyout(&cred->cr_groups[0], uap->egid, sizeof(cred->cr_groups[0])); if (uap->sgid) error3 = copyout(&cred->cr_svgid, uap->sgid, sizeof(cred->cr_svgid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct issetugid_args { int dummy; }; #endif /* ARGSUSED */ int sys_issetugid(struct thread *td, struct issetugid_args *uap) { struct proc *p = td->td_proc; /* * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, * we use P_SUGID because we consider changing the owners as * "tainting" as well. * This is significant for procs that start as root and "become" * a user without an exec - programs cannot know *everything* * that libc *might* have put in their data segment. */ td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0; return (0); } int sys___setugid(struct thread *td, struct __setugid_args *uap) { #ifdef REGRESSION struct proc *p; p = td->td_proc; switch (uap->flag) { case 0: PROC_LOCK(p); p->p_flag &= ~P_SUGID; PROC_UNLOCK(p); return (0); case 1: PROC_LOCK(p); p->p_flag |= P_SUGID; PROC_UNLOCK(p); return (0); default: return (EINVAL); } #else /* !REGRESSION */ return (ENOSYS); #endif /* REGRESSION */ } /* * Returns whether gid designates a supplementary group in cred. */ static bool supplementary_group_member(gid_t gid, struct ucred *cred) { int l, h, m; /* * Perform a binary search of the supplemental groups. This is possible * because we sort the groups in crsetgroups(). */ l = 1; h = cred->cr_ngroups; while (l < h) { m = l + (h - l) / 2; if (cred->cr_groups[m] < gid) l = m + 1; else h = m; } return (l < cred->cr_ngroups && cred->cr_groups[l] == gid); } /* * Check if gid is a member of the (effective) group set (i.e., effective and * supplementary groups). */ bool groupmember(gid_t gid, struct ucred *cred) { if (gid == cred->cr_groups[0]) return (true); return (supplementary_group_member(gid, cred)); } /* * Check if gid is a member of the real group set (i.e., real and supplementary * groups). */ bool realgroupmember(gid_t gid, struct ucred *cred) { if (gid == cred->cr_rgid) return (true); return (supplementary_group_member(gid, cred)); } /* * Test the active securelevel against a given level. securelevel_gt() * implements (securelevel > level). securelevel_ge() implements * (securelevel >= level). Note that the logic is inverted -- these * functions return EPERM on "success" and 0 on "failure". * * Due to care taken when setting the securelevel, we know that no jail will * be less secure that its parent (or the physical system), so it is sufficient * to test the current jail only. * * XXXRW: Possibly since this has to do with privilege, it should move to * kern_priv.c. */ int securelevel_gt(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel > level ? EPERM : 0); } int securelevel_ge(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0); } /* * 'see_other_uids' determines whether or not visibility of processes * and sockets with credentials holding different real uids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_uids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW, &see_other_uids, 0, "Unprivileged processes may see subjects/objects with different real uid"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_other_uids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_canseeotheruids(struct ucred *u1, struct ucred *u2) { if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) { if (priv_check_cred(u1, PRIV_SEEOTHERUIDS) != 0) return (ESRCH); } return (0); } /* * 'see_other_gids' determines whether or not visibility of processes * and sockets with credentials holding different real gids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_gids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW, &see_other_gids, 0, "Unprivileged processes may see subjects/objects with different real gid"); /* * Determine if u1 can "see" the subject specified by u2, according to the * 'see_other_gids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_canseeothergids(struct ucred *u1, struct ucred *u2) { if (!see_other_gids) { if (realgroupmember(u1->cr_rgid, u2)) return (0); for (int i = 1; i < u1->cr_ngroups; i++) if (realgroupmember(u1->cr_groups[i], u2)) return (0); if (priv_check_cred(u1, PRIV_SEEOTHERGIDS) != 0) return (ESRCH); } return (0); } /* * 'see_jail_proc' determines whether or not visibility of processes and * sockets with credentials holding different jail ids is possible using a * variety of system MIBs. * * XXX: data declarations should be together near the beginning of the file. */ static int see_jail_proc = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_jail_proc, CTLFLAG_RW, &see_jail_proc, 0, "Unprivileged processes may see subjects/objects with different jail ids"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_jail_proc' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_canseejailproc(struct ucred *u1, struct ucred *u2) { if (see_jail_proc || /* Policy deactivated. */ u1->cr_prison == u2->cr_prison || /* Same jail. */ priv_check_cred(u1, PRIV_SEEJAILPROC) == 0) /* Privileged. */ return (0); return (ESRCH); } /* * Helper for cr_cansee*() functions to abide by system-wide security.bsd.see_* * policies. Determines if u1 "can see" u2 according to these policies. * Returns: 0 for permitted, ESRCH otherwise */ int cr_bsd_visible(struct ucred *u1, struct ucred *u2) { int error; if ((error = cr_canseeotheruids(u1, u2))) return (error); if ((error = cr_canseeothergids(u1, u2))) return (error); if ((error = cr_canseejailproc(u1, u2))) return (error); return (0); } /*- * Determine if u1 "can see" the subject specified by u2. * Returns: 0 for permitted, an errno value otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_cansee(struct ucred *u1, struct ucred *u2) { int error; if ((error = prison_check(u1, u2))) return (error); #ifdef MAC if ((error = mac_cred_check_visible(u1, u2))) return (error); #endif if ((error = cr_bsd_visible(u1, u2))) return (error); return (0); } /*- * Determine if td "can see" the subject specified by p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect p->p_ucred must be held. td really * should be curthread. * References: td and p must be valid for the lifetime of the call */ int p_cansee(struct thread *td, struct proc *p) { /* Wrap cr_cansee() for all functionality. */ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); return (cr_cansee(td->td_ucred, p->p_ucred)); } /* * 'conservative_signals' prevents the delivery of a broad class of * signals by unprivileged processes to processes that have changed their * credentials since the last invocation of execve(). This can prevent * the leakage of cached information or retained privileges as a result * of a common class of signal-related vulnerabilities. However, this * may interfere with some applications that expect to be able to * deliver these signals to peer processes after having given up * privilege. */ static int conservative_signals = 1; SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW, &conservative_signals, 0, "Unprivileged processes prevented from " "sending certain signals to processes whose credentials have changed"); /*- * Determine whether cred may deliver the specified signal to proc. * Returns: 0 for permitted, an errno value otherwise. * Locks: A lock must be held for proc. * References: cred and proc must be valid for the lifetime of the call. */ int cr_cansignal(struct ucred *cred, struct proc *proc, int signum) { int error; PROC_LOCK_ASSERT(proc, MA_OWNED); /* * Jail semantics limit the scope of signalling to proc in the * same jail as cred, if cred is in jail. */ error = prison_check(cred, proc->p_ucred); if (error) return (error); #ifdef MAC if ((error = mac_proc_check_signal(cred, proc, signum))) return (error); #endif if ((error = cr_bsd_visible(cred, proc->p_ucred))) return (error); /* * UNIX signal semantics depend on the status of the P_SUGID * bit on the target process. If the bit is set, then additional * restrictions are placed on the set of available signals. */ if (conservative_signals && (proc->p_flag & P_SUGID)) { switch (signum) { case 0: case SIGKILL: case SIGINT: case SIGTERM: case SIGALRM: case SIGSTOP: case SIGTTIN: case SIGTTOU: case SIGTSTP: case SIGHUP: case SIGUSR1: case SIGUSR2: /* * Generally, permit job and terminal control * signals. */ break; default: /* Not permitted without privilege. */ error = priv_check_cred(cred, PRIV_SIGNAL_SUGID); if (error) return (error); } } /* * Generally, the target credential's ruid or svuid must match the * subject credential's ruid or euid. */ if (cred->cr_ruid != proc->p_ucred->cr_ruid && cred->cr_ruid != proc->p_ucred->cr_svuid && cred->cr_uid != proc->p_ucred->cr_ruid && cred->cr_uid != proc->p_ucred->cr_svuid) { error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED); if (error) return (error); } return (0); } /*- * Determine whether td may deliver the specified signal to p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must be * held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansignal(struct thread *td, struct proc *p, int signum) { KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); /* * UNIX signalling semantics require that processes in the same * session always be able to deliver SIGCONT to one another, * overriding the remaining protections. */ /* XXX: This will require an additional lock of some sort. */ if (signum == SIGCONT && td->td_proc->p_session == p->p_session) return (0); /* * Some compat layers use SIGTHR and higher signals for * communication between different kernel threads of the same * process, so that they expect that it's always possible to * deliver them, even for suid applications where cr_cansignal() can * deny such ability for security consideration. It should be * pretty safe to do since the only way to create two processes * with the same p_leader is via rfork(2). */ if (td->td_proc->p_leader != NULL && signum >= SIGTHR && signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader) return (0); return (cr_cansignal(td->td_ucred, p, signum)); } /*- * Determine whether td may reschedule p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansched(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_sched(td->td_ucred, p))) return (error); #endif if ((error = cr_bsd_visible(td->td_ucred, p->p_ucred))) return (error); if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid && td->td_ucred->cr_uid != p->p_ucred->cr_ruid) { error = priv_check(td, PRIV_SCHED_DIFFCRED); if (error) return (error); } return (0); } /* * Handle getting or setting the prison's unprivileged_proc_debug * value. */ static int sysctl_unprivileged_proc_debug(SYSCTL_HANDLER_ARGS) { int error, val; val = prison_allow(req->td->td_ucred, PR_ALLOW_UNPRIV_DEBUG); error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && val != 1) return (EINVAL); prison_set_allow(req->td->td_ucred, PR_ALLOW_UNPRIV_DEBUG, val); return (0); } /* * The 'unprivileged_proc_debug' flag may be used to disable a variety of * unprivileged inter-process debugging services, including some procfs * functionality, ptrace(), and ktrace(). In the past, inter-process * debugging has been involved in a variety of security problems, and sites * not requiring the service might choose to disable it when hardening * systems. */ SYSCTL_PROC(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_SECURE | CTLFLAG_MPSAFE, 0, 0, sysctl_unprivileged_proc_debug, "I", "Unprivileged processes may use process debugging facilities"); /*- * Determine whether td may debug p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_candebug(struct thread *td, struct proc *p) { int error, grpsubset, i, uidsubset; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); if ((error = priv_check(td, PRIV_DEBUG_UNPRIV))) return (error); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_debug(td->td_ucred, p))) return (error); #endif if ((error = cr_bsd_visible(td->td_ucred, p->p_ucred))) return (error); /* * Is p's group set a subset of td's effective group set? This * includes p's egid, group access list, rgid, and svgid. */ grpsubset = 1; for (i = 0; i < p->p_ucred->cr_ngroups; i++) { if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) { grpsubset = 0; break; } } grpsubset = grpsubset && groupmember(p->p_ucred->cr_rgid, td->td_ucred) && groupmember(p->p_ucred->cr_svgid, td->td_ucred); /* * Are the uids present in p's credential equal to td's * effective uid? This includes p's euid, svuid, and ruid. */ uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid && td->td_ucred->cr_uid == p->p_ucred->cr_svuid && td->td_ucred->cr_uid == p->p_ucred->cr_ruid); /* * If p's gids aren't a subset, or the uids aren't a subset, * or the credential has changed, require appropriate privilege * for td to debug p. */ if (!grpsubset || !uidsubset) { error = priv_check(td, PRIV_DEBUG_DIFFCRED); if (error) return (error); } /* * Has the credential of the process changed since the last exec()? */ if ((p->p_flag & P_SUGID) != 0) { error = priv_check(td, PRIV_DEBUG_SUGID); if (error) return (error); } /* Can't trace init when securelevel > 0. */ if (p == initproc) { error = securelevel_gt(td->td_ucred, 0); if (error) return (error); } /* * Can't trace a process that's currently exec'ing. * * XXX: Note, this is not a security policy decision, it's a * basic correctness/functionality decision. Therefore, this check * should be moved to the caller's of p_candebug(). */ if ((p->p_flag & P_INEXEC) != 0) return (EBUSY); /* Denied explicitly */ if ((p->p_flag2 & P2_NOTRACE) != 0) { error = priv_check(td, PRIV_DEBUG_DENIED); if (error != 0) return (error); } return (0); } /*- * Determine whether the subject represented by cred can "see" a socket. * Returns: 0 for permitted, ENOENT otherwise. */ int cr_canseesocket(struct ucred *cred, struct socket *so) { int error; error = prison_check(cred, so->so_cred); if (error) return (ENOENT); #ifdef MAC error = mac_socket_check_visible(cred, so); if (error) return (error); #endif if (cr_bsd_visible(cred, so->so_cred)) return (ENOENT); return (0); } /*- * Determine whether td can wait for the exit of p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_canwait(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_wait(td->td_ucred, p))) return (error); #endif #if 0 /* XXXMAC: This could have odd effects on some shells. */ if ((error = cr_bsd_visible(td->td_ucred, p->p_ucred))) return (error); #endif return (0); } /* * Credential management. * * struct ucred objects are rarely allocated but gain and lose references all * the time (e.g., on struct file alloc/dealloc) turning refcount updates into * a significant source of cache-line ping ponging. Common cases are worked * around by modifying thread-local counter instead if the cred to operate on * matches td_realucred. * * The counter is split into 2 parts: * - cr_users -- total count of all struct proc and struct thread objects * which have given cred in p_ucred and td_ucred respectively * - cr_ref -- the actual ref count, only valid if cr_users == 0 * * If users == 0 then cr_ref behaves similarly to refcount(9), in particular if * the count reaches 0 the object is freeable. * If users > 0 and curthread->td_realucred == cred, then updates are performed * against td_ucredref. * In other cases updates are performed against cr_ref. * * Changing td_realucred into something else decrements cr_users and transfers * accumulated updates. */ struct ucred * crcowget(struct ucred *cr) { mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users++; cr->cr_ref++; mtx_unlock(&cr->cr_mtx); return (cr); } static struct ucred * crunuse(struct thread *td) { struct ucred *cr, *crold; MPASS(td->td_realucred == td->td_ucred); cr = td->td_realucred; mtx_lock(&cr->cr_mtx); cr->cr_ref += td->td_ucredref; td->td_ucredref = 0; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users--; if (cr->cr_users == 0) { KASSERT(cr->cr_ref > 0, ("%s: ref %ld not > 0 on cred %p", __func__, cr->cr_ref, cr)); crold = cr; } else { cr->cr_ref--; crold = NULL; } mtx_unlock(&cr->cr_mtx); td->td_realucred = NULL; return (crold); } static void crunusebatch(struct ucred *cr, int users, int ref) { KASSERT(users > 0, ("%s: passed users %d not > 0 ; cred %p", __func__, users, cr)); mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users >= users, ("%s: users %d not > %d on cred %p", __func__, cr->cr_users, users, cr)); cr->cr_users -= users; cr->cr_ref += ref; cr->cr_ref -= users; if (cr->cr_users > 0) { mtx_unlock(&cr->cr_mtx); return; } KASSERT(cr->cr_ref >= 0, ("%s: ref %ld not >= 0 on cred %p", __func__, cr->cr_ref, cr)); if (cr->cr_ref > 0) { mtx_unlock(&cr->cr_mtx); return; } crfree_final(cr); } void crcowfree(struct thread *td) { struct ucred *cr; cr = crunuse(td); if (cr != NULL) crfree(cr); } struct ucred * crcowsync(void) { struct thread *td; struct proc *p; struct ucred *crnew, *crold; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(td->td_realucred == td->td_ucred); if (td->td_realucred == p->p_ucred) return (NULL); crnew = crcowget(p->p_ucred); crold = crunuse(td); td->td_realucred = crnew; td->td_ucred = td->td_realucred; return (crold); } /* * Batching. */ void credbatch_add(struct credbatch *crb, struct thread *td) { struct ucred *cr; MPASS(td->td_realucred != NULL); MPASS(td->td_realucred == td->td_ucred); MPASS(TD_GET_STATE(td) == TDS_INACTIVE); cr = td->td_realucred; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); if (crb->cred != cr) { if (crb->users > 0) { MPASS(crb->cred != NULL); crunusebatch(crb->cred, crb->users, crb->ref); crb->users = 0; crb->ref = 0; } } crb->cred = cr; crb->users++; crb->ref += td->td_ucredref; td->td_ucredref = 0; td->td_realucred = NULL; } void credbatch_final(struct credbatch *crb) { MPASS(crb->cred != NULL); MPASS(crb->users > 0); crunusebatch(crb->cred, crb->users, crb->ref); } /* * Allocate a zeroed cred structure. */ struct ucred * crget(void) { struct ucred *cr; cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); mtx_init(&cr->cr_mtx, "cred", NULL, MTX_DEF); cr->cr_ref = 1; #ifdef AUDIT audit_cred_init(cr); #endif #ifdef MAC mac_cred_init(cr); #endif cr->cr_groups = cr->cr_smallgroups; cr->cr_agroups = sizeof(cr->cr_smallgroups) / sizeof(cr->cr_smallgroups[0]); return (cr); } /* * Claim another reference to a ucred structure. */ struct ucred * crhold(struct ucred *cr) { struct thread *td; td = curthread; if (__predict_true(td->td_realucred == cr)) { KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); td->td_ucredref++; return (cr); } mtx_lock(&cr->cr_mtx); cr->cr_ref++; mtx_unlock(&cr->cr_mtx); return (cr); } /* * Free a cred structure. Throws away space when ref count gets to 0. */ void crfree(struct ucred *cr) { struct thread *td; td = curthread; if (__predict_true(td->td_realucred == cr)) { KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); td->td_ucredref--; return; } mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users >= 0, ("%s: users %d not >= 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_ref--; if (cr->cr_users > 0) { mtx_unlock(&cr->cr_mtx); return; } KASSERT(cr->cr_ref >= 0, ("%s: ref %ld not >= 0 on cred %p", __func__, cr->cr_ref, cr)); if (cr->cr_ref > 0) { mtx_unlock(&cr->cr_mtx); return; } crfree_final(cr); } static void crfree_final(struct ucred *cr) { KASSERT(cr->cr_users == 0, ("%s: users %d not == 0 on cred %p", __func__, cr->cr_users, cr)); KASSERT(cr->cr_ref == 0, ("%s: ref %ld not == 0 on cred %p", __func__, cr->cr_ref, cr)); /* * Some callers of crget(), such as nfs_statfs(), allocate a temporary * credential, but don't allocate a uidinfo structure. */ if (cr->cr_uidinfo != NULL) uifree(cr->cr_uidinfo); if (cr->cr_ruidinfo != NULL) uifree(cr->cr_ruidinfo); if (cr->cr_prison != NULL) prison_free(cr->cr_prison); if (cr->cr_loginclass != NULL) loginclass_free(cr->cr_loginclass); #ifdef AUDIT audit_cred_destroy(cr); #endif #ifdef MAC mac_cred_destroy(cr); #endif mtx_destroy(&cr->cr_mtx); if (cr->cr_groups != cr->cr_smallgroups) free(cr->cr_groups, M_CRED); free(cr, M_CRED); } /* * Copy a ucred's contents from a template. Does not block. */ void crcopy(struct ucred *dest, struct ucred *src) { KASSERT(dest->cr_ref == 1, ("crcopy of shared ucred")); bcopy(&src->cr_startcopy, &dest->cr_startcopy, (unsigned)((caddr_t)&src->cr_endcopy - (caddr_t)&src->cr_startcopy)); dest->cr_flags = src->cr_flags; crsetgroups(dest, src->cr_ngroups, src->cr_groups); uihold(dest->cr_uidinfo); uihold(dest->cr_ruidinfo); prison_hold(dest->cr_prison); loginclass_hold(dest->cr_loginclass); #ifdef AUDIT audit_cred_copy(src, dest); #endif #ifdef MAC mac_cred_copy(src, dest); #endif } /* * Dup cred struct to a new held one. */ struct ucred * crdup(struct ucred *cr) { struct ucred *newcr; newcr = crget(); crcopy(newcr, cr); return (newcr); } /* * Fill in a struct xucred based on a struct ucred. */ void cru2x(struct ucred *cr, struct xucred *xcr) { int ngroups; bzero(xcr, sizeof(*xcr)); xcr->cr_version = XUCRED_VERSION; xcr->cr_uid = cr->cr_uid; ngroups = MIN(cr->cr_ngroups, XU_NGROUPS); xcr->cr_ngroups = ngroups; bcopy(cr->cr_groups, xcr->cr_groups, ngroups * sizeof(*cr->cr_groups)); } void cru2xt(struct thread *td, struct xucred *xcr) { cru2x(td->td_ucred, xcr); xcr->cr_pid = td->td_proc->p_pid; } -/* - * Set initial process credentials. - * Callers are responsible for providing the reference for provided credentials. - */ -void -proc_set_cred_init(struct proc *p, struct ucred *newcred) -{ - - p->p_ucred = crcowget(newcred); -} - /* * Change process credentials. * Callers are responsible for providing the reference for passed credentials * and for freeing old ones. * * Process has to be locked except when it does not have credentials (as it * should not be visible just yet) or when newcred is NULL (as this can be * only used when the process is about to be freed, at which point it should * not be visible anymore). */ void proc_set_cred(struct proc *p, struct ucred *newcred) { struct ucred *cr; cr = p->p_ucred; MPASS(cr != NULL); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(newcred->cr_users == 0, ("%s: users %d not 0 on cred %p", __func__, newcred->cr_users, newcred)); mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users--; mtx_unlock(&cr->cr_mtx); p->p_ucred = newcred; newcred->cr_users = 1; PROC_UPDATE_COW(p); } void proc_unset_cred(struct proc *p) { struct ucred *cr; MPASS(p->p_state == PRS_ZOMBIE || p->p_state == PRS_NEW); cr = p->p_ucred; p->p_ucred = NULL; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); mtx_lock(&cr->cr_mtx); cr->cr_users--; if (cr->cr_users == 0) KASSERT(cr->cr_ref > 0, ("%s: ref %ld not > 0 on cred %p", __func__, cr->cr_ref, cr)); mtx_unlock(&cr->cr_mtx); crfree(cr); } struct ucred * crcopysafe(struct proc *p, struct ucred *cr) { struct ucred *oldcred; int groups; PROC_LOCK_ASSERT(p, MA_OWNED); oldcred = p->p_ucred; while (cr->cr_agroups < oldcred->cr_agroups) { groups = oldcred->cr_agroups; PROC_UNLOCK(p); crextend(cr, groups); PROC_LOCK(p); oldcred = p->p_ucred; } crcopy(cr, oldcred); return (oldcred); } /* * Extend the passed in credential to hold n items. */ void crextend(struct ucred *cr, int n) { int cnt; /* Truncate? */ if (n <= cr->cr_agroups) return; /* * We extend by 2 each time since we're using a power of two * allocator until we need enough groups to fill a page. * Once we're allocating multiple pages, only allocate as many * as we actually need. The case of processes needing a * non-power of two number of pages seems more likely than * a real world process that adds thousands of groups one at a * time. */ if ( n < PAGE_SIZE / sizeof(gid_t) ) { if (cr->cr_agroups == 0) cnt = MAX(1, MINALLOCSIZE / sizeof(gid_t)); else cnt = cr->cr_agroups * 2; while (cnt < n) cnt *= 2; } else cnt = roundup2(n, PAGE_SIZE / sizeof(gid_t)); /* Free the old array. */ if (cr->cr_groups != cr->cr_smallgroups) free(cr->cr_groups, M_CRED); cr->cr_groups = malloc(cnt * sizeof(gid_t), M_CRED, M_WAITOK | M_ZERO); cr->cr_agroups = cnt; } /* * Copy groups in to a credential, preserving any necessary invariants. * Currently this includes the sorting of all supplemental gids. * crextend() must have been called before hand to ensure sufficient * space is available. */ static void crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups) { int i; int j; gid_t g; KASSERT(cr->cr_agroups >= ngrp, ("cr_ngroups is too small")); bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t)); cr->cr_ngroups = ngrp; /* * Sort all groups except cr_groups[0] to allow groupmember to * perform a binary search. * * XXX: If large numbers of groups become common this should * be replaced with shell sort like linux uses or possibly * heap sort. */ for (i = 2; i < ngrp; i++) { g = cr->cr_groups[i]; for (j = i-1; j >= 1 && g < cr->cr_groups[j]; j--) cr->cr_groups[j + 1] = cr->cr_groups[j]; cr->cr_groups[j + 1] = g; } } /* * Copy groups in to a credential after expanding it if required. * Truncate the list to (ngroups_max + 1) if it is too large. */ void crsetgroups(struct ucred *cr, int ngrp, gid_t *groups) { if (ngrp > ngroups_max + 1) ngrp = ngroups_max + 1; crextend(cr, ngrp); crsetgroups_locked(cr, ngrp, groups); } /* * Get login name, if available. */ #ifndef _SYS_SYSPROTO_H_ struct getlogin_args { char *namebuf; u_int namelen; }; #endif /* ARGSUSED */ int sys_getlogin(struct thread *td, struct getlogin_args *uap) { char login[MAXLOGNAME]; struct proc *p = td->td_proc; size_t len; if (uap->namelen > MAXLOGNAME) uap->namelen = MAXLOGNAME; PROC_LOCK(p); SESS_LOCK(p->p_session); len = strlcpy(login, p->p_session->s_login, uap->namelen) + 1; SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (len > uap->namelen) return (ERANGE); return (copyout(login, uap->namebuf, len)); } /* * Set login name. */ #ifndef _SYS_SYSPROTO_H_ struct setlogin_args { char *namebuf; }; #endif /* ARGSUSED */ int sys_setlogin(struct thread *td, struct setlogin_args *uap) { struct proc *p = td->td_proc; int error; char logintmp[MAXLOGNAME]; CTASSERT(sizeof(p->p_session->s_login) >= sizeof(logintmp)); error = priv_check(td, PRIV_PROC_SETLOGIN); if (error) return (error); error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL); if (error != 0) { if (error == ENAMETOOLONG) error = EINVAL; return (error); } AUDIT_ARG_LOGIN(logintmp); PROC_LOCK(p); SESS_LOCK(p->p_session); strcpy(p->p_session->s_login, logintmp); SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); return (0); } void setsugid(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_SUGID; } /*- * Change a process's effective uid. * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_euid(struct ucred *newcred, struct uidinfo *euip) { newcred->cr_uid = euip->ui_uid; uihold(euip); uifree(newcred->cr_uidinfo); newcred->cr_uidinfo = euip; } /*- * Change a process's effective gid. * Side effects: newcred->cr_gid will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_egid(struct ucred *newcred, gid_t egid) { newcred->cr_groups[0] = egid; } /*- * Change a process's real uid. * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo * will be updated, and the old and new cr_ruidinfo proc * counts will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_ruid(struct ucred *newcred, struct uidinfo *ruip) { (void)chgproccnt(newcred->cr_ruidinfo, -1, 0); newcred->cr_ruid = ruip->ui_uid; uihold(ruip); uifree(newcred->cr_ruidinfo); newcred->cr_ruidinfo = ruip; (void)chgproccnt(newcred->cr_ruidinfo, 1, 0); } /*- * Change a process's real gid. * Side effects: newcred->cr_rgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_rgid(struct ucred *newcred, gid_t rgid) { newcred->cr_rgid = rgid; } /*- * Change a process's saved uid. * Side effects: newcred->cr_svuid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svuid(struct ucred *newcred, uid_t svuid) { newcred->cr_svuid = svuid; } /*- * Change a process's saved gid. * Side effects: newcred->cr_svgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svgid(struct ucred *newcred, gid_t svgid) { newcred->cr_svgid = svgid; } bool allow_ptrace = true; SYSCTL_BOOL(_security_bsd, OID_AUTO, allow_ptrace, CTLFLAG_RWTUN, &allow_ptrace, 0, "Deny ptrace(2) use by returning ENOSYS"); diff --git a/sys/sys/ucred.h b/sys/sys/ucred.h index 7c9e46e47774..3f8a70ab9c90 100644 --- a/sys/sys/ucred.h +++ b/sys/sys/ucred.h @@ -1,165 +1,164 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ucred.h 8.4 (Berkeley) 1/9/95 */ #ifndef _SYS_UCRED_H_ #define _SYS_UCRED_H_ #if defined(_KERNEL) || defined(_WANT_UCRED) #include #include #endif #include struct loginclass; #define XU_NGROUPS 16 /* * Credentials. * * Please do not inspect cr_uid directly to determine superuserness. The * priv(9) interface should be used to check for privilege. * * Lock reference: * c - cr_mtx * * Unmarked fields are constant after creation. * * See "Credential management" comment in kern_prot.c for more information. */ #if defined(_KERNEL) || defined(_WANT_UCRED) struct ucred { struct mtx cr_mtx; long cr_ref; /* (c) reference count */ u_int cr_users; /* (c) proc + thread using this cred */ u_int cr_flags; /* credential flags */ struct auditinfo_addr cr_audit; /* Audit properties. */ #define cr_startcopy cr_uid uid_t cr_uid; /* effective user id */ uid_t cr_ruid; /* real user id */ uid_t cr_svuid; /* saved user id */ int cr_ngroups; /* number of groups */ gid_t cr_rgid; /* real group id */ gid_t cr_svgid; /* saved group id */ struct uidinfo *cr_uidinfo; /* per euid resource consumption */ struct uidinfo *cr_ruidinfo; /* per ruid resource consumption */ struct prison *cr_prison; /* jail(2) */ struct loginclass *cr_loginclass; /* login class */ void *cr_pspare2[2]; /* general use 2 */ #define cr_endcopy cr_label struct label *cr_label; /* MAC label */ gid_t *cr_groups; /* groups */ int cr_agroups; /* Available groups */ gid_t cr_smallgroups[XU_NGROUPS]; /* storage for small groups */ }; #define NOCRED ((struct ucred *)0) /* no credential available */ #define FSCRED ((struct ucred *)-1) /* filesystem credential */ #endif /* _KERNEL || _WANT_UCRED */ /* * Flags for cr_flags. */ #define CRED_FLAG_CAPMODE 0x00000001 /* In capability mode. */ /* * This is the external representation of struct ucred. */ struct xucred { u_int cr_version; /* structure layout version */ uid_t cr_uid; /* effective user id */ short cr_ngroups; /* number of groups */ gid_t cr_groups[XU_NGROUPS]; /* groups */ union { void *_cr_unused1; /* compatibility with old ucred */ pid_t cr_pid; }; }; #define XUCRED_VERSION 0 /* This can be used for both ucred and xucred structures. */ #define cr_gid cr_groups[0] #ifdef _KERNEL struct proc; struct thread; struct credbatch { struct ucred *cred; int users; int ref; }; static inline void credbatch_prep(struct credbatch *crb) { crb->cred = NULL; crb->users = 0; crb->ref = 0; } void credbatch_add(struct credbatch *crb, struct thread *td); static inline void credbatch_process(struct credbatch *crb __unused) { } void credbatch_final(struct credbatch *crb); void change_egid(struct ucred *newcred, gid_t egid); void change_euid(struct ucred *newcred, struct uidinfo *euip); void change_rgid(struct ucred *newcred, gid_t rgid); void change_ruid(struct ucred *newcred, struct uidinfo *ruip); void change_svgid(struct ucred *newcred, gid_t svgid); void change_svuid(struct ucred *newcred, uid_t svuid); void crcopy(struct ucred *dest, struct ucred *src); struct ucred *crcopysafe(struct proc *p, struct ucred *cr); struct ucred *crdup(struct ucred *cr); void crextend(struct ucred *cr, int n); -void proc_set_cred_init(struct proc *p, struct ucred *cr); void proc_set_cred(struct proc *p, struct ucred *cr); void proc_unset_cred(struct proc *p); void crfree(struct ucred *cr); struct ucred *crcowsync(void); struct ucred *crget(void); struct ucred *crhold(struct ucred *cr); struct ucred *crcowget(struct ucred *cr); void crcowfree(struct thread *td); void cru2x(struct ucred *cr, struct xucred *xcr); void cru2xt(struct thread *td, struct xucred *xcr); void crsetgroups(struct ucred *cr, int n, gid_t *groups); bool groupmember(gid_t gid, struct ucred *cred); bool realgroupmember(gid_t gid, struct ucred *cred); #endif /* _KERNEL */ #endif /* !_SYS_UCRED_H_ */