Index: stable/10/sys/compat/linux/linux_misc.c =================================================================== --- stable/10/sys/compat/linux/linux_misc.c (revision 302228) +++ stable/10/sys/compat/linux/linux_misc.c (revision 302229) @@ -1,2520 +1,2520 @@ /*- * Copyright (c) 2002 Doug Rabson * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_kdtrace.h" #include #include #include #if defined(__i386__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include #include #include #include #include /** * Special DTrace provider for the linuxulator. * * In this file we define the provider for the entire linuxulator. All * modules (= files of the linuxulator) use it. * * We define a different name depending on the emulated bitsize, see * ../..//linux{,32}/linux.h, e.g.: * native bitsize = linuxulator * amd64, 32bit emulation = linuxulator32 */ LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE); int stclohz; /* Statistics clock frequency */ static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = { RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK, RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE, RLIMIT_MEMLOCK, RLIMIT_AS }; struct l_sysinfo { l_long uptime; /* Seconds since boot */ l_ulong loads[3]; /* 1, 5, and 15 minute load averages */ #define LINUX_SYSINFO_LOADS_SCALE 65536 l_ulong totalram; /* Total usable main memory size */ l_ulong freeram; /* Available memory size */ l_ulong sharedram; /* Amount of shared memory */ l_ulong bufferram; /* Memory used by buffers */ l_ulong totalswap; /* Total swap space size */ l_ulong freeswap; /* swap space still available */ l_ushort procs; /* Number of current processes */ l_ushort pads; l_ulong totalbig; l_ulong freebig; l_uint mem_unit; char _f[20-2*sizeof(l_long)-sizeof(l_int)]; /* padding */ }; struct l_pselect6arg { l_uintptr_t ss; l_size_t ss_len; }; static int linux_utimensat_nsec_valid(l_long); int linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args) { struct l_sysinfo sysinfo; vm_object_t object; int i, j; struct timespec ts; bzero(&sysinfo, sizeof(sysinfo)); getnanouptime(&ts); if (ts.tv_nsec != 0) ts.tv_sec++; sysinfo.uptime = ts.tv_sec; /* Use the information from the mib to get our load averages */ for (i = 0; i < 3; i++) sysinfo.loads[i] = averunnable.ldavg[i] * LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale; sysinfo.totalram = physmem * PAGE_SIZE; sysinfo.freeram = sysinfo.totalram - cnt.v_wire_count * PAGE_SIZE; sysinfo.sharedram = 0; mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) if (object->shadow_count > 1) sysinfo.sharedram += object->resident_page_count; mtx_unlock(&vm_object_list_mtx); sysinfo.sharedram *= PAGE_SIZE; sysinfo.bufferram = 0; swap_pager_status(&i, &j); sysinfo.totalswap = i * PAGE_SIZE; sysinfo.freeswap = (i - j) * PAGE_SIZE; sysinfo.procs = nprocs; /* The following are only present in newer Linux kernels. */ sysinfo.totalbig = 0; sysinfo.freebig = 0; sysinfo.mem_unit = 1; return (copyout(&sysinfo, args->info, sizeof(sysinfo))); } int linux_alarm(struct thread *td, struct linux_alarm_args *args) { struct itimerval it, old_it; u_int secs; int error; #ifdef DEBUG if (ldebug(alarm)) printf(ARGS(alarm, "%u"), args->secs); #endif secs = args->secs; /* * Linux alarm() is always successful. Limit secs to INT32_MAX / 2 * to match kern_setitimer()'s limit to avoid error from it. * * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit * platforms. */ if (secs > INT32_MAX / 2) secs = INT32_MAX / 2; it.it_value.tv_sec = secs; it.it_value.tv_usec = 0; timevalclear(&it.it_interval); error = kern_setitimer(td, ITIMER_REAL, &it, &old_it); KASSERT(error == 0, ("kern_setitimer returns %d", error)); if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) || old_it.it_value.tv_usec >= 500000) old_it.it_value.tv_sec++; td->td_retval[0] = old_it.it_value.tv_sec; return (0); } int linux_brk(struct thread *td, struct linux_brk_args *args) { struct vmspace *vm = td->td_proc->p_vmspace; vm_offset_t new, old; struct obreak_args /* { char * nsize; } */ tmp; #ifdef DEBUG if (ldebug(brk)) printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend); #endif old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize); new = (vm_offset_t)args->dsend; tmp.nsize = (char *)new; if (((caddr_t)new > vm->vm_daddr) && !sys_obreak(td, &tmp)) td->td_retval[0] = (long)new; else td->td_retval[0] = (long)old; return (0); } #if defined(__i386__) /* XXX: what about amd64/linux32? */ int linux_uselib(struct thread *td, struct linux_uselib_args *args) { struct nameidata ni; struct vnode *vp; struct exec *a_out; struct vattr attr; vm_offset_t vmaddr; unsigned long file_offset; unsigned long bss_size; char *library; ssize_t aresid; int error, locked, writecount; LCONVPATHEXIST(td, args->library, &library); #ifdef DEBUG if (ldebug(uselib)) printf(ARGS(uselib, "%s"), library); #endif a_out = NULL; locked = 0; vp = NULL; NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, library, td); error = namei(&ni); LFREEPATH(library); if (error) goto cleanup; vp = ni.ni_vp; NDFREE(&ni, NDF_ONLY_PNBUF); /* * From here on down, we have a locked vnode that must be unlocked. * XXX: The code below largely duplicates exec_check_permissions(). */ locked = 1; /* Writable? */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) goto cleanup; if (writecount != 0) { error = ETXTBSY; goto cleanup; } /* Executable? */ error = VOP_GETATTR(vp, &attr, td->td_ucred); if (error) goto cleanup; if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) { /* EACCESS is what exec(2) returns. */ error = ENOEXEC; goto cleanup; } /* Sensible size? */ if (attr.va_size == 0) { error = ENOEXEC; goto cleanup; } /* Can we access it? */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) goto cleanup; /* * XXX: This should use vn_open() so that it is properly authorized, * and to reduce code redundancy all over the place here. * XXX: Not really, it duplicates far more of exec_check_permissions() * than vn_open(). */ #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VREAD); if (error) goto cleanup; #endif error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error) goto cleanup; /* Pull in executable header into exec_map */ error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0); if (error) goto cleanup; /* Is it a Linux binary ? */ if (((a_out->a_magic >> 16) & 0xff) != 0x64) { error = ENOEXEC; goto cleanup; } /* * While we are here, we should REALLY do some more checks */ /* Set file/virtual offset based on a.out variant. */ switch ((int)(a_out->a_magic & 0xffff)) { case 0413: /* ZMAGIC */ file_offset = 1024; break; case 0314: /* QMAGIC */ file_offset = 0; break; default: error = ENOEXEC; goto cleanup; } bss_size = round_page(a_out->a_bss); /* Check various fields in header for validity/bounds. */ if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) { error = ENOEXEC; goto cleanup; } /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > attr.va_size) { error = EFAULT; goto cleanup; } /* * text/data/bss must not exceed limits * XXX - this is not complete. it should check current usage PLUS * the resources needed by this library. */ PROC_LOCK(td->td_proc); if (a_out->a_text > maxtsiz || a_out->a_data + bss_size > lim_cur(td->td_proc, RLIMIT_DATA) || racct_set(td->td_proc, RACCT_DATA, a_out->a_data + bss_size) != 0) { PROC_UNLOCK(td->td_proc); error = ENOMEM; goto cleanup; } PROC_UNLOCK(td->td_proc); /* * Prevent more writers. * XXX: Note that if any of the VM operations fail below we don't * clear this flag. */ VOP_SET_TEXT(vp); /* * Lock no longer needed */ locked = 0; VOP_UNLOCK(vp, 0); /* * Check if file_offset page aligned. Currently we cannot handle * misalinged file offsets, and so we read in the entire image * (what a waste). */ if (file_offset & PAGE_MASK) { #ifdef DEBUG printf("uselib: Non page aligned binary %lu\n", file_offset); #endif /* Map text+data read/write/execute */ /* a_entry is the load address and is page aligned */ vmaddr = trunc_page(a_out->a_entry); /* get anon user mapping, read+write+execute */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset, a_out->a_text + a_out->a_data, UIO_USERSPACE, 0, td->td_ucred, NOCRED, &aresid, td); if (error != 0) goto cleanup; if (aresid != 0) { error = ENOEXEC; goto cleanup; } } else { #ifdef DEBUG printf("uselib: Page aligned binary %lu\n", file_offset); #endif /* * for QMAGIC, a_entry is 20 bytes beyond the load address * to skip the executable header */ vmaddr = trunc_page(a_out->a_entry); /* * Map it all into the process's space as a single * copy-on-write "data" segment. */ error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset); if (error) goto cleanup; } #ifdef DEBUG printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0], ((long *)vmaddr)[1]); #endif if (bss_size != 0) { /* Calculate BSS start address */ vmaddr = trunc_page(a_out->a_entry) + a_out->a_text + a_out->a_data; /* allocate some 'anon' space */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; } cleanup: /* Unlock vnode if needed */ if (locked) VOP_UNLOCK(vp, 0); /* Release the temporary mapping. */ if (a_out) kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE); return (error); } #endif /* __i386__ */ int linux_select(struct thread *td, struct linux_select_args *args) { l_timeval ltv; struct timeval tv0, tv1, utv, *tvp; int error; #ifdef DEBUG if (ldebug(select)) printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds, (void *)args->readfds, (void *)args->writefds, (void *)args->exceptfds, (void *)args->timeout); #endif /* * Store current time for computation of the amount of * time left. */ if (args->timeout) { if ((error = copyin(args->timeout, <v, sizeof(ltv)))) goto select_out; utv.tv_sec = ltv.tv_sec; utv.tv_usec = ltv.tv_usec; #ifdef DEBUG if (ldebug(select)) printf(LMSG("incoming timeout (%jd/%ld)"), (intmax_t)utv.tv_sec, utv.tv_usec); #endif if (itimerfix(&utv)) { /* * The timeval was invalid. Convert it to something * valid that will act as it does under Linux. */ utv.tv_sec += utv.tv_usec / 1000000; utv.tv_usec %= 1000000; if (utv.tv_usec < 0) { utv.tv_sec -= 1; utv.tv_usec += 1000000; } if (utv.tv_sec < 0) timevalclear(&utv); } microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_select(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tvp, LINUX_NFDBITS); #ifdef DEBUG if (ldebug(select)) printf(LMSG("real select returns %d"), error); #endif if (error) goto select_out; if (args->timeout) { if (td->td_retval[0]) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); } else timevalclear(&utv); #ifdef DEBUG if (ldebug(select)) printf(LMSG("outgoing timeout (%jd/%ld)"), (intmax_t)utv.tv_sec, utv.tv_usec); #endif ltv.tv_sec = utv.tv_sec; ltv.tv_usec = utv.tv_usec; if ((error = copyout(<v, args->timeout, sizeof(ltv)))) goto select_out; } select_out: #ifdef DEBUG if (ldebug(select)) printf(LMSG("select_out -> %d"), error); #endif return (error); } int linux_mremap(struct thread *td, struct linux_mremap_args *args) { struct munmap_args /* { void *addr; size_t len; } */ bsd_args; int error = 0; #ifdef DEBUG if (ldebug(mremap)) printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"), (void *)(uintptr_t)args->addr, (unsigned long)args->old_len, (unsigned long)args->new_len, (unsigned long)args->flags); #endif if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) { td->td_retval[0] = 0; return (EINVAL); } /* * Check for the page alignment. * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK. */ if (args->addr & PAGE_MASK) { td->td_retval[0] = 0; return (EINVAL); } args->new_len = round_page(args->new_len); args->old_len = round_page(args->old_len); if (args->new_len > args->old_len) { td->td_retval[0] = 0; return (ENOMEM); } if (args->new_len < args->old_len) { bsd_args.addr = (caddr_t)((uintptr_t)args->addr + args->new_len); bsd_args.len = args->old_len - args->new_len; error = sys_munmap(td, &bsd_args); } td->td_retval[0] = error ? 0 : (uintptr_t)args->addr; return (error); } #define LINUX_MS_ASYNC 0x0001 #define LINUX_MS_INVALIDATE 0x0002 #define LINUX_MS_SYNC 0x0004 int linux_msync(struct thread *td, struct linux_msync_args *args) { struct msync_args bsd_args; bsd_args.addr = (caddr_t)(uintptr_t)args->addr; bsd_args.len = (uintptr_t)args->len; bsd_args.flags = args->fl & ~LINUX_MS_SYNC; return (sys_msync(td, &bsd_args)); } int linux_time(struct thread *td, struct linux_time_args *args) { struct timeval tv; l_time_t tm; int error; #ifdef DEBUG if (ldebug(time)) printf(ARGS(time, "*")); #endif microtime(&tv); tm = tv.tv_sec; if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm)))) return (error); td->td_retval[0] = tm; return (0); } struct l_times_argv { l_clock_t tms_utime; l_clock_t tms_stime; l_clock_t tms_cutime; l_clock_t tms_cstime; }; /* * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value. * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK * auxiliary vector entry. */ #define CLK_TCK 100 #define CONVOTCK(r) (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK)) #define CONVNTCK(r) (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz)) #define CONVTCK(r) (linux_kernver(td) >= LINUX_KERNVER_2004000 ? \ CONVNTCK(r) : CONVOTCK(r)) int linux_times(struct thread *td, struct linux_times_args *args) { struct timeval tv, utime, stime, cutime, cstime; struct l_times_argv tms; struct proc *p; int error; #ifdef DEBUG if (ldebug(times)) printf(ARGS(times, "*")); #endif if (args->buf != NULL) { p = td->td_proc; PROC_LOCK(p); PROC_STATLOCK(p); calcru(p, &utime, &stime); PROC_STATUNLOCK(p); calccru(p, &cutime, &cstime); PROC_UNLOCK(p); tms.tms_utime = CONVTCK(utime); tms.tms_stime = CONVTCK(stime); tms.tms_cutime = CONVTCK(cutime); tms.tms_cstime = CONVTCK(cstime); if ((error = copyout(&tms, args->buf, sizeof(tms)))) return (error); } microuptime(&tv); td->td_retval[0] = (int)CONVTCK(tv); return (0); } int linux_newuname(struct thread *td, struct linux_newuname_args *args) { struct l_new_utsname utsname; char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; char *p; #ifdef DEBUG if (ldebug(newuname)) printf(ARGS(newuname, "*")); #endif linux_get_osname(td, osname); linux_get_osrelease(td, osrelease); bzero(&utsname, sizeof(utsname)); strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME); getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME); getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME); strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME); strlcpy(utsname.version, version, LINUX_MAX_UTSNAME); for (p = utsname.version; *p != '\0'; ++p) if (*p == '\n') { *p = '\0'; break; } strlcpy(utsname.machine, linux_kplatform, LINUX_MAX_UTSNAME); return (copyout(&utsname, args->buf, sizeof(utsname))); } struct l_utimbuf { l_time_t l_actime; l_time_t l_modtime; }; int linux_utime(struct thread *td, struct linux_utime_args *args) { struct timeval tv[2], *tvp; struct l_utimbuf lut; char *fname; int error; LCONVPATHEXIST(td, args->fname, &fname); #ifdef DEBUG if (ldebug(utime)) printf(ARGS(utime, "%s, *"), fname); #endif if (args->times) { if ((error = copyin(args->times, &lut, sizeof lut))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = lut.l_actime; tv[0].tv_usec = 0; tv[1].tv_sec = lut.l_modtime; tv[1].tv_usec = 0; tvp = tv; } else tvp = NULL; error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } int linux_utimes(struct thread *td, struct linux_utimes_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; char *fname; int error; LCONVPATHEXIST(td, args->fname, &fname); #ifdef DEBUG if (ldebug(utimes)) printf(ARGS(utimes, "%s, *"), fname); #endif if (args->tptr != NULL) { if ((error = copyin(args->tptr, ltv, sizeof ltv))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } static int linux_utimensat_nsec_valid(l_long nsec) { if (nsec == LINUX_UTIME_OMIT || nsec == LINUX_UTIME_NOW) return (0); if (nsec >= 0 && nsec <= 999999999) return (0); return (1); } int linux_utimensat(struct thread *td, struct linux_utimensat_args *args) { struct l_timespec l_times[2]; struct timespec times[2], *timesp = NULL; char *path = NULL; int error, dfd, flags = 0; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; #ifdef DEBUG if (ldebug(utimensat)) printf(ARGS(utimensat, "%d, *"), dfd); #endif if (args->flags & ~LINUX_AT_SYMLINK_NOFOLLOW) return (EINVAL); if (args->times != NULL) { error = copyin(args->times, l_times, sizeof(l_times)); if (error != 0) return (error); if (linux_utimensat_nsec_valid(l_times[0].tv_nsec) != 0 || linux_utimensat_nsec_valid(l_times[1].tv_nsec) != 0) return (EINVAL); times[0].tv_sec = l_times[0].tv_sec; switch (l_times[0].tv_nsec) { case LINUX_UTIME_OMIT: times[0].tv_nsec = UTIME_OMIT; break; case LINUX_UTIME_NOW: times[0].tv_nsec = UTIME_NOW; break; default: times[0].tv_nsec = l_times[0].tv_nsec; } times[1].tv_sec = l_times[1].tv_sec; switch (l_times[1].tv_nsec) { case LINUX_UTIME_OMIT: times[1].tv_nsec = UTIME_OMIT; break; case LINUX_UTIME_NOW: times[1].tv_nsec = UTIME_NOW; break; default: times[1].tv_nsec = l_times[1].tv_nsec; break; } timesp = times; /* This breaks POSIX, but is what the Linux kernel does * _on purpose_ (documented in the man page for utimensat(2)), * so we must follow that behaviour. */ if (times[0].tv_nsec == UTIME_OMIT && times[1].tv_nsec == UTIME_OMIT) return (0); } if (args->pathname != NULL) LCONVPATHEXIST_AT(td, args->pathname, &path, dfd); else if (args->flags != 0) return (EINVAL); if (args->flags & LINUX_AT_SYMLINK_NOFOLLOW) flags |= AT_SYMLINK_NOFOLLOW; if (path == NULL) error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE); else { error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp, UIO_SYSSPACE, flags); LFREEPATH(path); } return (error); } int linux_futimesat(struct thread *td, struct linux_futimesat_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; char *fname; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->filename, &fname, dfd); #ifdef DEBUG if (ldebug(futimesat)) printf(ARGS(futimesat, "%s, *"), fname); #endif if (args->utimes != NULL) { if ((error = copyin(args->utimes, ltv, sizeof ltv))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } int linux_common_wait(struct thread *td, int pid, int *status, int options, struct rusage *ru) { int error, tmpstat; error = kern_wait(td, pid, &tmpstat, options, ru); if (error) return (error); if (status) { tmpstat &= 0xffff; if (WIFSIGNALED(tmpstat)) tmpstat = (tmpstat & 0xffffff80) | bsd_to_linux_signal(WTERMSIG(tmpstat)); else if (WIFSTOPPED(tmpstat)) tmpstat = (tmpstat & 0xffff00ff) | (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8); else if (WIFCONTINUED(tmpstat)) tmpstat = 0xffff; error = copyout(&tmpstat, status, sizeof(int)); } return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_waitpid(struct thread *td, struct linux_waitpid_args *args) { struct linux_wait4_args wait4_args; #ifdef DEBUG if (ldebug(waitpid)) printf(ARGS(waitpid, "%d, %p, %d"), args->pid, (void *)args->status, args->options); #endif wait4_args.pid = args->pid; wait4_args.status = args->status; wait4_args.options = args->options; wait4_args.rusage = NULL; return (linux_wait4(td, &wait4_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_wait4(struct thread *td, struct linux_wait4_args *args) { int error, options; struct rusage ru, *rup; #ifdef DEBUG if (ldebug(wait4)) printf(ARGS(wait4, "%d, %p, %d, %p"), args->pid, (void *)args->status, args->options, (void *)args->rusage); #endif if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG | LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL)) return (EINVAL); options = WEXITED; linux_to_bsd_waitopts(args->options, &options); if (args->rusage != NULL) rup = &ru; else rup = NULL; error = linux_common_wait(td, args->pid, args->status, options, rup); if (error != 0) return (error); if (args->rusage != NULL) error = linux_copyout_rusage(&ru, args->rusage); return (error); } int linux_waitid(struct thread *td, struct linux_waitid_args *args) { int status, options, sig; struct __wrusage wru; siginfo_t siginfo; l_siginfo_t lsi; idtype_t idtype; struct proc *p; int error; options = 0; linux_to_bsd_waitopts(args->options, &options); if (options & ~(WNOHANG | WNOWAIT | WEXITED | WUNTRACED | WCONTINUED)) return (EINVAL); if (!(options & (WEXITED | WUNTRACED | WCONTINUED))) return (EINVAL); switch (args->idtype) { case LINUX_P_ALL: idtype = P_ALL; break; case LINUX_P_PID: if (args->id <= 0) return (EINVAL); idtype = P_PID; break; case LINUX_P_PGID: if (args->id <= 0) return (EINVAL); idtype = P_PGID; break; default: return (EINVAL); } error = kern_wait6(td, idtype, args->id, &status, options, &wru, &siginfo); if (error != 0) return (error); if (args->rusage != NULL) { error = linux_copyout_rusage(&wru.wru_children, args->rusage); if (error != 0) return (error); } if (args->info != NULL) { p = td->td_proc; if (td->td_retval[0] == 0) bzero(&lsi, sizeof(lsi)); else { sig = bsd_to_linux_signal(siginfo.si_signo); siginfo_to_lsiginfo(&siginfo, &lsi, sig); } error = copyout(&lsi, args->info, sizeof(lsi)); } td->td_retval[0] = 0; return (error); } int linux_mknod(struct thread *td, struct linux_mknod_args *args) { char *path; int error; LCONVPATHCREAT(td, args->path, &path); #ifdef DEBUG if (ldebug(mknod)) printf(ARGS(mknod, "%s, %d, %ju"), path, args->mode, (uintmax_t)args->dev); #endif switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: error = kern_mkfifo(td, path, UIO_SYSSPACE, args->mode); break; case S_IFCHR: case S_IFBLK: error = kern_mknod(td, path, UIO_SYSSPACE, args->mode, args->dev); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: error = kern_open(td, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } LFREEPATH(path); return (error); } int linux_mknodat(struct thread *td, struct linux_mknodat_args *args) { char *path; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHCREAT_AT(td, args->filename, &path, dfd); #ifdef DEBUG if (ldebug(mknodat)) printf(ARGS(mknodat, "%s, %d, %d"), path, args->mode, args->dev); #endif switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode); break; case S_IFCHR: case S_IFBLK: error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode, args->dev); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: error = kern_openat(td, dfd, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } LFREEPATH(path); return (error); } /* * UGH! This is just about the dumbest idea I've ever heard!! */ int linux_personality(struct thread *td, struct linux_personality_args *args) { #ifdef DEBUG if (ldebug(personality)) printf(ARGS(personality, "%lu"), (unsigned long)args->per); #endif if (args->per != 0) return (EINVAL); /* Yes Jim, it's still a Linux... */ td->td_retval[0] = 0; return (0); } struct l_itimerval { l_timeval it_interval; l_timeval it_value; }; #define B2L_ITIMERVAL(bip, lip) \ (bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec; \ (bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec; \ (bip)->it_value.tv_sec = (lip)->it_value.tv_sec; \ (bip)->it_value.tv_usec = (lip)->it_value.tv_usec; int linux_setitimer(struct thread *td, struct linux_setitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv, oitv; #ifdef DEBUG if (ldebug(setitimer)) printf(ARGS(setitimer, "%p, %p"), (void *)uap->itv, (void *)uap->oitv); #endif if (uap->itv == NULL) { uap->itv = uap->oitv; return (linux_getitimer(td, (struct linux_getitimer_args *)uap)); } error = copyin(uap->itv, &ls, sizeof(ls)); if (error != 0) return (error); B2L_ITIMERVAL(&aitv, &ls); #ifdef DEBUG if (ldebug(setitimer)) { printf("setitimer: value: sec: %jd, usec: %ld\n", (intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec); printf("setitimer: interval: sec: %jd, usec: %ld\n", (intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec); } #endif error = kern_setitimer(td, uap->which, &aitv, &oitv); if (error != 0 || uap->oitv == NULL) return (error); B2L_ITIMERVAL(&ls, &oitv); return (copyout(&ls, uap->oitv, sizeof(ls))); } int linux_getitimer(struct thread *td, struct linux_getitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv; #ifdef DEBUG if (ldebug(getitimer)) printf(ARGS(getitimer, "%p"), (void *)uap->itv); #endif error = kern_getitimer(td, uap->which, &aitv); if (error != 0) return (error); B2L_ITIMERVAL(&ls, &aitv); return (copyout(&ls, uap->itv, sizeof(ls))); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_nice(struct thread *td, struct linux_nice_args *args) { struct setpriority_args bsd_args; bsd_args.which = PRIO_PROCESS; bsd_args.who = 0; /* current process */ bsd_args.prio = args->inc; return (sys_setpriority(td, &bsd_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_setgroups(struct thread *td, struct linux_setgroups_args *args) { struct ucred *newcred, *oldcred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int ngrp, error; struct proc *p; ngrp = args->gidsetsize; if (ngrp < 0 || ngrp >= ngroups_max + 1) return (EINVAL); linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK); error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t)); if (error) goto out; newcred = crget(); crextend(newcred, ngrp + 1); p = td->td_proc; PROC_LOCK(p); oldcred = p->p_ucred; crcopy(newcred, oldcred); /* * cr_groups[0] holds egid. Setting the whole set from * the supplied set will cause egid to be changed too. * Keep cr_groups[0] unchanged to prevent that. */ if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) { PROC_UNLOCK(p); crfree(newcred); goto out; } if (ngrp > 0) { newcred->cr_ngroups = ngrp + 1; bsd_gidset = newcred->cr_groups; ngrp--; while (ngrp >= 0) { bsd_gidset[ngrp + 1] = linux_gidset[ngrp]; ngrp--; } } else newcred->cr_ngroups = 1; setsugid(p); - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); error = 0; out: free(linux_gidset, M_LINUX); return (error); } int linux_getgroups(struct thread *td, struct linux_getgroups_args *args) { struct ucred *cred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int bsd_gidsetsz, ngrp, error; cred = td->td_ucred; bsd_gidset = cred->cr_groups; bsd_gidsetsz = cred->cr_ngroups - 1; /* * cr_groups[0] holds egid. Returning the whole set * here will cause a duplicate. Exclude cr_groups[0] * to prevent that. */ if ((ngrp = args->gidsetsize) == 0) { td->td_retval[0] = bsd_gidsetsz; return (0); } if (ngrp < bsd_gidsetsz) return (EINVAL); ngrp = 0; linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset), M_LINUX, M_WAITOK); while (ngrp < bsd_gidsetsz) { linux_gidset[ngrp] = bsd_gidset[ngrp + 1]; ngrp++; } error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t)); free(linux_gidset, M_LINUX); if (error) return (error); td->td_retval[0] = ngrp; return (0); } int linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args) { struct rlimit bsd_rlim; struct l_rlimit rlim; u_int which; int error; #ifdef DEBUG if (ldebug(setrlimit)) printf(ARGS(setrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); error = copyin(args->rlim, &rlim, sizeof(rlim)); if (error) return (error); bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur; bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max; return (kern_setrlimit(td, which, &bsd_rlim)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args) { struct l_rlimit rlim; struct proc *p = td->td_proc; struct rlimit bsd_rlim; u_int which; #ifdef DEBUG if (ldebug(old_getrlimit)) printf(ARGS(old_getrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); PROC_LOCK(p); lim_rlimit(p, which, &bsd_rlim); PROC_UNLOCK(p); #ifdef COMPAT_LINUX32 rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur; if (rlim.rlim_cur == UINT_MAX) rlim.rlim_cur = INT_MAX; rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max; if (rlim.rlim_max == UINT_MAX) rlim.rlim_max = INT_MAX; #else rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur; if (rlim.rlim_cur == ULONG_MAX) rlim.rlim_cur = LONG_MAX; rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max; if (rlim.rlim_max == ULONG_MAX) rlim.rlim_max = LONG_MAX; #endif return (copyout(&rlim, args->rlim, sizeof(rlim))); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args) { struct l_rlimit rlim; struct proc *p = td->td_proc; struct rlimit bsd_rlim; u_int which; #ifdef DEBUG if (ldebug(getrlimit)) printf(ARGS(getrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); PROC_LOCK(p); lim_rlimit(p, which, &bsd_rlim); PROC_UNLOCK(p); rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur; rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max; return (copyout(&rlim, args->rlim, sizeof(rlim))); } int linux_sched_setscheduler(struct thread *td, struct linux_sched_setscheduler_args *args) { struct sched_param sched_param; struct thread *tdt; int error, policy; #ifdef DEBUG if (ldebug(sched_setscheduler)) printf(ARGS(sched_setscheduler, "%d, %d, %p"), args->pid, args->policy, (const void *)args->param); #endif switch (args->policy) { case LINUX_SCHED_OTHER: policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: policy = SCHED_FIFO; break; case LINUX_SCHED_RR: policy = SCHED_RR; break; default: return (EINVAL); } error = copyin(args->param, &sched_param, sizeof(sched_param)); if (error) return (error); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_setscheduler(td, tdt, policy, &sched_param); PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getscheduler(struct thread *td, struct linux_sched_getscheduler_args *args) { struct thread *tdt; int error, policy; #ifdef DEBUG if (ldebug(sched_getscheduler)) printf(ARGS(sched_getscheduler, "%d"), args->pid); #endif tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_getscheduler(td, tdt, &policy); PROC_UNLOCK(tdt->td_proc); switch (policy) { case SCHED_OTHER: td->td_retval[0] = LINUX_SCHED_OTHER; break; case SCHED_FIFO: td->td_retval[0] = LINUX_SCHED_FIFO; break; case SCHED_RR: td->td_retval[0] = LINUX_SCHED_RR; break; } return (error); } int linux_sched_get_priority_max(struct thread *td, struct linux_sched_get_priority_max_args *args) { struct sched_get_priority_max_args bsd; #ifdef DEBUG if (ldebug(sched_get_priority_max)) printf(ARGS(sched_get_priority_max, "%d"), args->policy); #endif switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_max(td, &bsd)); } int linux_sched_get_priority_min(struct thread *td, struct linux_sched_get_priority_min_args *args) { struct sched_get_priority_min_args bsd; #ifdef DEBUG if (ldebug(sched_get_priority_min)) printf(ARGS(sched_get_priority_min, "%d"), args->policy); #endif switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_min(td, &bsd)); } #define REBOOT_CAD_ON 0x89abcdef #define REBOOT_CAD_OFF 0 #define REBOOT_HALT 0xcdef0123 #define REBOOT_RESTART 0x01234567 #define REBOOT_RESTART2 0xA1B2C3D4 #define REBOOT_POWEROFF 0x4321FEDC #define REBOOT_MAGIC1 0xfee1dead #define REBOOT_MAGIC2 0x28121969 #define REBOOT_MAGIC2A 0x05121996 #define REBOOT_MAGIC2B 0x16041998 int linux_reboot(struct thread *td, struct linux_reboot_args *args) { struct reboot_args bsd_args; #ifdef DEBUG if (ldebug(reboot)) printf(ARGS(reboot, "0x%x"), args->cmd); #endif if (args->magic1 != REBOOT_MAGIC1) return (EINVAL); switch (args->magic2) { case REBOOT_MAGIC2: case REBOOT_MAGIC2A: case REBOOT_MAGIC2B: break; default: return (EINVAL); } switch (args->cmd) { case REBOOT_CAD_ON: case REBOOT_CAD_OFF: return (priv_check(td, PRIV_REBOOT)); case REBOOT_HALT: bsd_args.opt = RB_HALT; break; case REBOOT_RESTART: case REBOOT_RESTART2: bsd_args.opt = 0; break; case REBOOT_POWEROFF: bsd_args.opt = RB_POWEROFF; break; default: return (EINVAL); } return (sys_reboot(td, &bsd_args)); } /* * The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify * td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that * are assumed to be preserved. The following lightweight syscalls fixes * this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c * * linux_getpid() - MP SAFE * linux_getgid() - MP SAFE * linux_getuid() - MP SAFE */ int linux_getpid(struct thread *td, struct linux_getpid_args *args) { #ifdef DEBUG if (ldebug(getpid)) printf(ARGS(getpid, "")); #endif td->td_retval[0] = td->td_proc->p_pid; return (0); } int linux_gettid(struct thread *td, struct linux_gettid_args *args) { struct linux_emuldata *em; #ifdef DEBUG if (ldebug(gettid)) printf(ARGS(gettid, "")); #endif em = em_find(td); KASSERT(em != NULL, ("gettid: emuldata not found.\n")); td->td_retval[0] = em->em_tid; return (0); } int linux_getppid(struct thread *td, struct linux_getppid_args *args) { #ifdef DEBUG if (ldebug(getppid)) printf(ARGS(getppid, "")); #endif PROC_LOCK(td->td_proc); td->td_retval[0] = td->td_proc->p_pptr->p_pid; PROC_UNLOCK(td->td_proc); return (0); } int linux_getgid(struct thread *td, struct linux_getgid_args *args) { #ifdef DEBUG if (ldebug(getgid)) printf(ARGS(getgid, "")); #endif td->td_retval[0] = td->td_ucred->cr_rgid; return (0); } int linux_getuid(struct thread *td, struct linux_getuid_args *args) { #ifdef DEBUG if (ldebug(getuid)) printf(ARGS(getuid, "")); #endif td->td_retval[0] = td->td_ucred->cr_ruid; return (0); } int linux_getsid(struct thread *td, struct linux_getsid_args *args) { struct getsid_args bsd; #ifdef DEBUG if (ldebug(getsid)) printf(ARGS(getsid, "%i"), args->pid); #endif bsd.pid = args->pid; return (sys_getsid(td, &bsd)); } int linux_nosys(struct thread *td, struct nosys_args *ignore) { return (ENOSYS); } int linux_getpriority(struct thread *td, struct linux_getpriority_args *args) { struct getpriority_args bsd_args; int error; #ifdef DEBUG if (ldebug(getpriority)) printf(ARGS(getpriority, "%i, %i"), args->which, args->who); #endif bsd_args.which = args->which; bsd_args.who = args->who; error = sys_getpriority(td, &bsd_args); td->td_retval[0] = 20 - td->td_retval[0]; return (error); } int linux_sethostname(struct thread *td, struct linux_sethostname_args *args) { int name[2]; #ifdef DEBUG if (ldebug(sethostname)) printf(ARGS(sethostname, "*, %i"), args->len); #endif name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname, args->len, 0, 0)); } int linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args) { int name[2]; #ifdef DEBUG if (ldebug(setdomainname)) printf(ARGS(setdomainname, "*, %i"), args->len); #endif name[0] = CTL_KERN; name[1] = KERN_NISDOMAINNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->name, args->len, 0, 0)); } int linux_exit_group(struct thread *td, struct linux_exit_group_args *args) { #ifdef DEBUG if (ldebug(exit_group)) printf(ARGS(exit_group, "%i"), args->error_code); #endif LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid, args->error_code); /* * XXX: we should send a signal to the parent if * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?) * as it doesnt occur often. */ exit1(td, W_EXITCODE(args->error_code, 0)); /* NOTREACHED */ } #define _LINUX_CAPABILITY_VERSION 0x19980330 struct l_user_cap_header { l_int version; l_int pid; }; struct l_user_cap_data { l_int effective; l_int permitted; l_int inheritable; }; int linux_capget(struct thread *td, struct linux_capget_args *args) { struct l_user_cap_header luch; struct l_user_cap_data lucd; int error; if (args->hdrp == NULL) return (EFAULT); error = copyin(args->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); if (luch.version != _LINUX_CAPABILITY_VERSION) { luch.version = _LINUX_CAPABILITY_VERSION; error = copyout(&luch, args->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); if (args->datap) { /* * The current implementation doesn't support setting * a capability (it's essentially a stub) so indicate * that no capabilities are currently set or available * to request. */ bzero (&lucd, sizeof(lucd)); error = copyout(&lucd, args->datap, sizeof(lucd)); } return (error); } int linux_capset(struct thread *td, struct linux_capset_args *args) { struct l_user_cap_header luch; struct l_user_cap_data lucd; int error; if (args->hdrp == NULL || args->datap == NULL) return (EFAULT); error = copyin(args->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); if (luch.version != _LINUX_CAPABILITY_VERSION) { luch.version = _LINUX_CAPABILITY_VERSION; error = copyout(&luch, args->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); error = copyin(args->datap, &lucd, sizeof(lucd)); if (error != 0) return (error); /* We currently don't support setting any capabilities. */ if (lucd.effective || lucd.permitted || lucd.inheritable) { linux_msg(td, "capset effective=0x%x, permitted=0x%x, " "inheritable=0x%x is not implemented", (int)lucd.effective, (int)lucd.permitted, (int)lucd.inheritable); return (EPERM); } return (0); } int linux_prctl(struct thread *td, struct linux_prctl_args *args) { int error = 0, max_size; struct proc *p = td->td_proc; char comm[LINUX_MAX_COMM_LEN]; struct linux_emuldata *em; int pdeath_signal; #ifdef DEBUG if (ldebug(prctl)) printf(ARGS(prctl, "%d, %ju, %ju, %ju, %ju"), args->option, (uintmax_t)args->arg2, (uintmax_t)args->arg3, (uintmax_t)args->arg4, (uintmax_t)args->arg5); #endif switch (args->option) { case LINUX_PR_SET_PDEATHSIG: if (!LINUX_SIG_VALID(args->arg2)) return (EINVAL); em = em_find(td); KASSERT(em != NULL, ("prctl: emuldata not found.\n")); em->pdeath_signal = args->arg2; break; case LINUX_PR_GET_PDEATHSIG: em = em_find(td); KASSERT(em != NULL, ("prctl: emuldata not found.\n")); pdeath_signal = em->pdeath_signal; error = copyout(&pdeath_signal, (void *)(register_t)args->arg2, sizeof(pdeath_signal)); break; case LINUX_PR_GET_KEEPCAPS: /* * Indicate that we always clear the effective and * permitted capability sets when the user id becomes * non-zero (actually the capability sets are simply * always zero in the current implementation). */ td->td_retval[0] = 0; break; case LINUX_PR_SET_KEEPCAPS: /* * Ignore requests to keep the effective and permitted * capability sets when the user id becomes non-zero. */ break; case LINUX_PR_SET_NAME: /* * To be on the safe side we need to make sure to not * overflow the size a linux program expects. We already * do this here in the copyin, so that we don't need to * check on copyout. */ max_size = MIN(sizeof(comm), sizeof(p->p_comm)); error = copyinstr((void *)(register_t)args->arg2, comm, max_size, NULL); /* Linux silently truncates the name if it is too long. */ if (error == ENAMETOOLONG) { /* * XXX: copyinstr() isn't documented to populate the * array completely, so do a copyin() to be on the * safe side. This should be changed in case * copyinstr() is changed to guarantee this. */ error = copyin((void *)(register_t)args->arg2, comm, max_size - 1); comm[max_size - 1] = '\0'; } if (error) return (error); PROC_LOCK(p); strlcpy(p->p_comm, comm, sizeof(p->p_comm)); PROC_UNLOCK(p); break; case LINUX_PR_GET_NAME: PROC_LOCK(p); strlcpy(comm, p->p_comm, sizeof(comm)); PROC_UNLOCK(p); error = copyout(comm, (void *)(register_t)args->arg2, strlen(comm) + 1); break; default: error = EINVAL; break; } return (error); } int linux_sched_setparam(struct thread *td, struct linux_sched_setparam_args *uap) { struct sched_param sched_param; struct thread *tdt; int error; #ifdef DEBUG if (ldebug(sched_setparam)) printf(ARGS(sched_setparam, "%d, *"), uap->pid); #endif error = copyin(uap->param, &sched_param, sizeof(sched_param)); if (error) return (error); tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_setparam(td, tdt, &sched_param); PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getparam(struct thread *td, struct linux_sched_getparam_args *uap) { struct sched_param sched_param; struct thread *tdt; int error; #ifdef DEBUG if (ldebug(sched_getparam)) printf(ARGS(sched_getparam, "%d, *"), uap->pid); #endif tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_getparam(td, tdt, &sched_param); PROC_UNLOCK(tdt->td_proc); if (error == 0) error = copyout(&sched_param, uap->param, sizeof(sched_param)); return (error); } /* * Get affinity of a process. */ int linux_sched_getaffinity(struct thread *td, struct linux_sched_getaffinity_args *args) { int error; struct thread *tdt; struct cpuset_getaffinity_args cga; #ifdef DEBUG if (ldebug(sched_getaffinity)) printf(ARGS(sched_getaffinity, "%d, %d, *"), args->pid, args->len); #endif if (args->len < sizeof(cpuset_t)) return (EINVAL); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); PROC_UNLOCK(tdt->td_proc); cga.level = CPU_LEVEL_WHICH; cga.which = CPU_WHICH_TID; cga.id = tdt->td_tid; cga.cpusetsize = sizeof(cpuset_t); cga.mask = (cpuset_t *) args->user_mask_ptr; if ((error = sys_cpuset_getaffinity(td, &cga)) == 0) td->td_retval[0] = sizeof(cpuset_t); return (error); } /* * Set affinity of a process. */ int linux_sched_setaffinity(struct thread *td, struct linux_sched_setaffinity_args *args) { struct cpuset_setaffinity_args csa; struct thread *tdt; #ifdef DEBUG if (ldebug(sched_setaffinity)) printf(ARGS(sched_setaffinity, "%d, %d, *"), args->pid, args->len); #endif if (args->len < sizeof(cpuset_t)) return (EINVAL); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); PROC_UNLOCK(tdt->td_proc); csa.level = CPU_LEVEL_WHICH; csa.which = CPU_WHICH_TID; csa.id = tdt->td_tid; csa.cpusetsize = sizeof(cpuset_t); csa.mask = (cpuset_t *) args->user_mask_ptr; return (sys_cpuset_setaffinity(td, &csa)); } struct linux_rlimit64 { uint64_t rlim_cur; uint64_t rlim_max; }; int linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args) { struct rlimit rlim, nrlim; struct linux_rlimit64 lrlim; struct proc *p; u_int which; int flags; int error; #ifdef DEBUG if (ldebug(prlimit64)) printf(ARGS(prlimit64, "%d, %d, %p, %p"), args->pid, args->resource, (void *)args->new, (void *)args->old); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); if (args->new != NULL) { /* * Note. Unlike FreeBSD where rlim is signed 64-bit Linux * rlim is unsigned 64-bit. FreeBSD treats negative limits * as INFINITY so we do not need a conversion even. */ error = copyin(args->new, &nrlim, sizeof(nrlim)); if (error != 0) return (error); } flags = PGET_HOLD | PGET_NOTWEXIT; if (args->new != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget(args->pid, flags, &p); if (error != 0) return (error); if (args->old != NULL) { PROC_LOCK(p); lim_rlimit(p, which, &rlim); PROC_UNLOCK(p); if (rlim.rlim_cur == RLIM_INFINITY) lrlim.rlim_cur = LINUX_RLIM_INFINITY; else lrlim.rlim_cur = rlim.rlim_cur; if (rlim.rlim_max == RLIM_INFINITY) lrlim.rlim_max = LINUX_RLIM_INFINITY; else lrlim.rlim_max = rlim.rlim_max; error = copyout(&lrlim, args->old, sizeof(lrlim)); if (error != 0) goto out; } if (args->new != NULL) error = kern_proc_setrlimit(td, p, which, &nrlim); out: PRELE(p); return (error); } int linux_pselect6(struct thread *td, struct linux_pselect6_args *args) { struct timeval utv, tv0, tv1, *tvp; struct l_pselect6arg lpse6; struct l_timespec lts; struct timespec uts; l_sigset_t l_ss; sigset_t *ssp; sigset_t ss; int error; ssp = NULL; if (args->sig != NULL) { error = copyin(args->sig, &lpse6, sizeof(lpse6)); if (error != 0) return (error); if (lpse6.ss_len != sizeof(l_ss)) return (EINVAL); if (lpse6.ss != 0) { error = copyin(PTRIN(lpse6.ss), &l_ss, sizeof(l_ss)); if (error != 0) return (error); linux_to_bsd_sigset(&l_ss, &ss); ssp = &ss; } } /* * Currently glibc changes nanosecond number to microsecond. * This mean losing precision but for now it is hardly seen. */ if (args->tsp != NULL) { error = copyin(args->tsp, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec(&uts, <s); if (error != 0) return (error); TIMESPEC_TO_TIMEVAL(&utv, &uts); if (itimerfix(&utv)) return (EINVAL); microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_pselect(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tvp, ssp, LINUX_NFDBITS); if (error == 0 && args->tsp != NULL) { if (td->td_retval[0] != 0) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); } else timevalclear(&utv); TIMEVAL_TO_TIMESPEC(&utv, &uts); native_to_linux_timespec(<s, &uts); error = copyout(<s, args->tsp, sizeof(lts)); } return (error); } int linux_ppoll(struct thread *td, struct linux_ppoll_args *args) { struct timespec ts0, ts1; struct l_timespec lts; struct timespec uts, *tsp; l_sigset_t l_ss; sigset_t *ssp; sigset_t ss; int error; if (args->sset != NULL) { if (args->ssize != sizeof(l_ss)) return (EINVAL); error = copyin(args->sset, &l_ss, sizeof(l_ss)); if (error) return (error); linux_to_bsd_sigset(&l_ss, &ss); ssp = &ss; } else ssp = NULL; if (args->tsp != NULL) { error = copyin(args->tsp, <s, sizeof(lts)); if (error) return (error); error = linux_to_native_timespec(&uts, <s); if (error != 0) return (error); nanotime(&ts0); tsp = &uts; } else tsp = NULL; error = kern_poll(td, args->fds, args->nfds, tsp, ssp); if (error == 0 && args->tsp != NULL) { if (td->td_retval[0]) { nanotime(&ts1); timespecsub(&ts1, &ts0); timespecsub(&uts, &ts1); if (uts.tv_sec < 0) timespecclear(&uts); } else timespecclear(&uts); native_to_linux_timespec(<s, &uts); error = copyout(<s, args->tsp, sizeof(lts)); } return (error); } #if defined(DEBUG) || defined(KTR) /* XXX: can be removed when every ldebug(...) and KTR stuff are removed. */ #ifdef COMPAT_LINUX32 #define L_MAXSYSCALL LINUX32_SYS_MAXSYSCALL #else #define L_MAXSYSCALL LINUX_SYS_MAXSYSCALL #endif u_char linux_debug_map[howmany(L_MAXSYSCALL, sizeof(u_char))]; static int linux_debug(int syscall, int toggle, int global) { if (global) { char c = toggle ? 0 : 0xff; memset(linux_debug_map, c, sizeof(linux_debug_map)); return (0); } if (syscall < 0 || syscall >= L_MAXSYSCALL) return (EINVAL); if (toggle) clrbit(linux_debug_map, syscall); else setbit(linux_debug_map, syscall); return (0); } #undef L_MAXSYSCALL /* * Usage: sysctl linux.debug=.<0/1> * * E.g.: sysctl linux.debug=21.0 * * As a special case, syscall "all" will apply to all syscalls globally. */ #define LINUX_MAX_DEBUGSTR 16 int linux_sysctl_debug(SYSCTL_HANDLER_ARGS) { char value[LINUX_MAX_DEBUGSTR], *p; int error, sysc, toggle; int global = 0; value[0] = '\0'; error = sysctl_handle_string(oidp, value, LINUX_MAX_DEBUGSTR, req); if (error || req->newptr == NULL) return (error); for (p = value; *p != '\0' && *p != '.'; p++); if (*p == '\0') return (EINVAL); *p++ = '\0'; sysc = strtol(value, NULL, 0); toggle = strtol(p, NULL, 0); if (strcmp(value, "all") == 0) global = 1; error = linux_debug(sysc, toggle, global); return (error); } #endif /* DEBUG || KTR */ int linux_sched_rr_get_interval(struct thread *td, struct linux_sched_rr_get_interval_args *uap) { struct timespec ts; struct l_timespec lts; struct thread *tdt; int error; /* * According to man in case the invalid pid specified * EINVAL should be returned. */ if (uap->pid < 0) return (EINVAL); tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_rr_get_interval_td(td, tdt, &ts); PROC_UNLOCK(tdt->td_proc); if (error != 0) return (error); native_to_linux_timespec(<s, &ts); return (copyout(<s, uap->interval, sizeof(lts))); } /* * In case when the Linux thread is the initial thread in * the thread group thread id is equal to the process id. * Glibc depends on this magic (assert in pthread_getattr_np.c). */ struct thread * linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid) { struct linux_emuldata *em; struct thread *tdt; struct proc *p; tdt = NULL; if (tid == 0 || tid == td->td_tid) { tdt = td; PROC_LOCK(tdt->td_proc); } else if (tid > PID_MAX) tdt = tdfind(tid, pid); else { /* * Initial thread where the tid equal to the pid. */ p = pfind(tid); if (p != NULL) { if (SV_PROC_ABI(p) != SV_ABI_LINUX) { /* * p is not a Linuxulator process. */ PROC_UNLOCK(p); return (NULL); } FOREACH_THREAD_IN_PROC(p, tdt) { em = em_find(tdt); if (tid == em->em_tid) return (tdt); } PROC_UNLOCK(p); } return (NULL); } return (tdt); } void linux_to_bsd_waitopts(int options, int *bsdopts) { if (options & LINUX_WNOHANG) *bsdopts |= WNOHANG; if (options & LINUX_WUNTRACED) *bsdopts |= WUNTRACED; if (options & LINUX_WEXITED) *bsdopts |= WEXITED; if (options & LINUX_WCONTINUED) *bsdopts |= WCONTINUED; if (options & LINUX_WNOWAIT) *bsdopts |= WNOWAIT; if (options & __WCLONE) *bsdopts |= WLINUXCLONE; } Index: stable/10/sys/compat/linux/linux_uid16.c =================================================================== --- stable/10/sys/compat/linux/linux_uid16.c (revision 302228) +++ stable/10/sys/compat/linux/linux_uid16.c (revision 302229) @@ -1,441 +1,441 @@ /*- * Copyright (c) 2001 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_kdtrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /** * DTrace probes in this module. */ LIN_SDT_PROBE_DEFINE3(uid16, linux_chown16, entry, "char *", "l_uid16_t", "l_gid16_t"); LIN_SDT_PROBE_DEFINE1(uid16, linux_chown16, conv_path, "char *"); LIN_SDT_PROBE_DEFINE1(uid16, linux_chown16, return, "int"); LIN_SDT_PROBE_DEFINE3(uid16, linux_lchown16, entry, "char *", "l_uid16_t", "l_gid16_t"); LIN_SDT_PROBE_DEFINE1(uid16, linux_lchown16, conv_path, "char *"); LIN_SDT_PROBE_DEFINE1(uid16, linux_lchown16, return, "int"); LIN_SDT_PROBE_DEFINE2(uid16, linux_setgroups16, entry, "l_uint", "l_gid16_t *"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setgroups16, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setgroups16, priv_check_cred_error, "int"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setgroups16, return, "int"); LIN_SDT_PROBE_DEFINE2(uid16, linux_getgroups16, entry, "l_uint", "l_gid16_t *"); LIN_SDT_PROBE_DEFINE1(uid16, linux_getgroups16, copyout_error, "int"); LIN_SDT_PROBE_DEFINE1(uid16, linux_getgroups16, return, "int"); LIN_SDT_PROBE_DEFINE0(uid16, linux_getgid16, entry); LIN_SDT_PROBE_DEFINE1(uid16, linux_getgid16, return, "int"); LIN_SDT_PROBE_DEFINE0(uid16, linux_getuid16, entry); LIN_SDT_PROBE_DEFINE1(uid16, linux_getuid16, return, "int"); LIN_SDT_PROBE_DEFINE0(uid16, linux_getegid16, entry); LIN_SDT_PROBE_DEFINE1(uid16, linux_getegid16, return, "int"); LIN_SDT_PROBE_DEFINE0(uid16, linux_geteuid16, entry); LIN_SDT_PROBE_DEFINE1(uid16, linux_geteuid16, return, "int"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setgid16, entry, "l_gid16_t"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setgid16, return, "int"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setuid16, entry, "l_uid16_t"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setuid16, return, "int"); LIN_SDT_PROBE_DEFINE2(uid16, linux_setregid16, entry, "l_gid16_t", "l_gid16_t"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setregid16, return, "int"); LIN_SDT_PROBE_DEFINE2(uid16, linux_setreuid16, entry, "l_uid16_t", "l_uid16_t"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setreuid16, return, "int"); LIN_SDT_PROBE_DEFINE3(uid16, linux_setresgid16, entry, "l_gid16_t", "l_gid16_t", "l_gid16_t"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setresgid16, return, "int"); LIN_SDT_PROBE_DEFINE3(uid16, linux_setresuid16, entry, "l_uid16_t", "l_uid16_t", "l_uid16_t"); LIN_SDT_PROBE_DEFINE1(uid16, linux_setresuid16, return, "int"); DUMMY(setfsuid16); DUMMY(setfsgid16); DUMMY(getresuid16); DUMMY(getresgid16); #define CAST_NOCHG(x) ((x == 0xFFFF) ? -1 : x) int linux_chown16(struct thread *td, struct linux_chown16_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); /* * The DTrace probes have to be after the LCONVPATHEXIST, as * LCONVPATHEXIST may return on its own and we do not want to * have a stray entry without the corresponding return. */ LIN_SDT_PROBE3(uid16, linux_chown16, entry, args->path, args->uid, args->gid); LIN_SDT_PROBE1(uid16, linux_chown16, conv_path, path); error = kern_chown(td, path, UIO_SYSSPACE, CAST_NOCHG(args->uid), CAST_NOCHG(args->gid)); LFREEPATH(path); LIN_SDT_PROBE1(uid16, linux_chown16, return, error); return (error); } int linux_lchown16(struct thread *td, struct linux_lchown16_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); /* * The DTrace probes have to be after the LCONVPATHEXIST, as * LCONVPATHEXIST may return on its own and we do not want to * have a stray entry without the corresponding return. */ LIN_SDT_PROBE3(uid16, linux_lchown16, entry, args->path, args->uid, args->gid); LIN_SDT_PROBE1(uid16, linux_lchown16, conv_path, path); error = kern_lchown(td, path, UIO_SYSSPACE, CAST_NOCHG(args->uid), CAST_NOCHG(args->gid)); LFREEPATH(path); LIN_SDT_PROBE1(uid16, linux_lchown16, return, error); return (error); } int linux_setgroups16(struct thread *td, struct linux_setgroups16_args *args) { struct ucred *newcred, *oldcred; l_gid16_t *linux_gidset; gid_t *bsd_gidset; int ngrp, error; struct proc *p; LIN_SDT_PROBE2(uid16, linux_setgroups16, entry, args->gidsetsize, args->gidset); ngrp = args->gidsetsize; if (ngrp < 0 || ngrp >= ngroups_max + 1) { LIN_SDT_PROBE1(uid16, linux_setgroups16, return, EINVAL); return (EINVAL); } linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK); error = copyin(args->gidset, linux_gidset, ngrp * sizeof(l_gid16_t)); if (error) { LIN_SDT_PROBE1(uid16, linux_setgroups16, copyin_error, error); LIN_SDT_PROBE1(uid16, linux_setgroups16, return, error); free(linux_gidset, M_LINUX); return (error); } newcred = crget(); p = td->td_proc; PROC_LOCK(p); oldcred = crcopysafe(p, newcred); /* * cr_groups[0] holds egid. Setting the whole set from * the supplied set will cause egid to be changed too. * Keep cr_groups[0] unchanged to prevent that. */ if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) { PROC_UNLOCK(p); crfree(newcred); LIN_SDT_PROBE1(uid16, linux_setgroups16, priv_check_cred_error, error); goto out; } if (ngrp > 0) { newcred->cr_ngroups = ngrp + 1; bsd_gidset = newcred->cr_groups; ngrp--; while (ngrp >= 0) { bsd_gidset[ngrp + 1] = linux_gidset[ngrp]; ngrp--; } } else newcred->cr_ngroups = 1; setsugid(td->td_proc); - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); error = 0; out: free(linux_gidset, M_LINUX); LIN_SDT_PROBE1(uid16, linux_setgroups16, return, error); return (error); } int linux_getgroups16(struct thread *td, struct linux_getgroups16_args *args) { struct ucred *cred; l_gid16_t *linux_gidset; gid_t *bsd_gidset; int bsd_gidsetsz, ngrp, error; LIN_SDT_PROBE2(uid16, linux_getgroups16, entry, args->gidsetsize, args->gidset); cred = td->td_ucred; bsd_gidset = cred->cr_groups; bsd_gidsetsz = cred->cr_ngroups - 1; /* * cr_groups[0] holds egid. Returning the whole set * here will cause a duplicate. Exclude cr_groups[0] * to prevent that. */ if ((ngrp = args->gidsetsize) == 0) { td->td_retval[0] = bsd_gidsetsz; LIN_SDT_PROBE1(uid16, linux_getgroups16, return, 0); return (0); } if (ngrp < bsd_gidsetsz) { LIN_SDT_PROBE1(uid16, linux_getgroups16, return, EINVAL); return (EINVAL); } ngrp = 0; linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset), M_LINUX, M_WAITOK); while (ngrp < bsd_gidsetsz) { linux_gidset[ngrp] = bsd_gidset[ngrp + 1]; ngrp++; } error = copyout(linux_gidset, args->gidset, ngrp * sizeof(l_gid16_t)); free(linux_gidset, M_LINUX); if (error) { LIN_SDT_PROBE1(uid16, linux_getgroups16, copyout_error, error); LIN_SDT_PROBE1(uid16, linux_getgroups16, return, error); return (error); } td->td_retval[0] = ngrp; LIN_SDT_PROBE1(uid16, linux_getgroups16, return, 0); return (0); } /* * The FreeBSD native getgid(2) and getuid(2) also modify td->td_retval[1] * when COMPAT_43 is defined. This clobbers registers that are assumed to * be preserved. The following lightweight syscalls fixes this. See also * linux_getpid(2), linux_getgid(2) and linux_getuid(2) in linux_misc.c * * linux_getgid16() - MP SAFE * linux_getuid16() - MP SAFE */ int linux_getgid16(struct thread *td, struct linux_getgid16_args *args) { LIN_SDT_PROBE0(uid16, linux_getgid16, entry); td->td_retval[0] = td->td_ucred->cr_rgid; LIN_SDT_PROBE1(uid16, linux_getgid16, return, 0); return (0); } int linux_getuid16(struct thread *td, struct linux_getuid16_args *args) { LIN_SDT_PROBE0(uid16, linux_getuid16, entry); td->td_retval[0] = td->td_ucred->cr_ruid; LIN_SDT_PROBE1(uid16, linux_getuid16, return, 0); return (0); } int linux_getegid16(struct thread *td, struct linux_getegid16_args *args) { struct getegid_args bsd; int error; LIN_SDT_PROBE0(uid16, linux_getegid16, entry); error = sys_getegid(td, &bsd); LIN_SDT_PROBE1(uid16, linux_getegid16, return, error); return (error); } int linux_geteuid16(struct thread *td, struct linux_geteuid16_args *args) { struct geteuid_args bsd; int error; LIN_SDT_PROBE0(uid16, linux_geteuid16, entry); error = sys_geteuid(td, &bsd); LIN_SDT_PROBE1(uid16, linux_geteuid16, return, error); return (error); } int linux_setgid16(struct thread *td, struct linux_setgid16_args *args) { struct setgid_args bsd; int error; LIN_SDT_PROBE1(uid16, linux_setgid16, entry, args->gid); bsd.gid = args->gid; error = sys_setgid(td, &bsd); LIN_SDT_PROBE1(uid16, linux_setgid16, return, error); return (error); } int linux_setuid16(struct thread *td, struct linux_setuid16_args *args) { struct setuid_args bsd; int error; LIN_SDT_PROBE1(uid16, linux_setuid16, entry, args->uid); bsd.uid = args->uid; error = sys_setuid(td, &bsd); LIN_SDT_PROBE1(uid16, linux_setuid16, return, error); return (error); } int linux_setregid16(struct thread *td, struct linux_setregid16_args *args) { struct setregid_args bsd; int error; LIN_SDT_PROBE2(uid16, linux_setregid16, entry, args->rgid, args->egid); bsd.rgid = CAST_NOCHG(args->rgid); bsd.egid = CAST_NOCHG(args->egid); error = sys_setregid(td, &bsd); LIN_SDT_PROBE1(uid16, linux_setregid16, return, error); return (error); } int linux_setreuid16(struct thread *td, struct linux_setreuid16_args *args) { struct setreuid_args bsd; int error; LIN_SDT_PROBE2(uid16, linux_setreuid16, entry, args->ruid, args->euid); bsd.ruid = CAST_NOCHG(args->ruid); bsd.euid = CAST_NOCHG(args->euid); error = sys_setreuid(td, &bsd); LIN_SDT_PROBE1(uid16, linux_setreuid16, return, error); return (error); } int linux_setresgid16(struct thread *td, struct linux_setresgid16_args *args) { struct setresgid_args bsd; int error; LIN_SDT_PROBE3(uid16, linux_setresgid16, entry, args->rgid, args->egid, args->sgid); bsd.rgid = CAST_NOCHG(args->rgid); bsd.egid = CAST_NOCHG(args->egid); bsd.sgid = CAST_NOCHG(args->sgid); error = sys_setresgid(td, &bsd); LIN_SDT_PROBE1(uid16, linux_setresgid16, return, error); return (error); } int linux_setresuid16(struct thread *td, struct linux_setresuid16_args *args) { struct setresuid_args bsd; int error; LIN_SDT_PROBE3(uid16, linux_setresuid16, entry, args->ruid, args->euid, args->suid); bsd.ruid = CAST_NOCHG(args->ruid); bsd.euid = CAST_NOCHG(args->euid); bsd.suid = CAST_NOCHG(args->suid); error = sys_setresuid(td, &bsd); LIN_SDT_PROBE1(uid16, linux_setresuid16, return, error); return (error); } Index: stable/10/sys/kern/init_main.c =================================================================== --- stable/10/sys/kern/init_main.c (revision 302228) +++ stable/10/sys/kern/init_main.c (revision 302229) @@ -1,870 +1,872 @@ /*- * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_init_path.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread thread0 __aligned(16); struct vmspace vmspace0; struct proc *initproc; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { register struct sysinit **sipp; /* system initialization*/ register struct sysinit **xipp; /* interior loop of sort*/ register struct sysinit *save; /* bubble*/ #if defined(VERBOSE_SYSINIT) int last; int verbose; #endif if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } #if defined(VERBOSE_SYSINIT) last = SI_SUB_COPYRIGHT; verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last) { verbose = 1; last = (*sipp)->subsystem; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } /* *************************************************************************** **** **** The following SYSINIT's belong elsewhere, but have not yet **** been moved. **** *************************************************************************** */ static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused, struct syscall_args *sa __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_mask = 0, .sv_sigsize = 0, .sv_sigtbl = NULL, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_prepsyscall = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; /* *************************************************************************** **** **** The two following SYSINIT's are proc0 specific glue code. I am not **** convinced that they can not be safely combined, but their order of **** operation has been maintained as the same as the original init_main.c **** for right now. **** **** These probably belong in init_proc.c or kern_proc.c, since they **** deal with proc0 (the fork template process). **** *************************************************************************** */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; + struct ucred *newcred; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KTHREAD; p->p_flag2 = 0; p->p_state = PRS_NORMAL; knlist_init_mtx(&p->p_klist, &p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; /* pid_max cannot be greater than PID_MAX */ td->td_tid = PID_MAX + 1; LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); td->td_state = TDS_RUNNING; td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = 0; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, CALLOUT_MPSAFE); /* Create credentials. */ - p->p_ucred = crget(); - p->p_ucred->cr_ngroups = 1; /* group 0 */ - p->p_ucred->cr_uidinfo = uifind(0); - p->p_ucred->cr_ruidinfo = uifind(0); - p->p_ucred->cr_prison = &prison0; - p->p_ucred->cr_loginclass = loginclass_find("default"); + newcred = crget(); + newcred->cr_ngroups = 1; /* group 0 */ + newcred->cr_uidinfo = uifind(0); + newcred->cr_ruidinfo = uifind(0); + newcred->cr_prison = &prison0; + newcred->cr_loginclass = loginclass_find("default"); + proc_set_cred(p, newcred); #ifdef AUDIT - audit_cred_kproc0(p->p_ucred); + audit_cred_kproc0(newcred); #endif #ifdef MAC - mac_cred_create_swapper(p->p_ucred); + mac_cred_create_swapper(newcred); #endif - td->td_ucred = crhold(p->p_ucred); + td->td_ucred = crhold(newcred); /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)cnt.v_free_count); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ pmap_pinit0(vmspace_pmap(&vmspace0)); p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_INVOKE(process_init, p); EVENTHANDLER_INVOKE(thread_init, td); EVENTHANDLER_INVOKE(process_ctor, p); EVENTHANDLER_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ PROC_STATUNLOCK(p); p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); static void random_init(void *dummy __unused) { /* * After CPU has been started we have some randomness on most * platforms via get_cyclecount(). For platforms that don't * we will reseed random(9) in proc0_post() as well. */ srandom(get_cyclecount()); } SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* *************************************************************************** **** **** The following code probably belongs in another file, like **** kern/init_init.c. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct thread *td; struct proc *p; mtx_lock(&Giant); GIANT_REQUIRED; td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ unsetenv("kern.geom.eli.passphrase"); /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = p->p_sysent->sv_usrstack - PAGE_SIZE; if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)p->p_sysent->sv_usrstack; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ if ((error = sys_execve(td, &args)) == 0) { mtx_unlock(&Giant); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* * Like kproc_create(), but runs in it's own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { struct ucred *newcred, *oldcred; int error; error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc, NULL, 0); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; LIST_INSERT_HEAD(&initproc->p_reaplist, &proc0, p_reapsibling); oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif - initproc->p_ucred = newcred; + proc_set_cred(initproc, newcred); PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cred_update_thread(FIRST_THREAD_IN_PROC(initproc)); cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); thread_unlock(td); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL); Index: stable/10/sys/kern/kern_exec.c =================================================================== --- stable/10/sys/kern/kern_exec.c (revision 302228) +++ stable/10/sys/kern/kern_exec.c (revision 302229) @@ -1,1548 +1,1548 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_kdtrace.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, kernel, , exec, "char *"); SDT_PROBE_DEFINE1(proc, kernel, , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, kernel, , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero); SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; } #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); post_execve(td, error, oldvmspace); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == 0) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(p->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { AUDIT_ARG_ARGV(args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG_ENVV(args->begin_envv, args->envc, args->endp - args->begin_envv); return (do_execve(td, args, mac_p)); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *newcred = NULL, *oldcred; struct uidinfo *euip = NULL; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts, *newsigacts; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *textvp = NULL, *binvp = NULL; cap_rights_t rights; int credential_changing; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, kernel, , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; binvp = nd.ni_vp; imgp->vp = binvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, cap_rights_init(&rights, CAP_FEXECVE), &binvp); if (error) goto exec_fail; vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(binvp); imgp->vp = binvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = VOP_IS_TEXT(imgp->vp); VOP_SET_TEXT(imgp->vp); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) VOP_UNSET_TEXT(imgp->vp); error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ VOP_UNSET_TEXT(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(binvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(binvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(binvp); vm_object_deallocate(imgp->object); imgp->object = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); /* * Do the best to calculate the full path to the image file. */ if (imgp->auxargs != NULL && ((args->fname != NULL && args->fname[0] == '/') || vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0)) imgp->execpath = args->fname; if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); /* * For security and other reasons, the file descriptor table cannot * be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); /* * Malloc things before we need locks. */ i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); /* Get a reference to the vnode prior to locking the proc */ VREF(binvp); /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } else { oldsigacts = NULL; newsigacts = NULL; /* satisfy gcc */ } PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; oldcred = p->p_ucred; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * setugidsafety() may call closef() and then pfind() * which may grab the process lock. * fdcheckstd() may call falloc() which may block to * allocate memory, so temporarily drop the process lock. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); setugidsafety(td); error = fdcheckstd(td); if (error != 0) goto done1; newcred = crdup(oldcred); euip = uifind(attr.va_uid); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); /* * Set the new credentials. */ if (attr.va_mode & S_ISUID) change_euid(newcred, euip); if (attr.va_mode & S_ISGID) change_egid(newcred, attr.va_gid); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, newcred, imgp->vp, interpvplabel, imgp); } #endif /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); - p->p_ucred = newcred; + proc_set_cred(p, newcred); } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); newcred = crdup(oldcred); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); - p->p_ucred = newcred; + proc_set_cred(p, newcred); } } /* * Store the vp for use in procfs. This vnode was referenced prior * to locking the proc lock. */ textvp = p->p_textvp; p->p_textvp = binvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. * * The proc lock needs to be released before taking the PMC * SX. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } else PROC_UNLOCK(p); #else /* !HWPMC_HOOKS */ PROC_UNLOCK(p); #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); else exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE1(proc, kernel, , exec__success, args->fname); VOP_UNLOCK(imgp->vp, 0); done1: /* * Free any resources malloc'd earlier that we didn't use. */ if (euip != NULL) uifree(euip); if (newcred != NULL) crfree(oldcred); /* * Handle deferred decrement of ref counts. */ if (textvp != NULL) vrele(textvp); if (binvp && error != 0) vrele(binvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif vn_lock(imgp->vp, LK_SHARED | LK_RETRY); pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); vput(imgp->vp); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { PROC_LOCK(p); td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); goto done2; } exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, kernel, , exec__failure, error); done2: #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif return (error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i; int initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_WLOCK(object); #if VM_NRESERVLEVEL > 0 if ((object->flags & OBJ_COLORED) == 0) { object->flags |= OBJ_COLORED; object->pg_color = 0; } #endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL); if (ma[0]->valid != VM_PAGE_BITS_ALL) { initial_pagein = VM_INITIAL_PAGEIN; if (initial_pagein > object->size) initial_pagein = object->size; for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid) break; if (vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, 0); ma[0] = vm_page_lookup(object, 0); if ((rv != VM_PAGER_OK) || (ma[0] == NULL)) { if (ma[0] != NULL) { vm_page_lock(ma[0]); vm_page_free(ma[0]); vm_page_unlock(ma[0]); } VM_OBJECT_WUNLOCK(object); return (EIO); } } vm_page_xunbusy(ma[0]); vm_page_lock(ma[0]); vm_page_hold(ma[0]); vm_page_activate(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(imgp) struct image_params *imgp; { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp, sv) struct image_params *imgp; struct sysentvec *sv; { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error) { vm_object_deallocate(obj); return (error); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error) return (error); #ifdef __ia64__ /* Allocate a new register stack */ error = vm_map_stack(map, IA64_BACKINGSTORE, (vm_size_t)ssiz, sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP); if (error) return (error); #endif /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the * VM_STACK case, but they are still used to monitor the size of the * process stack so we can check the stack rlimit. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long argp, envp; int error; size_t length; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ if (fname != NULL) { args->fname = args->buf; error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; } else length = 0; args->begin_argv = args->buf + length; args->endp = args->begin_argv; args->stringspace = ARG_MAX; /* * extract arguments first */ for (;;) { error = fueword(argv++, &argp); if (error == -1) { error = EFAULT; goto err_exit; } if (argp == 0) break; error = copyinstr((void *)(uintptr_t)argp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &envp); if (error == -1) { error = EFAULT; goto err_exit; } if (envp == 0) break; error = copyinstr((void *)(uintptr_t)envp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. Returns zero if the allocation succeeds * and ENOMEM otherwise. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)kmap_alloc_wait(exec_map, PATH_MAX + ARG_MAX); return (args->buf != NULL ? 0 : ENOMEM); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { kmap_free_wakeup(exec_map, (vm_offset_t)args->buf, PATH_MAX + ARG_MAX); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp; uintptr_t destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; size_t execpath_len; int szsigcode, szps; char canary[sizeof(long) * 8]; szps = sizeof(pagesizes[0]) * MAXPAGESIZES; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_sigcode_base == 0) { if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); } destp = (uintptr_t)arginfo; /* * install sigcode */ if (szsigcode != 0) { destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode); } /* * Copy the image path for the rtld. */ if (execpath_len != 0) { destp -= execpath_len; imgp->execpathp = destp; copyout(imgp->execpath, (void *)destp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = destp; copyout(canary, (void *)destp, sizeof(canary)); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ destp -= szps; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = destp; copyout(pagesizes, (void *)destp, szps); imgp->pagesizeslen = szps; destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error, writecount; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) return (error); if (writecount != 0) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = 1; return (error); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: stable/10/sys/kern/kern_exit.c =================================================================== --- stable/10/sys/kern/kern_exit.c (revision 302228) +++ stable/10/sys/kern/kern_exit.c (revision 302229) @@ -1,1367 +1,1367 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_kdtrace.h" #include "opt_ktrace.h" #include "opt_procdesc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for acct_process() function prototype */ #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exit; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, kernel, , exit, "int"); /* Hook for NFS teardown procedure. */ void (*nlminfo_release_p)(struct proc *p); struct proc * proc_realparent(struct proc *child) { struct proc *p, *parent; sx_assert(&proctree_lock, SX_LOCKED); if ((child->p_treeflag & P_TREE_ORPHANED) == 0) { if (child->p_oppid == 0 || child->p_pptr->p_pid == child->p_oppid) parent = child->p_pptr; else parent = initproc; return (parent); } for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) { /* Cannot use LIST_PREV(), since the list head is not known. */ p = __containerof(p->p_orphan.le_prev, struct proc, p_orphan.le_next); KASSERT((p->p_treeflag & P_TREE_ORPHANED) != 0, ("missing P_ORPHAN %p", p)); } parent = __containerof(p->p_orphan.le_prev, struct proc, p_orphans.lh_first); return (parent); } void reaper_abandon_children(struct proc *p, bool exiting) { struct proc *p1, *p2, *ptmp; sx_assert(&proctree_lock, SX_LOCKED); KASSERT(p != initproc, ("reaper_abandon_children for initproc")); if ((p->p_treeflag & P_TREE_REAPER) == 0) return; p1 = p->p_reaper; LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) { LIST_REMOVE(p2, p_reapsibling); p2->p_reaper = p1; p2->p_reapsubtree = p->p_reapsubtree; LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling); if (exiting && p2->p_pptr == p) { PROC_LOCK(p2); proc_reparent(p2, p1); PROC_UNLOCK(p2); } } KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty")); p->p_treeflag &= ~P_TREE_REAPER; } static void clear_orphan(struct proc *p) { struct proc *p1; sx_assert(&proctree_lock, SA_XLOCKED); if ((p->p_treeflag & P_TREE_ORPHANED) == 0) return; if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) { p1 = LIST_NEXT(p, p_orphan); if (p1 != NULL) p1->p_treeflag |= P_TREE_FIRST_ORPHAN; p->p_treeflag &= ~P_TREE_FIRST_ORPHAN; } LIST_REMOVE(p, p_orphan); p->p_treeflag &= ~P_TREE_ORPHANED; } /* * exit -- death of process. */ void sys_sys_exit(struct thread *td, struct sys_exit_args *uap) { exit1(td, W_EXITCODE(uap->rval, 0)); /* NOTREACHED */ } /* * Exit: deallocate address space and other resources, change proc state to * zombie, and unlink proc from allproc and parent's lists. Save exit status * and rusage for wait(). Check for child processes and orphan them. */ void exit1(struct thread *td, int rv) { struct proc *p, *nq, *q, *t; struct thread *tdt; struct vnode *ttyvp = NULL; mtx_assert(&Giant, MA_NOTOWNED); p = td->td_proc; /* * XXX in case we're rebooting we just let init die in order to * work around an unsolved stack overflow seen very late during * shutdown on sparc64 when the gmirror worker process exists. */ if (p == initproc && rebooting == 0) { printf("init died (signal %d, exit %d)\n", WTERMSIG(rv), WEXITSTATUS(rv)); panic("Going nowhere without my init!"); } /* * Deref SU mp, since the thread does not return to userspace. */ if (softdep_ast_cleanup != NULL) softdep_ast_cleanup(); /* * MUST abort all other threads before proceeding past here. */ PROC_LOCK(p); /* * First check if some other thread or external request got * here before us. If so, act appropriately: exit or suspend. * We must ensure that stop requests are handled before we set * P_WEXIT. */ thread_suspend_check(0); while (p->p_flag & P_HADTHREADS) { /* * Kill off the other threads. This requires * some co-operation from other parts of the kernel * so it may not be instantaneous. With this state set * any thread entering the kernel from userspace will * thread_exit() in trap(). Any thread attempting to * sleep will return immediately with EINTR or EWOULDBLOCK * which will hopefully force them to back out to userland * freeing resources as they go. Any thread attempting * to return to userland will thread_exit() from userret(). * thread_exit() will unsuspend us when the last of the * other threads exits. * If there is already a thread singler after resumption, * calling thread_single will fail; in that case, we just * re-check all suspension request, the thread should * either be suspended there or exit. */ if (!thread_single(p, SINGLE_EXIT)) /* * All other activity in this process is now * stopped. Threading support has been turned * off. */ break; /* * Recheck for new stop or suspend requests which * might appear while process lock was dropped in * thread_single(). */ thread_suspend_check(0); } KASSERT(p->p_numthreads == 1, ("exit1: proc %p exiting with %d threads", p, p->p_numthreads)); racct_sub(p, RACCT_NTHR, 1); /* * Wakeup anyone in procfs' PIOCWAIT. They should have a hold * on our vmspace, so we should block below until they have * released their reference to us. Note that if they have * requested S_EXIT stops we will block here until they ack * via PIOCCONT. */ _STOPEVENT(p, S_EXIT, rv); /* * Ignore any pending request to stop due to a stop signal. * Once P_WEXIT is set, future requests will be ignored as * well. */ p->p_flag &= ~P_STOPPED_SIG; KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped")); /* * Note that we are exiting and do another wakeup of anyone in * PIOCWAIT in case they aren't listening for S_EXIT stops or * decided to wait again after we told them we are exiting. */ p->p_flag |= P_WEXIT; wakeup(&p->p_stype); /* * Wait for any processes that have a hold on our vmspace to * release their reference. */ while (p->p_lock > 0) msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0); p->p_xstat = rv; /* Let event handler change exit status */ PROC_UNLOCK(p); /* Drain the limit callout while we don't have the proc locked */ callout_drain(&p->p_limco); #ifdef AUDIT /* * The Sun BSM exit token contains two components: an exit status as * passed to exit(), and a return value to indicate what sort of exit * it was. The exit status is WEXITSTATUS(rv), but it's not clear * what the return value is. */ AUDIT_ARG_EXIT(WEXITSTATUS(rv), 0); AUDIT_SYSCALL_EXIT(0, td); #endif /* Are we a task leader? */ if (p == p->p_leader) { mtx_lock(&ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); kern_psignal(q, SIGKILL); PROC_UNLOCK(q); q = q->p_peers; } while (p->p_peers != NULL) msleep(p, &ppeers_lock, PWAIT, "exit1", 0); mtx_unlock(&ppeers_lock); } /* * Check if any loadable modules need anything done at process exit. * E.g. SYSV IPC stuff * XXX what if one of these generates an error? */ EVENTHANDLER_INVOKE(process_exit, p); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ PROC_LOCK(p); rv = p->p_xstat; /* Event handler could change exit status */ stopprofclock(p); p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE); /* * Stop the real interval timer. If the handler is currently * executing, prevent it from rearming itself and let it finish. */ if (timevalisset(&p->p_realtimer.it_value) && callout_stop(&p->p_itcallout) == 0) { timevalclear(&p->p_realtimer.it_interval); msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0); KASSERT(!timevalisset(&p->p_realtimer.it_value), ("realtime timer is still armed")); } PROC_UNLOCK(p); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pid. */ funsetownlst(&p->p_sigiolst); /* * If this process has an nlminfo data area (for lockd), release it */ if (nlminfo_release_p != NULL && p->p_nlminfo != NULL) (*nlminfo_release_p)(p); /* * Close open files and release open-file table. * This may block! */ fdescfree(td); /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland */ if (td->td_pflags & TDP_GEOM) g_waitidle(); /* * Remove ourself from our leader's peer list and wake our leader. */ mtx_lock(&ppeers_lock); if (p->p_leader->p_peers) { q = p->p_leader; while (q->p_peers != p) q = q->p_peers; q->p_peers = p->p_peers; wakeup(p->p_leader); } mtx_unlock(&ppeers_lock); vmspace_exit(td); sx_xlock(&proctree_lock); if (SESS_LEADER(p)) { struct session *sp = p->p_session; struct tty *tp; /* * s_ttyp is not zero'd; we use this to indicate that * the session once had a controlling terminal. (for * logging and informational purposes) */ SESS_LOCK(sp); ttyvp = sp->s_ttyvp; tp = sp->s_ttyp; sp->s_ttyvp = NULL; sp->s_ttydp = NULL; sp->s_leader = NULL; SESS_UNLOCK(sp); /* * Signal foreground pgrp and revoke access to * controlling terminal if it has not been revoked * already. * * Because the TTY may have been revoked in the mean * time and could already have a new session associated * with it, make sure we don't send a SIGHUP to a * foreground process group that does not belong to this * session. */ if (tp != NULL) { tty_lock(tp); if (tp->t_session == sp) tty_signal_pgrp(tp, SIGHUP); tty_unlock(tp); } if (ttyvp != NULL) { sx_xunlock(&proctree_lock); if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) { VOP_REVOKE(ttyvp, REVOKEALL); VOP_UNLOCK(ttyvp, 0); } sx_xlock(&proctree_lock); } } fixjobc(p, p->p_pgrp, 0); sx_xunlock(&proctree_lock); (void)acct_process(td); /* Release the TTY now we've unlocked everything. */ if (ttyvp != NULL) vrele(ttyvp); #ifdef KTRACE ktrprocexit(td); #endif /* * Release reference to text vnode */ if (p->p_textvp != NULL) { vrele(p->p_textvp); p->p_textvp = NULL; } /* * Release our limits structure. */ lim_free(p->p_limit); p->p_limit = NULL; tidhash_remove(td); /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); LIST_REMOVE(p, p_hash); sx_xunlock(&allproc_lock); /* * Call machine-dependent code to release any * machine-dependent resources other than the address space. * The address space is released by "vmspace_exitfree(p)" in * vm_waitproc(). */ cpu_exit(td); WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid); /* * Reparent all children processes: * - traced ones to the original parent (or init if we are that parent) * - the rest to init */ sx_xlock(&proctree_lock); q = LIST_FIRST(&p->p_children); if (q != NULL) /* only need this if any child is S_ZOMB */ wakeup(q->p_reaper); for (; q != NULL; q = nq) { nq = LIST_NEXT(q, p_sibling); PROC_LOCK(q); q->p_sigparent = SIGCHLD; if (!(q->p_flag & P_TRACED)) { proc_reparent(q, q->p_reaper); } else { /* * Traced processes are killed since their existence * means someone is screwing up. */ t = proc_realparent(q); if (t == p) { proc_reparent(q, q->p_reaper); } else { PROC_LOCK(t); proc_reparent(q, t); PROC_UNLOCK(t); } /* * Since q was found on our children list, the * proc_reparent() call moved q to the orphan * list due to present P_TRACED flag. Clear * orphan link for q now while q is locked. */ clear_orphan(q); q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE); FOREACH_THREAD_IN_PROC(q, tdt) tdt->td_dbgflags &= ~TDB_SUSPEND; kern_psignal(q, SIGKILL); } PROC_UNLOCK(q); } /* * Also get rid of our orphans. */ while ((q = LIST_FIRST(&p->p_orphans)) != NULL) { PROC_LOCK(q); CTR2(KTR_PTRACE, "exit: pid %d, clearing orphan %d", p->p_pid, q->p_pid); clear_orphan(q); PROC_UNLOCK(q); } /* Save exit status. */ PROC_LOCK(p); p->p_xthread = td; /* Tell the prison that we are gone. */ prison_proc_free(p->p_ucred->cr_prison); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exit if it * has declared an interest. */ if (dtrace_fasttrap_exit) dtrace_fasttrap_exit(p); #endif /* * Notify interested parties of our demise. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXIT); #ifdef KDTRACE_HOOKS int reason = CLD_EXITED; if (WCOREDUMP(rv)) reason = CLD_DUMPED; else if (WIFSIGNALED(rv)) reason = CLD_KILLED; SDT_PROBE1(proc, kernel, , exit, reason); #endif /* * Just delete all entries in the p_klist. At this point we won't * report any more events, and there are nasty race conditions that * can beat us if we don't. */ knlist_clear(&p->p_klist, 1); /* * If this is a process with a descriptor, we may not need to deliver * a signal to the parent. proctree_lock is held over * procdesc_exit() to serialize concurrent calls to close() and * exit(). */ #ifdef PROCDESC if (p->p_procdesc == NULL || procdesc_exit(p)) { #endif /* * Notify parent that we're gone. If parent has the * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN, * notify process 1 instead (and hope it will handle this * situation). */ PROC_LOCK(p->p_pptr); mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { struct proc *pp; mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); pp = p->p_pptr; PROC_UNLOCK(pp); proc_reparent(p, p->p_reaper); p->p_sigparent = SIGCHLD; PROC_LOCK(p->p_pptr); /* * Notify parent, so in case he was wait(2)ing or * executing waitpid(2) with our pid, he will * continue. */ wakeup(pp); } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr == p->p_reaper || p->p_pptr == initproc) childproc_exited(p); else if (p->p_sigparent != 0) { if (p->p_sigparent == SIGCHLD) childproc_exited(p); else /* LINUX thread */ kern_psignal(p->p_pptr, p->p_sigparent); } #ifdef PROCDESC } else PROC_LOCK(p->p_pptr); #endif sx_xunlock(&proctree_lock); /* * The state PRS_ZOMBIE prevents other proesses from sending * signal to the process, to avoid memory leak, we free memory * for signal queue at the time when the state is set. */ sigqueue_flush(&p->p_sigqueue); sigqueue_flush(&td->td_sigqueue); /* * We have to wait until after acquiring all locks before * changing p_state. We need to avoid all possible context * switches (including ones from blocking on a mutex) while * marked as a zombie. We also have to set the zombie state * before we release the parent process' proc lock to avoid * a lost wakeup. So, we first call wakeup, then we grab the * sched lock, update the state, and release the parent process' * proc lock. */ wakeup(p->p_pptr); cv_broadcast(&p->p_pwait); sched_exit(p->p_pptr, td); umtx_thread_exit(td); PROC_SLOCK(p); p->p_state = PRS_ZOMBIE; PROC_UNLOCK(p->p_pptr); /* * Hopefully no one will try to deliver a signal to the process this * late in the game. */ knlist_destroy(&p->p_klist); /* * Save our children's rusage information in our exit rusage. */ PROC_STATLOCK(p); ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux); PROC_STATUNLOCK(p); /* * Make sure the scheduler takes this thread out of its tables etc. * This will also release this thread's reference to the ucred. * Other thread parts to release include pcb bits and such. */ thread_exit(); } #ifndef _SYS_SYSPROTO_H_ struct abort2_args { char *why; int nargs; void **args; }; #endif int sys_abort2(struct thread *td, struct abort2_args *uap) { struct proc *p = td->td_proc; struct sbuf *sb; void *uargs[16]; int error, i, sig; /* * Do it right now so we can log either proper call of abort2(), or * note, that invalid argument was passed. 512 is big enough to * handle 16 arguments' descriptions with additional comments. */ sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN); sbuf_clear(sb); sbuf_printf(sb, "%s(pid %d uid %d) aborted: ", p->p_comm, p->p_pid, td->td_ucred->cr_uid); /* * Since we can't return from abort2(), send SIGKILL in cases, where * abort2() was called improperly */ sig = SIGKILL; /* Prevent from DoSes from user-space. */ if (uap->nargs < 0 || uap->nargs > 16) goto out; if (uap->nargs > 0) { if (uap->args == NULL) goto out; error = copyin(uap->args, uargs, uap->nargs * sizeof(void *)); if (error != 0) goto out; } /* * Limit size of 'reason' string to 128. Will fit even when * maximal number of arguments was chosen to be logged. */ if (uap->why != NULL) { error = sbuf_copyin(sb, uap->why, 128); if (error < 0) goto out; } else { sbuf_printf(sb, "(null)"); } if (uap->nargs > 0) { sbuf_printf(sb, "("); for (i = 0;i < uap->nargs; i++) sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]); sbuf_printf(sb, ")"); } /* * Final stage: arguments were proper, string has been * successfully copied from userspace, and copying pointers * from user-space succeed. */ sig = SIGABRT; out: if (sig == SIGKILL) { sbuf_trim(sb); sbuf_printf(sb, " (Reason text inaccessible)"); } sbuf_cat(sb, "\n"); sbuf_finish(sb); log(LOG_INFO, "%s", sbuf_data(sb)); sbuf_delete(sb); exit1(td, W_EXITCODE(0, sig)); return (0); } #ifdef COMPAT_43 /* * The dirty work is handled by kern_wait(). */ int owait(struct thread *td, struct owait_args *uap __unused) { int error, status; error = kern_wait(td, WAIT_ANY, &status, 0, NULL); if (error == 0) td->td_retval[1] = status; return (error); } #endif /* COMPAT_43 */ /* * The dirty work is handled by kern_wait(). */ int sys_wait4(struct thread *td, struct wait4_args *uap) { struct rusage ru, *rup; int error, status; if (uap->rusage != NULL) rup = &ru; else rup = NULL; error = kern_wait(td, uap->pid, &status, uap->options, rup); if (uap->status != NULL && error == 0) error = copyout(&status, uap->status, sizeof(status)); if (uap->rusage != NULL && error == 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int sys_wait6(struct thread *td, struct wait6_args *uap) { struct __wrusage wru, *wrup; siginfo_t si, *sip; idtype_t idtype; id_t id; int error, status; idtype = uap->idtype; id = uap->id; if (uap->wrusage != NULL) wrup = &wru; else wrup = NULL; if (uap->info != NULL) { sip = &si; bzero(sip, sizeof(*sip)); } else sip = NULL; /* * We expect all callers of wait6() to know about WEXITED and * WTRAPPED. */ error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip); if (uap->status != NULL && error == 0) error = copyout(&status, uap->status, sizeof(status)); if (uap->wrusage != NULL && error == 0) error = copyout(&wru, uap->wrusage, sizeof(wru)); if (uap->info != NULL && error == 0) error = copyout(&si, uap->info, sizeof(si)); return (error); } /* * Reap the remains of a zombie process and optionally return status and * rusage. Asserts and will release both the proctree_lock and the process * lock as part of its work. */ void proc_reap(struct thread *td, struct proc *p, int *status, int options) { struct proc *q, *t; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE")); q = td->td_proc; PROC_SUNLOCK(p); td->td_retval[0] = p->p_pid; if (status) *status = p->p_xstat; /* convert to int */ if (options & WNOWAIT) { /* * Only poll, returning the status. Caller does not wish to * release the proc struct just yet. */ PROC_UNLOCK(p); sx_xunlock(&proctree_lock); return; } PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); /* * If we got the child via a ptrace 'attach', we need to give it back * to the old parent. */ if (p->p_oppid != 0 && p->p_oppid != p->p_pptr->p_pid) { PROC_UNLOCK(p); t = proc_realparent(p); PROC_LOCK(t); PROC_LOCK(p); CTR2(KTR_PTRACE, "wait: traced child %d moved back to parent %d", p->p_pid, t->p_pid); proc_reparent(p, t); p->p_oppid = 0; PROC_UNLOCK(p); pksignal(t, SIGCHLD, p->p_ksi); wakeup(t); cv_broadcast(&p->p_pwait); PROC_UNLOCK(t); sx_xunlock(&proctree_lock); return; } p->p_oppid = 0; PROC_UNLOCK(p); /* * Remove other references to this process to ensure we have an * exclusive reference. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); /* off zombproc */ sx_xunlock(&allproc_lock); LIST_REMOVE(p, p_sibling); reaper_abandon_children(p, true); LIST_REMOVE(p, p_reapsibling); PROC_LOCK(p); clear_orphan(p); PROC_UNLOCK(p); leavepgrp(p); #ifdef PROCDESC if (p->p_procdesc != NULL) procdesc_reap(p); #endif sx_xunlock(&proctree_lock); /* * As a side effect of this lock, we know that all other writes to * this proc are visible now, so no more locking is needed for p. */ PROC_LOCK(p); p->p_xstat = 0; /* XXX: why? */ PROC_UNLOCK(p); PROC_LOCK(q); ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux); PROC_UNLOCK(q); /* * Decrement the count of procs running with this uid. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0); /* * Destroy resource accounting information associated with the process. */ #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } #endif racct_proc_exit(p); /* * Free credentials, arguments, and sigacts. */ crfree(p->p_ucred); - p->p_ucred = NULL; + proc_set_cred(p, NULL); pargs_drop(p->p_args); p->p_args = NULL; sigacts_free(p->p_sigacts); p->p_sigacts = NULL; /* * Do any thread-system specific cleanups. */ thread_wait(p); /* * Give vm and machine-dependent layer a chance to free anything that * cpu_exit couldn't release while still running in process context. */ vm_waitproc(p); #ifdef MAC mac_proc_destroy(p); #endif KASSERT(FIRST_THREAD_IN_PROC(p), ("proc_reap: no residual thread!")); uma_zfree(proc_zone, p); atomic_add_int(&nprocs, -1); } static int proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id, int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo, int check_only) { struct rusage *rup; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK(p); switch (idtype) { case P_ALL: break; case P_PID: if (p->p_pid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_PGID: if (p->p_pgid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_SID: if (p->p_session->s_sid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_UID: if (p->p_ucred->cr_uid != (uid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_GID: if (p->p_ucred->cr_gid != (gid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_JAILID: if (p->p_ucred->cr_prison->pr_id != (int)id) { PROC_UNLOCK(p); return (0); } break; /* * It seems that the thread structures get zeroed out * at process exit. This makes it impossible to * support P_SETID, P_CID or P_CPUID. */ default: PROC_UNLOCK(p); return (0); } if (p_canwait(td, p)) { PROC_UNLOCK(p); return (0); } if (((options & WEXITED) == 0) && (p->p_state == PRS_ZOMBIE)) { PROC_UNLOCK(p); return (0); } /* * This special case handles a kthread spawned by linux_clone * (see linux_misc.c). The linux_wait4 and linux_waitpid * functions need to be able to distinguish between waiting * on a process and waiting on a thread. It is a thread if * p_sigparent is not SIGCHLD, and the WLINUXCLONE option * signifies we want to wait for threads and not processes. */ if ((p->p_sigparent != SIGCHLD) ^ ((options & WLINUXCLONE) != 0)) { PROC_UNLOCK(p); return (0); } if (siginfo != NULL) { bzero(siginfo, sizeof(*siginfo)); siginfo->si_errno = 0; /* * SUSv4 requires that the si_signo value is always * SIGCHLD. Obey it despite the rfork(2) interface * allows to request other signal for child exit * notification. */ siginfo->si_signo = SIGCHLD; /* * This is still a rough estimate. We will fix the * cases TRAPPED, STOPPED, and CONTINUED later. */ if (WCOREDUMP(p->p_xstat)) { siginfo->si_code = CLD_DUMPED; siginfo->si_status = WTERMSIG(p->p_xstat); } else if (WIFSIGNALED(p->p_xstat)) { siginfo->si_code = CLD_KILLED; siginfo->si_status = WTERMSIG(p->p_xstat); } else { siginfo->si_code = CLD_EXITED; siginfo->si_status = WEXITSTATUS(p->p_xstat); } siginfo->si_pid = p->p_pid; siginfo->si_uid = p->p_ucred->cr_uid; /* * The si_addr field would be useful additional * detail, but apparently the PC value may be lost * when we reach this point. bzero() above sets * siginfo->si_addr to NULL. */ } /* * There should be no reason to limit resources usage info to * exited processes only. A snapshot about any resources used * by a stopped process may be exactly what is needed. */ if (wrusage != NULL) { rup = &wrusage->wru_self; *rup = p->p_ru; PROC_STATLOCK(p); calcru(p, &rup->ru_utime, &rup->ru_stime); PROC_STATUNLOCK(p); rup = &wrusage->wru_children; *rup = p->p_stats->p_cru; calccru(p, &rup->ru_utime, &rup->ru_stime); } if (p->p_state == PRS_ZOMBIE && !check_only) { PROC_SLOCK(p); proc_reap(td, p, status, options); return (-1); } PROC_UNLOCK(p); return (1); } int kern_wait(struct thread *td, pid_t pid, int *status, int options, struct rusage *rusage) { struct __wrusage wru, *wrup; idtype_t idtype; id_t id; int ret; /* * Translate the special pid values into the (idtype, pid) * pair for kern_wait6. The WAIT_MYPGRP case is handled by * kern_wait6() on its own. */ if (pid == WAIT_ANY) { idtype = P_ALL; id = 0; } else if (pid < 0) { idtype = P_PGID; id = (id_t)-pid; } else { idtype = P_PID; id = (id_t)pid; } if (rusage != NULL) wrup = &wru; else wrup = NULL; /* * For backward compatibility we implicitly add flags WEXITED * and WTRAPPED here. */ options |= WEXITED | WTRAPPED; ret = kern_wait6(td, idtype, id, status, options, wrup, NULL); if (rusage != NULL) *rusage = wru.wru_self; return (ret); } int kern_wait6(struct thread *td, idtype_t idtype, id_t id, int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo) { struct proc *p, *q; int error, nfound, ret; AUDIT_ARG_VALUE((int)idtype); /* XXX - This is likely wrong! */ AUDIT_ARG_PID((pid_t)id); /* XXX - This may be wrong! */ AUDIT_ARG_VALUE(options); q = td->td_proc; if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) { PROC_LOCK(q); id = (id_t)q->p_pgid; PROC_UNLOCK(q); idtype = P_PGID; } /* If we don't know the option, just return. */ if ((options & ~(WUNTRACED | WNOHANG | WCONTINUED | WNOWAIT | WEXITED | WTRAPPED | WLINUXCLONE)) != 0) return (EINVAL); if ((options & (WEXITED | WUNTRACED | WCONTINUED | WTRAPPED)) == 0) { /* * We will be unable to find any matching processes, * because there are no known events to look for. * Prefer to return error instead of blocking * indefinitely. */ return (EINVAL); } loop: if (q->p_flag & P_STATCHILD) { PROC_LOCK(q); q->p_flag &= ~P_STATCHILD; PROC_UNLOCK(q); } nfound = 0; sx_xlock(&proctree_lock); LIST_FOREACH(p, &q->p_children, p_sibling) { ret = proc_to_reap(td, p, idtype, id, status, options, wrusage, siginfo, 0); if (ret == 0) continue; else if (ret == 1) nfound++; else return (0); PROC_LOCK(p); PROC_SLOCK(p); if ((options & WTRAPPED) != 0 && (p->p_flag & P_TRACED) != 0 && (p->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) != 0 && (p->p_suspcount == p->p_numthreads) && ((p->p_flag & P_WAITED) == 0)) { PROC_SUNLOCK(p); if ((options & WNOWAIT) == 0) p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; if (status != NULL) *status = W_STOPCODE(p->p_xstat); if (siginfo != NULL) { siginfo->si_status = p->p_xstat; siginfo->si_code = CLD_TRAPPED; } if ((options & WNOWAIT) == 0) { PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } CTR4(KTR_PTRACE, "wait: returning trapped pid %d status %#x (xstat %d) xthread %d", p->p_pid, W_STOPCODE(p->p_xstat), p->p_xstat, p->p_xthread != NULL ? p->p_xthread->td_tid : -1); PROC_UNLOCK(p); return (0); } if ((options & WUNTRACED) != 0 && (p->p_flag & P_STOPPED_SIG) != 0 && (p->p_suspcount == p->p_numthreads) && ((p->p_flag & P_WAITED) == 0)) { PROC_SUNLOCK(p); if ((options & WNOWAIT) == 0) p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; if (status != NULL) *status = W_STOPCODE(p->p_xstat); if (siginfo != NULL) { siginfo->si_status = p->p_xstat; siginfo->si_code = CLD_STOPPED; } if ((options & WNOWAIT) == 0) { PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } PROC_UNLOCK(p); return (0); } PROC_SUNLOCK(p); if ((options & WCONTINUED) != 0 && (p->p_flag & P_CONTINUED) != 0) { sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; if ((options & WNOWAIT) == 0) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } PROC_UNLOCK(p); if (status != NULL) *status = SIGCONT; if (siginfo != NULL) { siginfo->si_status = SIGCONT; siginfo->si_code = CLD_CONTINUED; } return (0); } PROC_UNLOCK(p); } /* * Look in the orphans list too, to allow the parent to * collect it's child exit status even if child is being * debugged. * * Debugger detaches from the parent upon successful * switch-over from parent to child. At this point due to * re-parenting the parent loses the child to debugger and a * wait4(2) call would report that it has no children to wait * for. By maintaining a list of orphans we allow the parent * to successfully wait until the child becomes a zombie. */ if (nfound == 0) { LIST_FOREACH(p, &q->p_orphans, p_orphan) { ret = proc_to_reap(td, p, idtype, id, NULL, options, NULL, NULL, 1); if (ret != 0) { KASSERT(ret != -1, ("reaped an orphan (pid %d)", (int)td->td_retval[0])); nfound++; break; } } } if (nfound == 0) { sx_xunlock(&proctree_lock); return (ECHILD); } if (options & WNOHANG) { sx_xunlock(&proctree_lock); td->td_retval[0] = 0; return (0); } PROC_LOCK(q); sx_xunlock(&proctree_lock); if (q->p_flag & P_STATCHILD) { q->p_flag &= ~P_STATCHILD; error = 0; } else error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0); PROC_UNLOCK(q); if (error) return (error); goto loop; } /* * Make process 'parent' the new parent of process 'child'. * Must be called with an exclusive hold of proctree lock. */ void proc_reparent(struct proc *child, struct proc *parent) { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(child, MA_OWNED); if (child->p_pptr == parent) return; PROC_LOCK(child->p_pptr); sigqueue_take(child->p_ksi); PROC_UNLOCK(child->p_pptr); LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); clear_orphan(child); if (child->p_flag & P_TRACED) { if (LIST_EMPTY(&child->p_pptr->p_orphans)) { child->p_treeflag |= P_TREE_FIRST_ORPHAN; LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child, p_orphan); } else { LIST_INSERT_AFTER(LIST_FIRST(&child->p_pptr->p_orphans), child, p_orphan); } child->p_treeflag |= P_TREE_ORPHANED; } child->p_pptr = parent; } Index: stable/10/sys/kern/kern_fork.c =================================================================== --- stable/10/sys/kern/kern_fork.c (revision 302228) +++ stable/10/sys/kern/kern_fork.c (revision 302229) @@ -1,1106 +1,1107 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_kdtrace.h" #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include "opt_procdesc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_fork_func_t dtrace_fasttrap_fork; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, kernel, , create, "struct proc *", "struct proc *", "int"); #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif /* ARGSUSED */ int sys_fork(struct thread *td, struct fork_args *uap) { int error; struct proc *p2; error = fork1(td, RFFDG | RFPROC, 0, &p2, NULL, 0); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } return (error); } /* ARGUSED */ int sys_pdfork(td, uap) struct thread *td; struct pdfork_args *uap; { #ifdef PROCDESC int error, fd; struct proc *p2; /* * It is necessary to return fd by reference because 0 is a valid file * descriptor number, and the child needs to be able to distinguish * itself from the parent using the return value. */ error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2, &fd, uap->flags); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; error = copyout(&fd, uap->fdp, sizeof(fd)); } return (error); #else return (ENOSYS); #endif } /* ARGSUSED */ int sys_vfork(struct thread *td, struct vfork_args *uap) { int error, flags; struct proc *p2; flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; error = fork1(td, flags, 0, &p2, NULL, 0); if (error == 0) { td->td_retval[0] = p2->p_pid; td->td_retval[1] = 0; } return (error); } int sys_rfork(struct thread *td, struct rfork_args *uap) { struct proc *p2; int error; /* Don't allow kernel-only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); AUDIT_ARG_FFLAGS(uap->flags); error = fork1(td, uap->flags, 0, &p2, NULL, 0); if (error == 0) { td->td_retval[0] = p2 ? p2->p_pid : 0; td->td_retval[1] = 0; } return (error); } int nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error != 0) return(error); sx_xlock(&allproc_lock); pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid < 0 || pid > pid_max - 100) /* out of range */ pid = pid_max - 100; else if (pid < 2) /* NOP */ pid = 0; else if (pid < 100) /* Make it reasonable */ pid = 100; randompid = pid; } sx_xunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus"); static int fork_findpid(int flags) { struct proc *p; int trypid; static int pidchecked = 0; /* * Requires allproc_lock in order to iterate over the list * of processes, and proctree_lock to access p_pgrp. */ sx_assert(&allproc_lock, SX_LOCKED); sx_assert(&proctree_lock, SX_LOCKED); /* * Find an unused process ID. We remember a range of unused IDs * ready to use (from lastpid+1 through pidchecked-1). * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { if (randompid) trypid += arc4random() % randompid; } retry: /* * If the process ID prototype has wrapped around, * restart somewhat above 0, as the low-numbered procs * tend to include daemons that don't exit. */ if (trypid >= pid_max) { trypid = trypid % pid_max; if (trypid < 100) trypid += 100; pidchecked = 0; } if (trypid >= pidchecked) { int doingzomb = 0; pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater * than trypid, so we can avoid checking for a while. * * Avoid reuse of the process group id, session id or * the reaper subtree id. Note that for process group * and sessions, the amount of reserved pids is * limited by process limit. For the subtree ids, the * id is kept reserved only while there is a * non-reaped process in the subtree, so amount of * reserved pids is limited by process limit times * two. */ p = LIST_FIRST(&allproc); again: for (; p != NULL; p = LIST_NEXT(p, p_list)) { while (p->p_pid == trypid || p->p_reapsubtree == trypid || (p->p_pgrp != NULL && (p->p_pgrp->pg_id == trypid || (p->p_session != NULL && p->p_session->s_sid == trypid)))) { trypid++; if (trypid >= pidchecked) goto retry; } if (p->p_pid > trypid && pidchecked > p->p_pid) pidchecked = p->p_pid; if (p->p_pgrp != NULL) { if (p->p_pgrp->pg_id > trypid && pidchecked > p->p_pgrp->pg_id) pidchecked = p->p_pgrp->pg_id; if (p->p_session != NULL && p->p_session->s_sid > trypid && pidchecked > p->p_session->s_sid) pidchecked = p->p_session->s_sid; } } if (!doingzomb) { doingzomb = 1; p = LIST_FIRST(&zombproc); goto again; } } /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) pidchecked = 0; else lastpid = trypid; return (trypid); } static int fork_norfproc(struct thread *td, int flags) { int error; struct proc *p1; KASSERT((flags & RFPROC) == 0, ("fork_norfproc called with RFPROC set")); p1 = td->td_proc; if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); if (thread_single(p1, SINGLE_BOUNDARY)) { PROC_UNLOCK(p1); return (ERESTART); } PROC_UNLOCK(p1); } error = vm_forkproc(td, NULL, NULL, NULL, flags); if (error) goto fail; /* * Close all file descriptors. */ if (flags & RFCFDG) { struct filedesc *fdtmp; fdtmp = fdinit(td->td_proc->p_fd); fdescfree(td); p1->p_fd = fdtmp; } /* * Unshare file descriptors (from parent). */ if (flags & RFFDG) fdunshare(td); fail: if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) && (flags & (RFCFDG | RFFDG))) { PROC_LOCK(p1); thread_single_end(p1, SINGLE_BOUNDARY); PROC_UNLOCK(p1); } return (error); } static void do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2, struct vmspace *vm2, int pdflags) { struct proc *p1, *pptr; int p2_held, trypid; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct sigacts *newsigacts; sx_assert(&proctree_lock, SX_SLOCKED); sx_assert(&allproc_lock, SX_XLOCKED); p2_held = 0; p1 = td->td_proc; trypid = fork_findpid(flags); sx_sunlock(&proctree_lock); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; AUDIT_ARG_PID(p2->p_pid); LIST_INSERT_HEAD(&allproc, p2, p_list); allproc_gen++; LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); tidhash_add(td2); PROC_LOCK(p2); PROC_LOCK(p1); sx_xunlock(&allproc_lock); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); pargs_hold(p2->p_args); PROC_UNLOCK(p1); bzero(&p2->p_startzero, __rangeof(struct proc, p_startzero, p_endzero)); p2->p_treeflag = 0; p2->p_filemon = NULL; - p2->p_ucred = crhold(td->td_ucred); + crhold(td->td_ucred); + proc_set_cred(p2, td->td_ucred); /* Tell the prison that we exist. */ prison_proc_hold(p2->p_ucred->cr_prison); PROC_UNLOCK(p2); /* * Malloc things while we don't hold any locks. */ if (flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (flags & RFCFDG) { fd = fdinit(p1->p_fd); fdtol = NULL; } else if (flags & RFFDG) { fd = fdcopy(p1->p_fd); fdtol = NULL; } else { fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((flags & RFTHREAD) != 0) { /* * Shared file descriptor table, and shared * process leaders. */ fdtol = p1->p_fdtol; FILEDESC_XLOCK(p1->p_fd); fdtol->fdl_refcount++; FILEDESC_XUNLOCK(p1->p_fd); } else { /* * Shared file descriptor table, and different * process leaders. */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ PROC_LOCK(p2); PROC_LOCK(p1); bzero(&td2->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); td2->td_su = NULL; bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name)); td2->td_sigstk = td->td_sigstk; td2->td_flags = TDF_INMEM; td2->td_lend_user_pri = PRI_MAX; td2->td_dbg_sc_code = td->td_dbg_sc_code; td2->td_dbg_sc_narg = td->td_dbg_sc_narg; #ifdef VIMAGE td2->td_vnet = NULL; td2->td_vnet_lpush = NULL; #endif /* * Allow the scheduler to initialize the child. */ thread_lock(td); sched_fork(td, td2); thread_unlock(td); /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. */ p2->p_flag = P_INMEM; p2->p_flag2 = p1->p_flag2 & (P2_NOTRACE | P2_NOTRACE_EXEC); p2->p_swtick = ticks; if (p1->p_flag & P_PROFIL) startprofclock(p2); td2->td_ucred = crhold(p2->p_ucred); if (flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; } if (flags & RFTSIGZMB) p2->p_sigparent = RFTSIGNUM(flags); else if (flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; p2->p_textvp = p1->p_textvp; p2->p_fd = fd; p2->p_fdtol = fdtol; if (p1->p_flag2 & P2_INHERIT_PROTECTED) { p2->p_flag |= P_PROTECTED; p2->p_flag2 |= P2_INHERIT_PROTECTED; } /* * p_limit is copy-on-write. Bump its refcount. */ lim_fork(p1, p2); pstats_fork(p1->p_stats, p2->p_stats); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* Bump references to the text vnode (for procfs). */ if (p2->p_textvp) vref(p2->p_textvp); /* * Set up linkage for kernel based threading. */ if ((flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); kern_psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & P_SUGID; td2->td_pflags |= td->td_pflags & TDP_ALTSTACK; SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; p2->p_pgrp = p1->p_pgrp; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); LIST_INIT(&p2->p_orphans); callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0); /* * If PF_FORK is set, the child process inherits the * procfs ioctl flags from its parent. */ if (p1->p_pfsflags & PF_FORK) { p2->p_stops = p1->p_stops; p2->p_pfsflags = p1->p_pfsflags; } /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if ((flags & RFNOWAIT) != 0) { pptr = p1->p_reaper; p2->p_reaper = pptr; } else { p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ? p1 : p1->p_reaper; pptr = p1; } p2->p_pptr = pptr; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_reaplist); LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling); if (p2->p_reaper == p1) p2->p_reapsubtree = p2->p_pid; else p2->p_reapsubtree = p1->p_reapsubtree; sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); #ifdef KTRACE ktrprocfork(p1, p2); #endif /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, td2, vm2, flags); if (flags == (RFFDG | RFPROC)) { PCPU_INC(cnt.v_forks); PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { PCPU_INC(cnt.v_vforks); PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (p1 == &proc0) { PCPU_INC(cnt.v_kthreads); PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else { PCPU_INC(cnt.v_rforks); PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } #ifdef PROCDESC /* * Associate the process descriptor with the process before anything * can happen that might cause that process to need the descriptor. * However, don't do this until after fork(2) can no longer fail. */ if (flags & RFPROCDESC) procdesc_new(p2, pdflags); #endif /* * Both processes are set up, now check if any loadable modules want * to adjust anything. */ EVENTHANDLER_INVOKE(process_fork, p1, p2, flags); /* * Set the child start time and mark the process as being complete. */ PROC_LOCK(p2); PROC_LOCK(p1); microuptime(&p2->p_stats->p_start); PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; PROC_SUNLOCK(p2); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the new process so that any * tracepoints inherited from the parent can be removed. We have to do * this only after p_state is PRS_NORMAL since the fasttrap module will * use pfind() later on. */ if ((flags & RFMEM) == 0 && dtrace_fasttrap_fork) dtrace_fasttrap_fork(p1, p2); #endif if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED | P_FOLLOWFORK)) { /* * Arrange for debugger to receive the fork event. * * We can report PL_FLAG_FORKED regardless of * P_FOLLOWFORK settings, but it does not make a sense * for runaway child. */ td->td_dbgflags |= TDB_FORK; td->td_dbg_forked = p2->p_pid; td2->td_dbgflags |= TDB_STOPATFORK; _PHOLD(p2); p2_held = 1; } if (flags & RFPPWAIT) { td->td_pflags |= TDP_RFPPWAIT; td->td_rfppwait_p = p2; } PROC_UNLOCK(p2); if ((flags & RFSTOPPED) == 0) { /* * If RFSTOPPED not requested, make child runnable and * add to run queue. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); } /* * Now can be swapped. */ _PRELE(p1); PROC_UNLOCK(p1); /* * Tell any interested parties about the new process. */ knote_fork(&p1->p_klist, p2->p_pid); SDT_PROBE3(proc, kernel, , create, p2, p1, flags); /* * Wait until debugger is attached to child. */ PROC_LOCK(p2); while ((td2->td_dbgflags & TDB_STOPATFORK) != 0) cv_wait(&p2->p_dbgwait, &p2->p_mtx); if (p2_held) _PRELE(p2); PROC_UNLOCK(p2); } int fork1(struct thread *td, int flags, int pages, struct proc **procp, int *procdescp, int pdflags) { struct proc *p1, *newproc; struct thread *td2; struct vmspace *vm2; #ifdef PROCDESC struct file *fp_procdesc; #endif vm_ooffset_t mem_charged; int error, nprocs_new, ok; static int curfail; static struct timeval lastfail; /* Check for the undefined or unimplemented flags. */ if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0) return (EINVAL); /* Signal value requires RFTSIGZMB. */ if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0) return (EINVAL); /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); /* Check the validity of the signal number. */ if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG) return (EINVAL); #ifdef PROCDESC if ((flags & RFPROCDESC) != 0) { /* Can't not create a process yet get a process descriptor. */ if ((flags & RFPROC) == 0) return (EINVAL); /* Must provide a place to put a procdesc if creating one. */ if (procdescp == NULL) return (EINVAL); } #endif p1 = td->td_proc; /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { *procp = NULL; return (fork_norfproc(td, flags)); } #ifdef PROCDESC fp_procdesc = NULL; #endif newproc = NULL; vm2 = NULL; /* * Increment the nprocs resource before allocations occur. * Although process entries are dynamically created, we still * keep a global limit on the maximum number we will * create. There are hard-limits as to the number of processes * that can run, established by the KVA and memory usage for * the process data. * * Don't allow a nonprivileged user to use the last ten * processes; don't let root exceed the limit. */ nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1; if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred, PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) { sx_xlock(&allproc_lock); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("maxproc limit exceeded by uid %u (pid %d); " "see tuning(7) and login.conf(5)\n", td->td_ucred->cr_ruid, p1->p_pid); } sx_xunlock(&allproc_lock); error = EAGAIN; goto fail1; } #ifdef PROCDESC /* * If required, create a process descriptor in the parent first; we * will abandon it if something goes wrong. We don't finit() until * later. */ if (flags & RFPROCDESC) { error = falloc(td, &fp_procdesc, procdescp, 0); if (error != 0) goto fail1; } #endif mem_charged = 0; if (pages == 0) pages = KSTACK_PAGES; /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); td2 = FIRST_THREAD_IN_PROC(newproc); if (td2 == NULL) { td2 = thread_alloc(pages); if (td2 == NULL) { error = ENOMEM; goto fail1; } proc_linkup(newproc, td2); } else { if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) { if (td2->td_kstack != 0) vm_thread_dispose(td2); if (!thread_alloc_stack(td2, pages)) { error = ENOMEM; goto fail1; } } } if ((flags & RFMEM) == 0) { vm2 = vmspace_fork(p1->p_vmspace, &mem_charged); if (vm2 == NULL) { error = ENOMEM; goto fail1; } if (!swap_reserve(mem_charged)) { /* * The swap reservation failed. The accounting * from the entries of the copied vm2 will be * substracted in vmspace_free(), so force the * reservation there. */ swap_reserve_force(mem_charged); error = ENOMEM; goto fail1; } } else vm2 = NULL; /* * XXX: This is ugly; when we copy resource usage, we need to bump * per-cred resource counters. */ - newproc->p_ucred = p1->p_ucred; + proc_set_cred(newproc, p1->p_ucred); /* * Initialize resource accounting for the child process. */ error = racct_proc_fork(p1, newproc); if (error != 0) { error = EAGAIN; goto fail1; } #ifdef MAC mac_proc_init(newproc); #endif knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx); STAILQ_INIT(&newproc->p_ktr); /* We have to lock the process tree while we look for a pid. */ sx_slock(&proctree_lock); sx_xlock(&allproc_lock); /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. * * XXXRW: Can we avoid privilege here if it's not needed? */ error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0); if (error == 0) ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0); else { PROC_LOCK(p1); ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, lim_cur(p1, RLIMIT_NPROC)); PROC_UNLOCK(p1); } if (ok) { do_fork(td, flags, newproc, td2, vm2, pdflags); /* * Return child proc pointer to parent. */ *procp = newproc; #ifdef PROCDESC if (flags & RFPROCDESC) { procdesc_finit(newproc->p_procdesc, fp_procdesc); fdrop(fp_procdesc, td); } #endif racct_proc_fork_done(newproc); return (0); } error = EAGAIN; sx_sunlock(&proctree_lock); sx_xunlock(&allproc_lock); #ifdef MAC mac_proc_destroy(newproc); #endif racct_proc_exit(newproc); fail1: if (vm2 != NULL) vmspace_free(vm2); uma_zfree(proc_zone, newproc); #ifdef PROCDESC if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) { fdclose(td->td_proc->p_fd, fp_procdesc, *procdescp, td); fdrop(fp_procdesc, td); } #endif atomic_add_int(&nprocs, -1); pause("fork", hz / 2); return (error); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(void (*callout)(void *, struct trapframe *), void *arg, struct trapframe *frame) { struct proc *p; struct thread *td; struct thread *dtd; td = curthread; p = td->td_proc; KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)", td, td->td_sched, p->p_pid, td->td_name); sched_fork_exit(td); /* * Processes normally resume in mi_switch() after being * cpu_switch()'ed to, but when children start up they arrive here * instead, so we must do much the same things as mi_switch() would. */ if ((dtd = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(dtd); } thread_unlock(td); /* * cpu_set_fork_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ if (p->p_flag & P_KTHREAD) { printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", td->td_name, p->p_pid); kthread_exit(); } mtx_assert(&Giant, MA_NOTOWNED); if (p->p_sysent->sv_schedtail != NULL) (p->p_sysent->sv_schedtail)(td); } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. Giant is not held on entry, and must not * be held on return. This function is passed in to fork_exit() as the * first parameter and is called when returning to a new userland process. */ void fork_return(struct thread *td, struct trapframe *frame) { struct proc *p, *dbg; p = td->td_proc; if (td->td_dbgflags & TDB_STOPATFORK) { sx_xlock(&proctree_lock); PROC_LOCK(p); if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED | P_FOLLOWFORK)) { /* * If debugger still wants auto-attach for the * parent's children, do it now. */ dbg = p->p_pptr->p_pptr; p->p_flag |= P_TRACED; p->p_oppid = p->p_pptr->p_pid; CTR2(KTR_PTRACE, "fork_return: attaching to new child pid %d: oppid %d", p->p_pid, p->p_oppid); proc_reparent(p, dbg); sx_xunlock(&proctree_lock); td->td_dbgflags |= TDB_CHILD | TDB_SCX; ptracestop(td, SIGSTOP); td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX); } else { /* * ... otherwise clear the request. */ sx_xunlock(&proctree_lock); td->td_dbgflags &= ~TDB_STOPATFORK; cv_broadcast(&p->p_dbgwait); } PROC_UNLOCK(p); } else if (p->p_flag & P_TRACED) { /* * This is the start of a new thread in a traced * process. Report a system call exit event. */ PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; _STOPEVENT(p, S_SCX, td->td_dbg_sc_code); if ((p->p_stops & S_PT_SCX) != 0) ptracestop(td, SIGTRAP); td->td_dbgflags &= ~TDB_SCX; PROC_UNLOCK(p); } userret(td, frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(SYS_fork, 0, 0); #endif } Index: stable/10/sys/kern/kern_jail.c =================================================================== --- stable/10/sys/kern/kern_jail.c (revision 302228) +++ stable/10/sys/kern/kern_jail.c (revision 302229) @@ -1,4861 +1,4861 @@ /*- * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #ifdef INET6 #include #endif /* INET6 */ #endif /* DDB */ #include #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ #ifdef INET #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL #endif #else /* !INET */ #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL 0 #endif #endif /* prison0 describes what is "real" about the system. */ struct prison prison0 = { .pr_id = 0, .pr_name = "0", .pr_ref = 1, .pr_uref = 1, .pr_path = "/", .pr_securelevel = -1, .pr_devfs_rsnum = 0, .pr_childmax = JAIL_MAX, .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif .pr_allow = PR_ALLOW_ALL, }; MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ struct sx allprison_lock; SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); LIST_HEAD(, prison_racct) allprison_racct; int lastprid = 0; static int do_jail_attach(struct thread *td, struct prison *pr); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static char *prison_path(struct prison *pr1, struct prison *pr2); static void prison_remove_one(struct prison *pr); #ifdef RACCT static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif #ifdef INET static int _prison_check_ip4(struct prison *pr, struct in_addr *ia); static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4); #endif #ifdef INET6 static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6); static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6); #endif /* Flags for prison_deref */ #define PD_DEREF 0x01 #define PD_DEUREF 0x02 #define PD_LOCKED 0x04 #define PD_LIST_SLOCKED 0x08 #define PD_LIST_XLOCKED 0x10 /* * Parameter names corresponding to PR_* flag values. Size values are for kvm * as we cannot figure out the size of a sparse array, or an array without a * terminating entry. */ static char *pr_flag_names[] = { [0] = "persist", #ifdef INET [7] = "ip4.saddrsel", #endif #ifdef INET6 [8] = "ip6.saddrsel", #endif }; const size_t pr_flag_names_size = sizeof(pr_flag_names); static char *pr_flag_nonames[] = { [0] = "nopersist", #ifdef INET [7] = "ip4.nosaddrsel", #endif #ifdef INET6 [8] = "ip6.nosaddrsel", #endif }; const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames); struct jailsys_flags { const char *name; unsigned disable; unsigned new; } pr_flag_jailsys[] = { { "host", 0, PR_HOST }, #ifdef VIMAGE { "vnet", 0, PR_VNET }, #endif #ifdef INET { "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER }, #endif #ifdef INET6 { "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER }, #endif }; const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); static char *pr_allow_names[] = { "allow.set_hostname", "allow.sysvipc", "allow.raw_sockets", "allow.chflags", "allow.mount", "allow.quotas", "allow.socket_af", "allow.mount.devfs", "allow.mount.nullfs", "allow.mount.zfs", "allow.mount.procfs", "allow.mount.tmpfs", "allow.mount.fdescfs", "allow.mount.linprocfs", "allow.mount.linsysfs", }; const size_t pr_allow_names_size = sizeof(pr_allow_names); static char *pr_allow_nonames[] = { "allow.noset_hostname", "allow.nosysvipc", "allow.noraw_sockets", "allow.nochflags", "allow.nomount", "allow.noquotas", "allow.nosocket_af", "allow.mount.nodevfs", "allow.mount.nonullfs", "allow.mount.nozfs", "allow.mount.noprocfs", "allow.mount.notmpfs", "allow.mount.nofdescfs", "allow.mount.nolinprocfs", "allow.mount.nolinsysfs", }; const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames); #define JAIL_DEFAULT_ALLOW PR_ALLOW_SET_HOSTNAME #define JAIL_DEFAULT_ENFORCE_STATFS 2 #define JAIL_DEFAULT_DEVFS_RSNUM 0 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; #if defined(INET) || defined(INET6) static unsigned jail_max_af_ips = 255; #endif /* * Initialize the parts of prison0 that can't be static-initialized with * constants. This is called from proc0_init() after creating thread0 cpuset. */ void prison0_init(void) { prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); prison0.pr_osreldate = osreldate; strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); } #ifdef INET static int qcmp_v4(const void *ip1, const void *ip2) { in_addr_t iaa, iab; /* * We need to compare in HBO here to get the list sorted as expected * by the result of the code. Sorting NBO addresses gives you * interesting results. If you do not understand, do not try. */ iaa = ntohl(((const struct in_addr *)ip1)->s_addr); iab = ntohl(((const struct in_addr *)ip2)->s_addr); /* * Do not simply return the difference of the two numbers, the int is * not wide enough. */ if (iaa > iab) return (1); else if (iaa < iab) return (-1); else return (0); } #endif #ifdef INET6 static int qcmp_v6(const void *ip1, const void *ip2) { const struct in6_addr *ia6a, *ia6b; int i, rc; ia6a = (const struct in6_addr *)ip1; ia6b = (const struct in6_addr *)ip2; rc = 0; for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) { if (ia6a->s6_addr[i] > ia6b->s6_addr[i]) rc = 1; else if (ia6a->s6_addr[i] < ia6b->s6_addr[i]) rc = -1; } return (rc); } #endif /* * struct jail_args { * struct jail *jail; * }; */ int sys_jail(struct thread *td, struct jail_args *uap) { uint32_t version; int error; struct jail j; error = copyin(uap->jail, &version, sizeof(uint32_t)); if (error) return (error); switch (version) { case 0: { struct jail_v0 j0; /* FreeBSD single IPv4 jails. */ bzero(&j, sizeof(struct jail)); error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); if (error) return (error); j.version = j0.version; j.path = j0.path; j.hostname = j0.hostname; j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ break; } case 1: /* * Version 1 was used by multi-IPv4 jail implementations * that never made it into the official kernel. */ return (EINVAL); case 2: /* JAIL_API_VERSION */ /* FreeBSD multi-IPv4/IPv6,noIP jails. */ error = copyin(uap->jail, &j, sizeof(struct jail)); if (error) return (error); break; default: /* Sci-Fi jails are not supported, sorry. */ return (EINVAL); } return (kern_jail(td, &j)); } int kern_jail(struct thread *td, struct jail *j) { struct iovec optiov[2 * (4 + sizeof(pr_allow_names) / sizeof(pr_allow_names[0]) #ifdef INET + 1 #endif #ifdef INET6 + 1 #endif )]; struct uio opt; char *u_path, *u_hostname, *u_name; #ifdef INET uint32_t ip4s; struct in_addr *u_ip4; #endif #ifdef INET6 struct in6_addr *u_ip6; #endif size_t tmplen; int error, enforce_statfs, fi; bzero(&optiov, sizeof(optiov)); opt.uio_iov = optiov; opt.uio_iovcnt = 0; opt.uio_offset = -1; opt.uio_resid = -1; opt.uio_segflg = UIO_SYSSPACE; opt.uio_rw = UIO_READ; opt.uio_td = td; /* Set permissions for top-level jails from sysctls. */ if (!jailed(td->td_ucred)) { for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); fi++) { optiov[opt.uio_iovcnt].iov_base = (jail_default_allow & (1 << fi)) ? pr_allow_names[fi] : pr_allow_nonames[fi]; optiov[opt.uio_iovcnt].iov_len = strlen(optiov[opt.uio_iovcnt].iov_base) + 1; opt.uio_iovcnt += 2; } optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); opt.uio_iovcnt++; enforce_statfs = jail_default_enforce_statfs; optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); opt.uio_iovcnt++; } tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; #ifdef INET ip4s = (j->version == 0) ? 1 : j->ip4s; if (ip4s > jail_max_af_ips) return (EINVAL); tmplen += ip4s * sizeof(struct in_addr); #else if (j->ip4s > 0) return (EINVAL); #endif #ifdef INET6 if (j->ip6s > jail_max_af_ips) return (EINVAL); tmplen += j->ip6s * sizeof(struct in6_addr); #else if (j->ip6s > 0) return (EINVAL); #endif u_path = malloc(tmplen, M_TEMP, M_WAITOK); u_hostname = u_path + MAXPATHLEN; u_name = u_hostname + MAXHOSTNAMELEN; #ifdef INET u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); #endif #ifdef INET6 #ifdef INET u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); #else u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); #endif #endif optiov[opt.uio_iovcnt].iov_base = "path"; optiov[opt.uio_iovcnt].iov_len = sizeof("path"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_path; error = copyinstr(j->path, u_path, MAXPATHLEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = "host.hostname"; optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_hostname; error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; if (j->jailname != NULL) { optiov[opt.uio_iovcnt].iov_base = "name"; optiov[opt.uio_iovcnt].iov_len = sizeof("name"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_name; error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; } #ifdef INET optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip4; optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); if (j->version == 0) u_ip4->s_addr = j->ip4s; else { error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } } opt.uio_iovcnt++; #endif #ifdef INET6 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip6; optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; #endif KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]), ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); free(u_path, M_TEMP); return (error); } /* * struct jail_set_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_set(struct thread *td, struct jail_set_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_set(td, auio, uap->flags); free(auio, M_IOV); return (error); } int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { struct nameidata nd; #ifdef INET struct in_addr *ip4; #endif #ifdef INET6 struct in6_addr *ip6; #endif struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *mypr, *ppr, *tpr; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; #if defined(INET) || defined(INET6) struct prison *tppr; void *op; #endif unsigned long hid; size_t namelen, onamelen, pnamelen; int born, created, cuflags, descend, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; int fi, jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; int fullpath_disabled; #if defined(INET) || defined(INET6) int ii, ij; #endif #ifdef INET int ip4s, redo_ip4; #endif #ifdef INET6 int ip6s, redo_ip6; #endif uint64_t pr_allow, ch_allow, pr_flags, ch_flags; unsigned tallow; char numbuf[12]; error = priv_check(td, PRIV_JAIL_SET); if (!error && (flags & JAIL_ATTACH)) error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); mypr = td->td_ucred->cr_prison; if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this * takes care of some expensive stuff (path lookup) before getting * the allprison lock. * * XXX Jails are not filesystems, and jail parameters are not mount * options. But it makes more sense to re-use the vfsopt code * than duplicate it under a different name. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); #ifdef INET ip4 = NULL; #endif #ifdef INET6 ip6 = NULL; #endif g_path = NULL; cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { error = EINVAL; vfs_opterror(opts, "no valid operation (create or update)"); goto done_errmsg; } error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; else if (error != 0) goto done_free; error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); if (error == ENOENT) gotslevel = 0; else if (error != 0) goto done_free; else gotslevel = 1; error = vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); if (error == ENOENT) gotchildmax = 0; else if (error != 0) goto done_free; else gotchildmax = 1; error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); if (error == ENOENT) gotenforce = 0; else if (error != 0) goto done_free; else if (enforce < 0 || enforce > 2) { error = EINVAL; goto done_free; } else gotenforce = 1; error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); if (error == ENOENT) gotrsnum = 0; else if (error != 0) goto done_free; else gotrsnum = 1; pr_flags = ch_flags = 0; for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]); fi++) { if (pr_flag_names[fi] == NULL) continue; vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi); vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi); } ch_flags |= pr_flags; for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]); fi++) { error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys, sizeof(jsys)); if (error == ENOENT) continue; if (error != 0) goto done_free; switch (jsys) { case JAIL_SYS_DISABLE: if (!pr_flag_jailsys[fi].disable) { error = EINVAL; goto done_free; } pr_flags |= pr_flag_jailsys[fi].disable; break; case JAIL_SYS_NEW: pr_flags |= pr_flag_jailsys[fi].new; break; case JAIL_SYS_INHERIT: break; default: error = EINVAL; goto done_free; } ch_flags |= pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable; } if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE && !(pr_flags & PR_PERSIST)) { error = EINVAL; vfs_opterror(opts, "new jail must persist or attach"); goto done_errmsg; } #ifdef VIMAGE if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { error = EINVAL; vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET6 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_errmsg; } #endif pr_allow = ch_allow = 0; for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); fi++) { vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi); vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi); } ch_allow |= pr_allow; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == ENOENT) name = NULL; else if (error != 0) goto done_free; else { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); if (error == ENOENT) host = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || host[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); if (error == ENOENT) domain = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || domain[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); if (error == ENOENT) uuid = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || uuid[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > HOSTUUIDLEN) { error = ENAMETOOLONG; goto done_free; } } #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32; error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); hid = hid32; } else #endif error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); if (error == ENOENT) gothid = 0; else if (error != 0) goto done_free; else { gothid = 1; ch_flags |= PR_HOST; pr_flags |= PR_HOST; } #ifdef INET error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); if (error == ENOENT) ip4s = 0; else if (error != 0) goto done_free; else if (ip4s & (sizeof(*ip4) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP4_USER | PR_IP4_DISABLE; if (ip4s == 0) pr_flags |= PR_IP4_USER | PR_IP4_DISABLE; else { pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER; ip4s /= sizeof(*ip4); if (ip4s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv4 addresses"); goto done_errmsg; } ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); bcopy(op, ip4, ip4s * sizeof(*ip4)); /* * IP addresses are all sorted but ip[0] to preserve * the primary IP address as given from userland. * This special IP is used for unbound outgoing * connections as well for "loopback" traffic in case * source address selection cannot find any more fitting * address to connect from. */ if (ip4s > 1) qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4); /* * Check for duplicate addresses and do some simple * zero and broadcast checks. If users give other bogus * addresses it is their problem. * * We do not have to care about byte order for these * checks so we will do them in NBO. */ for (ii = 0; ii < ip4s; ii++) { if (ip4[ii].s_addr == INADDR_ANY || ip4[ii].s_addr == INADDR_BROADCAST) { error = EINVAL; goto done_free; } if ((ii+1) < ip4s && (ip4[0].s_addr == ip4[ii+1].s_addr || ip4[ii].s_addr == ip4[ii+1].s_addr)) { error = EINVAL; goto done_free; } } } } #endif #ifdef INET6 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); if (error == ENOENT) ip6s = 0; else if (error != 0) goto done_free; else if (ip6s & (sizeof(*ip6) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP6_USER | PR_IP6_DISABLE; if (ip6s == 0) pr_flags |= PR_IP6_USER | PR_IP6_DISABLE; else { pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER; ip6s /= sizeof(*ip6); if (ip6s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv6 addresses"); goto done_errmsg; } ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); bcopy(op, ip6, ip6s * sizeof(*ip6)); if (ip6s > 1) qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6); for (ii = 0; ii < ip6s; ii++) { if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) { error = EINVAL; goto done_free; } if ((ii+1) < ip6s && (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) || IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1]))) { error = EINVAL; goto done_free; } } } } #endif #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_errmsg; } #endif error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); if (error == ENOENT) osrelstr = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osrelease cannot be changed after creation"); goto done_errmsg; } if (len == 0 || len >= OSRELEASELEN) { error = EINVAL; vfs_opterror(opts, "osrelease string must be 1-%d bytes long", OSRELEASELEN - 1); goto done_errmsg; } } error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); if (error == ENOENT) osreldt = 0; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be changed after creation"); goto done_errmsg; } if (osreldt == 0) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be 0"); goto done_errmsg; } } fullpath_disabled = 0; root = NULL; error = vfs_getopt(opts, "path", (void **)&path, &len); if (error == ENOENT) path = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "path cannot be changed after creation"); goto done_errmsg; } if (len == 0 || path[len - 1] != '\0') { error = EINVAL; goto done_free; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, td); error = namei(&nd); if (error) goto done_free; root = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); strlcpy(g_path, path, MAXPATHLEN); error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); if (error == 0) path = g_path; else if (error == ENODEV) { /* proceed if sysctl debug.disablefullpath == 1 */ fullpath_disabled = 1; if (len < 2 || (len == 2 && path[0] == '/')) path = NULL; } else { /* exit on other errors */ goto done_free; } if (root->v_type != VDIR) { error = ENOTDIR; vput(root); goto done_free; } VOP_UNLOCK(root, 0); if (fullpath_disabled) { /* Leave room for a real-root full pathname. */ if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/") ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) { error = ENAMETOOLONG; vrele(root); goto done_free; } } } /* * Find the specified jail, or at least its parent. * This abuses the file error codes ENOENT and EEXIST. */ pr = NULL; ppr = mypr; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); if (*p != '\0') jid = 0; } sx_xlock(&allprison_lock); if (jid != 0) { /* * See if a requested jid already exists. There is an * information leak here if the jid exists but is not within * the caller's jail hierarchy. Jail creators will get EEXIST * even though they cannot see the jail, and CREATE | UPDATE * will return ENOENT which is not normally a valid error. */ if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); goto done_unlock_list; } pr = prison_find(jid); if (pr != NULL) { ppr = pr->pr_parent; /* Create: jid must not exist. */ if (cuflags == JAIL_CREATE) { mtx_unlock(&pr->pr_mtx); error = EEXIST; vfs_opterror(opts, "jail %d already exists", jid); goto done_unlock_list; } if (!prison_ischild(mypr, pr)) { mtx_unlock(&pr->pr_mtx); pr = NULL; } else if (pr->pr_uref == 0) { if (!(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done_unlock_list; } else if ((flags & JAIL_ATTACH) || (pr_flags & PR_PERSIST)) { /* * A dying jail might be resurrected * (via attach or persist), but first * it must determine if another jail * has claimed its name. Accomplish * this by implicitly re-setting the * name. */ if (name == NULL) name = prison_name(mypr, pr); } } } if (pr == NULL) { /* Update: jid must exist. */ if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_unlock_list; } } } /* * If the caller provided a name, look for a jail by that name. * This has different semantics for creates and updates keyed by jid * (where the name must not already exist in a different jail), * and updates keyed by the name itself (where the name must exist * because that is the jail being updated). */ namelc = NULL; if (name != NULL) { namelc = strrchr(name, '.'); if (namelc == NULL) namelc = name; else { /* * This is a hierarchical name. Split it into the * parent and child names, and make sure the parent * exists or matches an already found jail. */ if (pr != NULL) { if (strncmp(name, ppr->pr_name, namelc - name) || ppr->pr_name[namelc - name] != '\0') { mtx_unlock(&pr->pr_mtx); error = EINVAL; vfs_opterror(opts, "cannot change jail's parent"); goto done_unlock_list; } } else { *namelc = '\0'; ppr = prison_find_name(mypr, name); if (ppr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } mtx_unlock(&ppr->pr_mtx); *namelc = '.'; } namelc++; } if (namelc[0] != '\0') { pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; name_again: deadpr = NULL; FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr != pr && tpr->pr_ref > 0 && !strcmp(tpr->pr_name + pnamelen, namelc)) { if (pr == NULL && cuflags != JAIL_CREATE) { mtx_lock(&tpr->pr_mtx); if (tpr->pr_ref > 0) { /* * Use this jail * for updates. */ if (tpr->pr_uref > 0) { pr = tpr; break; } deadpr = tpr; } mtx_unlock(&tpr->pr_mtx); } else if (tpr->pr_uref > 0) { /* * Create, or update(jid): * name must not exist in an * active sibling jail. */ error = EEXIST; if (pr != NULL) mtx_unlock(&pr->pr_mtx); vfs_opterror(opts, "jail \"%s\" already exists", name); goto done_unlock_list; } } } /* If no active jail is found, use a dying one. */ if (deadpr != NULL && pr == NULL) { if (flags & JAIL_DYING) { mtx_lock(&deadpr->pr_mtx); if (deadpr->pr_ref == 0) { mtx_unlock(&deadpr->pr_mtx); goto name_again; } pr = deadpr; } else if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_unlock_list; } } /* Update: name must exist if no jid. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } } } /* Update: must provide a jid or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); goto done_unlock_list; } /* If there's no prison to update, create a new one and link it in. */ if (pr == NULL) { for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) if (tpr->pr_childcount >= tpr->pr_childmax) { error = EPERM; vfs_opterror(opts, "prison limit exceeded"); goto done_unlock_list; } created = 1; mtx_lock(&ppr->pr_mtx); if (ppr->pr_ref == 0) { mtx_unlock(&ppr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", prison_name(mypr, ppr)); goto done_unlock_list; } ppr->pr_ref++; ppr->pr_uref++; mtx_unlock(&ppr->pr_mtx); pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); if (jid == 0) { /* Find the next free jid. */ jid = lastprid + 1; findnext: if (jid == JAIL_MAX) jid = 1; TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr->pr_id < jid) continue; if (tpr->pr_id > jid || tpr->pr_ref == 0) { TAILQ_INSERT_BEFORE(tpr, pr, pr_list); break; } if (jid == lastprid) { error = EAGAIN; vfs_opterror(opts, "no available jail IDs"); free(pr, M_PRISON); prison_deref(ppr, PD_DEREF | PD_DEUREF | PD_LIST_XLOCKED); goto done_releroot; } jid++; goto findnext; } lastprid = jid; } else { /* * The jail already has a jid (that did not yet exist), * so just find where to insert it. */ TAILQ_FOREACH(tpr, &allprison, pr_list) if (tpr->pr_id >= jid) { TAILQ_INSERT_BEFORE(tpr, pr, pr_list); break; } } if (tpr == NULL) TAILQ_INSERT_TAIL(&allprison, pr, pr_list); LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; pr->pr_parent = ppr; pr->pr_id = jid; /* Set some default values, and inherit some from the parent. */ if (namelc == NULL) namelc = ""; if (path == NULL) { path = "/"; root = mypr->pr_root; vref(root); } strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); pr->pr_flags |= PR_HOST; #if defined(INET) || defined(INET6) #ifdef VIMAGE if (!(pr_flags & PR_VNET)) #endif { #ifdef INET if (!(ch_flags & PR_IP4_USER)) pr->pr_flags |= PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE; else if (!(pr_flags & PR_IP4_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP4; if (ppr->pr_ip4 != NULL) { pr->pr_ip4s = ppr->pr_ip4s; pr->pr_ip4 = malloc(pr->pr_ip4s * sizeof(struct in_addr), M_PRISON, M_WAITOK); bcopy(ppr->pr_ip4, pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); } } #endif #ifdef INET6 if (!(ch_flags & PR_IP6_USER)) pr->pr_flags |= PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE; else if (!(pr_flags & PR_IP6_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP6; if (ppr->pr_ip6 != NULL) { pr->pr_ip6s = ppr->pr_ip6s; pr->pr_ip6 = malloc(pr->pr_ip6s * sizeof(struct in6_addr), M_PRISON, M_WAITOK); bcopy(ppr->pr_ip6, pr->pr_ip6, pr->pr_ip6s * sizeof(*pr->pr_ip6)); } } #endif } #endif /* Source address selection is always on by default. */ pr->pr_flags |= _PR_IP_SADDRSEL; pr->pr_securelevel = ppr->pr_securelevel; pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; if (osrelstr == NULL) strcpy(pr->pr_osrelease, ppr->pr_osrelease); else strcpy(pr->pr_osrelease, osrelstr); LIST_INIT(&pr->pr_children); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); #ifdef VIMAGE /* Allocate a new vnet if specified. */ pr->pr_vnet = (pr_flags & PR_VNET) ? vnet_alloc() : ppr->pr_vnet; #endif /* * Allocate a dedicated cpuset for each jail. * Unlike other initial settings, this may return an erorr. */ error = cpuset_create_root(ppr, &pr->pr_cpuset); if (error) { prison_deref(pr, PD_LIST_XLOCKED); goto done_releroot; } mtx_lock(&pr->pr_mtx); /* * New prisons do not yet have a reference, because we do not * want others to see the incomplete prison once the * allprison_lock is downgraded. */ } else { created = 0; /* * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ pr->pr_ref++; #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_deref_locked; } #endif #ifdef INET if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_deref_locked; } #endif #ifdef INET6 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_deref_locked; } #endif } /* Do final error checking before setting anything. */ if (gotslevel) { if (slevel < ppr->pr_securelevel) { error = EPERM; goto done_deref_locked; } } if (gotchildmax) { if (childmax >= ppr->pr_childmax) { error = EPERM; goto done_deref_locked; } } if (gotenforce) { if (enforce < ppr->pr_enforce_statfs) { error = EPERM; goto done_deref_locked; } } if (gotrsnum) { /* * devfs_rsnum is a uint16_t */ if (rsnum < 0 || rsnum > 65535) { error = EINVAL; goto done_deref_locked; } /* * Nested jails always inherit parent's devfs ruleset */ if (jailed(td->td_ucred)) { if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { error = EPERM; goto done_deref_locked; } else rsnum = ppr->pr_devfs_rsnum; } } #ifdef INET if (ip4s > 0) { if (ppr->pr_flags & PR_IP4) { /* * Make sure the new set of IP addresses is a * subset of the parent's list. Don't worry * about the parent being unlocked, as any * setting is done with allprison_lock held. */ for (ij = 0; ij < ppr->pr_ip4s; ij++) if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij == ppr->pr_ip4s) { error = EPERM; goto done_deref_locked; } if (ip4s > 1) { for (ii = ij = 1; ii < ip4s; ii++) { if (ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) continue; for (; ij < ppr->pr_ip4s; ij++) if (ip4[ii].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij == ppr->pr_ip4s) break; } if (ij == ppr->pr_ip4s) { error = EPERM; goto done_deref_locked; } } } /* * Check for conflicting IP addresses. We permit them * if there is no more than one IP on each jail. If * there is a duplicate on a jail with more than one * IP stop checking and return error. */ tppr = ppr; #ifdef VIMAGE for (; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif tpr->pr_uref == 0) { descend = 0; continue; } if (!(tpr->pr_flags & PR_IP4_USER)) continue; descend = 0; if (tpr->pr_ip4 == NULL || (ip4s == 1 && tpr->pr_ip4s == 1)) continue; for (ii = 0; ii < ip4s; ii++) { if (_prison_check_ip4(tpr, &ip4[ii]) == 0) { error = EADDRINUSE; vfs_opterror(opts, "IPv4 addresses clash"); goto done_deref_locked; } } } } #endif #ifdef INET6 if (ip6s > 0) { if (ppr->pr_flags & PR_IP6) { /* * Make sure the new set of IP addresses is a * subset of the parent's list. */ for (ij = 0; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL(&ip6[0], &ppr->pr_ip6[ij])) break; if (ij == ppr->pr_ip6s) { error = EPERM; goto done_deref_locked; } if (ip6s > 1) { for (ii = ij = 1; ii < ip6s; ii++) { if (IN6_ARE_ADDR_EQUAL(&ip6[ii], &ppr->pr_ip6[0])) continue; for (; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL( &ip6[ii], &ppr->pr_ip6[ij])) break; if (ij == ppr->pr_ip6s) break; } if (ij == ppr->pr_ip6s) { error = EPERM; goto done_deref_locked; } } } /* Check for conflicting IP addresses. */ tppr = ppr; #ifdef VIMAGE for (; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif tpr->pr_uref == 0) { descend = 0; continue; } if (!(tpr->pr_flags & PR_IP6_USER)) continue; descend = 0; if (tpr->pr_ip6 == NULL || (ip6s == 1 && tpr->pr_ip6s == 1)) continue; for (ii = 0; ii < ip6s; ii++) { if (_prison_check_ip6(tpr, &ip6[ii]) == 0) { error = EADDRINUSE; vfs_opterror(opts, "IPv6 addresses clash"); goto done_deref_locked; } } } } #endif onamelen = namelen = 0; if (namelc != NULL) { /* Give a default name of the jid. Also allow the name to be * explicitly the jid - but not any other number, and only in * normal form (no leading zero/etc). */ if (namelc[0] == '\0') snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); else if ((strtoul(namelc, &p, 10) != jid || namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { error = EINVAL; vfs_opterror(opts, "name cannot be numeric (unless it is the jid)"); goto done_deref_locked; } /* * Make sure the name isn't too long for the prison or its * children. */ pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; onamelen = strlen(pr->pr_name + pnamelen); namelen = strlen(namelc); if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref_locked; } FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { if (strlen(tpr->pr_name) + (namelen - onamelen) >= sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref_locked; } } } if (pr_allow & ~ppr->pr_allow) { error = EPERM; goto done_deref_locked; } /* * Let modules check their parameters. This requires unlocking and * then re-locking the prison, but this is still a valid state as long * as allprison_lock remains xlocked. */ mtx_unlock(&pr->pr_mtx); error = osd_jail_call(pr, PR_METHOD_CHECK, opts); if (error != 0) { prison_deref(pr, created ? PD_LIST_XLOCKED : PD_DEREF | PD_LIST_XLOCKED); goto done_releroot; } mtx_lock(&pr->pr_mtx); /* At this point, all valid parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_deref_locked; } } /* Set the parameters of the prison. */ #ifdef INET redo_ip4 = 0; if (pr_flags & PR_IP4_USER) { pr->pr_flags |= PR_IP4; free(pr->pr_ip4, M_PRISON); pr->pr_ip4s = ip4s; pr->pr_ip4 = ip4; ip4 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip4(tpr, NULL)) { redo_ip4 = 1; descend = 0; } } } #endif #ifdef INET6 redo_ip6 = 0; if (pr_flags & PR_IP6_USER) { pr->pr_flags |= PR_IP6; free(pr->pr_ip6, M_PRISON); pr->pr_ip6s = ip6s; pr->pr_ip6 = ip6; ip6 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip6(tpr, NULL)) { redo_ip6 = 1; descend = 0; } } } #endif if (gotslevel) { pr->pr_securelevel = slevel; /* Set all child jails to be at least this level. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_securelevel < slevel) tpr->pr_securelevel = slevel; } if (gotchildmax) { pr->pr_childmax = childmax; /* Set all child jails to under this limit. */ FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) if (tpr->pr_childmax > childmax - level) tpr->pr_childmax = childmax > level ? childmax - level : 0; } if (gotenforce) { pr->pr_enforce_statfs = enforce; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_enforce_statfs < enforce) tpr->pr_enforce_statfs = enforce; } if (gotrsnum) { pr->pr_devfs_rsnum = rsnum; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_devfs_rsnum = rsnum; } if (namelc != NULL) { if (ppr == &prison0) strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); else snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", ppr->pr_name, namelc); /* Change this component of child names. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, strlen(tpr->pr_name + onamelen) + 1); bcopy(pr->pr_name, tpr->pr_name, namelen); } } if (path != NULL) { /* Try to keep a real-rooted full pathname. */ if (fullpath_disabled && path[0] == '/' && strcmp(mypr->pr_path, "/")) snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s", mypr->pr_path, path); else strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); pr->pr_root = root; } if (PR_HOST & ch_flags & ~pr_flags) { if (pr->pr_flags & PR_HOST) { /* * Copy the parent's host info. As with pr_ip4 above, * the lack of a lock on the parent is not a problem; * it is always set with allprison_lock at least * shared, and is held exclusively here. */ strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, sizeof(pr->pr_hostname)); strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, sizeof(pr->pr_domainname)); strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, sizeof(pr->pr_hostuuid)); pr->pr_hostid = pr->pr_parent->pr_hostid; } } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { /* Set this prison, and any descendants without PR_HOST. */ if (host != NULL) strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); if (domain != NULL) strlcpy(pr->pr_domainname, domain, sizeof(pr->pr_domainname)); if (uuid != NULL) strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); if (gothid) pr->pr_hostid = hid; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { if (tpr->pr_flags & PR_HOST) descend = 0; else { if (host != NULL) strlcpy(tpr->pr_hostname, pr->pr_hostname, sizeof(tpr->pr_hostname)); if (domain != NULL) strlcpy(tpr->pr_domainname, pr->pr_domainname, sizeof(tpr->pr_domainname)); if (uuid != NULL) strlcpy(tpr->pr_hostuuid, pr->pr_hostuuid, sizeof(tpr->pr_hostuuid)); if (gothid) tpr->pr_hostid = hid; } } } if ((tallow = ch_allow & ~pr_allow)) { /* Clear allow bits in all children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_allow &= ~tallow; } pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; /* * Persistent prisons get an extra reference, and prisons losing their * persist flag lose that reference. Only do this for existing prisons * for now, so new ones will remain unseen until after the module * handlers have completed. */ born = pr->pr_uref == 0; if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) { if (pr_flags & PR_PERSIST) { pr->pr_ref++; pr->pr_uref++; } else { pr->pr_ref--; pr->pr_uref--; } } pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; mtx_unlock(&pr->pr_mtx); #ifdef RACCT if (racct_enable && created) prison_racct_attach(pr); #endif /* Locks may have prevented a complete restriction of child IP * addresses. If so, allocate some more memory and try again. */ #ifdef INET while (redo_ip4) { ip4s = pr->pr_ip4s; ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip4 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip4(tpr, ip4)) { if (ip4 != NULL) ip4 = NULL; else redo_ip4 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif #ifdef INET6 while (redo_ip6) { ip6s = pr->pr_ip6s; ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip6 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip6(tpr, ip6)) { if (ip6 != NULL) ip6 = NULL; else redo_ip6 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif /* Let the modules do their work. */ sx_downgrade(&allprison_lock); if (born) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) { (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); prison_deref(pr, created ? PD_LIST_SLOCKED : PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; } } error = osd_jail_call(pr, PR_METHOD_SET, opts); if (error) { if (born) (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); prison_deref(pr, created ? PD_LIST_SLOCKED : PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; } /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { mtx_lock(&pr->pr_mtx); error = do_jail_attach(td, pr); if (error) { vfs_opterror(opts, "attach failed"); if (!created) prison_deref(pr, PD_DEREF); goto done_errmsg; } } #ifdef RACCT if (racct_enable && !created) { if (!(flags & JAIL_ATTACH)) sx_sunlock(&allprison_lock); prison_racct_modify(pr); if (!(flags & JAIL_ATTACH)) sx_slock(&allprison_lock); } #endif td->td_retval[0] = pr->pr_id; /* * Now that it is all there, drop the temporary reference from existing * prisons. Or add a reference to newly created persistent prisons * (which was not done earlier so that the prison would not be publicly * visible). */ if (!created) { prison_deref(pr, (flags & JAIL_ATTACH) ? PD_DEREF : PD_DEREF | PD_LIST_SLOCKED); } else { if (pr_flags & PR_PERSIST) { mtx_lock(&pr->pr_mtx); pr->pr_ref++; pr->pr_uref++; mtx_unlock(&pr->pr_mtx); } if (!(flags & JAIL_ATTACH)) sx_sunlock(&allprison_lock); } goto done_free; done_deref_locked: prison_deref(pr, created ? PD_LOCKED | PD_LIST_XLOCKED : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); goto done_releroot; done_unlock_list: sx_xunlock(&allprison_lock); done_releroot: if (root != NULL) vrele(root); done_errmsg: if (error) { if (vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len) == 0 && errmsg_len > 0) { errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } done_free: #ifdef INET free(ip4, M_PRISON); #endif #ifdef INET6 free(ip6, M_PRISON); #endif if (g_path != NULL) free(g_path, M_TEMP); vfs_freeopts(opts); return (error); } /* * struct jail_get_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_get(struct thread *td, struct jail_get_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_get(td, auio, uap->flags); if (error == 0) error = copyout(auio->uio_iov, uap->iovp, uap->iovcnt * sizeof (struct iovec)); free(auio, M_IOV); return (error); } int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos; if (flags & ~JAIL_GET_MASK) return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; /* * Find the prison specified by one of: lastjid, jid, name. */ sx_slock(&allprison_lock); error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id > jid && prison_ischild(mypr, pr)) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0 && (pr->pr_uref > 0 || (flags & JAIL_DYING))) break; mtx_unlock(&pr->pr_mtx); } } if (pr != NULL) goto found_prison; error = ENOENT; vfs_opterror(opts, "no jail after %d", jid); goto done_unlock_list; } else if (error != ENOENT) goto done_unlock_list; error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == 0) { if (jid != 0) { pr = prison_find_child(mypr, jid); if (pr != NULL) { if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done_unlock_list; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_unlock_list; } } else if (error != ENOENT) goto done_unlock_list; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == 0) { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_unlock_list; } pr = prison_find_name(mypr, name); if (pr != NULL) { if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_unlock_list; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } else if (error != ENOENT) goto done_unlock_list; vfs_opterror(opts, "no jail specified"); error = ENOENT; goto done_unlock_list; found_prison: /* Get the parameters of the prison. */ pr->pr_ref++; locked = PD_LOCKED; td->td_retval[0] = pr->pr_id; error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done_deref; i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; error = vfs_setopt(opts, "parent", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "name", prison_name(mypr, pr)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, sizeof(pr->pr_cpuset->cs_id)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "path", prison_path(mypr, pr)); if (error != 0 && error != ENOENT) goto done_deref; #ifdef INET error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); if (error != 0 && error != ENOENT) goto done_deref; #endif #ifdef INET6 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6, pr->pr_ip6s * sizeof(*pr->pr_ip6)); if (error != 0 && error != ENOENT) goto done_deref; #endif error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, sizeof(pr->pr_securelevel)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, sizeof(pr->pr_childcount)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "children.max", &pr->pr_childmax, sizeof(pr->pr_childmax)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); if (error != 0 && error != ENOENT) goto done_deref; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32 = pr->pr_hostid; error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); } else #endif error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, sizeof(pr->pr_hostid)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, sizeof(pr->pr_enforce_statfs)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, sizeof(pr->pr_devfs_rsnum)); if (error != 0 && error != ENOENT) goto done_deref; for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]); fi++) { if (pr_flag_names[fi] == NULL) continue; i = (pr->pr_flags & (1 << fi)) ? 1 : 0; error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]); fi++) { i = pr->pr_flags & (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new); i = pr_flag_jailsys[fi].disable && (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); fi++) { if (pr_allow_names[fi] == NULL) continue; i = (pr->pr_allow & (1 << fi)) ? 1 : 0; error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } i = (pr->pr_uref == 0); error = vfs_setopt(opts, "dying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, "nodying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, sizeof(pr->pr_osreldate)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); if (error != 0 && error != ENOENT) goto done_deref; /* Get the module parameters. */ mtx_unlock(&pr->pr_mtx); locked = 0; error = osd_jail_call(pr, PR_METHOD_GET, opts); if (error) goto done_deref; prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED); /* By now, all parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_errmsg; } } /* Write the fetched parameters back to userspace. */ error = 0; TAILQ_FOREACH(opt, opts, link) { if (opt->pos >= 0 && opt->pos != errmsg_pos) { pos = 2 * opt->pos + 1; optuio->uio_iov[pos].iov_len = opt->len; if (opt->value != NULL) { if (optuio->uio_segflg == UIO_SYSSPACE) { bcopy(opt->value, optuio->uio_iov[pos].iov_base, opt->len); } else { error = copyout(opt->value, optuio->uio_iov[pos].iov_base, opt->len); if (error) break; } } } } goto done_errmsg; done_deref: prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; done_unlock_list: sx_sunlock(&allprison_lock); done_errmsg: if (error && errmsg_pos >= 0) { vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); errmsg_pos = 2 * errmsg_pos + 1; if (errmsg_len > 0) { if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } vfs_freeopts(opts); return (error); } /* * struct jail_remove_args { * int jid; * }; */ int sys_jail_remove(struct thread *td, struct jail_remove_args *uap) { struct prison *pr, *cpr, *lpr, *tpr; int descend, error; error = priv_check(td, PRIV_JAIL_REMOVE); if (error) return (error); sx_xlock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_xunlock(&allprison_lock); return (EINVAL); } /* Remove all descendants of this prison, then remove this prison. */ pr->pr_ref++; if (!LIST_EMPTY(&pr->pr_children)) { mtx_unlock(&pr->pr_mtx); lpr = NULL; FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { mtx_lock(&cpr->pr_mtx); if (cpr->pr_ref > 0) { tpr = cpr; cpr->pr_ref++; } else { /* Already removed - do not do it again. */ tpr = NULL; } mtx_unlock(&cpr->pr_mtx); if (lpr != NULL) { mtx_lock(&lpr->pr_mtx); prison_remove_one(lpr); sx_xlock(&allprison_lock); } lpr = tpr; } if (lpr != NULL) { mtx_lock(&lpr->pr_mtx); prison_remove_one(lpr); sx_xlock(&allprison_lock); } mtx_lock(&pr->pr_mtx); } prison_remove_one(pr); return (0); } static void prison_remove_one(struct prison *pr) { struct proc *p; int deuref; /* If the prison was persistent, it is not anymore. */ deuref = 0; if (pr->pr_flags & PR_PERSIST) { pr->pr_ref--; deuref = PD_DEUREF; pr->pr_flags &= ~PR_PERSIST; } /* * jail_remove added a reference. If that's the only one, remove * the prison now. */ KASSERT(pr->pr_ref > 0, ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id)); if (pr->pr_ref == 1) { prison_deref(pr, deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); return; } mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); /* * Kill all processes unfortunate enough to be attached to this prison. */ sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { PROC_LOCK(p); if (p->p_state != PRS_NEW && p->p_ucred && p->p_ucred->cr_prison == pr) kern_psignal(p, SIGKILL); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* Remove the temporary reference added by jail_remove. */ prison_deref(pr, deuref | PD_DEREF); } /* * struct jail_attach_args { * int jid; * }; */ int sys_jail_attach(struct thread *td, struct jail_attach_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); /* * Start with exclusive hold on allprison_lock to ensure that a possible * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove. * But then immediately downgrade it since we don't need to stop * readers. */ sx_xlock(&allprison_lock); sx_downgrade(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); return (EINVAL); } /* * Do not allow a process to attach to a prison that is not * considered to be "alive". */ if (pr->pr_uref == 0) { mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); return (EINVAL); } return (do_jail_attach(td, pr)); } static int do_jail_attach(struct thread *td, struct prison *pr) { struct proc *p; struct ucred *newcred, *oldcred; int error; /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two * different jails at the same time. It is important for * user processes not to do this, or they might end up with * a process root from one prison, but attached to the jail * of another. */ pr->pr_ref++; pr->pr_uref++; mtx_unlock(&pr->pr_mtx); /* Let modules do whatever they need to prepare for attaching. */ error = osd_jail_call(pr, PR_METHOD_ATTACH, td); if (error) { prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED); return (error); } sx_sunlock(&allprison_lock); /* * Reparent the newly attached process to this jail. */ p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) goto e_revert_osd; vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); if ((error = change_dir(pr->pr_root, td)) != 0) goto e_unlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) goto e_unlock; #endif VOP_UNLOCK(pr->pr_root, 0); if ((error = change_root(pr->pr_root, td))) goto e_revert_osd; newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_prison = pr; - p->p_ucred = newcred; + proc_set_cred(p, newcred); setsugid(p); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF); crfree(oldcred); return (0); e_unlock: VOP_UNLOCK(pr->pr_root, 0); e_revert_osd: /* Tell modules this thread is still in its old jail after all. */ (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); prison_deref(pr, PD_DEREF | PD_DEUREF); return (error); } /* * Returns a locked prison instance, or NULL on failure. */ struct prison * prison_find(int prid) { struct prison *pr; sx_assert(&allprison_lock, SX_LOCKED); TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id == prid) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) return (pr); mtx_unlock(&pr->pr_mtx); } } return (NULL); } /* * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. */ struct prison * prison_find_child(struct prison *mypr, int prid) { struct prison *pr; int descend; sx_assert(&allprison_lock, SX_LOCKED); FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (pr->pr_id == prid) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) return (pr); mtx_unlock(&pr->pr_mtx); } } return (NULL); } /* * Look for the name relative to mypr. Returns a locked prison or NULL. */ struct prison * prison_find_name(struct prison *mypr, const char *name) { struct prison *pr, *deadpr; size_t mylen; int descend; sx_assert(&allprison_lock, SX_LOCKED); mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; again: deadpr = NULL; FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (!strcmp(pr->pr_name + mylen, name)) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) { if (pr->pr_uref > 0) return (pr); deadpr = pr; } mtx_unlock(&pr->pr_mtx); } } /* There was no valid prison - perhaps there was a dying one. */ if (deadpr != NULL) { mtx_lock(&deadpr->pr_mtx); if (deadpr->pr_ref == 0) { mtx_unlock(&deadpr->pr_mtx); goto again; } } return (deadpr); } /* * See if a prison has the specific flag set. */ int prison_flag(struct ucred *cred, unsigned flag) { /* This is an atomic read, so no locking is necessary. */ return (cred->cr_prison->pr_flags & flag); } int prison_allow(struct ucred *cred, unsigned flag) { /* This is an atomic read, so no locking is necessary. */ return (cred->cr_prison->pr_allow & flag); } /* * Remove a prison reference. If that was the last reference, remove the * prison itself - but not in this context in case there are locks held. */ void prison_free_locked(struct prison *pr) { int ref; mtx_assert(&pr->pr_mtx, MA_OWNED); ref = --pr->pr_ref; mtx_unlock(&pr->pr_mtx); if (ref == 0) taskqueue_enqueue(taskqueue_thread, &pr->pr_task); } void prison_free(struct prison *pr) { mtx_lock(&pr->pr_mtx); prison_free_locked(pr); } /* * Complete a call to either prison_free or prison_proc_free. */ static void prison_complete(void *context, int pending) { struct prison *pr = context; sx_xlock(&allprison_lock); mtx_lock(&pr->pr_mtx); prison_deref(pr, pr->pr_uref ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED : PD_LOCKED | PD_LIST_XLOCKED); } /* * Remove a prison reference (usually). This internal version assumes no * mutexes are held, except perhaps the prison itself. If there are no more * references, release and delist the prison. On completion, the prison lock * and the allprison lock are both unlocked. */ static void prison_deref(struct prison *pr, int flags) { struct prison *ppr, *tpr; int ref, lasturef; if (!(flags & PD_LOCKED)) mtx_lock(&pr->pr_mtx); for (;;) { if (flags & PD_DEUREF) { KASSERT(pr->pr_uref > 0, ("prison_deref PD_DEUREF on a dead prison (jid=%d)", pr->pr_id)); pr->pr_uref--; lasturef = pr->pr_uref == 0; if (lasturef) pr->pr_ref++; KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0")); } else lasturef = 0; if (flags & PD_DEREF) { KASSERT(pr->pr_ref > 0, ("prison_deref PD_DEREF on a dead prison (jid=%d)", pr->pr_id)); pr->pr_ref--; } ref = pr->pr_ref; mtx_unlock(&pr->pr_mtx); /* * Tell the modules if the last user reference was removed * (even it sticks around in dying state). */ if (lasturef) { if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) { sx_xlock(&allprison_lock); flags |= PD_LIST_XLOCKED; } (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); mtx_lock(&pr->pr_mtx); ref = --pr->pr_ref; mtx_unlock(&pr->pr_mtx); } /* If the prison still has references, nothing else to do. */ if (ref > 0) { if (flags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (flags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); return; } if (flags & PD_LIST_SLOCKED) { if (!sx_try_upgrade(&allprison_lock)) { sx_sunlock(&allprison_lock); sx_xlock(&allprison_lock); } } else if (!(flags & PD_LIST_XLOCKED)) sx_xlock(&allprison_lock); TAILQ_REMOVE(&allprison, pr, pr_list); LIST_REMOVE(pr, pr_sibling); ppr = pr->pr_parent; for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount--; sx_xunlock(&allprison_lock); #ifdef VIMAGE if (pr->pr_vnet != ppr->pr_vnet) vnet_destroy(pr->pr_vnet); #endif if (pr->pr_root != NULL) vrele(pr->pr_root); mtx_destroy(&pr->pr_mtx); #ifdef INET free(pr->pr_ip4, M_PRISON); #endif #ifdef INET6 free(pr->pr_ip6, M_PRISON); #endif if (pr->pr_cpuset != NULL) cpuset_rel(pr->pr_cpuset); osd_jail_exit(pr); #ifdef RACCT if (racct_enable) prison_racct_detach(pr); #endif free(pr, M_PRISON); /* Removing a prison frees a reference on its parent. */ pr = ppr; mtx_lock(&pr->pr_mtx); flags = PD_DEREF | PD_DEUREF; } } void prison_hold_locked(struct prison *pr) { mtx_assert(&pr->pr_mtx, MA_OWNED); KASSERT(pr->pr_ref > 0, ("Trying to hold dead prison (jid=%d).", pr->pr_id)); pr->pr_ref++; } void prison_hold(struct prison *pr) { mtx_lock(&pr->pr_mtx); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); } void prison_proc_hold(struct prison *pr) { mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); pr->pr_uref++; mtx_unlock(&pr->pr_mtx); } void prison_proc_free(struct prison *pr) { mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); if (pr->pr_uref > 1) pr->pr_uref--; else { /* * Don't remove the last user reference in this context, which * is expected to be a process that is not only locked, but * also half dead. */ pr->pr_ref++; mtx_unlock(&pr->pr_mtx); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); return; } mtx_unlock(&pr->pr_mtx); } #ifdef INET /* * Restrict a prison's IP address list with its parent's, possibly replacing * it. Return true if the replacement buffer was used (or would have been). */ static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4) { int ii, ij, used; struct prison *ppr; ppr = pr->pr_parent; if (!(pr->pr_flags & PR_IP4_USER)) { /* This has no user settings, so just copy the parent's list. */ if (pr->pr_ip4s < ppr->pr_ip4s) { /* * There's no room for the parent's list. Use the * new list buffer, which is assumed to be big enough * (if it was passed). If there's no buffer, try to * allocate one. */ used = 1; if (newip4 == NULL) { newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4), M_PRISON, M_NOWAIT); if (newip4 != NULL) used = 0; } if (newip4 != NULL) { bcopy(ppr->pr_ip4, newip4, ppr->pr_ip4s * sizeof(*newip4)); free(pr->pr_ip4, M_PRISON); pr->pr_ip4 = newip4; pr->pr_ip4s = ppr->pr_ip4s; } return (used); } pr->pr_ip4s = ppr->pr_ip4s; if (pr->pr_ip4s > 0) bcopy(ppr->pr_ip4, pr->pr_ip4, pr->pr_ip4s * sizeof(*newip4)); else if (pr->pr_ip4 != NULL) { free(pr->pr_ip4, M_PRISON); pr->pr_ip4 = NULL; } } else if (pr->pr_ip4s > 0) { /* Remove addresses that aren't in the parent. */ for (ij = 0; ij < ppr->pr_ip4s; ij++) if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij < ppr->pr_ip4s) ii = 1; else { bcopy(pr->pr_ip4 + 1, pr->pr_ip4, --pr->pr_ip4s * sizeof(*pr->pr_ip4)); ii = 0; } for (ij = 1; ii < pr->pr_ip4s; ) { if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) { ii++; continue; } switch (ij >= ppr->pr_ip4s ? -1 : qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) { case -1: bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii, (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4)); break; case 0: ii++; ij++; break; case 1: ij++; break; } } if (pr->pr_ip4s == 0) { pr->pr_flags |= PR_IP4_DISABLE; free(pr->pr_ip4, M_PRISON); pr->pr_ip4 = NULL; } } return (0); } /* * Pass back primary IPv4 address of this jail. * * If not restricted return success but do not alter the address. Caller has * to make sure to initialize it correctly (e.g. INADDR_ANY). * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4. * Address returned in NBO. */ int prison_get_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } /* * Return 1 if we should do proper source address selection or are not jailed. * We will return 0 if we should bypass source address selection in favour * of the primary jail IPv4 address. Only in this case *ia will be updated and * returned in NBO. * Return EAFNOSUPPORT, in case this jail does not allow IPv4. */ int prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; struct in_addr lia; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); if (!jailed(cred)) return (1); pr = cred->cr_prison; if (pr->pr_flags & PR_IP4_SADDRSEL) return (1); lia.s_addr = INADDR_ANY; error = prison_get_ip4(cred, &lia); if (error) return (error); if (lia.s_addr == INADDR_ANY) return (1); ia->s_addr = lia.s_addr; return (0); } /* * Return true if pr1 and pr2 have the same IPv4 address restrictions. */ int prison_equal_ip4(struct prison *pr1, struct prison *pr2) { if (pr1 == pr2) return (1); /* * No need to lock since the PR_IP4_USER flag can't be altered for * existing prisons. */ while (pr1 != &prison0 && #ifdef VIMAGE !(pr1->pr_flags & PR_VNET) && #endif !(pr1->pr_flags & PR_IP4_USER)) pr1 = pr1->pr_parent; while (pr2 != &prison0 && #ifdef VIMAGE !(pr2->pr_flags & PR_VNET) && #endif !(pr2->pr_flags & PR_IP4_USER)) pr2 = pr2->pr_parent; return (pr1 == pr2); } /* * Make sure our (source) address is set to something meaningful to this * jail. * * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv4. Address passed in in NBO and returned in NBO. */ int prison_local_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; struct in_addr ia0; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } ia0.s_addr = ntohl(ia->s_addr); if (ia0.s_addr == INADDR_LOOPBACK) { ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } if (ia0.s_addr == INADDR_ANY) { /* * In case there is only 1 IPv4 address, bind directly. */ if (pr->pr_ip4s == 1) ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } error = _prison_check_ip4(pr, ia); mtx_unlock(&pr->pr_mtx); return (error); } /* * Rewrite destination address in case we will connect to loopback address. * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4. * Address passed in in NBO and returned in NBO. */ int prison_remote_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (ntohl(ia->s_addr) == INADDR_LOOPBACK) { ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } /* * Return success because nothing had to be changed. */ mtx_unlock(&pr->pr_mtx); return (0); } /* * Check if given address belongs to the jail referenced by cred/prison. * * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv4. Address passed in in NBO. */ static int _prison_check_ip4(struct prison *pr, struct in_addr *ia) { int i, a, z, d; /* * Check the primary IP. */ if (pr->pr_ip4[0].s_addr == ia->s_addr) return (0); /* * All the other IPs are sorted so we can do a binary search. */ a = 0; z = pr->pr_ip4s - 2; while (a <= z) { i = (a + z) / 2; d = qcmp_v4(&pr->pr_ip4[i+1], ia); if (d > 0) z = i - 1; else if (d < 0) a = i + 1; else return (0); } return (EADDRNOTAVAIL); } int prison_check_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } error = _prison_check_ip4(pr, ia); mtx_unlock(&pr->pr_mtx); return (error); } #endif #ifdef INET6 static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6) { int ii, ij, used; struct prison *ppr; ppr = pr->pr_parent; if (!(pr->pr_flags & PR_IP6_USER)) { /* This has no user settings, so just copy the parent's list. */ if (pr->pr_ip6s < ppr->pr_ip6s) { /* * There's no room for the parent's list. Use the * new list buffer, which is assumed to be big enough * (if it was passed). If there's no buffer, try to * allocate one. */ used = 1; if (newip6 == NULL) { newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6), M_PRISON, M_NOWAIT); if (newip6 != NULL) used = 0; } if (newip6 != NULL) { bcopy(ppr->pr_ip6, newip6, ppr->pr_ip6s * sizeof(*newip6)); free(pr->pr_ip6, M_PRISON); pr->pr_ip6 = newip6; pr->pr_ip6s = ppr->pr_ip6s; } return (used); } pr->pr_ip6s = ppr->pr_ip6s; if (pr->pr_ip6s > 0) bcopy(ppr->pr_ip6, pr->pr_ip6, pr->pr_ip6s * sizeof(*newip6)); else if (pr->pr_ip6 != NULL) { free(pr->pr_ip6, M_PRISON); pr->pr_ip6 = NULL; } } else if (pr->pr_ip6s > 0) { /* Remove addresses that aren't in the parent. */ for (ij = 0; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], &ppr->pr_ip6[ij])) break; if (ij < ppr->pr_ip6s) ii = 1; else { bcopy(pr->pr_ip6 + 1, pr->pr_ip6, --pr->pr_ip6s * sizeof(*pr->pr_ip6)); ii = 0; } for (ij = 1; ii < pr->pr_ip6s; ) { if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii], &ppr->pr_ip6[0])) { ii++; continue; } switch (ij >= ppr->pr_ip6s ? -1 : qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) { case -1: bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii, (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6)); break; case 0: ii++; ij++; break; case 1: ij++; break; } } if (pr->pr_ip6s == 0) { pr->pr_flags |= PR_IP6_DISABLE; free(pr->pr_ip6, M_PRISON); pr->pr_ip6 = NULL; } } return 0; } /* * Pass back primary IPv6 address for this jail. * * If not restricted return success but do not alter the address. Caller has * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT). * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6. */ int prison_get_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } /* * Return 1 if we should do proper source address selection or are not jailed. * We will return 0 if we should bypass source address selection in favour * of the primary jail IPv6 address. Only in this case *ia will be updated and * returned in NBO. * Return EAFNOSUPPORT, in case this jail does not allow IPv6. */ int prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; struct in6_addr lia6; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); if (!jailed(cred)) return (1); pr = cred->cr_prison; if (pr->pr_flags & PR_IP6_SADDRSEL) return (1); lia6 = in6addr_any; error = prison_get_ip6(cred, &lia6); if (error) return (error); if (IN6_IS_ADDR_UNSPECIFIED(&lia6)) return (1); bcopy(&lia6, ia6, sizeof(struct in6_addr)); return (0); } /* * Return true if pr1 and pr2 have the same IPv6 address restrictions. */ int prison_equal_ip6(struct prison *pr1, struct prison *pr2) { if (pr1 == pr2) return (1); while (pr1 != &prison0 && #ifdef VIMAGE !(pr1->pr_flags & PR_VNET) && #endif !(pr1->pr_flags & PR_IP6_USER)) pr1 = pr1->pr_parent; while (pr2 != &prison0 && #ifdef VIMAGE !(pr2->pr_flags & PR_VNET) && #endif !(pr2->pr_flags & PR_IP6_USER)) pr2 = pr2->pr_parent; return (pr1 == pr2); } /* * Make sure our (source) address is set to something meaningful to this jail. * * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0) * when needed while binding. * * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv6. */ int prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (IN6_IS_ADDR_LOOPBACK(ia6)) { bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } if (IN6_IS_ADDR_UNSPECIFIED(ia6)) { /* * In case there is only 1 IPv6 address, and v6only is true, * then bind directly. */ if (v6only != 0 && pr->pr_ip6s == 1) bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } error = _prison_check_ip6(pr, ia6); mtx_unlock(&pr->pr_mtx); return (error); } /* * Rewrite destination address in case we will connect to loopback address. * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6. */ int prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (IN6_IS_ADDR_LOOPBACK(ia6)) { bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } /* * Return success because nothing had to be changed. */ mtx_unlock(&pr->pr_mtx); return (0); } /* * Check if given address belongs to the jail referenced by cred/prison. * * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv6. */ static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6) { int i, a, z, d; /* * Check the primary IP. */ if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6)) return (0); /* * All the other IPs are sorted so we can do a binary search. */ a = 0; z = pr->pr_ip6s - 2; while (a <= z) { i = (a + z) / 2; d = qcmp_v6(&pr->pr_ip6[i+1], ia6); if (d > 0) z = i - 1; else if (d < 0) a = i + 1; else return (0); } return (EADDRNOTAVAIL); } int prison_check_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } error = _prison_check_ip6(pr, ia6); mtx_unlock(&pr->pr_mtx); return (error); } #endif /* * Check if a jail supports the given address family. * * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT * if not. */ int prison_check_af(struct ucred *cred, int af) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); pr = cred->cr_prison; #ifdef VIMAGE /* Prisons with their own network stack are not limited. */ if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (af) { #ifdef INET case AF_INET: if (pr->pr_flags & PR_IP4) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif #ifdef INET6 case AF_INET6: if (pr->pr_flags & PR_IP6) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif case AF_LOCAL: case AF_ROUTE: break; default: if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Check if given address belongs to the jail referenced by cred (wrapper to * prison_check_ip[46]). * * Returns 0 if jail doesn't restrict the address family or if address belongs * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if * the jail doesn't allow the address family. IPv4 Address passed in in NBO. */ int prison_if(struct ucred *cred, struct sockaddr *sa) { #ifdef INET struct sockaddr_in *sai; #endif #ifdef INET6 struct sockaddr_in6 *sai6; #endif int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (sa->sa_family) { #ifdef INET case AF_INET: sai = (struct sockaddr_in *)sa; error = prison_check_ip4(cred, &sai->sin_addr); break; #endif #ifdef INET6 case AF_INET6: sai6 = (struct sockaddr_in6 *)sa; error = prison_check_ip6(cred, &sai6->sin6_addr); break; #endif default: if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ int prison_check(struct ucred *cred1, struct ucred *cred2) { return ((cred1->cr_prison == cred2->cr_prison || prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); } /* * Return 1 if p2 is a child of p1, otherwise 0. */ int prison_ischild(struct prison *pr1, struct prison *pr2) { for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) if (pr1 == pr2) return (1); return (0); } /* * Return 1 if the passed credential is in a jail, otherwise 0. */ int jailed(struct ucred *cred) { return (cred->cr_prison != &prison0); } /* * Return 1 if the passed credential is in a jail and that jail does not * have its own virtual network stack, otherwise 0. */ int jailed_without_vnet(struct ucred *cred) { if (!jailed(cred)) return (0); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif return (1); } /* * Return the correct hostname (domainname, et al) for the passed credential. */ void getcredhostname(struct ucred *cred, char *buf, size_t size) { struct prison *pr; /* * A NULL credential can be used to shortcut to the physical * system's hostname. */ pr = (cred != NULL) ? cred->cr_prison : &prison0; mtx_lock(&pr->pr_mtx); strlcpy(buf, pr->pr_hostname, size); mtx_unlock(&pr->pr_mtx); } void getcreddomainname(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_domainname, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostuuid(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_hostuuid, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostid(struct ucred *cred, unsigned long *hostid) { mtx_lock(&cred->cr_prison->pr_mtx); *hostid = cred->cr_prison->pr_hostid; mtx_unlock(&cred->cr_prison->pr_mtx); } #ifdef VIMAGE /* * Determine whether the prison represented by cred owns * its vnet rather than having it inherited. * * Returns 1 in case the prison owns the vnet, 0 otherwise. */ int prison_owns_vnet(struct ucred *cred) { /* * vnets cannot be added/removed after jail creation, * so no need to lock here. */ return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0); } #endif /* * Determine whether the subject represented by cred can "see" * status of a mount point. * Returns: 0 for permitted, ENOENT otherwise. * XXX: This function should be called cr_canseemount() and should be * placed in kern_prot.c. */ int prison_canseemount(struct ucred *cred, struct mount *mp) { struct prison *pr; struct statfs *sp; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return (0); if (pr->pr_root->v_mount == mp) return (0); if (pr->pr_enforce_statfs == 2) return (ENOENT); /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. * This is ugly check, but this is the only situation when jail's * directory ends with '/'. */ if (strcmp(pr->pr_path, "/") == 0) return (0); len = strlen(pr->pr_path); sp = &mp->mnt_stat; if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) return (ENOENT); /* * Be sure that we don't have situation where jail's root directory * is "/some/path" and mount point is "/some/pathpath". */ if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') return (ENOENT); return (0); } void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) { char jpath[MAXPATHLEN]; struct prison *pr; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return; if (prison_canseemount(cred, mp) != 0) { bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, "[restricted]", sizeof(sp->f_mntonname)); return; } if (pr->pr_root->v_mount == mp) { /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); *sp->f_mntonname = '/'; return; } /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. */ if (strcmp(pr->pr_path, "/") == 0) return; len = strlen(pr->pr_path); strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); if (*jpath == '\0') { /* Should never happen. */ *sp->f_mntonname = '/'; } else { strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); } } /* * Check with permission for a specific privilege is granted within jail. We * have a specific list of accepted privileges; the rest are denied. */ int prison_priv_check(struct ucred *cred, int priv) { if (!jailed(cred)) return (0); #ifdef VIMAGE /* * Privileges specific to prisons with a virtual network stack. * There might be a duplicate entry here in case the privilege * is only granted conditionally in the legacy jail case. */ switch (priv) { #ifdef notyet /* * NFS-specific privileges. */ case PRIV_NFS_DAEMON: case PRIV_NFS_LOCKD: #endif /* * Network stack privileges. */ case PRIV_NET_BRIDGE: case PRIV_NET_GRE: case PRIV_NET_BPF: case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ case PRIV_NET_ROUTE: case PRIV_NET_TAP: case PRIV_NET_SETIFMTU: case PRIV_NET_SETIFFLAGS: case PRIV_NET_SETIFCAP: case PRIV_NET_SETIFDESCR: case PRIV_NET_SETIFNAME : case PRIV_NET_SETIFMETRIC: case PRIV_NET_SETIFPHYS: case PRIV_NET_SETIFMAC: case PRIV_NET_ADDMULTI: case PRIV_NET_DELMULTI: case PRIV_NET_HWIOCTL: case PRIV_NET_SETLLADDR: case PRIV_NET_ADDIFGROUP: case PRIV_NET_DELIFGROUP: case PRIV_NET_IFCREATE: case PRIV_NET_IFDESTROY: case PRIV_NET_ADDIFADDR: case PRIV_NET_DELIFADDR: case PRIV_NET_LAGG: case PRIV_NET_GIF: case PRIV_NET_SETIFVNET: case PRIV_NET_SETIFFIB: /* * 802.11-related privileges. */ case PRIV_NET80211_GETKEY: #ifdef notyet case PRIV_NET80211_MANAGE: /* XXX-BZ discuss with sam@ */ #endif #ifdef notyet /* * AppleTalk privileges. */ case PRIV_NETATALK_RESERVEDPORT: /* * ATM privileges. */ case PRIV_NETATM_CFG: case PRIV_NETATM_ADD: case PRIV_NETATM_DEL: case PRIV_NETATM_SET: /* * Bluetooth privileges. */ case PRIV_NETBLUETOOTH_RAW: #endif /* * Netgraph and netgraph module privileges. */ case PRIV_NETGRAPH_CONTROL: #ifdef notyet case PRIV_NETGRAPH_TTY: #endif /* * IPv4 and IPv6 privileges. */ case PRIV_NETINET_IPFW: case PRIV_NETINET_DIVERT: case PRIV_NETINET_PF: case PRIV_NETINET_DUMMYNET: case PRIV_NETINET_CARP: case PRIV_NETINET_MROUTE: case PRIV_NETINET_RAW: case PRIV_NETINET_ADDRCTRL6: case PRIV_NETINET_ND6: case PRIV_NETINET_SCOPE6: case PRIV_NETINET_ALIFETIME6: case PRIV_NETINET_IPSEC: case PRIV_NETINET_BINDANY: #ifdef notyet /* * IPX/SPX privileges. */ case PRIV_NETIPX_RESERVEDPORT: case PRIV_NETIPX_RAW: /* * NCP privileges. */ case PRIV_NETNCP: /* * SMB privileges. */ case PRIV_NETSMB: #endif /* * No default: or deny here. * In case of no permit fall through to next switch(). */ if (cred->cr_prison->pr_flags & PR_VNET) return (0); } #endif /* VIMAGE */ switch (priv) { /* * Allow ktrace privileges for root in jail. */ case PRIV_KTRACE: #if 0 /* * Allow jailed processes to configure audit identity and * submit audit records (login, etc). In the future we may * want to further refine the relationship between audit and * jail. */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: case PRIV_AUDIT_SUBMIT: #endif /* * Allow jailed processes to manipulate process UNIX * credentials in any way they see fit. */ case PRIV_CRED_SETUID: case PRIV_CRED_SETEUID: case PRIV_CRED_SETGID: case PRIV_CRED_SETEGID: case PRIV_CRED_SETGROUPS: case PRIV_CRED_SETREUID: case PRIV_CRED_SETREGID: case PRIV_CRED_SETRESUID: case PRIV_CRED_SETRESGID: /* * Jail implements visibility constraints already, so allow * jailed root to override uid/gid-based constraints. */ case PRIV_SEEOTHERGIDS: case PRIV_SEEOTHERUIDS: /* * Jail implements inter-process debugging limits already, so * allow jailed root various debugging privileges. */ case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: /* * Allow jail to set various resource limits and login * properties, and for now, exceed process resource limits. */ case PRIV_PROC_LIMIT: case PRIV_PROC_SETLOGIN: case PRIV_PROC_SETRLIMIT: /* * System V and POSIX IPC privileges are granted in jail. */ case PRIV_IPC_READ: case PRIV_IPC_WRITE: case PRIV_IPC_ADMIN: case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: /* * Jail operations within a jail work on child jails. */ case PRIV_JAIL_ATTACH: case PRIV_JAIL_SET: case PRIV_JAIL_REMOVE: /* * Jail implements its own inter-process limits, so allow * root processes in jail to change scheduling on other * processes in the same jail. Likewise for signalling. */ case PRIV_SCHED_DIFFCRED: case PRIV_SCHED_CPUSET: case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: /* * Allow jailed processes to write to sysctls marked as jail * writable. */ case PRIV_SYSCTL_WRITEJAIL: /* * Allow root in jail to manage a variety of quota * properties. These should likely be conditional on a * configuration option. */ case PRIV_VFS_GETQUOTA: case PRIV_VFS_SETQUOTA: /* * Since Jail relies on chroot() to implement file system * protections, grant many VFS privileges to root in jail. * Be careful to exclude mount-related and NFS-related * privileges. */ case PRIV_VFS_READ: case PRIV_VFS_WRITE: case PRIV_VFS_ADMIN: case PRIV_VFS_EXEC: case PRIV_VFS_LOOKUP: case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ case PRIV_VFS_CHFLAGS_DEV: case PRIV_VFS_CHOWN: case PRIV_VFS_CHROOT: case PRIV_VFS_RETAINSUGID: case PRIV_VFS_FCHROOT: case PRIV_VFS_LINK: case PRIV_VFS_SETGID: case PRIV_VFS_STAT: case PRIV_VFS_STICKYFILE: /* * As in the non-jail case, non-root users are expected to be * able to read kernel/phyiscal memory (provided /dev/[k]mem * exists in the jail and they have permission to access it). */ case PRIV_KMEM_READ: return (0); /* * Depending on the global setting, allow privilege of * setting system flags. */ case PRIV_VFS_SYSFLAGS: if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) return (0); else return (EPERM); /* * Depending on the global setting, allow privilege of * mounting/unmounting file systems. */ case PRIV_VFS_MOUNT: case PRIV_VFS_UNMOUNT: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_MOUNT_OWNER: if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT && cred->cr_prison->pr_enforce_statfs < 2) return (0); else return (EPERM); /* * Allow jailed root to bind reserved ports and reuse in-use * ports. */ case PRIV_NETINET_RESERVEDPORT: case PRIV_NETINET_REUSEPORT: return (0); /* * Allow jailed root to set certian IPv4/6 (option) headers. */ case PRIV_NETINET_SETHDROPTS: return (0); /* * Conditionally allow creating raw sockets in jail. */ case PRIV_NETINET_RAW: if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) return (0); else return (EPERM); /* * Since jail implements its own visibility limits on netstat * sysctls, allow getcred. This allows identd to work in * jail. */ case PRIV_NETINET_GETCRED: return (0); /* * Allow jailed root to set loginclass. */ case PRIV_PROC_SETLOGINCLASS: return (0); default: /* * In all remaining cases, deny the privilege request. This * includes almost all network privileges, many system * configuration privileges. */ return (EPERM); } } /* * Return the part of pr2's name that is relative to pr1, or the whole name * if it does not directly follow. */ char * prison_name(struct prison *pr1, struct prison *pr2) { char *name; /* Jails see themselves as "0" (if they see themselves at all). */ if (pr1 == pr2) return "0"; name = pr2->pr_name; if (prison_ischild(pr1, pr2)) { /* * pr1 isn't locked (and allprison_lock may not be either) * so its length can't be counted on. But the number of dots * can be counted on - and counted. */ for (; pr1 != &prison0; pr1 = pr1->pr_parent) name = strchr(name, '.') + 1; } return (name); } /* * Return the part of pr2's path that is relative to pr1, or the whole path * if it does not directly follow. */ static char * prison_path(struct prison *pr1, struct prison *pr2) { char *path1, *path2; int len1; path1 = pr1->pr_path; path2 = pr2->pr_path; if (!strcmp(path1, "/")) return (path2); len1 = strlen(path1); if (strncmp(path1, path2, len1)) return (path2); if (path2[len1] == '\0') return "/"; if (path2[len1] == '/') return (path2 + len1); return (path2); } /* * Jail-related sysctls. */ static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0, "Jails"); static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { struct xprison *xp; struct prison *pr, *cpr; #ifdef INET struct in_addr *ip4 = NULL; int ip4s = 0; #endif #ifdef INET6 struct in6_addr *ip6 = NULL; int ip6s = 0; #endif int descend, error; xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); pr = req->td->td_ucred->cr_prison; error = 0; sx_slock(&allprison_lock); FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { #if defined(INET) || defined(INET6) again: #endif mtx_lock(&cpr->pr_mtx); #ifdef INET if (cpr->pr_ip4s > 0) { if (ip4s < cpr->pr_ip4s) { ip4s = cpr->pr_ip4s; mtx_unlock(&cpr->pr_mtx); ip4 = realloc(ip4, ip4s * sizeof(struct in_addr), M_TEMP, M_WAITOK); goto again; } bcopy(cpr->pr_ip4, ip4, cpr->pr_ip4s * sizeof(struct in_addr)); } #endif #ifdef INET6 if (cpr->pr_ip6s > 0) { if (ip6s < cpr->pr_ip6s) { ip6s = cpr->pr_ip6s; mtx_unlock(&cpr->pr_mtx); ip6 = realloc(ip6, ip6s * sizeof(struct in6_addr), M_TEMP, M_WAITOK); goto again; } bcopy(cpr->pr_ip6, ip6, cpr->pr_ip6s * sizeof(struct in6_addr)); } #endif if (cpr->pr_ref == 0) { mtx_unlock(&cpr->pr_mtx); continue; } bzero(xp, sizeof(*xp)); xp->pr_version = XPRISON_VERSION; xp->pr_id = cpr->pr_id; xp->pr_state = cpr->pr_uref > 0 ? PRISON_STATE_ALIVE : PRISON_STATE_DYING; strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); #ifdef INET xp->pr_ip4s = cpr->pr_ip4s; #endif #ifdef INET6 xp->pr_ip6s = cpr->pr_ip6s; #endif mtx_unlock(&cpr->pr_mtx); error = SYSCTL_OUT(req, xp, sizeof(*xp)); if (error) break; #ifdef INET if (xp->pr_ip4s > 0) { error = SYSCTL_OUT(req, ip4, xp->pr_ip4s * sizeof(struct in_addr)); if (error) break; } #endif #ifdef INET6 if (xp->pr_ip6s > 0) { error = SYSCTL_OUT(req, ip6, xp->pr_ip6s * sizeof(struct in6_addr)); if (error) break; } #endif } sx_sunlock(&allprison_lock); free(xp, M_TEMP); #ifdef INET free(ip4, M_TEMP); #endif #ifdef INET6 free(ip6, M_TEMP); #endif return (error); } SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_list, "S", "List of active jails"); static int sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) { int error, injail; injail = jailed(req->td->td_ucred); error = SYSCTL_OUT(req, &injail, sizeof(injail)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_jailed, "I", "Process in jail?"); static int sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) { int error, havevnet; #ifdef VIMAGE struct ucred *cred = req->td->td_ucred; havevnet = jailed(cred) && prison_owns_vnet(cred); #else havevnet = 0; #endif error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, vnet, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns VNET?"); #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, "Number of IP addresses a jail may have at most per address family (deprecated)"); #endif /* * Default parameters for jail(2) compatability. For historical reasons, * the sysctl names have varying similarity to the parameter names. Prisons * just see their own parameters, and can't change them. */ static int sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) { struct prison *pr; int allow, error, i; pr = req->td->td_ucred->cr_prison; allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow; /* Get the current flag value, and convert it to a boolean. */ i = (allow & arg2) ? 1 : 0; if (arg1 != NULL) i = !i; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); i = i ? arg2 : 0; if (arg1 != NULL) i ^= arg2; /* * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 * for writing. */ mtx_lock(&prison0.pr_mtx); jail_default_allow = (jail_default_allow & ~arg2) | i; mtx_unlock(&prison0.pr_mtx); return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", "Processes in jail can set their hostnames (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", "Processes in jail can use System V IPC primitives (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", "Prison root can create raw sockets (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", "Processes in jail can alter system file flags (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", "Processes in jail can mount/unmount jail-friendly file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the devfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the fdescfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the nullfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the procfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the linprocfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the linsysfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the tmpfs file system (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the zfs file system (deprecated)"); static int sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) { struct prison *pr; int level, error; pr = req->td->td_ucred->cr_prison; level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); *(int *)arg1 = level; return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), sysctl_jail_default_level, "I", "Processes in jail cannot see all mounted file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), sysctl_jail_default_level, "I", "Ruleset for the devfs filesystem in jail (deprecated)"); /* * Nodes to describe jail parameters. Maximum length of string parameters * is returned in the string itself, and the other parameters exist merely * to make themselves and their types known. */ SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0, "Jail parameters"); int sysctl_jail_param(SYSCTL_HANDLER_ARGS) { int i; long l; size_t s; char numbuf[12]; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_LONG: case CTLTYPE_ULONG: l = 0; #ifdef SCTL_MASK32 if (!(req->flags & SCTL_MASK32)) #endif return (SYSCTL_OUT(req, &l, sizeof(l))); case CTLTYPE_INT: case CTLTYPE_UINT: i = 0; return (SYSCTL_OUT(req, &i, sizeof(i))); case CTLTYPE_STRING: snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); return (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); case CTLTYPE_STRUCT: s = (size_t)arg2; return (SYSCTL_OUT(req, &s, sizeof(s))); } return (0); } /* * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at * jail creation time but cannot be changed in an existing jail. */ SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail secure level"); SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail value for kern.osreldate and uname -K"); SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, "Jail value for kern.osrelease and uname -r"); SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail cannot see all mounted file systems"); SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, "I", "Ruleset for in-jail devfs mounts"); SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail persistence"); #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, "I", "Current number of child jails"); SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, "I", "Maximum number of child jails"); SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail hostname"); SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail NIS domainname"); SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, "Jail host UUID"); SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, "LU", "Jail host ID"); SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); #ifdef INET SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, "Jail IPv4 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), "S,in_addr,a", "Jail IPv4 addresses"); SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv4 source address selection rather than the " "primary jail IPv4 address."); #endif #ifdef INET6 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, "Jail IPv6 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), "S,in6_addr,a", "Jail IPv6 addresses"); SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv6 source address selection rather than the " "primary jail IPv6 address."); #endif SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set hostname"); SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may use SYSV IPC"); SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create raw sockets"); SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may alter system file flags"); SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set file quotas"); SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount/unmount jail-friendly file systems in general"); SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the devfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the fdescfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the nullfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the procfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the linprocfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the linsysfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the tmpfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the zfs file system"); #ifdef RACCT void prison_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void *arg2, void *arg3) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_slock(&allprison_lock); LIST_FOREACH(prr, &allprison_racct, prr_next) (callback)(prr->prr_racct, arg2, arg3); sx_sunlock(&allprison_lock); } static struct prison_racct * prison_racct_find_locked(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) return (NULL); LIST_FOREACH(prr, &allprison_racct, prr_next) { if (strcmp(name, prr->prr_name) != 0) continue; /* Found prison_racct with a matching name? */ prison_racct_hold(prr); return (prr); } /* Add new prison_racct. */ prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); racct_create(&prr->prr_racct); strcpy(prr->prr_name, name); refcount_init(&prr->prr_refcount, 1); LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); return (prr); } struct prison_racct * prison_racct_find(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_xlock(&allprison_lock); prr = prison_racct_find_locked(name); sx_xunlock(&allprison_lock); return (prr); } void prison_racct_hold(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); refcount_acquire(&prr->prr_refcount); } static void prison_racct_free_locked(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (refcount_release(&prr->prr_refcount)) { racct_destroy(&prr->prr_racct); LIST_REMOVE(prr, prr_next); free(prr, M_PRISON_RACCT); } } void prison_racct_free(struct prison_racct *prr) { int old; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); old = prr->prr_refcount; if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1)) return; sx_xlock(&allprison_lock); prison_racct_free_locked(prr); sx_xunlock(&allprison_lock); } static void prison_racct_attach(struct prison *pr) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); prr = prison_racct_find_locked(pr->pr_name); KASSERT(prr != NULL, ("cannot find prison_racct")); pr->pr_prison_racct = prr; } /* * Handle jail renaming. From the racct point of view, renaming means * moving from one prison_racct to another. */ static void prison_racct_modify(struct prison *pr) { struct proc *p; struct ucred *cred; struct prison_racct *oldprr; ASSERT_RACCT_ENABLED(); sx_slock(&allproc_lock); sx_xlock(&allprison_lock); if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { sx_xunlock(&allprison_lock); sx_sunlock(&allproc_lock); return; } oldprr = pr->pr_prison_racct; pr->pr_prison_racct = NULL; prison_racct_attach(pr); /* * Move resource utilisation records. */ racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); /* * Force rctl to reattach rules to processes. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); cred = crhold(p->p_ucred); PROC_UNLOCK(p); racct_proc_ucred_changed(p, cred, cred); crfree(cred); } sx_sunlock(&allproc_lock); prison_racct_free_locked(oldprr); sx_xunlock(&allprison_lock); } static void prison_racct_detach(struct prison *pr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (pr->pr_prison_racct == NULL) return; prison_racct_free(pr->pr_prison_racct); pr->pr_prison_racct = NULL; } #endif /* RACCT */ #ifdef DDB static void db_show_prison(struct prison *pr) { int fi; #if defined(INET) || defined(INET6) int ii; #endif unsigned jsf; #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif db_printf("prison %p:\n", pr); db_printf(" jid = %d\n", pr->pr_id); db_printf(" name = %s\n", pr->pr_name); db_printf(" parent = %p\n", pr->pr_parent); db_printf(" ref = %d\n", pr->pr_ref); db_printf(" uref = %d\n", pr->pr_uref); db_printf(" path = %s\n", pr->pr_path); db_printf(" cpuset = %d\n", pr->pr_cpuset ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); db_printf(" children.max = %d\n", pr->pr_childmax); db_printf(" children.cur = %d\n", pr->pr_childcount); db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); db_printf(" flags = 0x%x", pr->pr_flags); for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]); fi++) if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi))) db_printf(" %s", pr_flag_names[fi]); for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]); fi++) { jsf = pr->pr_flags & (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new); db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name, pr_flag_jailsys[fi].disable && (jsf == pr_flag_jailsys[fi].disable) ? "disable" : (jsf == pr_flag_jailsys[fi].new) ? "new" : "inherit"); } db_printf(" allow = 0x%x", pr->pr_allow); for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); fi++) if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi))) db_printf(" %s", pr_allow_names[fi]); db_printf("\n"); db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); db_printf(" host.hostname = %s\n", pr->pr_hostname); db_printf(" host.domainname = %s\n", pr->pr_domainname); db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); db_printf(" host.hostid = %lu\n", pr->pr_hostid); #ifdef INET db_printf(" ip4s = %d\n", pr->pr_ip4s); for (ii = 0; ii < pr->pr_ip4s; ii++) db_printf(" %s %s\n", ii == 0 ? "ip4.addr =" : " ", inet_ntoa(pr->pr_ip4[ii])); #endif #ifdef INET6 db_printf(" ip6s = %d\n", pr->pr_ip6s); for (ii = 0; ii < pr->pr_ip6s; ii++) db_printf(" %s %s\n", ii == 0 ? "ip6.addr =" : " ", ip6_sprintf(ip6buf, &pr->pr_ip6[ii])); #endif } DB_SHOW_COMMAND(prison, db_show_prison_command) { struct prison *pr; if (!have_addr) { /* * Show all prisons in the list, and prison0 which is not * listed. */ db_show_prison(&prison0); if (!db_pager_quit) { TAILQ_FOREACH(pr, &allprison, pr_list) { db_show_prison(pr); if (db_pager_quit) break; } } return; } if (addr == 0) pr = &prison0; else { /* Look for a prison with the ID and with references. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr && pr->pr_ref > 0) break; if (pr == NULL) /* Look again, without requiring a reference. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr) break; if (pr == NULL) /* Assume address points to a valid prison. */ pr = (struct prison *)addr; } db_show_prison(pr); } #endif /* DDB */ Index: stable/10/sys/kern/kern_loginclass.c =================================================================== --- stable/10/sys/kern/kern_loginclass.c (revision 302228) +++ stable/10/sys/kern/kern_loginclass.c (revision 302229) @@ -1,229 +1,229 @@ /*- * Copyright (c) 2011 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Processes may set login class name using setloginclass(2). This * is usually done through call to setusercontext(3), by programs * such as login(1), based on information from master.passwd(5). Kernel * uses this information to enforce per-class resource limits. Current * login class can be determined using id(1). Login class is inherited * from the parent process during fork(2). If not set, it defaults * to "default". * * Code in this file implements setloginclass(2) and getloginclass(2) * system calls, and maintains class name storage and retrieval. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_LOGINCLASS, "loginclass", "loginclass structures"); LIST_HEAD(, loginclass) loginclasses; /* * Lock protecting loginclasses list. */ static struct mtx loginclasses_lock; MTX_SYSINIT(loginclasses_init, &loginclasses_lock, "loginclasses lock", MTX_DEF); void loginclass_hold(struct loginclass *lc) { refcount_acquire(&lc->lc_refcount); } void loginclass_free(struct loginclass *lc) { int old; old = lc->lc_refcount; if (old > 1 && atomic_cmpset_int(&lc->lc_refcount, old, old - 1)) return; mtx_lock(&loginclasses_lock); if (refcount_release(&lc->lc_refcount)) { racct_destroy(&lc->lc_racct); LIST_REMOVE(lc, lc_next); mtx_unlock(&loginclasses_lock); free(lc, M_LOGINCLASS); return; } mtx_unlock(&loginclasses_lock); } /* * Return loginclass structure with a corresponding name. Not * performance critical, as it's used mainly by setloginclass(2), * which happens once per login session. Caller has to use * loginclass_free() on the returned value when it's no longer * needed. */ struct loginclass * loginclass_find(const char *name) { struct loginclass *lc, *newlc; if (name[0] == '\0' || strlen(name) >= MAXLOGNAME) return (NULL); newlc = malloc(sizeof(*newlc), M_LOGINCLASS, M_ZERO | M_WAITOK); racct_create(&newlc->lc_racct); mtx_lock(&loginclasses_lock); LIST_FOREACH(lc, &loginclasses, lc_next) { if (strcmp(name, lc->lc_name) != 0) continue; /* Found loginclass with a matching name? */ loginclass_hold(lc); mtx_unlock(&loginclasses_lock); racct_destroy(&newlc->lc_racct); free(newlc, M_LOGINCLASS); return (lc); } /* Add new loginclass. */ strcpy(newlc->lc_name, name); refcount_init(&newlc->lc_refcount, 1); LIST_INSERT_HEAD(&loginclasses, newlc, lc_next); mtx_unlock(&loginclasses_lock); return (newlc); } /* * Get login class name. */ #ifndef _SYS_SYSPROTO_H_ struct getloginclass_args { char *namebuf; size_t namelen; }; #endif /* ARGSUSED */ int sys_getloginclass(struct thread *td, struct getloginclass_args *uap) { int error = 0; size_t lcnamelen; struct proc *p; struct loginclass *lc; p = td->td_proc; PROC_LOCK(p); lc = p->p_ucred->cr_loginclass; loginclass_hold(lc); PROC_UNLOCK(p); lcnamelen = strlen(lc->lc_name) + 1; if (lcnamelen > uap->namelen) error = ERANGE; if (error == 0) error = copyout(lc->lc_name, uap->namebuf, lcnamelen); loginclass_free(lc); return (error); } /* * Set login class name. */ #ifndef _SYS_SYSPROTO_H_ struct setloginclass_args { const char *namebuf; }; #endif /* ARGSUSED */ int sys_setloginclass(struct thread *td, struct setloginclass_args *uap) { struct proc *p = td->td_proc; int error; char lcname[MAXLOGNAME]; struct loginclass *newlc; struct ucred *newcred, *oldcred; error = priv_check(td, PRIV_PROC_SETLOGINCLASS); if (error != 0) return (error); error = copyinstr(uap->namebuf, lcname, sizeof(lcname), NULL); if (error != 0) return (error); newlc = loginclass_find(lcname); if (newlc == NULL) return (EINVAL); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_loginclass = newlc; - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif loginclass_free(oldcred->cr_loginclass); crfree(oldcred); return (0); } void loginclass_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void *arg2, void *arg3) { struct loginclass *lc; mtx_lock(&loginclasses_lock); LIST_FOREACH(lc, &loginclasses, lc_next) (callback)(lc->lc_racct, arg2, arg3); mtx_unlock(&loginclasses_lock); } Index: stable/10/sys/kern/kern_prot.c =================================================================== --- stable/10/sys/kern/kern_prot.c (revision 302228) +++ stable/10/sys/kern/kern_prot.c (revision 302229) @@ -1,2229 +1,2254 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 * The Regents of the University of California. * (c) UNIX System Laboratories, Inc. * Copyright (c) 2000-2001 Robert N. M. Watson. * All rights reserved. * * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94 */ /* * System calls related to processes and protection */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef REGRESSION FEATURE(regression, "Kernel support for interfaces necessary for regression testing (SECURITY RISK!)"); #endif #if defined(INET) || defined(INET6) #include #include #endif #include #include static MALLOC_DEFINE(M_CRED, "cred", "credentials"); SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy"); static void crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups); #ifndef _SYS_SYSPROTO_H_ struct getpid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getpid(struct thread *td, struct getpid_args *uap) { struct proc *p = td->td_proc; td->td_retval[0] = p->p_pid; #if defined(COMPAT_43) PROC_LOCK(p); td->td_retval[1] = p->p_pptr->p_pid; PROC_UNLOCK(p); #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct getppid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getppid(struct thread *td, struct getppid_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = p->p_pptr->p_pid; PROC_UNLOCK(p); return (0); } /* * Get process group ID; note that POSIX getpgrp takes no parameter. */ #ifndef _SYS_SYSPROTO_H_ struct getpgrp_args { int dummy; }; #endif int sys_getpgrp(struct thread *td, struct getpgrp_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* Get an arbitary pid's process group id */ #ifndef _SYS_SYSPROTO_H_ struct getpgid_args { pid_t pid; }; #endif int sys_getpgid(struct thread *td, struct getpgid_args *uap) { struct proc *p; int error; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* * Get an arbitary pid's session id. */ #ifndef _SYS_SYSPROTO_H_ struct getsid_args { pid_t pid; }; #endif int sys_getsid(struct thread *td, struct getsid_args *uap) { struct proc *p; int error; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_session->s_sid; PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct getuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getuid(struct thread *td, struct getuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_ruid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_uid; #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct geteuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_geteuid(struct thread *td, struct geteuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_uid; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getgid(struct thread *td, struct getgid_args *uap) { td->td_retval[0] = td->td_ucred->cr_rgid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_groups[0]; #endif return (0); } /* * Get effective group ID. The "egid" is groups[0], and could be obtained * via getgroups. This syscall exists because it is somewhat painful to do * correctly in a library function. */ #ifndef _SYS_SYSPROTO_H_ struct getegid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getegid(struct thread *td, struct getegid_args *uap) { td->td_retval[0] = td->td_ucred->cr_groups[0]; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgroups_args { u_int gidsetsize; gid_t *gidset; }; #endif int sys_getgroups(struct thread *td, register struct getgroups_args *uap) { gid_t *groups; u_int ngrp; int error; if (uap->gidsetsize < td->td_ucred->cr_ngroups) { if (uap->gidsetsize == 0) ngrp = 0; else return (EINVAL); } else ngrp = td->td_ucred->cr_ngroups; groups = malloc(ngrp * sizeof(*groups), M_TEMP, M_WAITOK); error = kern_getgroups(td, &ngrp, groups); if (error) goto out; if (uap->gidsetsize > 0) error = copyout(groups, uap->gidset, ngrp * sizeof(gid_t)); if (error == 0) td->td_retval[0] = ngrp; out: free(groups, M_TEMP); return (error); } int kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups) { struct ucred *cred; cred = td->td_ucred; if (*ngrp == 0) { *ngrp = cred->cr_ngroups; return (0); } if (*ngrp < cred->cr_ngroups) return (EINVAL); *ngrp = cred->cr_ngroups; bcopy(cred->cr_groups, groups, *ngrp * sizeof(gid_t)); return (0); } #ifndef _SYS_SYSPROTO_H_ struct setsid_args { int dummy; }; #endif /* ARGSUSED */ int sys_setsid(register struct thread *td, struct setsid_args *uap) { struct pgrp *pgrp; int error; struct proc *p = td->td_proc; struct pgrp *newpgrp; struct session *newsess; error = 0; pgrp = NULL; newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); sx_xlock(&proctree_lock); if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) { if (pgrp != NULL) PGRP_UNLOCK(pgrp); error = EPERM; } else { (void)enterpgrp(p, p->p_pid, newpgrp, newsess); td->td_retval[0] = p->p_pid; newpgrp = NULL; newsess = NULL; } sx_xunlock(&proctree_lock); if (newpgrp != NULL) free(newpgrp, M_PGRP); if (newsess != NULL) free(newsess, M_SESSION); return (error); } /* * set process group (setpgid/old setpgrp) * * caller does setpgid(targpid, targpgid) * * pid must be caller or child of caller (ESRCH) * if a child * pid must be in same session (EPERM) * pid can't have done an exec (EACCES) * if pgid != pid * there must exist some pid in same session having pgid (EPERM) * pid must not be session leader (EPERM) */ #ifndef _SYS_SYSPROTO_H_ struct setpgid_args { int pid; /* target process id */ int pgid; /* target pgrp id */ }; #endif /* ARGSUSED */ int sys_setpgid(struct thread *td, register struct setpgid_args *uap) { struct proc *curp = td->td_proc; register struct proc *targp; /* target process */ register struct pgrp *pgrp; /* target pgrp */ int error; struct pgrp *newpgrp; if (uap->pgid < 0) return (EINVAL); error = 0; newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); sx_xlock(&proctree_lock); if (uap->pid != 0 && uap->pid != curp->p_pid) { if ((targp = pfind(uap->pid)) == NULL) { error = ESRCH; goto done; } if (!inferior(targp)) { PROC_UNLOCK(targp); error = ESRCH; goto done; } if ((error = p_cansee(td, targp))) { PROC_UNLOCK(targp); goto done; } if (targp->p_pgrp == NULL || targp->p_session != curp->p_session) { PROC_UNLOCK(targp); error = EPERM; goto done; } if (targp->p_flag & P_EXEC) { PROC_UNLOCK(targp); error = EACCES; goto done; } PROC_UNLOCK(targp); } else targp = curp; if (SESS_LEADER(targp)) { error = EPERM; goto done; } if (uap->pgid == 0) uap->pgid = targp->p_pid; if ((pgrp = pgfind(uap->pgid)) == NULL) { if (uap->pgid == targp->p_pid) { error = enterpgrp(targp, uap->pgid, newpgrp, NULL); if (error == 0) newpgrp = NULL; } else error = EPERM; } else { if (pgrp == targp->p_pgrp) { PGRP_UNLOCK(pgrp); goto done; } if (pgrp->pg_id != targp->p_pid && pgrp->pg_session != curp->p_session) { PGRP_UNLOCK(pgrp); error = EPERM; goto done; } PGRP_UNLOCK(pgrp); error = enterthispgrp(targp, pgrp); } done: sx_xunlock(&proctree_lock); KASSERT((error == 0) || (newpgrp != NULL), ("setpgid failed and newpgrp is NULL")); if (newpgrp != NULL) free(newpgrp, M_PGRP); return (error); } /* * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD * compatible. It says that setting the uid/gid to euid/egid is a special * case of "appropriate privilege". Once the rules are expanded out, this * basically means that setuid(nnn) sets all three id's, in all permitted * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid()) * does not set the saved id - this is dangerous for traditional BSD * programs. For this reason, we *really* do not want to set * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2. */ #define POSIX_APPENDIX_B_4_2_2 #ifndef _SYS_SYSPROTO_H_ struct setuid_args { uid_t uid; }; #endif /* ARGSUSED */ int sys_setuid(struct thread *td, struct setuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t uid; struct uidinfo *uip; int error; uid = uap->uid; AUDIT_ARG_UID(uid); newcred = crget(); uip = uifind(uid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setuid(oldcred, uid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setuid(geteuid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setuid(xx)" sets all * three id's (assuming you have privs). * * Notes on the logic. We do things in three steps. * 1: We determine if the euid is going to change, and do EPERM * right away. We unconditionally change the euid later if this * test is satisfied, simplifying that part of the logic. * 2: We determine if the real and/or saved uids are going to * change. Determined by compile options. * 3: Change euid last. (after tests in #2 for "appropriate privs") */ if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */ #ifdef _POSIX_SAVED_IDS uid != oldcred->cr_svuid && /* allow setuid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ uid != oldcred->cr_uid && /* allow setuid(geteuid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETUID, 0)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or uid == euid) * If so, we are changing the real uid and/or saved uid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */ uid == oldcred->cr_uid || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETUID, 0) == 0) #endif { /* * Set the real uid and transfer proc count to new user. */ if (uid != oldcred->cr_ruid) { change_ruid(newcred, uip); setsugid(p); } /* * Set saved uid * * XXX always set saved uid even if not _POSIX_SAVED_IDS, as * the security of seteuid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (uid != oldcred->cr_svuid) { change_svuid(newcred, uid); setsugid(p); } } /* * In all permitted cases, we are changing the euid. */ if (uid != oldcred->cr_uid) { change_euid(newcred, uip); setsugid(p); } - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif uifree(uip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(uip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct seteuid_args { uid_t euid; }; #endif /* ARGSUSED */ int sys_seteuid(struct thread *td, struct seteuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid; struct uidinfo *euip; int error; euid = uap->euid; AUDIT_ARG_EUID(euid); newcred = crget(); euip = uifind(euid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_seteuid(oldcred, euid); if (error) goto fail; #endif if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */ euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID, 0)) != 0) goto fail; /* * Everything's okay, do it. */ if (oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgid_args { gid_t gid; }; #endif /* ARGSUSED */ int sys_setgid(struct thread *td, struct setgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t gid; int error; gid = uap->gid; AUDIT_ARG_GID(gid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgid(oldcred, gid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setgid(getegid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setgid(xx)" sets all * three id's (assuming you have privs). * * For notes on the logic here, see setuid() above. */ if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */ #ifdef _POSIX_SAVED_IDS gid != oldcred->cr_svgid && /* allow setgid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETGID, 0)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or gid == egid) * If so, we are changing the real uid and saved gid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */ gid == oldcred->cr_groups[0] || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETGID, 0) == 0) #endif { /* * Set real gid */ if (oldcred->cr_rgid != gid) { change_rgid(newcred, gid); setsugid(p); } /* * Set saved gid * * XXX always set saved gid even if not _POSIX_SAVED_IDS, as * the security of setegid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (oldcred->cr_svgid != gid) { change_svgid(newcred, gid); setsugid(p); } } /* * In all cases permitted cases, we are changing the egid. * Copy credentials so other references do not see our changes. */ if (oldcred->cr_groups[0] != gid) { change_egid(newcred, gid); setsugid(p); } - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setegid_args { gid_t egid; }; #endif /* ARGSUSED */ int sys_setegid(struct thread *td, struct setegid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid; int error; egid = uap->egid; AUDIT_ARG_EGID(egid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setegid(oldcred, egid); if (error) goto fail; #endif if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */ egid != oldcred->cr_svgid && /* allow setegid(saved gid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID, 0)) != 0) goto fail; if (oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgroups_args { u_int gidsetsize; gid_t *gidset; }; #endif /* ARGSUSED */ int sys_setgroups(struct thread *td, struct setgroups_args *uap) { gid_t *groups = NULL; int error; if (uap->gidsetsize > ngroups_max + 1) return (EINVAL); groups = malloc(uap->gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK); error = copyin(uap->gidset, groups, uap->gidsetsize * sizeof(gid_t)); if (error) goto out; error = kern_setgroups(td, uap->gidsetsize, groups); out: free(groups, M_TEMP); return (error); } int kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; int error; if (ngrp > ngroups_max + 1) return (EINVAL); AUDIT_ARG_GROUPSET(groups, ngrp); newcred = crget(); crextend(newcred, ngrp); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgroups(oldcred, ngrp, groups); if (error) goto fail; #endif error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0); if (error) goto fail; if (ngrp < 1) { /* * setgroups(0, NULL) is a legitimate way of clearing the * groups vector on non-BSD systems (which generally do not * have the egid in the groups[0]). We risk security holes * when running non-BSD software if we do not do the same. */ newcred->cr_ngroups = 1; } else { crsetgroups_locked(newcred, ngrp, groups); } setsugid(p); - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setreuid_args { uid_t ruid; uid_t euid; }; #endif /* ARGSUSED */ int sys_setreuid(register struct thread *td, struct setreuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setreuid(oldcred, ruid, euid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid) || (euid != (uid_t)-1 && euid != oldcred->cr_uid && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID, 0)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) && newcred->cr_svuid != newcred->cr_uid) { change_svuid(newcred, newcred->cr_uid); setsugid(p); } - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setregid_args { gid_t rgid; gid_t egid; }; #endif /* ARGSUSED */ int sys_setregid(register struct thread *td, struct setregid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid; int error; egid = uap->egid; rgid = uap->rgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setregid(oldcred, rgid, egid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid) || (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID, 0)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) && newcred->cr_svgid != newcred->cr_groups[0]) { change_svgid(newcred, newcred->cr_groups[0]); setsugid(p); } - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } /* * setresuid(ruid, euid, suid) is like setreuid except control over the saved * uid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresuid_args { uid_t ruid; uid_t euid; uid_t suid; }; #endif /* ARGSUSED */ int sys_setresuid(register struct thread *td, struct setresuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid, suid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; suid = uap->suid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); AUDIT_ARG_SUID(suid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresuid(oldcred, ruid, euid, suid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid && ruid != oldcred->cr_uid) || (euid != (uid_t)-1 && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid && euid != oldcred->cr_uid) || (suid != (uid_t)-1 && suid != oldcred->cr_ruid && suid != oldcred->cr_svuid && suid != oldcred->cr_uid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID, 0)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) { change_svuid(newcred, suid); setsugid(p); } - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } /* * setresgid(rgid, egid, sgid) is like setregid except control over the saved * gid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresgid_args { gid_t rgid; gid_t egid; gid_t sgid; }; #endif /* ARGSUSED */ int sys_setresgid(register struct thread *td, struct setresgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid, sgid; int error; egid = uap->egid; rgid = uap->rgid; sgid = uap->sgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); AUDIT_ARG_SGID(sgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid && rgid != oldcred->cr_groups[0]) || (egid != (gid_t)-1 && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid && egid != oldcred->cr_groups[0]) || (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid && sgid != oldcred->cr_svgid && sgid != oldcred->cr_groups[0])) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID, 0)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) { change_svgid(newcred, sgid); setsugid(p); } - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getresuid_args { uid_t *ruid; uid_t *euid; uid_t *suid; }; #endif /* ARGSUSED */ int sys_getresuid(register struct thread *td, struct getresuid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->ruid) error1 = copyout(&cred->cr_ruid, uap->ruid, sizeof(cred->cr_ruid)); if (uap->euid) error2 = copyout(&cred->cr_uid, uap->euid, sizeof(cred->cr_uid)); if (uap->suid) error3 = copyout(&cred->cr_svuid, uap->suid, sizeof(cred->cr_svuid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct getresgid_args { gid_t *rgid; gid_t *egid; gid_t *sgid; }; #endif /* ARGSUSED */ int sys_getresgid(register struct thread *td, struct getresgid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->rgid) error1 = copyout(&cred->cr_rgid, uap->rgid, sizeof(cred->cr_rgid)); if (uap->egid) error2 = copyout(&cred->cr_groups[0], uap->egid, sizeof(cred->cr_groups[0])); if (uap->sgid) error3 = copyout(&cred->cr_svgid, uap->sgid, sizeof(cred->cr_svgid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct issetugid_args { int dummy; }; #endif /* ARGSUSED */ int sys_issetugid(register struct thread *td, struct issetugid_args *uap) { struct proc *p = td->td_proc; /* * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, * we use P_SUGID because we consider changing the owners as * "tainting" as well. * This is significant for procs that start as root and "become" * a user without an exec - programs cannot know *everything* * that libc *might* have put in their data segment. */ PROC_LOCK(p); td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0; PROC_UNLOCK(p); return (0); } int sys___setugid(struct thread *td, struct __setugid_args *uap) { #ifdef REGRESSION struct proc *p; p = td->td_proc; switch (uap->flag) { case 0: PROC_LOCK(p); p->p_flag &= ~P_SUGID; PROC_UNLOCK(p); return (0); case 1: PROC_LOCK(p); p->p_flag |= P_SUGID; PROC_UNLOCK(p); return (0); default: return (EINVAL); } #else /* !REGRESSION */ return (ENOSYS); #endif /* REGRESSION */ } /* * Check if gid is a member of the group set. */ int groupmember(gid_t gid, struct ucred *cred) { int l; int h; int m; if (cred->cr_groups[0] == gid) return(1); /* * If gid was not our primary group, perform a binary search * of the supplemental groups. This is possible because we * sort the groups in crsetgroups(). */ l = 1; h = cred->cr_ngroups; while (l < h) { m = l + ((h - l) / 2); if (cred->cr_groups[m] < gid) l = m + 1; else h = m; } if ((l < cred->cr_ngroups) && (cred->cr_groups[l] == gid)) return (1); return (0); } /* * Test the active securelevel against a given level. securelevel_gt() * implements (securelevel > level). securelevel_ge() implements * (securelevel >= level). Note that the logic is inverted -- these * functions return EPERM on "success" and 0 on "failure". * * Due to care taken when setting the securelevel, we know that no jail will * be less secure that its parent (or the physical system), so it is sufficient * to test the current jail only. * * XXXRW: Possibly since this has to do with privilege, it should move to * kern_priv.c. */ int securelevel_gt(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel > level ? EPERM : 0); } int securelevel_ge(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0); } /* * 'see_other_uids' determines whether or not visibility of processes * and sockets with credentials holding different real uids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_uids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW, &see_other_uids, 0, "Unprivileged processes may see subjects/objects with different real uid"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_other_uids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_seeotheruids(struct ucred *u1, struct ucred *u2) { if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) { if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, 0) != 0) return (ESRCH); } return (0); } /* * 'see_other_gids' determines whether or not visibility of processes * and sockets with credentials holding different real gids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_gids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW, &see_other_gids, 0, "Unprivileged processes may see subjects/objects with different real gid"); /* * Determine if u1 can "see" the subject specified by u2, according to the * 'see_other_gids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_seeothergids(struct ucred *u1, struct ucred *u2) { int i, match; if (!see_other_gids) { match = 0; for (i = 0; i < u1->cr_ngroups; i++) { if (groupmember(u1->cr_groups[i], u2)) match = 1; if (match) break; } if (!match) { if (priv_check_cred(u1, PRIV_SEEOTHERGIDS, 0) != 0) return (ESRCH); } } return (0); } /*- * Determine if u1 "can see" the subject specified by u2. * Returns: 0 for permitted, an errno value otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_cansee(struct ucred *u1, struct ucred *u2) { int error; if ((error = prison_check(u1, u2))) return (error); #ifdef MAC if ((error = mac_cred_check_visible(u1, u2))) return (error); #endif if ((error = cr_seeotheruids(u1, u2))) return (error); if ((error = cr_seeothergids(u1, u2))) return (error); return (0); } /*- * Determine if td "can see" the subject specified by p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect p->p_ucred must be held. td really * should be curthread. * References: td and p must be valid for the lifetime of the call */ int p_cansee(struct thread *td, struct proc *p) { /* Wrap cr_cansee() for all functionality. */ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); return (cr_cansee(td->td_ucred, p->p_ucred)); } /* * 'conservative_signals' prevents the delivery of a broad class of * signals by unprivileged processes to processes that have changed their * credentials since the last invocation of execve(). This can prevent * the leakage of cached information or retained privileges as a result * of a common class of signal-related vulnerabilities. However, this * may interfere with some applications that expect to be able to * deliver these signals to peer processes after having given up * privilege. */ static int conservative_signals = 1; SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW, &conservative_signals, 0, "Unprivileged processes prevented from " "sending certain signals to processes whose credentials have changed"); /*- * Determine whether cred may deliver the specified signal to proc. * Returns: 0 for permitted, an errno value otherwise. * Locks: A lock must be held for proc. * References: cred and proc must be valid for the lifetime of the call. */ int cr_cansignal(struct ucred *cred, struct proc *proc, int signum) { int error; PROC_LOCK_ASSERT(proc, MA_OWNED); /* * Jail semantics limit the scope of signalling to proc in the * same jail as cred, if cred is in jail. */ error = prison_check(cred, proc->p_ucred); if (error) return (error); #ifdef MAC if ((error = mac_proc_check_signal(cred, proc, signum))) return (error); #endif if ((error = cr_seeotheruids(cred, proc->p_ucred))) return (error); if ((error = cr_seeothergids(cred, proc->p_ucred))) return (error); /* * UNIX signal semantics depend on the status of the P_SUGID * bit on the target process. If the bit is set, then additional * restrictions are placed on the set of available signals. */ if (conservative_signals && (proc->p_flag & P_SUGID)) { switch (signum) { case 0: case SIGKILL: case SIGINT: case SIGTERM: case SIGALRM: case SIGSTOP: case SIGTTIN: case SIGTTOU: case SIGTSTP: case SIGHUP: case SIGUSR1: case SIGUSR2: /* * Generally, permit job and terminal control * signals. */ break; default: /* Not permitted without privilege. */ error = priv_check_cred(cred, PRIV_SIGNAL_SUGID, 0); if (error) return (error); } } /* * Generally, the target credential's ruid or svuid must match the * subject credential's ruid or euid. */ if (cred->cr_ruid != proc->p_ucred->cr_ruid && cred->cr_ruid != proc->p_ucred->cr_svuid && cred->cr_uid != proc->p_ucred->cr_ruid && cred->cr_uid != proc->p_ucred->cr_svuid) { error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED, 0); if (error) return (error); } return (0); } /*- * Determine whether td may deliver the specified signal to p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must be * held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansignal(struct thread *td, struct proc *p, int signum) { KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); /* * UNIX signalling semantics require that processes in the same * session always be able to deliver SIGCONT to one another, * overriding the remaining protections. */ /* XXX: This will require an additional lock of some sort. */ if (signum == SIGCONT && td->td_proc->p_session == p->p_session) return (0); /* * Some compat layers use SIGTHR and higher signals for * communication between different kernel threads of the same * process, so that they expect that it's always possible to * deliver them, even for suid applications where cr_cansignal() can * deny such ability for security consideration. It should be * pretty safe to do since the only way to create two processes * with the same p_leader is via rfork(2). */ if (td->td_proc->p_leader != NULL && signum >= SIGTHR && signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader) return (0); return (cr_cansignal(td->td_ucred, p, signum)); } /*- * Determine whether td may reschedule p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansched(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_sched(td->td_ucred, p))) return (error); #endif if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) return (error); if ((error = cr_seeothergids(td->td_ucred, p->p_ucred))) return (error); if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid && td->td_ucred->cr_uid != p->p_ucred->cr_ruid) { error = priv_check(td, PRIV_SCHED_DIFFCRED); if (error) return (error); } return (0); } /* * The 'unprivileged_proc_debug' flag may be used to disable a variety of * unprivileged inter-process debugging services, including some procfs * functionality, ptrace(), and ktrace(). In the past, inter-process * debugging has been involved in a variety of security problems, and sites * not requiring the service might choose to disable it when hardening * systems. * * XXX: Should modifying and reading this variable require locking? * XXX: data declarations should be together near the beginning of the file. */ static int unprivileged_proc_debug = 1; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW, &unprivileged_proc_debug, 0, "Unprivileged processes may use process debugging facilities"); /*- * Determine whether td may debug p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_candebug(struct thread *td, struct proc *p) { int credentialchanged, error, grpsubset, i, uidsubset; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (!unprivileged_proc_debug) { error = priv_check(td, PRIV_DEBUG_UNPRIV); if (error) return (error); } if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_debug(td->td_ucred, p))) return (error); #endif if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) return (error); if ((error = cr_seeothergids(td->td_ucred, p->p_ucred))) return (error); /* * Is p's group set a subset of td's effective group set? This * includes p's egid, group access list, rgid, and svgid. */ grpsubset = 1; for (i = 0; i < p->p_ucred->cr_ngroups; i++) { if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) { grpsubset = 0; break; } } grpsubset = grpsubset && groupmember(p->p_ucred->cr_rgid, td->td_ucred) && groupmember(p->p_ucred->cr_svgid, td->td_ucred); /* * Are the uids present in p's credential equal to td's * effective uid? This includes p's euid, svuid, and ruid. */ uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid && td->td_ucred->cr_uid == p->p_ucred->cr_svuid && td->td_ucred->cr_uid == p->p_ucred->cr_ruid); /* * Has the credential of the process changed since the last exec()? */ credentialchanged = (p->p_flag & P_SUGID); /* * If p's gids aren't a subset, or the uids aren't a subset, * or the credential has changed, require appropriate privilege * for td to debug p. */ if (!grpsubset || !uidsubset) { error = priv_check(td, PRIV_DEBUG_DIFFCRED); if (error) return (error); } if (credentialchanged) { error = priv_check(td, PRIV_DEBUG_SUGID); if (error) return (error); } /* Can't trace init when securelevel > 0. */ if (p == initproc) { error = securelevel_gt(td->td_ucred, 0); if (error) return (error); } /* * Can't trace a process that's currently exec'ing. * * XXX: Note, this is not a security policy decision, it's a * basic correctness/functionality decision. Therefore, this check * should be moved to the caller's of p_candebug(). */ if ((p->p_flag & P_INEXEC) != 0) return (EBUSY); /* Denied explicitely */ if ((p->p_flag2 & P2_NOTRACE) != 0) { error = priv_check(td, PRIV_DEBUG_DENIED); if (error != 0) return (error); } return (0); } /*- * Determine whether the subject represented by cred can "see" a socket. * Returns: 0 for permitted, ENOENT otherwise. */ int cr_canseesocket(struct ucred *cred, struct socket *so) { int error; error = prison_check(cred, so->so_cred); if (error) return (ENOENT); #ifdef MAC error = mac_socket_check_visible(cred, so); if (error) return (error); #endif if (cr_seeotheruids(cred, so->so_cred)) return (ENOENT); if (cr_seeothergids(cred, so->so_cred)) return (ENOENT); return (0); } #if defined(INET) || defined(INET6) /*- * Determine whether the subject represented by cred can "see" a socket. * Returns: 0 for permitted, ENOENT otherwise. */ int cr_canseeinpcb(struct ucred *cred, struct inpcb *inp) { int error; error = prison_check(cred, inp->inp_cred); if (error) return (ENOENT); #ifdef MAC INP_LOCK_ASSERT(inp); error = mac_inpcb_check_visible(cred, inp); if (error) return (error); #endif if (cr_seeotheruids(cred, inp->inp_cred)) return (ENOENT); if (cr_seeothergids(cred, inp->inp_cred)) return (ENOENT); return (0); } #endif /*- * Determine whether td can wait for the exit of p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_canwait(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_wait(td->td_ucred, p))) return (error); #endif #if 0 /* XXXMAC: This could have odd effects on some shells. */ if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) return (error); #endif return (0); } /* * Allocate a zeroed cred structure. */ struct ucred * crget(void) { register struct ucred *cr; cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); refcount_init(&cr->cr_ref, 1); #ifdef AUDIT audit_cred_init(cr); #endif #ifdef MAC mac_cred_init(cr); #endif crextend(cr, XU_NGROUPS); return (cr); } /* * Claim another reference to a ucred structure. */ struct ucred * crhold(struct ucred *cr) { refcount_acquire(&cr->cr_ref); return (cr); } /* * Free a cred structure. Throws away space when ref count gets to 0. */ void crfree(struct ucred *cr) { KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref)); KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred")); if (refcount_release(&cr->cr_ref)) { /* * Some callers of crget(), such as nfs_statfs(), * allocate a temporary credential, but don't * allocate a uidinfo structure. */ if (cr->cr_uidinfo != NULL) uifree(cr->cr_uidinfo); if (cr->cr_ruidinfo != NULL) uifree(cr->cr_ruidinfo); /* * Free a prison, if any. */ if (cr->cr_prison != NULL) prison_free(cr->cr_prison); if (cr->cr_loginclass != NULL) loginclass_free(cr->cr_loginclass); #ifdef AUDIT audit_cred_destroy(cr); #endif #ifdef MAC mac_cred_destroy(cr); #endif free(cr->cr_groups, M_CRED); free(cr, M_CRED); } } /* * Check to see if this ucred is shared. */ int crshared(struct ucred *cr) { return (cr->cr_ref > 1); } /* * Copy a ucred's contents from a template. Does not block. */ void crcopy(struct ucred *dest, struct ucred *src) { KASSERT(crshared(dest) == 0, ("crcopy of shared ucred")); bcopy(&src->cr_startcopy, &dest->cr_startcopy, (unsigned)((caddr_t)&src->cr_endcopy - (caddr_t)&src->cr_startcopy)); crsetgroups(dest, src->cr_ngroups, src->cr_groups); uihold(dest->cr_uidinfo); uihold(dest->cr_ruidinfo); prison_hold(dest->cr_prison); loginclass_hold(dest->cr_loginclass); #ifdef AUDIT audit_cred_copy(src, dest); #endif #ifdef MAC mac_cred_copy(src, dest); #endif } /* * Dup cred struct to a new held one. */ struct ucred * crdup(struct ucred *cr) { struct ucred *newcr; newcr = crget(); crcopy(newcr, cr); return (newcr); } /* * Fill in a struct xucred based on a struct ucred. */ void cru2x(struct ucred *cr, struct xucred *xcr) { int ngroups; bzero(xcr, sizeof(*xcr)); xcr->cr_version = XUCRED_VERSION; xcr->cr_uid = cr->cr_uid; ngroups = MIN(cr->cr_ngroups, XU_NGROUPS); xcr->cr_ngroups = ngroups; bcopy(cr->cr_groups, xcr->cr_groups, ngroups * sizeof(*cr->cr_groups)); } /* * small routine to swap a thread's current ucred for the correct one taken * from the process. */ void cred_update_thread(struct thread *td) { struct proc *p; struct ucred *cred; p = td->td_proc; cred = td->td_ucred; PROC_LOCK(p); td->td_ucred = crhold(p->p_ucred); PROC_UNLOCK(p); if (cred != NULL) crfree(cred); +} + +/* + * Change process credentials. + * Callers are responsible for providing the reference for current credentials + * and for freeing old ones. + * + * Process has to be locked except when it does not have credentials (as it + * should not be visible just yet) or when newcred is NULL (as this can be + * only used when the process is about to be freed, at which point it should + * not be visible anymore). + */ +struct ucred * +proc_set_cred(struct proc *p, struct ucred *newcred) +{ + struct ucred *oldcred; + + if (newcred == NULL) + MPASS(p->p_state == PRS_ZOMBIE); + else if (p->p_ucred != NULL) + PROC_LOCK_ASSERT(p, MA_OWNED); + + oldcred = p->p_ucred; + p->p_ucred = newcred; + return (oldcred); } struct ucred * crcopysafe(struct proc *p, struct ucred *cr) { struct ucred *oldcred; int groups; PROC_LOCK_ASSERT(p, MA_OWNED); oldcred = p->p_ucred; while (cr->cr_agroups < oldcred->cr_agroups) { groups = oldcred->cr_agroups; PROC_UNLOCK(p); crextend(cr, groups); PROC_LOCK(p); oldcred = p->p_ucred; } crcopy(cr, oldcred); return (oldcred); } /* * Extend the passed in credential to hold n items. */ void crextend(struct ucred *cr, int n) { int cnt; /* Truncate? */ if (n <= cr->cr_agroups) return; /* * We extend by 2 each time since we're using a power of two * allocator until we need enough groups to fill a page. * Once we're allocating multiple pages, only allocate as many * as we actually need. The case of processes needing a * non-power of two number of pages seems more likely than * a real world process that adds thousands of groups one at a * time. */ if ( n < PAGE_SIZE / sizeof(gid_t) ) { if (cr->cr_agroups == 0) cnt = MINALLOCSIZE / sizeof(gid_t); else cnt = cr->cr_agroups * 2; while (cnt < n) cnt *= 2; } else cnt = roundup2(n, PAGE_SIZE / sizeof(gid_t)); /* Free the old array. */ if (cr->cr_groups) free(cr->cr_groups, M_CRED); cr->cr_groups = malloc(cnt * sizeof(gid_t), M_CRED, M_WAITOK | M_ZERO); cr->cr_agroups = cnt; } /* * Copy groups in to a credential, preserving any necessary invariants. * Currently this includes the sorting of all supplemental gids. * crextend() must have been called before hand to ensure sufficient * space is available. */ static void crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups) { int i; int j; gid_t g; KASSERT(cr->cr_agroups >= ngrp, ("cr_ngroups is too small")); bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t)); cr->cr_ngroups = ngrp; /* * Sort all groups except cr_groups[0] to allow groupmember to * perform a binary search. * * XXX: If large numbers of groups become common this should * be replaced with shell sort like linux uses or possibly * heap sort. */ for (i = 2; i < ngrp; i++) { g = cr->cr_groups[i]; for (j = i-1; j >= 1 && g < cr->cr_groups[j]; j--) cr->cr_groups[j + 1] = cr->cr_groups[j]; cr->cr_groups[j + 1] = g; } } /* * Copy groups in to a credential after expanding it if required. * Truncate the list to (ngroups_max + 1) if it is too large. */ void crsetgroups(struct ucred *cr, int ngrp, gid_t *groups) { if (ngrp > ngroups_max + 1) ngrp = ngroups_max + 1; crextend(cr, ngrp); crsetgroups_locked(cr, ngrp, groups); } /* * Get login name, if available. */ #ifndef _SYS_SYSPROTO_H_ struct getlogin_args { char *namebuf; u_int namelen; }; #endif /* ARGSUSED */ int sys_getlogin(struct thread *td, struct getlogin_args *uap) { char login[MAXLOGNAME]; struct proc *p = td->td_proc; size_t len; if (uap->namelen > MAXLOGNAME) uap->namelen = MAXLOGNAME; PROC_LOCK(p); SESS_LOCK(p->p_session); len = strlcpy(login, p->p_session->s_login, uap->namelen) + 1; SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (len > uap->namelen) return (ERANGE); return (copyout(login, uap->namebuf, len)); } /* * Set login name. */ #ifndef _SYS_SYSPROTO_H_ struct setlogin_args { char *namebuf; }; #endif /* ARGSUSED */ int sys_setlogin(struct thread *td, struct setlogin_args *uap) { struct proc *p = td->td_proc; int error; char logintmp[MAXLOGNAME]; CTASSERT(sizeof(p->p_session->s_login) >= sizeof(logintmp)); error = priv_check(td, PRIV_PROC_SETLOGIN); if (error) return (error); error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL); if (error != 0) { if (error == ENAMETOOLONG) error = EINVAL; return (error); } PROC_LOCK(p); SESS_LOCK(p->p_session); strcpy(p->p_session->s_login, logintmp); SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); return (0); } void setsugid(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_SUGID; if (!(p->p_pfsflags & PF_ISUGID)) p->p_stops = 0; } /*- * Change a process's effective uid. * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_euid(struct ucred *newcred, struct uidinfo *euip) { newcred->cr_uid = euip->ui_uid; uihold(euip); uifree(newcred->cr_uidinfo); newcred->cr_uidinfo = euip; } /*- * Change a process's effective gid. * Side effects: newcred->cr_gid will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_egid(struct ucred *newcred, gid_t egid) { newcred->cr_groups[0] = egid; } /*- * Change a process's real uid. * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo * will be updated, and the old and new cr_ruidinfo proc * counts will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_ruid(struct ucred *newcred, struct uidinfo *ruip) { (void)chgproccnt(newcred->cr_ruidinfo, -1, 0); newcred->cr_ruid = ruip->ui_uid; uihold(ruip); uifree(newcred->cr_ruidinfo); newcred->cr_ruidinfo = ruip; (void)chgproccnt(newcred->cr_ruidinfo, 1, 0); } /*- * Change a process's real gid. * Side effects: newcred->cr_rgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_rgid(struct ucred *newcred, gid_t rgid) { newcred->cr_rgid = rgid; } /*- * Change a process's saved uid. * Side effects: newcred->cr_svuid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svuid(struct ucred *newcred, uid_t svuid) { newcred->cr_svuid = svuid; } /*- * Change a process's saved gid. * Side effects: newcred->cr_svgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svgid(struct ucred *newcred, gid_t svgid) { newcred->cr_svgid = svgid; } Index: stable/10/sys/kern/sys_capability.c =================================================================== --- stable/10/sys/kern/sys_capability.c (revision 302228) +++ stable/10/sys/kern/sys_capability.c (revision 302229) @@ -1,628 +1,628 @@ /*- * Copyright (c) 2008-2011 Robert N. M. Watson * Copyright (c) 2010-2011 Jonathan Anderson * Copyright (c) 2012 FreeBSD Foundation * All rights reserved. * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Portions of this software were developed by Pawel Jakub Dawidek under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * FreeBSD kernel capability facility. * * Two kernel features are implemented here: capability mode, a sandboxed mode * of execution for processes, and capabilities, a refinement on file * descriptors that allows fine-grained control over operations on the file * descriptor. Collectively, these allow processes to run in the style of a * historic "capability system" in which they can use only resources * explicitly delegated to them. This model is enforced by restricting access * to global namespaces in capability mode. * * Capabilities wrap other file descriptor types, binding them to a constant * rights mask set when the capability is created. New capabilities may be * derived from existing capabilities, but only if they have the same or a * strict subset of the rights on the original capability. * * System calls permitted in capability mode are defined in capabilities.conf; * calls must be carefully audited for safety to ensure that they don't allow * escape from a sandbox. Some calls permit only a subset of operations in * capability mode -- for example, shm_open(2) is limited to creating * anonymous, rather than named, POSIX shared memory objects. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CAPABILITY_MODE FEATURE(security_capability_mode, "Capsicum Capability Mode"); /* * System call to enter capability mode for the process. */ int sys_cap_enter(struct thread *td, struct cap_enter_args *uap) { struct ucred *newcred, *oldcred; struct proc *p; if (IN_CAPABILITY_MODE(td)) return (0); newcred = crget(); p = td->td_proc; PROC_LOCK(p); oldcred = p->p_ucred; crcopy(newcred, oldcred); newcred->cr_flags |= CRED_FLAG_CAPMODE; - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); } /* * System call to query whether the process is in capability mode. */ int sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap) { u_int i; i = IN_CAPABILITY_MODE(td) ? 1 : 0; return (copyout(&i, uap->modep, sizeof(i))); } #else /* !CAPABILITY_MODE */ int sys_cap_enter(struct thread *td, struct cap_enter_args *uap) { return (ENOSYS); } int sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap) { return (ENOSYS); } #endif /* CAPABILITY_MODE */ #ifdef CAPABILITIES FEATURE(security_capabilities, "Capsicum Capabilities"); MALLOC_DECLARE(M_FILECAPS); static inline int _cap_check(const cap_rights_t *havep, const cap_rights_t *needp, enum ktr_cap_fail_type type) { int i; for (i = 0; i < nitems(havep->cr_rights); i++) { if (!cap_rights_contains(havep, needp)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(type, needp, havep); #endif return (ENOTCAPABLE); } } return (0); } /* * Test whether a capability grants the requested rights. */ int cap_check(const cap_rights_t *havep, const cap_rights_t *needp) { return (_cap_check(havep, needp, CAPFAIL_NOTCAPABLE)); } /* * Convert capability rights into VM access flags. */ u_char cap_rights_to_vmprot(cap_rights_t *havep) { u_char maxprot; maxprot = VM_PROT_NONE; if (cap_rights_is_set(havep, CAP_MMAP_R)) maxprot |= VM_PROT_READ; if (cap_rights_is_set(havep, CAP_MMAP_W)) maxprot |= VM_PROT_WRITE; if (cap_rights_is_set(havep, CAP_MMAP_X)) maxprot |= VM_PROT_EXECUTE; return (maxprot); } /* * Extract rights from a capability for monitoring purposes -- not for use in * any other way, as we want to keep all capability permission evaluation in * this one file. */ cap_rights_t * cap_rights_fde(struct filedescent *fde) { return (&fde->fde_rights); } cap_rights_t * cap_rights(struct filedesc *fdp, int fd) { return (cap_rights_fde(&fdp->fd_ofiles[fd])); } /* * System call to limit rights of the given capability. */ int sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap) { struct filedesc *fdp; cap_rights_t rights; int error, fd, version; cap_rights_init(&rights); error = copyin(uap->rightsp, &rights, sizeof(rights.cr_rights[0])); if (error != 0) return (error); version = CAPVER(&rights); if (version != CAP_RIGHTS_VERSION_00) return (EINVAL); error = copyin(uap->rightsp, &rights, sizeof(rights.cr_rights[0]) * CAPARSIZE(&rights)); if (error != 0) return (error); /* Check for race. */ if (CAPVER(&rights) != version) return (EINVAL); if (!cap_rights_is_valid(&rights)) return (EINVAL); if (version != CAP_RIGHTS_VERSION) { rights.cr_rights[0] &= ~(0x3ULL << 62); rights.cr_rights[0] |= ((uint64_t)CAP_RIGHTS_VERSION << 62); } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrcaprights(&rights); #endif fd = uap->fd; AUDIT_ARG_FD(fd); AUDIT_ARG_RIGHTS(&rights); fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } error = _cap_check(cap_rights(fdp, fd), &rights, CAPFAIL_INCREASE); if (error == 0) { fdp->fd_ofiles[fd].fde_rights = rights; if (!cap_rights_is_set(&rights, CAP_IOCTL)) { free(fdp->fd_ofiles[fd].fde_ioctls, M_FILECAPS); fdp->fd_ofiles[fd].fde_ioctls = NULL; fdp->fd_ofiles[fd].fde_nioctls = 0; } if (!cap_rights_is_set(&rights, CAP_FCNTL)) fdp->fd_ofiles[fd].fde_fcntls = 0; } FILEDESC_XUNLOCK(fdp); return (error); } /* * System call to query the rights mask associated with a capability. */ int sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap) { struct filedesc *fdp; cap_rights_t rights; int error, fd, i, n; if (uap->version != CAP_RIGHTS_VERSION_00) return (EINVAL); fd = uap->fd; AUDIT_ARG_FD(fd); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { FILEDESC_SUNLOCK(fdp); return (EBADF); } rights = *cap_rights(fdp, fd); FILEDESC_SUNLOCK(fdp); n = uap->version + 2; if (uap->version != CAPVER(&rights)) { /* * For older versions we need to check if the descriptor * doesn't contain rights not understood by the caller. * If it does, we have to return an error. */ for (i = n; i < CAPARSIZE(&rights); i++) { if ((rights.cr_rights[i] & ~(0x7FULL << 57)) != 0) return (EINVAL); } } error = copyout(&rights, uap->rightsp, sizeof(rights.cr_rights[0]) * n); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT)) ktrcaprights(&rights); #endif return (error); } /* * Test whether a capability grants the given ioctl command. * If descriptor doesn't have CAP_IOCTL, then ioctls list is empty and * ENOTCAPABLE will be returned. */ int cap_ioctl_check(struct filedesc *fdp, int fd, u_long cmd) { u_long *cmds; ssize_t ncmds; long i; FILEDESC_LOCK_ASSERT(fdp); KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("%s: invalid fd=%d", __func__, fd)); ncmds = fdp->fd_ofiles[fd].fde_nioctls; if (ncmds == -1) return (0); cmds = fdp->fd_ofiles[fd].fde_ioctls; for (i = 0; i < ncmds; i++) { if (cmds[i] == cmd) return (0); } return (ENOTCAPABLE); } /* * Check if the current ioctls list can be replaced by the new one. */ static int cap_ioctl_limit_check(struct filedesc *fdp, int fd, const u_long *cmds, size_t ncmds) { u_long *ocmds; ssize_t oncmds; u_long i; long j; oncmds = fdp->fd_ofiles[fd].fde_nioctls; if (oncmds == -1) return (0); if (oncmds < (ssize_t)ncmds) return (ENOTCAPABLE); ocmds = fdp->fd_ofiles[fd].fde_ioctls; for (i = 0; i < ncmds; i++) { for (j = 0; j < oncmds; j++) { if (cmds[i] == ocmds[j]) break; } if (j == oncmds) return (ENOTCAPABLE); } return (0); } int kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds, size_t ncmds) { struct filedesc *fdp; u_long *ocmds; int error; AUDIT_ARG_FD(fd); fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { error = EBADF; goto out; } error = cap_ioctl_limit_check(fdp, fd, cmds, ncmds); if (error != 0) goto out; ocmds = fdp->fd_ofiles[fd].fde_ioctls; fdp->fd_ofiles[fd].fde_ioctls = cmds; fdp->fd_ofiles[fd].fde_nioctls = ncmds; cmds = ocmds; error = 0; out: FILEDESC_XUNLOCK(fdp); free(cmds, M_FILECAPS); return (error); } int sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap) { u_long *cmds; size_t ncmds; int error; ncmds = uap->ncmds; if (ncmds > 256) /* XXX: Is 256 sane? */ return (EINVAL); if (ncmds == 0) { cmds = NULL; } else { cmds = malloc(sizeof(cmds[0]) * ncmds, M_FILECAPS, M_WAITOK); error = copyin(uap->cmds, cmds, sizeof(cmds[0]) * ncmds); if (error != 0) { free(cmds, M_FILECAPS); return (error); } } return (kern_cap_ioctls_limit(td, uap->fd, cmds, ncmds)); } int sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap) { struct filedesc *fdp; struct filedescent *fdep; u_long *cmds; size_t maxcmds; int error, fd; fd = uap->fd; cmds = uap->cmds; maxcmds = uap->maxcmds; AUDIT_ARG_FD(fd); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { error = EBADF; goto out; } /* * If all ioctls are allowed (fde_nioctls == -1 && fde_ioctls == NULL) * the only sane thing we can do is to not populate the given array and * return CAP_IOCTLS_ALL. */ fdep = &fdp->fd_ofiles[fd]; if (cmds != NULL && fdep->fde_ioctls != NULL) { error = copyout(fdep->fde_ioctls, cmds, sizeof(cmds[0]) * MIN(fdep->fde_nioctls, maxcmds)); if (error != 0) goto out; } if (fdep->fde_nioctls == -1) td->td_retval[0] = CAP_IOCTLS_ALL; else td->td_retval[0] = fdep->fde_nioctls; error = 0; out: FILEDESC_SUNLOCK(fdp); return (error); } /* * Test whether a capability grants the given fcntl command. */ int cap_fcntl_check_fde(struct filedescent *fde, int cmd) { uint32_t fcntlcap; fcntlcap = (1 << cmd); KASSERT((CAP_FCNTL_ALL & fcntlcap) != 0, ("Unsupported fcntl=%d.", cmd)); if ((fde->fde_fcntls & fcntlcap) != 0) return (0); return (ENOTCAPABLE); } int cap_fcntl_check(struct filedesc *fdp, int fd, int cmd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("%s: invalid fd=%d", __func__, fd)); return (cap_fcntl_check_fde(&fdp->fd_ofiles[fd], cmd)); } int sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap) { struct filedesc *fdp; uint32_t fcntlrights; int fd; fd = uap->fd; fcntlrights = uap->fcntlrights; AUDIT_ARG_FD(fd); AUDIT_ARG_FCNTL_RIGHTS(fcntlrights); if ((fcntlrights & ~CAP_FCNTL_ALL) != 0) return (EINVAL); fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } if ((fcntlrights & ~fdp->fd_ofiles[fd].fde_fcntls) != 0) { FILEDESC_XUNLOCK(fdp); return (ENOTCAPABLE); } fdp->fd_ofiles[fd].fde_fcntls = fcntlrights; FILEDESC_XUNLOCK(fdp); return (0); } int sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap) { struct filedesc *fdp; uint32_t rights; int fd; fd = uap->fd; AUDIT_ARG_FD(fd); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { FILEDESC_SUNLOCK(fdp); return (EBADF); } rights = fdp->fd_ofiles[fd].fde_fcntls; FILEDESC_SUNLOCK(fdp); return (copyout(&rights, uap->fcntlrightsp, sizeof(rights))); } #else /* !CAPABILITIES */ /* * Stub Capability functions for when options CAPABILITIES isn't compiled * into the kernel. */ int sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap) { return (ENOSYS); } int sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap) { return (ENOSYS); } int sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap) { return (ENOSYS); } int sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap) { return (ENOSYS); } int sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap) { return (ENOSYS); } int sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap) { return (ENOSYS); } #endif /* CAPABILITIES */ Index: stable/10/sys/security/audit/audit_syscalls.c =================================================================== --- stable/10/sys/security/audit/audit_syscalls.c (revision 302228) +++ stable/10/sys/security/audit/audit_syscalls.c (revision 302229) @@ -1,873 +1,873 @@ /*- * Copyright (c) 1999-2009 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Apple Inc. ("Apple") nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef AUDIT /* * System call to allow a user space application to submit a BSM audit record * to the kernel for inclusion in the audit log. This function does little * verification on the audit record that is submitted. * * XXXAUDIT: Audit preselection for user records does not currently work, * since we pre-select only based on the AUE_audit event type, not the event * type submitted as part of the user audit data. */ /* ARGSUSED */ int sys_audit(struct thread *td, struct audit_args *uap) { int error; void * rec; struct kaudit_record *ar; if (jailed(td->td_ucred)) return (ENOSYS); error = priv_check(td, PRIV_AUDIT_SUBMIT); if (error) return (error); if ((uap->length <= 0) || (uap->length > audit_qctrl.aq_bufsz)) return (EINVAL); ar = currecord(); /* * If there's no current audit record (audit() itself not audited) * commit the user audit record. */ if (ar == NULL) { /* * This is not very efficient; we're required to allocate a * complete kernel audit record just so the user record can * tag along. * * XXXAUDIT: Maybe AUE_AUDIT in the system call context and * special pre-select handling? */ td->td_ar = audit_new(AUE_NULL, td); if (td->td_ar == NULL) return (ENOTSUP); td->td_pflags |= TDP_AUDITREC; ar = td->td_ar; } if (uap->length > MAX_AUDIT_RECORD_SIZE) return (EINVAL); rec = malloc(uap->length, M_AUDITDATA, M_WAITOK); error = copyin(uap->record, rec, uap->length); if (error) goto free_out; /* Verify the record. */ if (bsm_rec_verify(rec) == 0) { error = EINVAL; goto free_out; } #ifdef MAC error = mac_system_check_audit(td->td_ucred, rec, uap->length); if (error) goto free_out; #endif /* * Attach the user audit record to the kernel audit record. Because * this system call is an auditable event, we will write the user * record along with the record for this audit event. * * XXXAUDIT: KASSERT appropriate starting values of k_udata, k_ulen, * k_ar_commit & AR_COMMIT_USER? */ ar->k_udata = rec; ar->k_ulen = uap->length; ar->k_ar_commit |= AR_COMMIT_USER; /* * Currently we assume that all preselection has been performed in * userspace. We unconditionally set these masks so that the records * get committed both to the trail and pipe. In the future we will * want to setup kernel based preselection. */ ar->k_ar_commit |= (AR_PRESELECT_USER_TRAIL | AR_PRESELECT_USER_PIPE); return (0); free_out: /* * audit_syscall_exit() will free the audit record on the thread even * if we allocated it above. */ free(rec, M_AUDITDATA); return (error); } /* * System call to manipulate auditing. */ /* ARGSUSED */ int sys_auditon(struct thread *td, struct auditon_args *uap) { struct ucred *cred, *newcred, *oldcred; int error; union auditon_udata udata; struct proc *tp; if (jailed(td->td_ucred)) return (ENOSYS); AUDIT_ARG_CMD(uap->cmd); #ifdef MAC error = mac_system_check_auditon(td->td_ucred, uap->cmd); if (error) return (error); #endif error = priv_check(td, PRIV_AUDIT_CONTROL); if (error) return (error); if ((uap->length <= 0) || (uap->length > sizeof(union auditon_udata))) return (EINVAL); memset((void *)&udata, 0, sizeof(udata)); /* * Some of the GET commands use the arguments too. */ switch (uap->cmd) { case A_SETPOLICY: case A_OLDSETPOLICY: case A_SETKMASK: case A_SETQCTRL: case A_OLDSETQCTRL: case A_SETSTAT: case A_SETUMASK: case A_SETSMASK: case A_SETCOND: case A_OLDSETCOND: case A_SETCLASS: case A_SETPMASK: case A_SETFSIZE: case A_SETKAUDIT: case A_GETCLASS: case A_GETPINFO: case A_GETPINFO_ADDR: case A_SENDTRIGGER: error = copyin(uap->data, (void *)&udata, uap->length); if (error) return (error); AUDIT_ARG_AUDITON(&udata); break; } /* * XXXAUDIT: Locking? */ switch (uap->cmd) { case A_OLDGETPOLICY: case A_GETPOLICY: if (uap->length == sizeof(udata.au_policy64)) { if (!audit_fail_stop) udata.au_policy64 |= AUDIT_CNT; if (audit_panic_on_write_fail) udata.au_policy64 |= AUDIT_AHLT; if (audit_argv) udata.au_policy64 |= AUDIT_ARGV; if (audit_arge) udata.au_policy64 |= AUDIT_ARGE; break; } if (uap->length != sizeof(udata.au_policy)) return (EINVAL); if (!audit_fail_stop) udata.au_policy |= AUDIT_CNT; if (audit_panic_on_write_fail) udata.au_policy |= AUDIT_AHLT; if (audit_argv) udata.au_policy |= AUDIT_ARGV; if (audit_arge) udata.au_policy |= AUDIT_ARGE; break; case A_OLDSETPOLICY: case A_SETPOLICY: if (uap->length == sizeof(udata.au_policy64)) { if (udata.au_policy & (~AUDIT_CNT|AUDIT_AHLT| AUDIT_ARGV|AUDIT_ARGE)) return (EINVAL); audit_fail_stop = ((udata.au_policy64 & AUDIT_CNT) == 0); audit_panic_on_write_fail = (udata.au_policy64 & AUDIT_AHLT); audit_argv = (udata.au_policy64 & AUDIT_ARGV); audit_arge = (udata.au_policy64 & AUDIT_ARGE); break; } if (uap->length != sizeof(udata.au_policy)) return (EINVAL); if (udata.au_policy & ~(AUDIT_CNT|AUDIT_AHLT|AUDIT_ARGV| AUDIT_ARGE)) return (EINVAL); /* * XXX - Need to wake up waiters if the policy relaxes? */ audit_fail_stop = ((udata.au_policy & AUDIT_CNT) == 0); audit_panic_on_write_fail = (udata.au_policy & AUDIT_AHLT); audit_argv = (udata.au_policy & AUDIT_ARGV); audit_arge = (udata.au_policy & AUDIT_ARGE); break; case A_GETKMASK: if (uap->length != sizeof(udata.au_mask)) return (EINVAL); udata.au_mask = audit_nae_mask; break; case A_SETKMASK: if (uap->length != sizeof(udata.au_mask)) return (EINVAL); audit_nae_mask = udata.au_mask; break; case A_OLDGETQCTRL: case A_GETQCTRL: if (uap->length == sizeof(udata.au_qctrl64)) { udata.au_qctrl64.aq64_hiwater = (u_int64_t)audit_qctrl.aq_hiwater; udata.au_qctrl64.aq64_lowater = (u_int64_t)audit_qctrl.aq_lowater; udata.au_qctrl64.aq64_bufsz = (u_int64_t)audit_qctrl.aq_bufsz; udata.au_qctrl64.aq64_minfree = (u_int64_t)audit_qctrl.aq_minfree; break; } if (uap->length != sizeof(udata.au_qctrl)) return (EINVAL); udata.au_qctrl = audit_qctrl; break; case A_OLDSETQCTRL: case A_SETQCTRL: if (uap->length == sizeof(udata.au_qctrl64)) { if ((udata.au_qctrl64.aq64_hiwater > AQ_MAXHIGH) || (udata.au_qctrl64.aq64_lowater >= udata.au_qctrl.aq_hiwater) || (udata.au_qctrl64.aq64_bufsz > AQ_MAXBUFSZ) || (udata.au_qctrl64.aq64_minfree < 0) || (udata.au_qctrl64.aq64_minfree > 100)) return (EINVAL); audit_qctrl.aq_hiwater = (int)udata.au_qctrl64.aq64_hiwater; audit_qctrl.aq_lowater = (int)udata.au_qctrl64.aq64_lowater; audit_qctrl.aq_bufsz = (int)udata.au_qctrl64.aq64_bufsz; audit_qctrl.aq_minfree = (int)udata.au_qctrl64.aq64_minfree; audit_qctrl.aq_delay = -1; /* Not used. */ break; } if (uap->length != sizeof(udata.au_qctrl)) return (EINVAL); if ((udata.au_qctrl.aq_hiwater > AQ_MAXHIGH) || (udata.au_qctrl.aq_lowater >= udata.au_qctrl.aq_hiwater) || (udata.au_qctrl.aq_bufsz > AQ_MAXBUFSZ) || (udata.au_qctrl.aq_minfree < 0) || (udata.au_qctrl.aq_minfree > 100)) return (EINVAL); audit_qctrl = udata.au_qctrl; /* XXX The queue delay value isn't used with the kernel. */ audit_qctrl.aq_delay = -1; break; case A_GETCWD: return (ENOSYS); break; case A_GETCAR: return (ENOSYS); break; case A_GETSTAT: return (ENOSYS); break; case A_SETSTAT: return (ENOSYS); break; case A_SETUMASK: return (ENOSYS); break; case A_SETSMASK: return (ENOSYS); break; case A_OLDGETCOND: case A_GETCOND: if (uap->length == sizeof(udata.au_cond64)) { if (audit_enabled && !audit_suspended) udata.au_cond64 = AUC_AUDITING; else udata.au_cond64 = AUC_NOAUDIT; break; } if (uap->length != sizeof(udata.au_cond)) return (EINVAL); if (audit_enabled && !audit_suspended) udata.au_cond = AUC_AUDITING; else udata.au_cond = AUC_NOAUDIT; break; case A_OLDSETCOND: case A_SETCOND: if (uap->length == sizeof(udata.au_cond64)) { if (udata.au_cond64 == AUC_NOAUDIT) audit_suspended = 1; if (udata.au_cond64 == AUC_AUDITING) audit_suspended = 0; if (udata.au_cond64 == AUC_DISABLED) { audit_suspended = 1; audit_shutdown(NULL, 0); } break; } if (uap->length != sizeof(udata.au_cond)) return (EINVAL); if (udata.au_cond == AUC_NOAUDIT) audit_suspended = 1; if (udata.au_cond == AUC_AUDITING) audit_suspended = 0; if (udata.au_cond == AUC_DISABLED) { audit_suspended = 1; audit_shutdown(NULL, 0); } break; case A_GETCLASS: if (uap->length != sizeof(udata.au_evclass)) return (EINVAL); udata.au_evclass.ec_class = au_event_class( udata.au_evclass.ec_number); break; case A_SETCLASS: if (uap->length != sizeof(udata.au_evclass)) return (EINVAL); au_evclassmap_insert(udata.au_evclass.ec_number, udata.au_evclass.ec_class); break; case A_GETPINFO: if (uap->length != sizeof(udata.au_aupinfo)) return (EINVAL); if (udata.au_aupinfo.ap_pid < 1) return (ESRCH); if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL) return (ESRCH); if ((error = p_cansee(td, tp)) != 0) { PROC_UNLOCK(tp); return (error); } cred = tp->p_ucred; if (cred->cr_audit.ai_termid.at_type == AU_IPv6) { PROC_UNLOCK(tp); return (EINVAL); } udata.au_aupinfo.ap_auid = cred->cr_audit.ai_auid; udata.au_aupinfo.ap_mask.am_success = cred->cr_audit.ai_mask.am_success; udata.au_aupinfo.ap_mask.am_failure = cred->cr_audit.ai_mask.am_failure; udata.au_aupinfo.ap_termid.machine = cred->cr_audit.ai_termid.at_addr[0]; udata.au_aupinfo.ap_termid.port = (dev_t)cred->cr_audit.ai_termid.at_port; udata.au_aupinfo.ap_asid = cred->cr_audit.ai_asid; PROC_UNLOCK(tp); break; case A_SETPMASK: if (uap->length != sizeof(udata.au_aupinfo)) return (EINVAL); if (udata.au_aupinfo.ap_pid < 1) return (ESRCH); newcred = crget(); if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL) { crfree(newcred); return (ESRCH); } if ((error = p_cansee(td, tp)) != 0) { PROC_UNLOCK(tp); crfree(newcred); return (error); } oldcred = tp->p_ucred; crcopy(newcred, oldcred); newcred->cr_audit.ai_mask.am_success = udata.au_aupinfo.ap_mask.am_success; newcred->cr_audit.ai_mask.am_failure = udata.au_aupinfo.ap_mask.am_failure; - tp->p_ucred = newcred; + proc_set_cred(tp, newcred); PROC_UNLOCK(tp); crfree(oldcred); break; case A_SETFSIZE: if (uap->length != sizeof(udata.au_fstat)) return (EINVAL); if ((udata.au_fstat.af_filesz != 0) && (udata.au_fstat.af_filesz < MIN_AUDIT_FILE_SIZE)) return (EINVAL); audit_fstat.af_filesz = udata.au_fstat.af_filesz; break; case A_GETFSIZE: if (uap->length != sizeof(udata.au_fstat)) return (EINVAL); udata.au_fstat.af_filesz = audit_fstat.af_filesz; udata.au_fstat.af_currsz = audit_fstat.af_currsz; break; case A_GETPINFO_ADDR: if (uap->length != sizeof(udata.au_aupinfo_addr)) return (EINVAL); if (udata.au_aupinfo_addr.ap_pid < 1) return (ESRCH); if ((tp = pfind(udata.au_aupinfo_addr.ap_pid)) == NULL) return (ESRCH); cred = tp->p_ucred; udata.au_aupinfo_addr.ap_auid = cred->cr_audit.ai_auid; udata.au_aupinfo_addr.ap_mask.am_success = cred->cr_audit.ai_mask.am_success; udata.au_aupinfo_addr.ap_mask.am_failure = cred->cr_audit.ai_mask.am_failure; udata.au_aupinfo_addr.ap_termid = cred->cr_audit.ai_termid; udata.au_aupinfo_addr.ap_asid = cred->cr_audit.ai_asid; PROC_UNLOCK(tp); break; case A_GETKAUDIT: if (uap->length != sizeof(udata.au_kau_info)) return (EINVAL); audit_get_kinfo(&udata.au_kau_info); break; case A_SETKAUDIT: if (uap->length != sizeof(udata.au_kau_info)) return (EINVAL); if (udata.au_kau_info.ai_termid.at_type != AU_IPv4 && udata.au_kau_info.ai_termid.at_type != AU_IPv6) return (EINVAL); audit_set_kinfo(&udata.au_kau_info); break; case A_SENDTRIGGER: if (uap->length != sizeof(udata.au_trigger)) return (EINVAL); if ((udata.au_trigger < AUDIT_TRIGGER_MIN) || (udata.au_trigger > AUDIT_TRIGGER_MAX)) return (EINVAL); return (audit_send_trigger(udata.au_trigger)); default: return (EINVAL); } /* * Copy data back to userspace for the GET comands. */ switch (uap->cmd) { case A_GETPOLICY: case A_OLDGETPOLICY: case A_GETKMASK: case A_GETQCTRL: case A_OLDGETQCTRL: case A_GETCWD: case A_GETCAR: case A_GETSTAT: case A_GETCOND: case A_OLDGETCOND: case A_GETCLASS: case A_GETPINFO: case A_GETFSIZE: case A_GETPINFO_ADDR: case A_GETKAUDIT: error = copyout((void *)&udata, uap->data, uap->length); if (error) return (error); break; } return (0); } /* * System calls to manage the user audit information. */ /* ARGSUSED */ int sys_getauid(struct thread *td, struct getauid_args *uap) { int error; if (jailed(td->td_ucred)) return (ENOSYS); error = priv_check(td, PRIV_AUDIT_GETAUDIT); if (error) return (error); return (copyout(&td->td_ucred->cr_audit.ai_auid, uap->auid, sizeof(td->td_ucred->cr_audit.ai_auid))); } /* ARGSUSED */ int sys_setauid(struct thread *td, struct setauid_args *uap) { struct ucred *newcred, *oldcred; au_id_t id; int error; if (jailed(td->td_ucred)) return (ENOSYS); error = copyin(uap->auid, &id, sizeof(id)); if (error) return (error); audit_arg_auid(id); newcred = crget(); PROC_LOCK(td->td_proc); oldcred = td->td_proc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC error = mac_cred_check_setauid(oldcred, id); if (error) goto fail; #endif error = priv_check_cred(oldcred, PRIV_AUDIT_SETAUDIT, 0); if (error) goto fail; newcred->cr_audit.ai_auid = id; - td->td_proc->p_ucred = newcred; + proc_set_cred(td->td_proc, newcred); PROC_UNLOCK(td->td_proc); crfree(oldcred); return (0); fail: PROC_UNLOCK(td->td_proc); crfree(newcred); return (error); } /* * System calls to get and set process audit information. */ /* ARGSUSED */ int sys_getaudit(struct thread *td, struct getaudit_args *uap) { struct auditinfo ai; struct ucred *cred; int error; cred = td->td_ucred; if (jailed(cred)) return (ENOSYS); error = priv_check(td, PRIV_AUDIT_GETAUDIT); if (error) return (error); if (cred->cr_audit.ai_termid.at_type == AU_IPv6) return (E2BIG); bzero(&ai, sizeof(ai)); ai.ai_auid = cred->cr_audit.ai_auid; ai.ai_mask = cred->cr_audit.ai_mask; ai.ai_asid = cred->cr_audit.ai_asid; ai.ai_termid.machine = cred->cr_audit.ai_termid.at_addr[0]; ai.ai_termid.port = cred->cr_audit.ai_termid.at_port; return (copyout(&ai, uap->auditinfo, sizeof(ai))); } /* ARGSUSED */ int sys_setaudit(struct thread *td, struct setaudit_args *uap) { struct ucred *newcred, *oldcred; struct auditinfo ai; int error; if (jailed(td->td_ucred)) return (ENOSYS); error = copyin(uap->auditinfo, &ai, sizeof(ai)); if (error) return (error); audit_arg_auditinfo(&ai); newcred = crget(); PROC_LOCK(td->td_proc); oldcred = td->td_proc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC error = mac_cred_check_setaudit(oldcred, &ai); if (error) goto fail; #endif error = priv_check_cred(oldcred, PRIV_AUDIT_SETAUDIT, 0); if (error) goto fail; bzero(&newcred->cr_audit, sizeof(newcred->cr_audit)); newcred->cr_audit.ai_auid = ai.ai_auid; newcred->cr_audit.ai_mask = ai.ai_mask; newcred->cr_audit.ai_asid = ai.ai_asid; newcred->cr_audit.ai_termid.at_addr[0] = ai.ai_termid.machine; newcred->cr_audit.ai_termid.at_port = ai.ai_termid.port; newcred->cr_audit.ai_termid.at_type = AU_IPv4; - td->td_proc->p_ucred = newcred; + proc_set_cred(td->td_proc, newcred); PROC_UNLOCK(td->td_proc); crfree(oldcred); return (0); fail: PROC_UNLOCK(td->td_proc); crfree(newcred); return (error); } /* ARGSUSED */ int sys_getaudit_addr(struct thread *td, struct getaudit_addr_args *uap) { int error; if (jailed(td->td_ucred)) return (ENOSYS); if (uap->length < sizeof(*uap->auditinfo_addr)) return (EOVERFLOW); error = priv_check(td, PRIV_AUDIT_GETAUDIT); if (error) return (error); return (copyout(&td->td_ucred->cr_audit, uap->auditinfo_addr, sizeof(*uap->auditinfo_addr))); } /* ARGSUSED */ int sys_setaudit_addr(struct thread *td, struct setaudit_addr_args *uap) { struct ucred *newcred, *oldcred; struct auditinfo_addr aia; int error; if (jailed(td->td_ucred)) return (ENOSYS); error = copyin(uap->auditinfo_addr, &aia, sizeof(aia)); if (error) return (error); audit_arg_auditinfo_addr(&aia); if (aia.ai_termid.at_type != AU_IPv6 && aia.ai_termid.at_type != AU_IPv4) return (EINVAL); newcred = crget(); PROC_LOCK(td->td_proc); oldcred = td->td_proc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC error = mac_cred_check_setaudit_addr(oldcred, &aia); if (error) goto fail; #endif error = priv_check_cred(oldcred, PRIV_AUDIT_SETAUDIT, 0); if (error) goto fail; newcred->cr_audit = aia; - td->td_proc->p_ucred = newcred; + proc_set_cred(td->td_proc, newcred); PROC_UNLOCK(td->td_proc); crfree(oldcred); return (0); fail: PROC_UNLOCK(td->td_proc); crfree(newcred); return (error); } /* * Syscall to manage audit files. */ /* ARGSUSED */ int sys_auditctl(struct thread *td, struct auditctl_args *uap) { struct nameidata nd; struct ucred *cred; struct vnode *vp; int error = 0; int flags; if (jailed(td->td_ucred)) return (ENOSYS); error = priv_check(td, PRIV_AUDIT_CONTROL); if (error) return (error); vp = NULL; cred = NULL; /* * If a path is specified, open the replacement vnode, perform * validity checks, and grab another reference to the current * credential. * * On Darwin, a NULL path argument is also used to disable audit. */ if (uap->path == NULL) return (EINVAL); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); flags = AUDIT_OPEN_FLAGS; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); vp = nd.ni_vp; #ifdef MAC error = mac_system_check_auditctl(td->td_ucred, vp); VOP_UNLOCK(vp, 0); if (error) { vn_close(vp, AUDIT_CLOSE_FLAGS, td->td_ucred, td); return (error); } #else VOP_UNLOCK(vp, 0); #endif NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VREG) { vn_close(vp, AUDIT_CLOSE_FLAGS, td->td_ucred, td); return (EINVAL); } cred = td->td_ucred; crhold(cred); /* * XXXAUDIT: Should audit_suspended actually be cleared by * audit_worker? */ audit_suspended = 0; audit_rotate_vnode(cred, vp); return (error); } #else /* !AUDIT */ int sys_audit(struct thread *td, struct audit_args *uap) { return (ENOSYS); } int sys_auditon(struct thread *td, struct auditon_args *uap) { return (ENOSYS); } int sys_getauid(struct thread *td, struct getauid_args *uap) { return (ENOSYS); } int sys_setauid(struct thread *td, struct setauid_args *uap) { return (ENOSYS); } int sys_getaudit(struct thread *td, struct getaudit_args *uap) { return (ENOSYS); } int sys_setaudit(struct thread *td, struct setaudit_args *uap) { return (ENOSYS); } int sys_getaudit_addr(struct thread *td, struct getaudit_addr_args *uap) { return (ENOSYS); } int sys_setaudit_addr(struct thread *td, struct setaudit_addr_args *uap) { return (ENOSYS); } int sys_auditctl(struct thread *td, struct auditctl_args *uap) { return (ENOSYS); } #endif /* AUDIT */ Index: stable/10/sys/security/mac/mac_syscalls.c =================================================================== --- stable/10/sys/security/mac/mac_syscalls.c (revision 302228) +++ stable/10/sys/security/mac/mac_syscalls.c (revision 302229) @@ -1,733 +1,733 @@ /*- * Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson * Copyright (c) 2001 Ilmar S. Habibulin * Copyright (c) 2001-2005 Networks Associates Technology, Inc. * Copyright (c) 2005-2006 SPARTA, Inc. * Copyright (c) 2008 Apple Inc. * All rights reserved. * * This software was developed by Robert Watson and Ilmar Habibulin for the * TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MAC FEATURE(security_mac, "Mandatory Access Control Framework support"); int sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap) { char *elements, *buffer; struct mac mac; struct proc *tproc; struct ucred *tcred; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); tproc = pfind(uap->pid); if (tproc == NULL) return (ESRCH); tcred = NULL; /* Satisfy gcc. */ error = p_cansee(td, tproc); if (error == 0) tcred = crhold(tproc->p_ucred); PROC_UNLOCK(tproc); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); crfree(tcred); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = mac_cred_externalize_label(tcred->cr_label, elements, buffer, mac.m_buflen); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); free(buffer, M_MACTEMP); free(elements, M_MACTEMP); crfree(tcred); return (error); } int sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap) { char *elements, *buffer; struct mac mac; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = mac_cred_externalize_label(td->td_ucred->cr_label, elements, buffer, mac.m_buflen); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap) { struct ucred *newcred, *oldcred; struct label *intlabel; struct proc *p; struct mac mac; char *buffer; int error; if (!(mac_labeled & MPC_OBJECT_CRED)) return (EINVAL); error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } intlabel = mac_cred_label_alloc(); error = mac_cred_internalize_label(intlabel, buffer); free(buffer, M_MACTEMP); if (error) goto out; newcred = crget(); p = td->td_proc; PROC_LOCK(p); oldcred = p->p_ucred; error = mac_cred_check_relabel(oldcred, intlabel); if (error) { PROC_UNLOCK(p); crfree(newcred); goto out; } setsugid(p); crcopy(newcred, oldcred); mac_cred_relabel(newcred, intlabel); - p->p_ucred = newcred; + proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); mac_proc_vm_revoke(td); out: mac_cred_label_free(intlabel); return (error); } int sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap) { char *elements, *buffer; struct label *intlabel; struct file *fp; struct mac mac; struct vnode *vp; struct pipe *pipe; struct socket *so; cap_rights_t rights; short label_type; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = fget(td, uap->fd, cap_rights_init(&rights, CAP_MAC_GET), &fp); if (error) goto out; label_type = fp->f_type; switch (fp->f_type) { case DTYPE_FIFO: case DTYPE_VNODE: if (!(mac_labeled & MPC_OBJECT_VNODE)) { error = EINVAL; goto out_fdrop; } vp = fp->f_vnode; intlabel = mac_vnode_label_alloc(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); mac_vnode_copy_label(vp->v_label, intlabel); VOP_UNLOCK(vp, 0); error = mac_vnode_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_vnode_label_free(intlabel); break; case DTYPE_PIPE: if (!(mac_labeled & MPC_OBJECT_PIPE)) { error = EINVAL; goto out_fdrop; } pipe = fp->f_data; intlabel = mac_pipe_label_alloc(); PIPE_LOCK(pipe); mac_pipe_copy_label(pipe->pipe_pair->pp_label, intlabel); PIPE_UNLOCK(pipe); error = mac_pipe_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_pipe_label_free(intlabel); break; case DTYPE_SOCKET: if (!(mac_labeled & MPC_OBJECT_SOCKET)) { error = EINVAL; goto out_fdrop; } so = fp->f_data; intlabel = mac_socket_label_alloc(M_WAITOK); SOCK_LOCK(so); mac_socket_copy_label(so->so_label, intlabel); SOCK_UNLOCK(so); error = mac_socket_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_socket_label_free(intlabel); break; default: error = EINVAL; } if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); out_fdrop: fdrop(fp, td); out: free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap) { char *elements, *buffer; struct nameidata nd; struct label *intlabel; struct mac mac; int error; if (!(mac_labeled & MPC_OBJECT_VNODE)) return (EINVAL); error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_USERSPACE, uap->path_p, td); error = namei(&nd); if (error) goto out; intlabel = mac_vnode_label_alloc(); mac_vnode_copy_label(nd.ni_vp->v_label, intlabel); error = mac_vnode_externalize_label(intlabel, elements, buffer, mac.m_buflen); NDFREE(&nd, 0); mac_vnode_label_free(intlabel); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); out: free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap) { char *elements, *buffer; struct nameidata nd; struct label *intlabel; struct mac mac; int error; if (!(mac_labeled & MPC_OBJECT_VNODE)) return (EINVAL); error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); NDINIT(&nd, LOOKUP, LOCKLEAF | NOFOLLOW, UIO_USERSPACE, uap->path_p, td); error = namei(&nd); if (error) goto out; intlabel = mac_vnode_label_alloc(); mac_vnode_copy_label(nd.ni_vp->v_label, intlabel); error = mac_vnode_externalize_label(intlabel, elements, buffer, mac.m_buflen); NDFREE(&nd, 0); mac_vnode_label_free(intlabel); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); out: free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap) { struct label *intlabel; struct pipe *pipe; struct socket *so; struct file *fp; struct mount *mp; struct vnode *vp; struct mac mac; cap_rights_t rights; char *buffer; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } error = fget(td, uap->fd, cap_rights_init(&rights, CAP_MAC_SET), &fp); if (error) goto out; switch (fp->f_type) { case DTYPE_FIFO: case DTYPE_VNODE: if (!(mac_labeled & MPC_OBJECT_VNODE)) { error = EINVAL; goto out_fdrop; } intlabel = mac_vnode_label_alloc(); error = mac_vnode_internalize_label(intlabel, buffer); if (error) { mac_vnode_label_free(intlabel); break; } vp = fp->f_vnode; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) { mac_vnode_label_free(intlabel); break; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = vn_setlabel(vp, intlabel, td->td_ucred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); mac_vnode_label_free(intlabel); break; case DTYPE_PIPE: if (!(mac_labeled & MPC_OBJECT_PIPE)) { error = EINVAL; goto out_fdrop; } intlabel = mac_pipe_label_alloc(); error = mac_pipe_internalize_label(intlabel, buffer); if (error == 0) { pipe = fp->f_data; PIPE_LOCK(pipe); error = mac_pipe_label_set(td->td_ucred, pipe->pipe_pair, intlabel); PIPE_UNLOCK(pipe); } mac_pipe_label_free(intlabel); break; case DTYPE_SOCKET: if (!(mac_labeled & MPC_OBJECT_SOCKET)) { error = EINVAL; goto out_fdrop; } intlabel = mac_socket_label_alloc(M_WAITOK); error = mac_socket_internalize_label(intlabel, buffer); if (error == 0) { so = fp->f_data; error = mac_socket_label_set(td->td_ucred, so, intlabel); } mac_socket_label_free(intlabel); break; default: error = EINVAL; } out_fdrop: fdrop(fp, td); out: free(buffer, M_MACTEMP); return (error); } int sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap) { struct label *intlabel; struct nameidata nd; struct mount *mp; struct mac mac; char *buffer; int error; if (!(mac_labeled & MPC_OBJECT_VNODE)) return (EINVAL); error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } intlabel = mac_vnode_label_alloc(); error = mac_vnode_internalize_label(intlabel, buffer); free(buffer, M_MACTEMP); if (error) goto out; NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_USERSPACE, uap->path_p, td); error = namei(&nd); if (error == 0) { error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); if (error == 0) { error = vn_setlabel(nd.ni_vp, intlabel, td->td_ucred); vn_finished_write(mp); } } NDFREE(&nd, 0); out: mac_vnode_label_free(intlabel); return (error); } int sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap) { struct label *intlabel; struct nameidata nd; struct mount *mp; struct mac mac; char *buffer; int error; if (!(mac_labeled & MPC_OBJECT_VNODE)) return (EINVAL); error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } intlabel = mac_vnode_label_alloc(); error = mac_vnode_internalize_label(intlabel, buffer); free(buffer, M_MACTEMP); if (error) goto out; NDINIT(&nd, LOOKUP, LOCKLEAF | NOFOLLOW, UIO_USERSPACE, uap->path_p, td); error = namei(&nd); if (error == 0) { error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH); if (error == 0) { error = vn_setlabel(nd.ni_vp, intlabel, td->td_ucred); vn_finished_write(mp); } } NDFREE(&nd, 0); out: mac_vnode_label_free(intlabel); return (error); } int sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap) { struct mac_policy_conf *mpc; char target[MAC_MAX_POLICY_NAME]; int error; error = copyinstr(uap->policy, target, sizeof(target), NULL); if (error) return (error); error = ENOSYS; LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { if (strcmp(mpc->mpc_name, target) == 0 && mpc->mpc_ops->mpo_syscall != NULL) { error = mpc->mpc_ops->mpo_syscall(td, uap->call, uap->arg); goto out; } } if (!LIST_EMPTY(&mac_policy_list)) { mac_policy_slock_sleep(); LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { if (strcmp(mpc->mpc_name, target) == 0 && mpc->mpc_ops->mpo_syscall != NULL) { error = mpc->mpc_ops->mpo_syscall(td, uap->call, uap->arg); break; } } mac_policy_sunlock_sleep(); } out: return (error); } #else /* !MAC */ int sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap) { return (ENOSYS); } int sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap) { return (ENOSYS); } int sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap) { return (ENOSYS); } int sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap) { return (ENOSYS); } int sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap) { return (ENOSYS); } int sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap) { return (ENOSYS); } int sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap) { return (ENOSYS); } int sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap) { return (ENOSYS); } int sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap) { return (ENOSYS); } int sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap) { return (ENOSYS); } #endif /* !MAC */ Index: stable/10/sys/security/mac_lomac/mac_lomac.c =================================================================== --- stable/10/sys/security/mac_lomac/mac_lomac.c (revision 302228) +++ stable/10/sys/security/mac_lomac/mac_lomac.c (revision 302229) @@ -1,3087 +1,3087 @@ /*- * Copyright (c) 1999-2002, 2007-2009 Robert N. M. Watson * Copyright (c) 2001-2005 Networks Associates Technology, Inc. * Copyright (c) 2006 SPARTA, Inc. * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by NAI Labs, * the Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Developed by the TrustedBSD Project. * * Low-watermark floating label mandatory integrity policy. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct mac_lomac_proc { struct mac_lomac mac_lomac; struct mtx mtx; }; SYSCTL_DECL(_security_mac); static SYSCTL_NODE(_security_mac, OID_AUTO, lomac, CTLFLAG_RW, 0, "TrustedBSD mac_lomac policy controls"); static int lomac_label_size = sizeof(struct mac_lomac); SYSCTL_INT(_security_mac_lomac, OID_AUTO, label_size, CTLFLAG_RD, &lomac_label_size, 0, "Size of struct mac_lomac"); static int lomac_enabled = 1; SYSCTL_INT(_security_mac_lomac, OID_AUTO, enabled, CTLFLAG_RW, &lomac_enabled, 0, "Enforce MAC/LOMAC policy"); TUNABLE_INT("security.mac.lomac.enabled", &lomac_enabled); static int destroyed_not_inited; SYSCTL_INT(_security_mac_lomac, OID_AUTO, destroyed_not_inited, CTLFLAG_RD, &destroyed_not_inited, 0, "Count of labels destroyed but not inited"); static int trust_all_interfaces = 0; SYSCTL_INT(_security_mac_lomac, OID_AUTO, trust_all_interfaces, CTLFLAG_RD, &trust_all_interfaces, 0, "Consider all interfaces 'trusted' by MAC/LOMAC"); TUNABLE_INT("security.mac.lomac.trust_all_interfaces", &trust_all_interfaces); static char trusted_interfaces[128]; SYSCTL_STRING(_security_mac_lomac, OID_AUTO, trusted_interfaces, CTLFLAG_RD, trusted_interfaces, 0, "Interfaces considered 'trusted' by MAC/LOMAC"); TUNABLE_STR("security.mac.lomac.trusted_interfaces", trusted_interfaces, sizeof(trusted_interfaces)); static int ptys_equal = 0; SYSCTL_INT(_security_mac_lomac, OID_AUTO, ptys_equal, CTLFLAG_RW, &ptys_equal, 0, "Label pty devices as lomac/equal on create"); TUNABLE_INT("security.mac.lomac.ptys_equal", &ptys_equal); static int revocation_enabled = 1; SYSCTL_INT(_security_mac_lomac, OID_AUTO, revocation_enabled, CTLFLAG_RW, &revocation_enabled, 0, "Revoke access to objects on relabel"); TUNABLE_INT("security.mac.lomac.revocation_enabled", &revocation_enabled); static int lomac_slot; #define SLOT(l) ((struct mac_lomac *)mac_label_get((l), lomac_slot)) #define SLOT_SET(l, val) mac_label_set((l), lomac_slot, (uintptr_t)(val)) #define PSLOT(l) ((struct mac_lomac_proc *) \ mac_label_get((l), lomac_slot)) #define PSLOT_SET(l, val) mac_label_set((l), lomac_slot, (uintptr_t)(val)) static MALLOC_DEFINE(M_LOMAC, "mac_lomac_label", "MAC/LOMAC labels"); static struct mac_lomac * lomac_alloc(int flag) { struct mac_lomac *ml; ml = malloc(sizeof(*ml), M_LOMAC, M_ZERO | flag); return (ml); } static void lomac_free(struct mac_lomac *ml) { if (ml != NULL) free(ml, M_LOMAC); else atomic_add_int(&destroyed_not_inited, 1); } static int lomac_atmostflags(struct mac_lomac *ml, int flags) { if ((ml->ml_flags & flags) != ml->ml_flags) return (EINVAL); return (0); } static int lomac_dominate_element(struct mac_lomac_element *a, struct mac_lomac_element *b) { switch (a->mle_type) { case MAC_LOMAC_TYPE_EQUAL: case MAC_LOMAC_TYPE_HIGH: return (1); case MAC_LOMAC_TYPE_LOW: switch (b->mle_type) { case MAC_LOMAC_TYPE_GRADE: case MAC_LOMAC_TYPE_HIGH: return (0); case MAC_LOMAC_TYPE_EQUAL: case MAC_LOMAC_TYPE_LOW: return (1); default: panic("lomac_dominate_element: b->mle_type invalid"); } case MAC_LOMAC_TYPE_GRADE: switch (b->mle_type) { case MAC_LOMAC_TYPE_EQUAL: case MAC_LOMAC_TYPE_LOW: return (1); case MAC_LOMAC_TYPE_HIGH: return (0); case MAC_LOMAC_TYPE_GRADE: return (a->mle_grade >= b->mle_grade); default: panic("lomac_dominate_element: b->mle_type invalid"); } default: panic("lomac_dominate_element: a->mle_type invalid"); } } static int lomac_range_in_range(struct mac_lomac *rangea, struct mac_lomac *rangeb) { return (lomac_dominate_element(&rangeb->ml_rangehigh, &rangea->ml_rangehigh) && lomac_dominate_element(&rangea->ml_rangelow, &rangeb->ml_rangelow)); } static int lomac_single_in_range(struct mac_lomac *single, struct mac_lomac *range) { KASSERT((single->ml_flags & MAC_LOMAC_FLAG_SINGLE) != 0, ("lomac_single_in_range: a not single")); KASSERT((range->ml_flags & MAC_LOMAC_FLAG_RANGE) != 0, ("lomac_single_in_range: b not range")); return (lomac_dominate_element(&range->ml_rangehigh, &single->ml_single) && lomac_dominate_element(&single->ml_single, &range->ml_rangelow)); } static int lomac_auxsingle_in_range(struct mac_lomac *single, struct mac_lomac *range) { KASSERT((single->ml_flags & MAC_LOMAC_FLAG_AUX) != 0, ("lomac_single_in_range: a not auxsingle")); KASSERT((range->ml_flags & MAC_LOMAC_FLAG_RANGE) != 0, ("lomac_single_in_range: b not range")); return (lomac_dominate_element(&range->ml_rangehigh, &single->ml_auxsingle) && lomac_dominate_element(&single->ml_auxsingle, &range->ml_rangelow)); } static int lomac_dominate_single(struct mac_lomac *a, struct mac_lomac *b) { KASSERT((a->ml_flags & MAC_LOMAC_FLAG_SINGLE) != 0, ("lomac_dominate_single: a not single")); KASSERT((b->ml_flags & MAC_LOMAC_FLAG_SINGLE) != 0, ("lomac_dominate_single: b not single")); return (lomac_dominate_element(&a->ml_single, &b->ml_single)); } static int lomac_subject_dominate(struct mac_lomac *a, struct mac_lomac *b) { KASSERT((~a->ml_flags & (MAC_LOMAC_FLAG_SINGLE | MAC_LOMAC_FLAG_RANGE)) == 0, ("lomac_dominate_single: a not subject")); KASSERT((b->ml_flags & MAC_LOMAC_FLAG_SINGLE) != 0, ("lomac_dominate_single: b not single")); return (lomac_dominate_element(&a->ml_rangehigh, &b->ml_single)); } static int lomac_equal_element(struct mac_lomac_element *a, struct mac_lomac_element *b) { if (a->mle_type == MAC_LOMAC_TYPE_EQUAL || b->mle_type == MAC_LOMAC_TYPE_EQUAL) return (1); return (a->mle_type == b->mle_type && a->mle_grade == b->mle_grade); } static int lomac_equal_single(struct mac_lomac *a, struct mac_lomac *b) { KASSERT((a->ml_flags & MAC_LOMAC_FLAG_SINGLE) != 0, ("lomac_equal_single: a not single")); KASSERT((b->ml_flags & MAC_LOMAC_FLAG_SINGLE) != 0, ("lomac_equal_single: b not single")); return (lomac_equal_element(&a->ml_single, &b->ml_single)); } static int lomac_contains_equal(struct mac_lomac *ml) { if (ml->ml_flags & MAC_LOMAC_FLAG_SINGLE) if (ml->ml_single.mle_type == MAC_LOMAC_TYPE_EQUAL) return (1); if (ml->ml_flags & MAC_LOMAC_FLAG_AUX) if (ml->ml_auxsingle.mle_type == MAC_LOMAC_TYPE_EQUAL) return (1); if (ml->ml_flags & MAC_LOMAC_FLAG_RANGE) { if (ml->ml_rangelow.mle_type == MAC_LOMAC_TYPE_EQUAL) return (1); if (ml->ml_rangehigh.mle_type == MAC_LOMAC_TYPE_EQUAL) return (1); } return (0); } static int lomac_subject_privileged(struct mac_lomac *ml) { KASSERT((ml->ml_flags & MAC_LOMAC_FLAGS_BOTH) == MAC_LOMAC_FLAGS_BOTH, ("lomac_subject_privileged: subject doesn't have both labels")); /* If the single is EQUAL, it's ok. */ if (ml->ml_single.mle_type == MAC_LOMAC_TYPE_EQUAL) return (0); /* If either range endpoint is EQUAL, it's ok. */ if (ml->ml_rangelow.mle_type == MAC_LOMAC_TYPE_EQUAL || ml->ml_rangehigh.mle_type == MAC_LOMAC_TYPE_EQUAL) return (0); /* If the range is low-high, it's ok. */ if (ml->ml_rangelow.mle_type == MAC_LOMAC_TYPE_LOW && ml->ml_rangehigh.mle_type == MAC_LOMAC_TYPE_HIGH) return (0); /* It's not ok. */ return (EPERM); } static int lomac_high_single(struct mac_lomac *ml) { KASSERT((ml->ml_flags & MAC_LOMAC_FLAG_SINGLE) != 0, ("lomac_high_single: mac_lomac not single")); return (ml->ml_single.mle_type == MAC_LOMAC_TYPE_HIGH); } static int lomac_valid(struct mac_lomac *ml) { if (ml->ml_flags & MAC_LOMAC_FLAG_SINGLE) { switch (ml->ml_single.mle_type) { case MAC_LOMAC_TYPE_GRADE: case MAC_LOMAC_TYPE_EQUAL: case MAC_LOMAC_TYPE_HIGH: case MAC_LOMAC_TYPE_LOW: break; default: return (EINVAL); } } else { if (ml->ml_single.mle_type != MAC_LOMAC_TYPE_UNDEF) return (EINVAL); } if (ml->ml_flags & MAC_LOMAC_FLAG_AUX) { switch (ml->ml_auxsingle.mle_type) { case MAC_LOMAC_TYPE_GRADE: case MAC_LOMAC_TYPE_EQUAL: case MAC_LOMAC_TYPE_HIGH: case MAC_LOMAC_TYPE_LOW: break; default: return (EINVAL); } } else { if (ml->ml_auxsingle.mle_type != MAC_LOMAC_TYPE_UNDEF) return (EINVAL); } if (ml->ml_flags & MAC_LOMAC_FLAG_RANGE) { switch (ml->ml_rangelow.mle_type) { case MAC_LOMAC_TYPE_GRADE: case MAC_LOMAC_TYPE_EQUAL: case MAC_LOMAC_TYPE_HIGH: case MAC_LOMAC_TYPE_LOW: break; default: return (EINVAL); } switch (ml->ml_rangehigh.mle_type) { case MAC_LOMAC_TYPE_GRADE: case MAC_LOMAC_TYPE_EQUAL: case MAC_LOMAC_TYPE_HIGH: case MAC_LOMAC_TYPE_LOW: break; default: return (EINVAL); } if (!lomac_dominate_element(&ml->ml_rangehigh, &ml->ml_rangelow)) return (EINVAL); } else { if (ml->ml_rangelow.mle_type != MAC_LOMAC_TYPE_UNDEF || ml->ml_rangehigh.mle_type != MAC_LOMAC_TYPE_UNDEF) return (EINVAL); } return (0); } static void lomac_set_range(struct mac_lomac *ml, u_short typelow, u_short gradelow, u_short typehigh, u_short gradehigh) { ml->ml_rangelow.mle_type = typelow; ml->ml_rangelow.mle_grade = gradelow; ml->ml_rangehigh.mle_type = typehigh; ml->ml_rangehigh.mle_grade = gradehigh; ml->ml_flags |= MAC_LOMAC_FLAG_RANGE; } static void lomac_set_single(struct mac_lomac *ml, u_short type, u_short grade) { ml->ml_single.mle_type = type; ml->ml_single.mle_grade = grade; ml->ml_flags |= MAC_LOMAC_FLAG_SINGLE; } static void lomac_copy_range(struct mac_lomac *labelfrom, struct mac_lomac *labelto) { KASSERT((labelfrom->ml_flags & MAC_LOMAC_FLAG_RANGE) != 0, ("lomac_copy_range: labelfrom not range")); labelto->ml_rangelow = labelfrom->ml_rangelow; labelto->ml_rangehigh = labelfrom->ml_rangehigh; labelto->ml_flags |= MAC_LOMAC_FLAG_RANGE; } static void lomac_copy_single(struct mac_lomac *labelfrom, struct mac_lomac *labelto) { KASSERT((labelfrom->ml_flags & MAC_LOMAC_FLAG_SINGLE) != 0, ("lomac_copy_single: labelfrom not single")); labelto->ml_single = labelfrom->ml_single; labelto->ml_flags |= MAC_LOMAC_FLAG_SINGLE; } static void lomac_copy_auxsingle(struct mac_lomac *labelfrom, struct mac_lomac *labelto) { KASSERT((labelfrom->ml_flags & MAC_LOMAC_FLAG_AUX) != 0, ("lomac_copy_auxsingle: labelfrom not auxsingle")); labelto->ml_auxsingle = labelfrom->ml_auxsingle; labelto->ml_flags |= MAC_LOMAC_FLAG_AUX; } static void lomac_copy(struct mac_lomac *source, struct mac_lomac *dest) { if (source->ml_flags & MAC_LOMAC_FLAG_SINGLE) lomac_copy_single(source, dest); if (source->ml_flags & MAC_LOMAC_FLAG_AUX) lomac_copy_auxsingle(source, dest); if (source->ml_flags & MAC_LOMAC_FLAG_RANGE) lomac_copy_range(source, dest); } static int lomac_to_string(struct sbuf *sb, struct mac_lomac *ml); static int maybe_demote(struct mac_lomac *subjlabel, struct mac_lomac *objlabel, const char *actionname, const char *objname, struct vnode *vp) { struct sbuf subjlabel_sb, subjtext_sb, objlabel_sb; char *subjlabeltext, *objlabeltext, *subjtext; struct mac_lomac cached_subjlabel; struct mac_lomac_proc *subj; struct vattr va; struct proc *p; pid_t pgid; subj = PSLOT(curthread->td_proc->p_label); p = curthread->td_proc; mtx_lock(&subj->mtx); if (subj->mac_lomac.ml_flags & MAC_LOMAC_FLAG_UPDATE) { /* * Check to see if the pending demotion would be more or less * severe than this one, and keep the more severe. This can * only happen for a multi-threaded application. */ if (lomac_dominate_single(objlabel, &subj->mac_lomac)) { mtx_unlock(&subj->mtx); return (0); } } bzero(&subj->mac_lomac, sizeof(subj->mac_lomac)); /* * Always demote the single label. */ lomac_copy_single(objlabel, &subj->mac_lomac); /* * Start with the original range, then minimize each side of the * range to the point of not dominating the object. The high side * will always be demoted, of course. */ lomac_copy_range(subjlabel, &subj->mac_lomac); if (!lomac_dominate_element(&objlabel->ml_single, &subj->mac_lomac.ml_rangelow)) subj->mac_lomac.ml_rangelow = objlabel->ml_single; subj->mac_lomac.ml_rangehigh = objlabel->ml_single; subj->mac_lomac.ml_flags |= MAC_LOMAC_FLAG_UPDATE; thread_lock(curthread); curthread->td_flags |= TDF_ASTPENDING | TDF_MACPEND; thread_unlock(curthread); /* * Avoid memory allocation while holding a mutex; cache the label. */ lomac_copy_single(&subj->mac_lomac, &cached_subjlabel); mtx_unlock(&subj->mtx); sbuf_new(&subjlabel_sb, NULL, 0, SBUF_AUTOEXTEND); lomac_to_string(&subjlabel_sb, subjlabel); sbuf_finish(&subjlabel_sb); subjlabeltext = sbuf_data(&subjlabel_sb); sbuf_new(&subjtext_sb, NULL, 0, SBUF_AUTOEXTEND); lomac_to_string(&subjtext_sb, &subj->mac_lomac); sbuf_finish(&subjtext_sb); subjtext = sbuf_data(&subjtext_sb); sbuf_new(&objlabel_sb, NULL, 0, SBUF_AUTOEXTEND); lomac_to_string(&objlabel_sb, objlabel); sbuf_finish(&objlabel_sb); objlabeltext = sbuf_data(&objlabel_sb); pgid = p->p_pgrp->pg_id; /* XXX could be stale? */ if (vp != NULL && VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { log(LOG_INFO, "LOMAC: level-%s subject p%dg%du%d:%s demoted to" " level %s after %s a level-%s %s (inode=%ld, " "mountpount=%s)\n", subjlabeltext, p->p_pid, pgid, curthread->td_ucred->cr_uid, p->p_comm, subjtext, actionname, objlabeltext, objname, va.va_fileid, vp->v_mount->mnt_stat.f_mntonname); } else { log(LOG_INFO, "LOMAC: level-%s subject p%dg%du%d:%s demoted to" " level %s after %s a level-%s %s\n", subjlabeltext, p->p_pid, pgid, curthread->td_ucred->cr_uid, p->p_comm, subjtext, actionname, objlabeltext, objname); } sbuf_delete(&subjlabel_sb); sbuf_delete(&subjtext_sb); sbuf_delete(&objlabel_sb); return (0); } /* * Relabel "to" to "from" only if "from" is a valid label (contains at least * a single), as for a relabel operation which may or may not involve a * relevant label. */ static void try_relabel(struct mac_lomac *from, struct mac_lomac *to) { if (from->ml_flags & MAC_LOMAC_FLAG_SINGLE) { bzero(to, sizeof(*to)); lomac_copy(from, to); } } /* * Policy module operations. */ static void lomac_init(struct mac_policy_conf *conf) { } /* * Label operations. */ static void lomac_init_label(struct label *label) { SLOT_SET(label, lomac_alloc(M_WAITOK)); } static int lomac_init_label_waitcheck(struct label *label, int flag) { SLOT_SET(label, lomac_alloc(flag)); if (SLOT(label) == NULL) return (ENOMEM); return (0); } static void lomac_destroy_label(struct label *label) { lomac_free(SLOT(label)); SLOT_SET(label, NULL); } static int lomac_element_to_string(struct sbuf *sb, struct mac_lomac_element *element) { switch (element->mle_type) { case MAC_LOMAC_TYPE_HIGH: return (sbuf_printf(sb, "high")); case MAC_LOMAC_TYPE_LOW: return (sbuf_printf(sb, "low")); case MAC_LOMAC_TYPE_EQUAL: return (sbuf_printf(sb, "equal")); case MAC_LOMAC_TYPE_GRADE: return (sbuf_printf(sb, "%d", element->mle_grade)); default: panic("lomac_element_to_string: invalid type (%d)", element->mle_type); } } static int lomac_to_string(struct sbuf *sb, struct mac_lomac *ml) { if (ml->ml_flags & MAC_LOMAC_FLAG_SINGLE) { if (lomac_element_to_string(sb, &ml->ml_single) == -1) return (EINVAL); } if (ml->ml_flags & MAC_LOMAC_FLAG_AUX) { if (sbuf_putc(sb, '[') == -1) return (EINVAL); if (lomac_element_to_string(sb, &ml->ml_auxsingle) == -1) return (EINVAL); if (sbuf_putc(sb, ']') == -1) return (EINVAL); } if (ml->ml_flags & MAC_LOMAC_FLAG_RANGE) { if (sbuf_putc(sb, '(') == -1) return (EINVAL); if (lomac_element_to_string(sb, &ml->ml_rangelow) == -1) return (EINVAL); if (sbuf_putc(sb, '-') == -1) return (EINVAL); if (lomac_element_to_string(sb, &ml->ml_rangehigh) == -1) return (EINVAL); if (sbuf_putc(sb, ')') == -1) return (EINVAL); } return (0); } static int lomac_externalize_label(struct label *label, char *element_name, struct sbuf *sb, int *claimed) { struct mac_lomac *ml; if (strcmp(MAC_LOMAC_LABEL_NAME, element_name) != 0) return (0); (*claimed)++; ml = SLOT(label); return (lomac_to_string(sb, ml)); } static int lomac_parse_element(struct mac_lomac_element *element, char *string) { if (strcmp(string, "high") == 0 || strcmp(string, "hi") == 0) { element->mle_type = MAC_LOMAC_TYPE_HIGH; element->mle_grade = MAC_LOMAC_TYPE_UNDEF; } else if (strcmp(string, "low") == 0 || strcmp(string, "lo") == 0) { element->mle_type = MAC_LOMAC_TYPE_LOW; element->mle_grade = MAC_LOMAC_TYPE_UNDEF; } else if (strcmp(string, "equal") == 0 || strcmp(string, "eq") == 0) { element->mle_type = MAC_LOMAC_TYPE_EQUAL; element->mle_grade = MAC_LOMAC_TYPE_UNDEF; } else { char *p0, *p1; int d; p0 = string; d = strtol(p0, &p1, 10); if (d < 0 || d > 65535) return (EINVAL); element->mle_type = MAC_LOMAC_TYPE_GRADE; element->mle_grade = d; if (p1 == p0 || *p1 != '\0') return (EINVAL); } return (0); } /* * Note: destructively consumes the string, make a local copy before calling * if that's a problem. */ static int lomac_parse(struct mac_lomac *ml, char *string) { char *range, *rangeend, *rangehigh, *rangelow, *single, *auxsingle, *auxsingleend; int error; /* Do we have a range? */ single = string; range = strchr(string, '('); if (range == single) single = NULL; auxsingle = strchr(string, '['); if (auxsingle == single) single = NULL; if (range != NULL && auxsingle != NULL) return (EINVAL); rangelow = rangehigh = NULL; if (range != NULL) { /* Nul terminate the end of the single string. */ *range = '\0'; range++; rangelow = range; rangehigh = strchr(rangelow, '-'); if (rangehigh == NULL) return (EINVAL); rangehigh++; if (*rangelow == '\0' || *rangehigh == '\0') return (EINVAL); rangeend = strchr(rangehigh, ')'); if (rangeend == NULL) return (EINVAL); if (*(rangeend + 1) != '\0') return (EINVAL); /* Nul terminate the ends of the ranges. */ *(rangehigh - 1) = '\0'; *rangeend = '\0'; } KASSERT((rangelow != NULL && rangehigh != NULL) || (rangelow == NULL && rangehigh == NULL), ("lomac_internalize_label: range mismatch")); if (auxsingle != NULL) { /* Nul terminate the end of the single string. */ *auxsingle = '\0'; auxsingle++; auxsingleend = strchr(auxsingle, ']'); if (auxsingleend == NULL) return (EINVAL); if (*(auxsingleend + 1) != '\0') return (EINVAL); /* Nul terminate the end of the auxsingle. */ *auxsingleend = '\0'; } bzero(ml, sizeof(*ml)); if (single != NULL) { error = lomac_parse_element(&ml->ml_single, single); if (error) return (error); ml->ml_flags |= MAC_LOMAC_FLAG_SINGLE; } if (auxsingle != NULL) { error = lomac_parse_element(&ml->ml_auxsingle, auxsingle); if (error) return (error); ml->ml_flags |= MAC_LOMAC_FLAG_AUX; } if (rangelow != NULL) { error = lomac_parse_element(&ml->ml_rangelow, rangelow); if (error) return (error); error = lomac_parse_element(&ml->ml_rangehigh, rangehigh); if (error) return (error); ml->ml_flags |= MAC_LOMAC_FLAG_RANGE; } error = lomac_valid(ml); if (error) return (error); return (0); } static int lomac_internalize_label(struct label *label, char *element_name, char *element_data, int *claimed) { struct mac_lomac *ml, ml_temp; int error; if (strcmp(MAC_LOMAC_LABEL_NAME, element_name) != 0) return (0); (*claimed)++; error = lomac_parse(&ml_temp, element_data); if (error) return (error); ml = SLOT(label); *ml = ml_temp; return (0); } static void lomac_copy_label(struct label *src, struct label *dest) { *SLOT(dest) = *SLOT(src); } /* * Object-specific entry point implementations are sorted alphabetically by * object type name and then by operation. */ static int lomac_bpfdesc_check_receive(struct bpf_d *d, struct label *dlabel, struct ifnet *ifp, struct label *ifplabel) { struct mac_lomac *a, *b; if (!lomac_enabled) return (0); a = SLOT(dlabel); b = SLOT(ifplabel); if (lomac_equal_single(a, b)) return (0); return (EACCES); } static void lomac_bpfdesc_create(struct ucred *cred, struct bpf_d *d, struct label *dlabel) { struct mac_lomac *source, *dest; source = SLOT(cred->cr_label); dest = SLOT(dlabel); lomac_copy_single(source, dest); } static void lomac_bpfdesc_create_mbuf(struct bpf_d *d, struct label *dlabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *source, *dest; source = SLOT(dlabel); dest = SLOT(mlabel); lomac_copy_single(source, dest); } static int lomac_cred_check_relabel(struct ucred *cred, struct label *newlabel) { struct mac_lomac *subj, *new; int error; subj = SLOT(cred->cr_label); new = SLOT(newlabel); /* * If there is a LOMAC label update for the credential, it may be an * update of the single, range, or both. */ error = lomac_atmostflags(new, MAC_LOMAC_FLAGS_BOTH); if (error) return (error); /* * If the LOMAC label is to be changed, authorize as appropriate. */ if (new->ml_flags & MAC_LOMAC_FLAGS_BOTH) { /* * Fill in the missing parts from the previous label. */ if ((new->ml_flags & MAC_LOMAC_FLAG_SINGLE) == 0) lomac_copy_single(subj, new); if ((new->ml_flags & MAC_LOMAC_FLAG_RANGE) == 0) lomac_copy_range(subj, new); /* * To change the LOMAC range on a credential, the new range * label must be in the current range. */ if (!lomac_range_in_range(new, subj)) return (EPERM); /* * To change the LOMAC single label on a credential, the new * single label must be in the new range. Implicitly from * the previous check, the new single is in the old range. */ if (!lomac_single_in_range(new, new)) return (EPERM); /* * To have EQUAL in any component of the new credential LOMAC * label, the subject must already have EQUAL in their label. */ if (lomac_contains_equal(new)) { error = lomac_subject_privileged(subj); if (error) return (error); } /* * XXXMAC: Additional consistency tests regarding the single * and range of the new label might be performed here. */ } return (0); } static int lomac_cred_check_visible(struct ucred *cr1, struct ucred *cr2) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cr1->cr_label); obj = SLOT(cr2->cr_label); /* XXX: range */ if (!lomac_dominate_single(obj, subj)) return (ESRCH); return (0); } static void lomac_cred_create_init(struct ucred *cred) { struct mac_lomac *dest; dest = SLOT(cred->cr_label); lomac_set_single(dest, MAC_LOMAC_TYPE_HIGH, 0); lomac_set_range(dest, MAC_LOMAC_TYPE_LOW, 0, MAC_LOMAC_TYPE_HIGH, 0); } static void lomac_cred_create_swapper(struct ucred *cred) { struct mac_lomac *dest; dest = SLOT(cred->cr_label); lomac_set_single(dest, MAC_LOMAC_TYPE_EQUAL, 0); lomac_set_range(dest, MAC_LOMAC_TYPE_LOW, 0, MAC_LOMAC_TYPE_HIGH, 0); } static void lomac_cred_relabel(struct ucred *cred, struct label *newlabel) { struct mac_lomac *source, *dest; source = SLOT(newlabel); dest = SLOT(cred->cr_label); try_relabel(source, dest); } static void lomac_devfs_create_device(struct ucred *cred, struct mount *mp, struct cdev *dev, struct devfs_dirent *de, struct label *delabel) { struct mac_lomac *ml; const char *dn; int lomac_type; ml = SLOT(delabel); dn = devtoname(dev); if (strcmp(dn, "null") == 0 || strcmp(dn, "zero") == 0 || strcmp(dn, "random") == 0 || strncmp(dn, "fd/", strlen("fd/")) == 0 || strncmp(dn, "ttyv", strlen("ttyv")) == 0) lomac_type = MAC_LOMAC_TYPE_EQUAL; else if (ptys_equal && (strncmp(dn, "ttyp", strlen("ttyp")) == 0 || strncmp(dn, "pts/", strlen("pts/")) == 0 || strncmp(dn, "ptyp", strlen("ptyp")) == 0)) lomac_type = MAC_LOMAC_TYPE_EQUAL; else lomac_type = MAC_LOMAC_TYPE_HIGH; lomac_set_single(ml, lomac_type, 0); } static void lomac_devfs_create_directory(struct mount *mp, char *dirname, int dirnamelen, struct devfs_dirent *de, struct label *delabel) { struct mac_lomac *ml; ml = SLOT(delabel); lomac_set_single(ml, MAC_LOMAC_TYPE_HIGH, 0); } static void lomac_devfs_create_symlink(struct ucred *cred, struct mount *mp, struct devfs_dirent *dd, struct label *ddlabel, struct devfs_dirent *de, struct label *delabel) { struct mac_lomac *source, *dest; source = SLOT(cred->cr_label); dest = SLOT(delabel); lomac_copy_single(source, dest); } static void lomac_devfs_update(struct mount *mp, struct devfs_dirent *de, struct label *delabel, struct vnode *vp, struct label *vplabel) { struct mac_lomac *source, *dest; source = SLOT(vplabel); dest = SLOT(delabel); lomac_copy(source, dest); } static void lomac_devfs_vnode_associate(struct mount *mp, struct label *mplabel, struct devfs_dirent *de, struct label *delabel, struct vnode *vp, struct label *vplabel) { struct mac_lomac *source, *dest; source = SLOT(delabel); dest = SLOT(vplabel); lomac_copy_single(source, dest); } static int lomac_ifnet_check_relabel(struct ucred *cred, struct ifnet *ifp, struct label *ifplabel, struct label *newlabel) { struct mac_lomac *subj, *new; int error; subj = SLOT(cred->cr_label); new = SLOT(newlabel); /* * If there is a LOMAC label update for the interface, it may be an * update of the single, range, or both. */ error = lomac_atmostflags(new, MAC_LOMAC_FLAGS_BOTH); if (error) return (error); /* * Relabling network interfaces requires LOMAC privilege. */ error = lomac_subject_privileged(subj); if (error) return (error); /* * If the LOMAC label is to be changed, authorize as appropriate. */ if (new->ml_flags & MAC_LOMAC_FLAGS_BOTH) { /* * Fill in the missing parts from the previous label. */ if ((new->ml_flags & MAC_LOMAC_FLAG_SINGLE) == 0) lomac_copy_single(subj, new); if ((new->ml_flags & MAC_LOMAC_FLAG_RANGE) == 0) lomac_copy_range(subj, new); /* * Rely on the traditional superuser status for the LOMAC * interface relabel requirements. XXXMAC: This will go * away. * * XXXRW: This is also redundant to a higher layer check. */ error = priv_check_cred(cred, PRIV_NET_SETIFMAC, 0); if (error) return (EPERM); /* * XXXMAC: Additional consistency tests regarding the single * and the range of the new label might be performed here. */ } return (0); } static int lomac_ifnet_check_transmit(struct ifnet *ifp, struct label *ifplabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *p, *i; if (!lomac_enabled) return (0); p = SLOT(mlabel); i = SLOT(ifplabel); return (lomac_single_in_range(p, i) ? 0 : EACCES); } static void lomac_ifnet_create(struct ifnet *ifp, struct label *ifplabel) { char tifname[IFNAMSIZ], *p, *q; char tiflist[sizeof(trusted_interfaces)]; struct mac_lomac *dest; int len, grade; dest = SLOT(ifplabel); if (ifp->if_type == IFT_LOOP) { grade = MAC_LOMAC_TYPE_EQUAL; goto set; } if (trust_all_interfaces) { grade = MAC_LOMAC_TYPE_HIGH; goto set; } grade = MAC_LOMAC_TYPE_LOW; if (trusted_interfaces[0] == '\0' || !strvalid(trusted_interfaces, sizeof(trusted_interfaces))) goto set; bzero(tiflist, sizeof(tiflist)); for (p = trusted_interfaces, q = tiflist; *p != '\0'; p++, q++) if(*p != ' ' && *p != '\t') *q = *p; for (p = q = tiflist;; p++) { if (*p == ',' || *p == '\0') { len = p - q; if (len < IFNAMSIZ) { bzero(tifname, sizeof(tifname)); bcopy(q, tifname, len); if (strcmp(tifname, ifp->if_xname) == 0) { grade = MAC_LOMAC_TYPE_HIGH; break; } } else { *p = '\0'; printf("MAC/LOMAC warning: interface name " "\"%s\" is too long (must be < %d)\n", q, IFNAMSIZ); } if (*p == '\0') break; q = p + 1; } } set: lomac_set_single(dest, grade, 0); lomac_set_range(dest, grade, 0, grade, 0); } static void lomac_ifnet_create_mbuf(struct ifnet *ifp, struct label *ifplabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *source, *dest; source = SLOT(ifplabel); dest = SLOT(mlabel); lomac_copy_single(source, dest); } static void lomac_ifnet_relabel(struct ucred *cred, struct ifnet *ifp, struct label *ifplabel, struct label *newlabel) { struct mac_lomac *source, *dest; source = SLOT(newlabel); dest = SLOT(ifplabel); try_relabel(source, dest); } static int lomac_inpcb_check_deliver(struct inpcb *inp, struct label *inplabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *p, *i; if (!lomac_enabled) return (0); p = SLOT(mlabel); i = SLOT(inplabel); return (lomac_equal_single(p, i) ? 0 : EACCES); } static int lomac_inpcb_check_visible(struct ucred *cred, struct inpcb *inp, struct label *inplabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(inplabel); if (!lomac_dominate_single(obj, subj)) return (ENOENT); return (0); } static void lomac_inpcb_create(struct socket *so, struct label *solabel, struct inpcb *inp, struct label *inplabel) { struct mac_lomac *source, *dest; source = SLOT(solabel); dest = SLOT(inplabel); lomac_copy_single(source, dest); } static void lomac_inpcb_create_mbuf(struct inpcb *inp, struct label *inplabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *source, *dest; source = SLOT(inplabel); dest = SLOT(mlabel); lomac_copy_single(source, dest); } static void lomac_inpcb_sosetlabel(struct socket *so, struct label *solabel, struct inpcb *inp, struct label *inplabel) { struct mac_lomac *source, *dest; SOCK_LOCK_ASSERT(so); source = SLOT(solabel); dest = SLOT(inplabel); lomac_copy_single(source, dest); } static void lomac_ip6q_create(struct mbuf *m, struct label *mlabel, struct ip6q *q6, struct label *q6label) { struct mac_lomac *source, *dest; source = SLOT(mlabel); dest = SLOT(q6label); lomac_copy_single(source, dest); } static int lomac_ip6q_match(struct mbuf *m, struct label *mlabel, struct ip6q *q6, struct label *q6label) { struct mac_lomac *a, *b; a = SLOT(q6label); b = SLOT(mlabel); return (lomac_equal_single(a, b)); } static void lomac_ip6q_reassemble(struct ip6q *q6, struct label *q6label, struct mbuf *m, struct label *mlabel) { struct mac_lomac *source, *dest; source = SLOT(q6label); dest = SLOT(mlabel); /* Just use the head, since we require them all to match. */ lomac_copy_single(source, dest); } static void lomac_ip6q_update(struct mbuf *m, struct label *mlabel, struct ip6q *q6, struct label *q6label) { /* NOOP: we only accept matching labels, so no need to update */ } static void lomac_ipq_create(struct mbuf *m, struct label *mlabel, struct ipq *q, struct label *qlabel) { struct mac_lomac *source, *dest; source = SLOT(mlabel); dest = SLOT(qlabel); lomac_copy_single(source, dest); } static int lomac_ipq_match(struct mbuf *m, struct label *mlabel, struct ipq *q, struct label *qlabel) { struct mac_lomac *a, *b; a = SLOT(qlabel); b = SLOT(mlabel); return (lomac_equal_single(a, b)); } static void lomac_ipq_reassemble(struct ipq *q, struct label *qlabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *source, *dest; source = SLOT(qlabel); dest = SLOT(mlabel); /* Just use the head, since we require them all to match. */ lomac_copy_single(source, dest); } static void lomac_ipq_update(struct mbuf *m, struct label *mlabel, struct ipq *q, struct label *qlabel) { /* NOOP: we only accept matching labels, so no need to update */ } static int lomac_kld_check_load(struct ucred *cred, struct vnode *vp, struct label *vplabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (lomac_subject_privileged(subj)) return (EPERM); if (!lomac_high_single(obj)) return (EACCES); return (0); } static void lomac_mount_create(struct ucred *cred, struct mount *mp, struct label *mplabel) { struct mac_lomac *source, *dest; source = SLOT(cred->cr_label); dest = SLOT(mplabel); lomac_copy_single(source, dest); } static void lomac_netatalk_aarp_send(struct ifnet *ifp, struct label *ifplabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *dest; dest = SLOT(mlabel); lomac_set_single(dest, MAC_LOMAC_TYPE_EQUAL, 0); } static void lomac_netinet_arp_send(struct ifnet *ifp, struct label *ifplabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *dest; dest = SLOT(mlabel); lomac_set_single(dest, MAC_LOMAC_TYPE_EQUAL, 0); } static void lomac_netinet_firewall_reply(struct mbuf *mrecv, struct label *mrecvlabel, struct mbuf *msend, struct label *msendlabel) { struct mac_lomac *source, *dest; source = SLOT(mrecvlabel); dest = SLOT(msendlabel); lomac_copy_single(source, dest); } static void lomac_netinet_firewall_send(struct mbuf *m, struct label *mlabel) { struct mac_lomac *dest; dest = SLOT(mlabel); /* XXX: where is the label for the firewall really comming from? */ lomac_set_single(dest, MAC_LOMAC_TYPE_EQUAL, 0); } static void lomac_netinet_fragment(struct mbuf *m, struct label *mlabel, struct mbuf *frag, struct label *fraglabel) { struct mac_lomac *source, *dest; source = SLOT(mlabel); dest = SLOT(fraglabel); lomac_copy_single(source, dest); } static void lomac_netinet_icmp_reply(struct mbuf *mrecv, struct label *mrecvlabel, struct mbuf *msend, struct label *msendlabel) { struct mac_lomac *source, *dest; source = SLOT(mrecvlabel); dest = SLOT(msendlabel); lomac_copy_single(source, dest); } static void lomac_netinet_igmp_send(struct ifnet *ifp, struct label *ifplabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *dest; dest = SLOT(mlabel); lomac_set_single(dest, MAC_LOMAC_TYPE_EQUAL, 0); } static void lomac_netinet6_nd6_send(struct ifnet *ifp, struct label *ifplabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *dest; dest = SLOT(mlabel); lomac_set_single(dest, MAC_LOMAC_TYPE_EQUAL, 0); } static int lomac_pipe_check_ioctl(struct ucred *cred, struct pipepair *pp, struct label *pplabel, unsigned long cmd, void /* caddr_t */ *data) { if (!lomac_enabled) return (0); /* XXX: This will be implemented soon... */ return (0); } static int lomac_pipe_check_read(struct ucred *cred, struct pipepair *pp, struct label *pplabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(pplabel); if (!lomac_dominate_single(obj, subj)) return (maybe_demote(subj, obj, "reading", "pipe", NULL)); return (0); } static int lomac_pipe_check_relabel(struct ucred *cred, struct pipepair *pp, struct label *pplabel, struct label *newlabel) { struct mac_lomac *subj, *obj, *new; int error; new = SLOT(newlabel); subj = SLOT(cred->cr_label); obj = SLOT(pplabel); /* * If there is a LOMAC label update for a pipe, it must be a single * update. */ error = lomac_atmostflags(new, MAC_LOMAC_FLAG_SINGLE); if (error) return (error); /* * To perform a relabel of a pipe (LOMAC label or not), LOMAC must * authorize the relabel. */ if (!lomac_single_in_range(obj, subj)) return (EPERM); /* * If the LOMAC label is to be changed, authorize as appropriate. */ if (new->ml_flags & MAC_LOMAC_FLAG_SINGLE) { /* * To change the LOMAC label on a pipe, the new pipe label * must be in the subject range. */ if (!lomac_single_in_range(new, subj)) return (EPERM); /* * To change the LOMAC label on a pipe to be EQUAL, the * subject must have appropriate privilege. */ if (lomac_contains_equal(new)) { error = lomac_subject_privileged(subj); if (error) return (error); } } return (0); } static int lomac_pipe_check_write(struct ucred *cred, struct pipepair *pp, struct label *pplabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(pplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static void lomac_pipe_create(struct ucred *cred, struct pipepair *pp, struct label *pplabel) { struct mac_lomac *source, *dest; source = SLOT(cred->cr_label); dest = SLOT(pplabel); lomac_copy_single(source, dest); } static void lomac_pipe_relabel(struct ucred *cred, struct pipepair *pp, struct label *pplabel, struct label *newlabel) { struct mac_lomac *source, *dest; source = SLOT(newlabel); dest = SLOT(pplabel); try_relabel(source, dest); } /* * Some system privileges are allowed regardless of integrity grade; others * are allowed only when running with privilege with respect to the LOMAC * policy as they might otherwise allow bypassing of the integrity policy. */ static int lomac_priv_check(struct ucred *cred, int priv) { struct mac_lomac *subj; int error; if (!lomac_enabled) return (0); /* * Exempt only specific privileges from the LOMAC integrity policy. */ switch (priv) { case PRIV_KTRACE: case PRIV_MSGBUF: /* * Allow processes to manipulate basic process audit properties, and * to submit audit records. */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: case PRIV_AUDIT_SUBMIT: /* * Allow processes to manipulate their regular UNIX credentials. */ case PRIV_CRED_SETUID: case PRIV_CRED_SETEUID: case PRIV_CRED_SETGID: case PRIV_CRED_SETEGID: case PRIV_CRED_SETGROUPS: case PRIV_CRED_SETREUID: case PRIV_CRED_SETREGID: case PRIV_CRED_SETRESUID: case PRIV_CRED_SETRESGID: /* * Allow processes to perform system monitoring. */ case PRIV_SEEOTHERGIDS: case PRIV_SEEOTHERUIDS: break; /* * Allow access to general process debugging facilities. We * separately control debugging based on MAC label. */ case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: /* * Allow manipulating jails. */ case PRIV_JAIL_ATTACH: /* * Allow privilege with respect to the Partition policy, but not the * Privs policy. */ case PRIV_MAC_PARTITION: /* * Allow privilege with respect to process resource limits and login * context. */ case PRIV_PROC_LIMIT: case PRIV_PROC_SETLOGIN: case PRIV_PROC_SETRLIMIT: /* * Allow System V and POSIX IPC privileges. */ case PRIV_IPC_READ: case PRIV_IPC_WRITE: case PRIV_IPC_ADMIN: case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: /* * Allow certain scheduler manipulations -- possibly this should be * controlled by more fine-grained policy, as potentially low * integrity processes can deny CPU to higher integrity ones. */ case PRIV_SCHED_DIFFCRED: case PRIV_SCHED_SETPRIORITY: case PRIV_SCHED_RTPRIO: case PRIV_SCHED_SETPOLICY: case PRIV_SCHED_SET: case PRIV_SCHED_SETPARAM: /* * More IPC privileges. */ case PRIV_SEM_WRITE: /* * Allow signaling privileges subject to integrity policy. */ case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: /* * Allow access to only limited sysctls from lower integrity levels; * piggy-back on the Jail definition. */ case PRIV_SYSCTL_WRITEJAIL: /* * Allow TTY-based privileges, subject to general device access using * labels on TTY device nodes, but not console privilege. */ case PRIV_TTY_DRAINWAIT: case PRIV_TTY_DTRWAIT: case PRIV_TTY_EXCLUSIVE: case PRIV_TTY_STI: case PRIV_TTY_SETA: /* * Grant most VFS privileges, as almost all are in practice bounded * by more specific checks using labels. */ case PRIV_VFS_READ: case PRIV_VFS_WRITE: case PRIV_VFS_ADMIN: case PRIV_VFS_EXEC: case PRIV_VFS_LOOKUP: case PRIV_VFS_CHFLAGS_DEV: case PRIV_VFS_CHOWN: case PRIV_VFS_CHROOT: case PRIV_VFS_RETAINSUGID: case PRIV_VFS_EXCEEDQUOTA: case PRIV_VFS_FCHROOT: case PRIV_VFS_FHOPEN: case PRIV_VFS_FHSTATFS: case PRIV_VFS_GENERATION: case PRIV_VFS_GETFH: case PRIV_VFS_GETQUOTA: case PRIV_VFS_LINK: case PRIV_VFS_MOUNT: case PRIV_VFS_MOUNT_OWNER: case PRIV_VFS_MOUNT_PERM: case PRIV_VFS_MOUNT_SUIDDIR: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_SETGID: case PRIV_VFS_STICKYFILE: case PRIV_VFS_SYSFLAGS: case PRIV_VFS_UNMOUNT: /* * Allow VM privileges; it would be nice if these were subject to * resource limits. */ case PRIV_VM_MADV_PROTECT: case PRIV_VM_MLOCK: case PRIV_VM_MUNLOCK: case PRIV_VM_SWAP_NOQUOTA: case PRIV_VM_SWAP_NORLIMIT: /* * Allow some but not all network privileges. In general, dont allow * reconfiguring the network stack, just normal use. */ case PRIV_NETATALK_RESERVEDPORT: case PRIV_NETINET_RESERVEDPORT: case PRIV_NETINET_RAW: case PRIV_NETINET_REUSEPORT: case PRIV_NETIPX_RESERVEDPORT: case PRIV_NETIPX_RAW: break; /* * All remaining system privileges are allow only if the process * holds privilege with respect to the LOMAC policy. */ default: subj = SLOT(cred->cr_label); error = lomac_subject_privileged(subj); if (error) return (error); } return (0); } static int lomac_proc_check_debug(struct ucred *cred, struct proc *p) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(p->p_ucred->cr_label); /* XXX: range checks */ if (!lomac_dominate_single(obj, subj)) return (ESRCH); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_proc_check_sched(struct ucred *cred, struct proc *p) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(p->p_ucred->cr_label); /* XXX: range checks */ if (!lomac_dominate_single(obj, subj)) return (ESRCH); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_proc_check_signal(struct ucred *cred, struct proc *p, int signum) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(p->p_ucred->cr_label); /* XXX: range checks */ if (!lomac_dominate_single(obj, subj)) return (ESRCH); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static void lomac_proc_destroy_label(struct label *label) { mtx_destroy(&PSLOT(label)->mtx); free(PSLOT(label), M_LOMAC); PSLOT_SET(label, NULL); } static void lomac_proc_init_label(struct label *label) { PSLOT_SET(label, malloc(sizeof(struct mac_lomac_proc), M_LOMAC, M_ZERO | M_WAITOK)); mtx_init(&PSLOT(label)->mtx, "MAC/Lomac proc lock", NULL, MTX_DEF); } static int lomac_socket_check_deliver(struct socket *so, struct label *solabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *p, *s; int error; if (!lomac_enabled) return (0); p = SLOT(mlabel); s = SLOT(solabel); SOCK_LOCK(so); error = lomac_equal_single(p, s) ? 0 : EACCES; SOCK_UNLOCK(so); return (error); } static int lomac_socket_check_relabel(struct ucred *cred, struct socket *so, struct label *solabel, struct label *newlabel) { struct mac_lomac *subj, *obj, *new; int error; SOCK_LOCK_ASSERT(so); new = SLOT(newlabel); subj = SLOT(cred->cr_label); obj = SLOT(solabel); /* * If there is a LOMAC label update for the socket, it may be an * update of single. */ error = lomac_atmostflags(new, MAC_LOMAC_FLAG_SINGLE); if (error) return (error); /* * To relabel a socket, the old socket single must be in the subject * range. */ if (!lomac_single_in_range(obj, subj)) return (EPERM); /* * If the LOMAC label is to be changed, authorize as appropriate. */ if (new->ml_flags & MAC_LOMAC_FLAG_SINGLE) { /* * To relabel a socket, the new socket single must be in the * subject range. */ if (!lomac_single_in_range(new, subj)) return (EPERM); /* * To change the LOMAC label on the socket to contain EQUAL, * the subject must have appropriate privilege. */ if (lomac_contains_equal(new)) { error = lomac_subject_privileged(subj); if (error) return (error); } } return (0); } static int lomac_socket_check_visible(struct ucred *cred, struct socket *so, struct label *solabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(solabel); SOCK_LOCK(so); if (!lomac_dominate_single(obj, subj)) { SOCK_UNLOCK(so); return (ENOENT); } SOCK_UNLOCK(so); return (0); } static void lomac_socket_create(struct ucred *cred, struct socket *so, struct label *solabel) { struct mac_lomac *source, *dest; source = SLOT(cred->cr_label); dest = SLOT(solabel); lomac_copy_single(source, dest); } static void lomac_socket_create_mbuf(struct socket *so, struct label *solabel, struct mbuf *m, struct label *mlabel) { struct mac_lomac *source, *dest; source = SLOT(solabel); dest = SLOT(mlabel); SOCK_LOCK(so); lomac_copy_single(source, dest); SOCK_UNLOCK(so); } static void lomac_socket_newconn(struct socket *oldso, struct label *oldsolabel, struct socket *newso, struct label *newsolabel) { struct mac_lomac source, *dest; SOCK_LOCK(oldso); source = *SLOT(oldsolabel); SOCK_UNLOCK(oldso); dest = SLOT(newsolabel); SOCK_LOCK(newso); lomac_copy_single(&source, dest); SOCK_UNLOCK(newso); } static void lomac_socket_relabel(struct ucred *cred, struct socket *so, struct label *solabel, struct label *newlabel) { struct mac_lomac *source, *dest; SOCK_LOCK_ASSERT(so); source = SLOT(newlabel); dest = SLOT(solabel); try_relabel(source, dest); } static void lomac_socketpeer_set_from_mbuf(struct mbuf *m, struct label *mlabel, struct socket *so, struct label *sopeerlabel) { struct mac_lomac *source, *dest; source = SLOT(mlabel); dest = SLOT(sopeerlabel); SOCK_LOCK(so); lomac_copy_single(source, dest); SOCK_UNLOCK(so); } static void lomac_socketpeer_set_from_socket(struct socket *oldso, struct label *oldsolabel, struct socket *newso, struct label *newsopeerlabel) { struct mac_lomac source, *dest; SOCK_LOCK(oldso); source = *SLOT(oldsolabel); SOCK_UNLOCK(oldso); dest = SLOT(newsopeerlabel); SOCK_LOCK(newso); lomac_copy_single(&source, dest); SOCK_UNLOCK(newso); } static void lomac_syncache_create(struct label *label, struct inpcb *inp) { struct mac_lomac *source, *dest; source = SLOT(inp->inp_label); dest = SLOT(label); lomac_copy(source, dest); } static void lomac_syncache_create_mbuf(struct label *sc_label, struct mbuf *m, struct label *mlabel) { struct mac_lomac *source, *dest; source = SLOT(sc_label); dest = SLOT(mlabel); lomac_copy(source, dest); } static int lomac_system_check_acct(struct ucred *cred, struct vnode *vp, struct label *vplabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (lomac_subject_privileged(subj)) return (EPERM); if (!lomac_high_single(obj)) return (EACCES); return (0); } static int lomac_system_check_auditctl(struct ucred *cred, struct vnode *vp, struct label *vplabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (lomac_subject_privileged(subj)) return (EPERM); if (!lomac_high_single(obj)) return (EACCES); return (0); } static int lomac_system_check_swapoff(struct ucred *cred, struct vnode *vp, struct label *vplabel) { struct mac_lomac *subj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); if (lomac_subject_privileged(subj)) return (EPERM); return (0); } static int lomac_system_check_swapon(struct ucred *cred, struct vnode *vp, struct label *vplabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (lomac_subject_privileged(subj)) return (EPERM); if (!lomac_high_single(obj)) return (EACCES); return (0); } static int lomac_system_check_sysctl(struct ucred *cred, struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { struct mac_lomac *subj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); /* * Treat sysctl variables without CTLFLAG_ANYBODY flag as lomac/high, * but also require privilege to change them. */ if (req->newptr != NULL && (oidp->oid_kind & CTLFLAG_ANYBODY) == 0) { #ifdef notdef if (!lomac_subject_dominate_high(subj)) return (EACCES); #endif if (lomac_subject_privileged(subj)) return (EPERM); } return (0); } static void lomac_thread_userret(struct thread *td) { struct proc *p = td->td_proc; struct mac_lomac_proc *subj = PSLOT(p->p_label); struct ucred *newcred, *oldcred; int dodrop; mtx_lock(&subj->mtx); if (subj->mac_lomac.ml_flags & MAC_LOMAC_FLAG_UPDATE) { dodrop = 0; mtx_unlock(&subj->mtx); newcred = crget(); /* * Prevent a lock order reversal in mac_proc_vm_revoke; * ideally, the other user of subj->mtx wouldn't be holding * Giant. */ mtx_lock(&Giant); PROC_LOCK(p); mtx_lock(&subj->mtx); /* * Check if we lost the race while allocating the cred. */ if ((subj->mac_lomac.ml_flags & MAC_LOMAC_FLAG_UPDATE) == 0) { crfree(newcred); goto out; } oldcred = p->p_ucred; crcopy(newcred, oldcred); crhold(newcred); lomac_copy(&subj->mac_lomac, SLOT(newcred->cr_label)); - p->p_ucred = newcred; + proc_set_cred(p, newcred); crfree(oldcred); dodrop = 1; out: mtx_unlock(&subj->mtx); PROC_UNLOCK(p); if (dodrop) mac_proc_vm_revoke(curthread); mtx_unlock(&Giant); } else { mtx_unlock(&subj->mtx); } } static int lomac_vnode_associate_extattr(struct mount *mp, struct label *mplabel, struct vnode *vp, struct label *vplabel) { struct mac_lomac ml_temp, *source, *dest; int buflen, error; source = SLOT(mplabel); dest = SLOT(vplabel); buflen = sizeof(ml_temp); bzero(&ml_temp, buflen); error = vn_extattr_get(vp, IO_NODELOCKED, MAC_LOMAC_EXTATTR_NAMESPACE, MAC_LOMAC_EXTATTR_NAME, &buflen, (char *)&ml_temp, curthread); if (error == ENOATTR || error == EOPNOTSUPP) { /* Fall back to the mntlabel. */ lomac_copy_single(source, dest); return (0); } else if (error) return (error); if (buflen != sizeof(ml_temp)) { if (buflen != sizeof(ml_temp) - sizeof(ml_temp.ml_auxsingle)) { printf("lomac_vnode_associate_extattr: bad size %d\n", buflen); return (EPERM); } bzero(&ml_temp.ml_auxsingle, sizeof(ml_temp.ml_auxsingle)); buflen = sizeof(ml_temp); (void)vn_extattr_set(vp, IO_NODELOCKED, MAC_LOMAC_EXTATTR_NAMESPACE, MAC_LOMAC_EXTATTR_NAME, buflen, (char *)&ml_temp, curthread); } if (lomac_valid(&ml_temp) != 0) { printf("lomac_vnode_associate_extattr: invalid\n"); return (EPERM); } if ((ml_temp.ml_flags & MAC_LOMAC_FLAGS_BOTH) != MAC_LOMAC_FLAG_SINGLE) { printf("lomac_vnode_associate_extattr: not single\n"); return (EPERM); } lomac_copy_single(&ml_temp, dest); return (0); } static void lomac_vnode_associate_singlelabel(struct mount *mp, struct label *mplabel, struct vnode *vp, struct label *vplabel) { struct mac_lomac *source, *dest; source = SLOT(mplabel); dest = SLOT(vplabel); lomac_copy_single(source, dest); } static int lomac_vnode_check_create(struct ucred *cred, struct vnode *dvp, struct label *dvplabel, struct componentname *cnp, struct vattr *vap) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(dvplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); if (obj->ml_flags & MAC_LOMAC_FLAG_AUX && !lomac_dominate_element(&subj->ml_single, &obj->ml_auxsingle)) return (EACCES); return (0); } static int lomac_vnode_check_deleteacl(struct ucred *cred, struct vnode *vp, struct label *vplabel, acl_type_t type) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_check_link(struct ucred *cred, struct vnode *dvp, struct label *dvplabel, struct vnode *vp, struct label *vplabel, struct componentname *cnp) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(dvplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_check_mmap(struct ucred *cred, struct vnode *vp, struct label *vplabel, int prot, int flags) { struct mac_lomac *subj, *obj; /* * Rely on the use of open()-time protections to handle * non-revocation cases. */ if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (((prot & VM_PROT_WRITE) != 0) && ((flags & MAP_SHARED) != 0)) { if (!lomac_subject_dominate(subj, obj)) return (EACCES); } if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { if (!lomac_dominate_single(obj, subj)) return (maybe_demote(subj, obj, "mapping", "file", vp)); } return (0); } static void lomac_vnode_check_mmap_downgrade(struct ucred *cred, struct vnode *vp, struct label *vplabel, /* XXX vm_prot_t */ int *prot) { struct mac_lomac *subj, *obj; /* * Rely on the use of open()-time protections to handle * non-revocation cases. */ if (!lomac_enabled || !revocation_enabled) return; subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) *prot &= ~VM_PROT_WRITE; } static int lomac_vnode_check_open(struct ucred *cred, struct vnode *vp, struct label *vplabel, accmode_t accmode) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); /* XXX privilege override for admin? */ if (accmode & VMODIFY_PERMS) { if (!lomac_subject_dominate(subj, obj)) return (EACCES); } return (0); } static int lomac_vnode_check_read(struct ucred *active_cred, struct ucred *file_cred, struct vnode *vp, struct label *vplabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled || !revocation_enabled) return (0); subj = SLOT(active_cred->cr_label); obj = SLOT(vplabel); if (!lomac_dominate_single(obj, subj)) return (maybe_demote(subj, obj, "reading", "file", vp)); return (0); } static int lomac_vnode_check_relabel(struct ucred *cred, struct vnode *vp, struct label *vplabel, struct label *newlabel) { struct mac_lomac *old, *new, *subj; int error; old = SLOT(vplabel); new = SLOT(newlabel); subj = SLOT(cred->cr_label); /* * If there is a LOMAC label update for the vnode, it must be a * single label, with an optional explicit auxiliary single. */ error = lomac_atmostflags(new, MAC_LOMAC_FLAG_SINGLE | MAC_LOMAC_FLAG_AUX); if (error) return (error); /* * To perform a relabel of the vnode (LOMAC label or not), LOMAC must * authorize the relabel. */ if (!lomac_single_in_range(old, subj)) return (EPERM); /* * If the LOMAC label is to be changed, authorize as appropriate. */ if (new->ml_flags & MAC_LOMAC_FLAG_SINGLE) { /* * To change the LOMAC label on a vnode, the new vnode label * must be in the subject range. */ if (!lomac_single_in_range(new, subj)) return (EPERM); /* * To change the LOMAC label on the vnode to be EQUAL, the * subject must have appropriate privilege. */ if (lomac_contains_equal(new)) { error = lomac_subject_privileged(subj); if (error) return (error); } } if (new->ml_flags & MAC_LOMAC_FLAG_AUX) { /* * Fill in the missing parts from the previous label. */ if ((new->ml_flags & MAC_LOMAC_FLAG_SINGLE) == 0) lomac_copy_single(subj, new); /* * To change the auxiliary LOMAC label on a vnode, the new * vnode label must be in the subject range. */ if (!lomac_auxsingle_in_range(new, subj)) return (EPERM); /* * To change the auxiliary LOMAC label on the vnode to be * EQUAL, the subject must have appropriate privilege. */ if (lomac_contains_equal(new)) { error = lomac_subject_privileged(subj); if (error) return (error); } } return (0); } static int lomac_vnode_check_rename_from(struct ucred *cred, struct vnode *dvp, struct label *dvplabel, struct vnode *vp, struct label *vplabel, struct componentname *cnp) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(dvplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_check_rename_to(struct ucred *cred, struct vnode *dvp, struct label *dvplabel, struct vnode *vp, struct label *vplabel, int samedir, struct componentname *cnp) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(dvplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); if (vp != NULL) { obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); } return (0); } static int lomac_vnode_check_revoke(struct ucred *cred, struct vnode *vp, struct label *vplabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_check_setacl(struct ucred *cred, struct vnode *vp, struct label *vplabel, acl_type_t type, struct acl *acl) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_check_setextattr(struct ucred *cred, struct vnode *vp, struct label *vplabel, int attrnamespace, const char *name) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); /* XXX: protect the MAC EA in a special way? */ return (0); } static int lomac_vnode_check_setflags(struct ucred *cred, struct vnode *vp, struct label *vplabel, u_long flags) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_check_setmode(struct ucred *cred, struct vnode *vp, struct label *vplabel, mode_t mode) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_check_setowner(struct ucred *cred, struct vnode *vp, struct label *vplabel, uid_t uid, gid_t gid) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_check_setutimes(struct ucred *cred, struct vnode *vp, struct label *vplabel, struct timespec atime, struct timespec mtime) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_check_unlink(struct ucred *cred, struct vnode *dvp, struct label *dvplabel, struct vnode *vp, struct label *vplabel, struct componentname *cnp) { struct mac_lomac *subj, *obj; if (!lomac_enabled) return (0); subj = SLOT(cred->cr_label); obj = SLOT(dvplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_check_write(struct ucred *active_cred, struct ucred *file_cred, struct vnode *vp, struct label *vplabel) { struct mac_lomac *subj, *obj; if (!lomac_enabled || !revocation_enabled) return (0); subj = SLOT(active_cred->cr_label); obj = SLOT(vplabel); if (!lomac_subject_dominate(subj, obj)) return (EACCES); return (0); } static int lomac_vnode_create_extattr(struct ucred *cred, struct mount *mp, struct label *mplabel, struct vnode *dvp, struct label *dvplabel, struct vnode *vp, struct label *vplabel, struct componentname *cnp) { struct mac_lomac *source, *dest, *dir, temp; size_t buflen; int error; buflen = sizeof(temp); bzero(&temp, buflen); source = SLOT(cred->cr_label); dest = SLOT(vplabel); dir = SLOT(dvplabel); if (dir->ml_flags & MAC_LOMAC_FLAG_AUX) { lomac_copy_auxsingle(dir, &temp); lomac_set_single(&temp, dir->ml_auxsingle.mle_type, dir->ml_auxsingle.mle_grade); } else { lomac_copy_single(source, &temp); } error = vn_extattr_set(vp, IO_NODELOCKED, MAC_LOMAC_EXTATTR_NAMESPACE, MAC_LOMAC_EXTATTR_NAME, buflen, (char *)&temp, curthread); if (error == 0) lomac_copy(&temp, dest); return (error); } static void lomac_vnode_execve_transition(struct ucred *old, struct ucred *new, struct vnode *vp, struct label *vplabel, struct label *interpvplabel, struct image_params *imgp, struct label *execlabel) { struct mac_lomac *source, *dest, *obj, *robj; source = SLOT(old->cr_label); dest = SLOT(new->cr_label); obj = SLOT(vplabel); robj = interpvplabel != NULL ? SLOT(interpvplabel) : obj; lomac_copy(source, dest); /* * If there's an auxiliary label on the real object, respect it and * assume that this level should be assumed immediately if a higher * level is currently in place. */ if (robj->ml_flags & MAC_LOMAC_FLAG_AUX && !lomac_dominate_element(&robj->ml_auxsingle, &dest->ml_single) && lomac_auxsingle_in_range(robj, dest)) lomac_set_single(dest, robj->ml_auxsingle.mle_type, robj->ml_auxsingle.mle_grade); /* * Restructuring to use the execve transitioning mechanism instead of * the normal demotion mechanism here would be difficult, so just * copy the label over and perform standard demotion. This is also * non-optimal because it will result in the intermediate label "new" * being created and immediately recycled. */ if (lomac_enabled && revocation_enabled && !lomac_dominate_single(obj, source)) (void)maybe_demote(source, obj, "executing", "file", vp); } static int lomac_vnode_execve_will_transition(struct ucred *old, struct vnode *vp, struct label *vplabel, struct label *interpvplabel, struct image_params *imgp, struct label *execlabel) { struct mac_lomac *subj, *obj, *robj; if (!lomac_enabled || !revocation_enabled) return (0); subj = SLOT(old->cr_label); obj = SLOT(vplabel); robj = interpvplabel != NULL ? SLOT(interpvplabel) : obj; return ((robj->ml_flags & MAC_LOMAC_FLAG_AUX && !lomac_dominate_element(&robj->ml_auxsingle, &subj->ml_single) && lomac_auxsingle_in_range(robj, subj)) || !lomac_dominate_single(obj, subj)); } static void lomac_vnode_relabel(struct ucred *cred, struct vnode *vp, struct label *vplabel, struct label *newlabel) { struct mac_lomac *source, *dest; source = SLOT(newlabel); dest = SLOT(vplabel); try_relabel(source, dest); } static int lomac_vnode_setlabel_extattr(struct ucred *cred, struct vnode *vp, struct label *vplabel, struct label *intlabel) { struct mac_lomac *source, temp; size_t buflen; int error; buflen = sizeof(temp); bzero(&temp, buflen); source = SLOT(intlabel); if ((source->ml_flags & MAC_LOMAC_FLAG_SINGLE) == 0) return (0); lomac_copy_single(source, &temp); error = vn_extattr_set(vp, IO_NODELOCKED, MAC_LOMAC_EXTATTR_NAMESPACE, MAC_LOMAC_EXTATTR_NAME, buflen, (char *)&temp, curthread); return (error); } static struct mac_policy_ops lomac_ops = { .mpo_init = lomac_init, .mpo_bpfdesc_check_receive = lomac_bpfdesc_check_receive, .mpo_bpfdesc_create = lomac_bpfdesc_create, .mpo_bpfdesc_create_mbuf = lomac_bpfdesc_create_mbuf, .mpo_bpfdesc_destroy_label = lomac_destroy_label, .mpo_bpfdesc_init_label = lomac_init_label, .mpo_cred_check_relabel = lomac_cred_check_relabel, .mpo_cred_check_visible = lomac_cred_check_visible, .mpo_cred_copy_label = lomac_copy_label, .mpo_cred_create_swapper = lomac_cred_create_swapper, .mpo_cred_create_init = lomac_cred_create_init, .mpo_cred_destroy_label = lomac_destroy_label, .mpo_cred_externalize_label = lomac_externalize_label, .mpo_cred_init_label = lomac_init_label, .mpo_cred_internalize_label = lomac_internalize_label, .mpo_cred_relabel = lomac_cred_relabel, .mpo_devfs_create_device = lomac_devfs_create_device, .mpo_devfs_create_directory = lomac_devfs_create_directory, .mpo_devfs_create_symlink = lomac_devfs_create_symlink, .mpo_devfs_destroy_label = lomac_destroy_label, .mpo_devfs_init_label = lomac_init_label, .mpo_devfs_update = lomac_devfs_update, .mpo_devfs_vnode_associate = lomac_devfs_vnode_associate, .mpo_ifnet_check_relabel = lomac_ifnet_check_relabel, .mpo_ifnet_check_transmit = lomac_ifnet_check_transmit, .mpo_ifnet_copy_label = lomac_copy_label, .mpo_ifnet_create = lomac_ifnet_create, .mpo_ifnet_create_mbuf = lomac_ifnet_create_mbuf, .mpo_ifnet_destroy_label = lomac_destroy_label, .mpo_ifnet_externalize_label = lomac_externalize_label, .mpo_ifnet_init_label = lomac_init_label, .mpo_ifnet_internalize_label = lomac_internalize_label, .mpo_ifnet_relabel = lomac_ifnet_relabel, .mpo_syncache_create = lomac_syncache_create, .mpo_syncache_destroy_label = lomac_destroy_label, .mpo_syncache_init_label = lomac_init_label_waitcheck, .mpo_inpcb_check_deliver = lomac_inpcb_check_deliver, .mpo_inpcb_check_visible = lomac_inpcb_check_visible, .mpo_inpcb_create = lomac_inpcb_create, .mpo_inpcb_create_mbuf = lomac_inpcb_create_mbuf, .mpo_inpcb_destroy_label = lomac_destroy_label, .mpo_inpcb_init_label = lomac_init_label_waitcheck, .mpo_inpcb_sosetlabel = lomac_inpcb_sosetlabel, .mpo_ip6q_create = lomac_ip6q_create, .mpo_ip6q_destroy_label = lomac_destroy_label, .mpo_ip6q_init_label = lomac_init_label_waitcheck, .mpo_ip6q_match = lomac_ip6q_match, .mpo_ip6q_reassemble = lomac_ip6q_reassemble, .mpo_ip6q_update = lomac_ip6q_update, .mpo_ipq_create = lomac_ipq_create, .mpo_ipq_destroy_label = lomac_destroy_label, .mpo_ipq_init_label = lomac_init_label_waitcheck, .mpo_ipq_match = lomac_ipq_match, .mpo_ipq_reassemble = lomac_ipq_reassemble, .mpo_ipq_update = lomac_ipq_update, .mpo_kld_check_load = lomac_kld_check_load, .mpo_mbuf_copy_label = lomac_copy_label, .mpo_mbuf_destroy_label = lomac_destroy_label, .mpo_mbuf_init_label = lomac_init_label_waitcheck, .mpo_mount_create = lomac_mount_create, .mpo_mount_destroy_label = lomac_destroy_label, .mpo_mount_init_label = lomac_init_label, .mpo_netatalk_aarp_send = lomac_netatalk_aarp_send, .mpo_netinet_arp_send = lomac_netinet_arp_send, .mpo_netinet_firewall_reply = lomac_netinet_firewall_reply, .mpo_netinet_firewall_send = lomac_netinet_firewall_send, .mpo_netinet_fragment = lomac_netinet_fragment, .mpo_netinet_icmp_reply = lomac_netinet_icmp_reply, .mpo_netinet_igmp_send = lomac_netinet_igmp_send, .mpo_netinet6_nd6_send = lomac_netinet6_nd6_send, .mpo_pipe_check_ioctl = lomac_pipe_check_ioctl, .mpo_pipe_check_read = lomac_pipe_check_read, .mpo_pipe_check_relabel = lomac_pipe_check_relabel, .mpo_pipe_check_write = lomac_pipe_check_write, .mpo_pipe_copy_label = lomac_copy_label, .mpo_pipe_create = lomac_pipe_create, .mpo_pipe_destroy_label = lomac_destroy_label, .mpo_pipe_externalize_label = lomac_externalize_label, .mpo_pipe_init_label = lomac_init_label, .mpo_pipe_internalize_label = lomac_internalize_label, .mpo_pipe_relabel = lomac_pipe_relabel, .mpo_priv_check = lomac_priv_check, .mpo_proc_check_debug = lomac_proc_check_debug, .mpo_proc_check_sched = lomac_proc_check_sched, .mpo_proc_check_signal = lomac_proc_check_signal, .mpo_proc_destroy_label = lomac_proc_destroy_label, .mpo_proc_init_label = lomac_proc_init_label, .mpo_socket_check_deliver = lomac_socket_check_deliver, .mpo_socket_check_relabel = lomac_socket_check_relabel, .mpo_socket_check_visible = lomac_socket_check_visible, .mpo_socket_copy_label = lomac_copy_label, .mpo_socket_create = lomac_socket_create, .mpo_socket_create_mbuf = lomac_socket_create_mbuf, .mpo_socket_destroy_label = lomac_destroy_label, .mpo_socket_externalize_label = lomac_externalize_label, .mpo_socket_init_label = lomac_init_label_waitcheck, .mpo_socket_internalize_label = lomac_internalize_label, .mpo_socket_newconn = lomac_socket_newconn, .mpo_socket_relabel = lomac_socket_relabel, .mpo_socketpeer_destroy_label = lomac_destroy_label, .mpo_socketpeer_externalize_label = lomac_externalize_label, .mpo_socketpeer_init_label = lomac_init_label_waitcheck, .mpo_socketpeer_set_from_mbuf = lomac_socketpeer_set_from_mbuf, .mpo_socketpeer_set_from_socket = lomac_socketpeer_set_from_socket, .mpo_syncache_create_mbuf = lomac_syncache_create_mbuf, .mpo_system_check_acct = lomac_system_check_acct, .mpo_system_check_auditctl = lomac_system_check_auditctl, .mpo_system_check_swapoff = lomac_system_check_swapoff, .mpo_system_check_swapon = lomac_system_check_swapon, .mpo_system_check_sysctl = lomac_system_check_sysctl, .mpo_thread_userret = lomac_thread_userret, .mpo_vnode_associate_extattr = lomac_vnode_associate_extattr, .mpo_vnode_associate_singlelabel = lomac_vnode_associate_singlelabel, .mpo_vnode_check_access = lomac_vnode_check_open, .mpo_vnode_check_create = lomac_vnode_check_create, .mpo_vnode_check_deleteacl = lomac_vnode_check_deleteacl, .mpo_vnode_check_link = lomac_vnode_check_link, .mpo_vnode_check_mmap = lomac_vnode_check_mmap, .mpo_vnode_check_mmap_downgrade = lomac_vnode_check_mmap_downgrade, .mpo_vnode_check_open = lomac_vnode_check_open, .mpo_vnode_check_read = lomac_vnode_check_read, .mpo_vnode_check_relabel = lomac_vnode_check_relabel, .mpo_vnode_check_rename_from = lomac_vnode_check_rename_from, .mpo_vnode_check_rename_to = lomac_vnode_check_rename_to, .mpo_vnode_check_revoke = lomac_vnode_check_revoke, .mpo_vnode_check_setacl = lomac_vnode_check_setacl, .mpo_vnode_check_setextattr = lomac_vnode_check_setextattr, .mpo_vnode_check_setflags = lomac_vnode_check_setflags, .mpo_vnode_check_setmode = lomac_vnode_check_setmode, .mpo_vnode_check_setowner = lomac_vnode_check_setowner, .mpo_vnode_check_setutimes = lomac_vnode_check_setutimes, .mpo_vnode_check_unlink = lomac_vnode_check_unlink, .mpo_vnode_check_write = lomac_vnode_check_write, .mpo_vnode_copy_label = lomac_copy_label, .mpo_vnode_create_extattr = lomac_vnode_create_extattr, .mpo_vnode_destroy_label = lomac_destroy_label, .mpo_vnode_execve_transition = lomac_vnode_execve_transition, .mpo_vnode_execve_will_transition = lomac_vnode_execve_will_transition, .mpo_vnode_externalize_label = lomac_externalize_label, .mpo_vnode_init_label = lomac_init_label, .mpo_vnode_internalize_label = lomac_internalize_label, .mpo_vnode_relabel = lomac_vnode_relabel, .mpo_vnode_setlabel_extattr = lomac_vnode_setlabel_extattr, }; MAC_POLICY_SET(&lomac_ops, mac_lomac, "TrustedBSD MAC/LOMAC", MPC_LOADTIME_FLAG_NOTLATE, &lomac_slot); Index: stable/10/sys/sys/ucred.h =================================================================== --- stable/10/sys/sys/ucred.h (revision 302228) +++ stable/10/sys/sys/ucred.h (revision 302229) @@ -1,118 +1,119 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ucred.h 8.4 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_UCRED_H_ #define _SYS_UCRED_H_ #include struct loginclass; /* * Credentials. * * Please do not inspect cr_uid directly to determine superuserness. The * priv(9) interface should be used to check for privilege. */ #if defined(_KERNEL) || defined(_WANT_UCRED) struct ucred { u_int cr_ref; /* reference count */ #define cr_startcopy cr_uid uid_t cr_uid; /* effective user id */ uid_t cr_ruid; /* real user id */ uid_t cr_svuid; /* saved user id */ int cr_ngroups; /* number of groups */ gid_t cr_rgid; /* real group id */ gid_t cr_svgid; /* saved group id */ struct uidinfo *cr_uidinfo; /* per euid resource consumption */ struct uidinfo *cr_ruidinfo; /* per ruid resource consumption */ struct prison *cr_prison; /* jail(2) */ struct loginclass *cr_loginclass; /* login class */ u_int cr_flags; /* credential flags */ void *cr_pspare2[2]; /* general use 2 */ #define cr_endcopy cr_label struct label *cr_label; /* MAC label */ struct auditinfo_addr cr_audit; /* Audit properties. */ gid_t *cr_groups; /* groups */ int cr_agroups; /* Available groups */ }; #define NOCRED ((struct ucred *)0) /* no credential available */ #define FSCRED ((struct ucred *)-1) /* filesystem credential */ #endif /* _KERNEL || _WANT_UCRED */ #define XU_NGROUPS 16 /* * Flags for cr_flags. */ #define CRED_FLAG_CAPMODE 0x00000001 /* In capability mode. */ /* * This is the external representation of struct ucred. */ struct xucred { u_int cr_version; /* structure layout version */ uid_t cr_uid; /* effective user id */ short cr_ngroups; /* number of groups */ gid_t cr_groups[XU_NGROUPS]; /* groups */ void *_cr_unused1; /* compatibility with old ucred */ }; #define XUCRED_VERSION 0 /* This can be used for both ucred and xucred structures. */ #define cr_gid cr_groups[0] #ifdef _KERNEL struct proc; struct thread; void change_egid(struct ucred *newcred, gid_t egid); void change_euid(struct ucred *newcred, struct uidinfo *euip); void change_rgid(struct ucred *newcred, gid_t rgid); void change_ruid(struct ucred *newcred, struct uidinfo *ruip); void change_svgid(struct ucred *newcred, gid_t svgid); void change_svuid(struct ucred *newcred, uid_t svuid); void crcopy(struct ucred *dest, struct ucred *src); struct ucred *crcopysafe(struct proc *p, struct ucred *cr); struct ucred *crdup(struct ucred *cr); void crextend(struct ucred *cr, int n); void cred_update_thread(struct thread *td); +struct ucred *proc_set_cred(struct proc *p, struct ucred *cr); void crfree(struct ucred *cr); struct ucred *crget(void); struct ucred *crhold(struct ucred *cr); int crshared(struct ucred *cr); void cru2x(struct ucred *cr, struct xucred *xcr); void crsetgroups(struct ucred *cr, int n, gid_t *groups); int groupmember(gid_t gid, struct ucred *cred); #endif /* _KERNEL */ #endif /* !_SYS_UCRED_H_ */ Index: stable/10 =================================================================== --- stable/10 (revision 302228) +++ stable/10 (revision 302229) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r280130