diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 26a994ef0c32..d1149dd4fb3b 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -1,5525 +1,5530 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_nfs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ #include #define PRISON0_HOSTUUID_MODULE "hostuuid" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ #ifdef INET #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL #endif #else /* !INET */ #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL 0 #endif #endif /* prison0 describes what is "real" about the system. */ struct prison prison0 = { .pr_id = 0, .pr_name = "0", .pr_ref = 1, .pr_uref = 1, .pr_path = "/", .pr_securelevel = -1, .pr_devfs_rsnum = 0, .pr_state = PRISON_STATE_ALIVE, .pr_childmax = JAIL_MAX, .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif .pr_allow = PR_ALLOW_PRISON0, }; _Static_assert((PR_ALLOW_PRISON0 & ~PR_ALLOW_ALL_STATIC) == 0, "Bits enabled in PR_ALLOW_PRISON0 that are not statically reserved"); MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); struct bool_flags { const char *name; const char *noname; volatile u_int flag; }; struct jailsys_flags { const char *name; unsigned disable; unsigned new; }; /* * Handle jail teardown in a dedicated thread to avoid deadlocks from * vnet_destroy(). */ TASKQUEUE_DEFINE_THREAD(jail_remove); /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ struct sx allprison_lock; SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); LIST_HEAD(, prison_racct) allprison_racct; int lastprid = 0; int lastdeadid = 0; static int get_next_prid(struct prison **insprp); static int get_next_deadid(struct prison **insprp); static int do_jail_attach(struct thread *td, struct prison *pr, int drflags); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison); static int prison_lock_xlock(struct prison *pr, int flags); static void prison_cleanup_locked(struct prison *pr); static void prison_cleanup_unlocked(struct prison *pr); static void prison_free_not_last(struct prison *pr); static void prison_proc_free_not_last(struct prison *pr); static void prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p); static void prison_set_allow_locked(struct prison *pr, unsigned flag, int enable); static char *prison_path(struct prison *pr1, struct prison *pr2); #ifdef RACCT static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif static void prison_knote(struct prison *pr, long hint); /* Flags for prison_deref */ #define PD_DEREF 0x01 /* Decrement pr_ref */ #define PD_DEUREF 0x02 /* Decrement pr_uref */ #define PD_KILL 0x04 /* Remove jail, kill processes, etc */ #define PD_LOCKED 0x10 /* pr_mtx is held */ #define PD_LIST_SLOCKED 0x20 /* allprison_lock is held shared */ #define PD_LIST_XLOCKED 0x40 /* allprison_lock is held exclusive */ #define PD_OP_FLAGS 0x07 /* Operation flags */ #define PD_LOCK_FLAGS 0x70 /* Lock status flags */ /* * Parameter names corresponding to PR_* flag values. Size values are for kvm * as we cannot figure out the size of a sparse array, or an array without a * terminating entry. */ static struct bool_flags pr_flag_bool[] = { {"persist", "nopersist", PR_PERSIST}, #ifdef INET {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL}, #endif #ifdef INET6 {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL}, #endif }; const size_t pr_flag_bool_size = sizeof(pr_flag_bool); static struct jailsys_flags pr_flag_jailsys[] = { {"host", 0, PR_HOST}, #ifdef VIMAGE {"vnet", 0, PR_VNET}, #endif #ifdef INET {"ip4", PR_IP4_USER, PR_IP4_USER}, #endif #ifdef INET6 {"ip6", PR_IP6_USER, PR_IP6_USER}, #endif }; const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); /* * Make this array full-size so dynamic parameters can be added. * It is protected by prison0.mtx, but lockless reading is allowed * with an atomic check of the flag values. */ static struct bool_flags pr_flag_allow[NBBY * NBPW] = { {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME}, {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC}, {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS}, {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS}, {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT}, {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS}, {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF}, {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK}, {"allow.reserved_ports", "allow.noreserved_ports", PR_ALLOW_RESERVED_PORTS}, {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF}, {"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug", PR_ALLOW_UNPRIV_DEBUG}, {"allow.suser", "allow.nosuser", PR_ALLOW_SUSER}, #ifdef VIMAGE {"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD}, #endif {"allow.extattr", "allow.noextattr", PR_ALLOW_EXTATTR}, {"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME}, {"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME}, {"allow.routing", "allow.norouting", PR_ALLOW_ROUTING}, {"allow.unprivileged_parent_tampering", "allow.nounprivileged_parent_tampering", PR_ALLOW_UNPRIV_PARENT_TAMPER}, #ifdef AUDIT {"allow.setaudit", "allow.nosetaudit", PR_ALLOW_SETAUDIT}, #endif }; static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC; const size_t pr_flag_allow_size = sizeof(pr_flag_allow); #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | \ PR_ALLOW_RESERVED_PORTS | \ PR_ALLOW_UNPRIV_DEBUG | \ PR_ALLOW_SUSER) #define JAIL_DEFAULT_ENFORCE_STATFS 2 #define JAIL_DEFAULT_DEVFS_RSNUM 0 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; #if defined(INET) || defined(INET6) static unsigned jail_max_af_ips = 255; #endif /* * Initialize the parts of prison0 that can't be static-initialized with * constants. This is called from proc0_init() after creating thread0 cpuset. */ void prison0_init(void) { uint8_t *file, *data; size_t size; char buf[sizeof(prison0.pr_hostuuid)]; bool valid; prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); prison0.pr_osreldate = osreldate; strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); /* If we have a preloaded hostuuid, use it. */ file = preload_search_by_type(PRISON0_HOSTUUID_MODULE); if (file != NULL) { data = preload_fetch_addr(file); size = preload_fetch_size(file); if (data != NULL) { /* * The preloaded data may include trailing whitespace, almost * certainly a newline; skip over any whitespace or * non-printable characters to be safe. */ while (size > 0 && data[size - 1] <= 0x20) { size--; } valid = false; /* * Not NUL-terminated when passed from loader, but * validate_uuid requires that due to using sscanf (as * does the subsequent strlcpy, since it still reads * past the given size to return the true length); * bounce to a temporary buffer to fix. */ if (size >= sizeof(buf)) goto done; memcpy(buf, data, size); buf[size] = '\0'; if (validate_uuid(buf, size, NULL, 0) != 0) goto done; valid = true; (void)strlcpy(prison0.pr_hostuuid, buf, sizeof(prison0.pr_hostuuid)); done: if (bootverbose && !valid) { printf("hostuuid: preload data malformed: '%.*s'\n", (int)size, data); } } } if (bootverbose) printf("hostuuid: using %s\n", prison0.pr_hostuuid); } /* * struct jail_args { * struct jail *jail; * }; */ int sys_jail(struct thread *td, struct jail_args *uap) { uint32_t version; int error; struct jail j; error = copyin(uap->jail, &version, sizeof(uint32_t)); if (error) return (error); switch (version) { case 0: { struct jail_v0 j0; /* FreeBSD single IPv4 jails. */ bzero(&j, sizeof(struct jail)); error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); if (error) return (error); j.version = j0.version; j.path = j0.path; j.hostname = j0.hostname; j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ break; } case 1: /* * Version 1 was used by multi-IPv4 jail implementations * that never made it into the official kernel. */ return (EINVAL); case 2: /* JAIL_API_VERSION */ /* FreeBSD multi-IPv4/IPv6,noIP jails. */ error = copyin(uap->jail, &j, sizeof(struct jail)); if (error) return (error); break; default: /* Sci-Fi jails are not supported, sorry. */ return (EINVAL); } return (kern_jail(td, &j)); } int kern_jail(struct thread *td, struct jail *j) { struct iovec optiov[2 * (4 + nitems(pr_flag_allow) #ifdef INET + 1 #endif #ifdef INET6 + 1 #endif )]; struct uio opt; char *u_path, *u_hostname, *u_name; struct bool_flags *bf; #ifdef INET uint32_t ip4s; struct in_addr *u_ip4; #endif #ifdef INET6 struct in6_addr *u_ip6; #endif size_t tmplen; int error, enforce_statfs; bzero(&optiov, sizeof(optiov)); opt.uio_iov = optiov; opt.uio_iovcnt = 0; opt.uio_offset = -1; opt.uio_resid = -1; opt.uio_segflg = UIO_SYSSPACE; opt.uio_rw = UIO_READ; opt.uio_td = td; /* Set permissions for top-level jails from sysctls. */ if (!jailed(td->td_ucred)) { for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { optiov[opt.uio_iovcnt].iov_base = __DECONST(char *, (jail_default_allow & bf->flag) ? bf->name : bf->noname); optiov[opt.uio_iovcnt].iov_len = strlen(optiov[opt.uio_iovcnt].iov_base) + 1; opt.uio_iovcnt += 2; } optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); opt.uio_iovcnt++; enforce_statfs = jail_default_enforce_statfs; optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); opt.uio_iovcnt++; } tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; #ifdef INET ip4s = (j->version == 0) ? 1 : j->ip4s; if (ip4s > jail_max_af_ips) return (EINVAL); tmplen += ip4s * sizeof(struct in_addr); #else if (j->ip4s > 0) return (EINVAL); #endif #ifdef INET6 if (j->ip6s > jail_max_af_ips) return (EINVAL); tmplen += j->ip6s * sizeof(struct in6_addr); #else if (j->ip6s > 0) return (EINVAL); #endif u_path = malloc(tmplen, M_TEMP, M_WAITOK); u_hostname = u_path + MAXPATHLEN; u_name = u_hostname + MAXHOSTNAMELEN; #ifdef INET u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); #endif #ifdef INET6 #ifdef INET u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); #else u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); #endif #endif optiov[opt.uio_iovcnt].iov_base = "path"; optiov[opt.uio_iovcnt].iov_len = sizeof("path"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_path; error = copyinstr(j->path, u_path, MAXPATHLEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = "host.hostname"; optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_hostname; error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; if (j->jailname != NULL) { optiov[opt.uio_iovcnt].iov_base = "name"; optiov[opt.uio_iovcnt].iov_len = sizeof("name"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_name; error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; } #ifdef INET optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip4; optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); if (j->version == 0) u_ip4->s_addr = j->ip4s; else { error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } } opt.uio_iovcnt++; #endif #ifdef INET6 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip6; optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; #endif KASSERT(opt.uio_iovcnt <= nitems(optiov), ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); free(u_path, M_TEMP); return (error); } /* * struct jail_set_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_set(struct thread *td, struct jail_set_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_set(td, auio, uap->flags); freeuio(auio); return (error); } #if defined(INET) || defined(INET6) typedef int prison_addr_cmp_t(const void *, const void *); typedef bool prison_addr_valid_t(const void *); static const struct pr_family { size_t size; prison_addr_cmp_t *cmp; prison_addr_valid_t *valid; int ip_flag; } pr_families[PR_FAMILY_MAX] = { #ifdef INET [PR_INET] = { .size = sizeof(struct in_addr), .cmp = prison_qcmp_v4, .valid = prison_valid_v4, .ip_flag = PR_IP4_USER, }, #endif #ifdef INET6 [PR_INET6] = { .size = sizeof(struct in6_addr), .cmp = prison_qcmp_v6, .valid = prison_valid_v6, .ip_flag = PR_IP6_USER, }, #endif }; /* * Network address lists (pr_addrs) allocation for jails. The addresses * are accessed locklessly by the network stack, thus need to be protected by * the network epoch. */ struct prison_ip { struct epoch_context ctx; uint32_t ips; #ifdef FUTURE_C /* * XXX Variable-length automatic arrays in union may be * supported in future C. */ union { char pr_ip[]; struct in_addr pr_ip4[]; struct in6_addr pr_ip6[]; }; #else /* No future C :( */ char pr_ip[]; #endif }; static char * PR_IP(struct prison_ip *pip, const pr_family_t af, int idx) { MPASS(pip); MPASS(af < PR_FAMILY_MAX); MPASS(idx >= 0 && idx < pip->ips); return (pip->pr_ip + pr_families[af].size * idx); } static struct prison_ip * prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags) { struct prison_ip *pip; pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size, M_PRISON, flags); if (pip != NULL) pip->ips = cnt; return (pip); } /* * Allocate and copyin user supplied address list, sorting and validating. * kern_jail_set() helper. */ static struct prison_ip * prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt) { prison_addr_cmp_t *const cmp = pr_families[af].cmp; const size_t size = pr_families[af].size; struct prison_ip *pip; pip = prison_ip_alloc(af, cnt, M_WAITOK); bcopy(op, pip->pr_ip, cnt * size); /* * IP addresses are all sorted but ip[0] to preserve * the primary IP address as given from userland. * This special IP is used for unbound outgoing * connections as well for "loopback" traffic in case * source address selection cannot find any more fitting * address to connect from. */ if (cnt > 1) qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp); /* * Check for duplicate addresses and do some simple * zero and broadcast checks. If users give other bogus * addresses it is their problem. */ for (int i = 0; i < cnt; i++) { if (!pr_families[af].valid(PR_IP(pip, af, i))) { free(pip, M_PRISON); return (NULL); } if (i + 1 < cnt && (cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 || cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) { free(pip, M_PRISON); return (NULL); } } return (pip); } /* * Allocate and dup parent prison address list. * kern_jail_set() helper. */ static void prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af) { const struct prison_ip *ppip = ppr->pr_addrs[af]; struct prison_ip *pip; if (ppip != NULL) { pip = prison_ip_alloc(af, ppip->ips, M_WAITOK); bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size); pr->pr_addrs[af] = pip; } } /* * Make sure the new set of IP addresses is a subset of the parent's list. * Don't worry about the parent being unlocked, as any setting is done with * allprison_lock held. * kern_jail_set() helper. */ static bool prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip, const pr_family_t af) { prison_addr_cmp_t *const cmp = pr_families[af].cmp; int i, j; if (ppip == NULL) return (false); for (i = 0; i < ppip->ips; i++) if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0) break; if (i == ppip->ips) /* Main address not present in parent. */ return (false); if (pip->ips > 1) { for (i = j = 1; i < pip->ips; i++) { if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) /* Equals to parent primary address. */ continue; for (; j < ppip->ips; j++) if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, j)) == 0) break; if (j == ppip->ips) break; } if (j == ppip->ips) /* Address not present in parent. */ return (false); } return (true); } /* * Check for conflicting IP addresses. We permit them if there is no more * than one IP on each jail. If there is a duplicate on a jail with more * than one IP stop checking and return error. * kern_jail_set() helper. */ static bool prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr, struct prison_ip *pip, pr_family_t af) { const struct prison *tppr, *tpr; int descend; #ifdef VIMAGE for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #else tppr = &prison0; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif !prison_isalive(tpr)) { descend = 0; continue; } if (!(tpr->pr_flags & pr_families[af].ip_flag)) continue; descend = 0; if (tpr->pr_addrs[af] == NULL || (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1)) continue; for (int i = 0; i < pip->ips; i++) if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0) return (false); } return (true); } _Static_assert(offsetof(struct prison_ip, ctx) == 0, "prison must start with epoch context"); static void prison_ip_free_deferred(epoch_context_t ctx) { free(ctx, M_PRISON); } static void prison_ip_free(struct prison_ip *pip) { if (pip != NULL) NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx); } static void prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new) { struct prison_ip **mem, *old; mtx_assert(&pr->pr_mtx, MA_OWNED); mem = &pr->pr_addrs[af]; old = *mem; atomic_store_ptr(mem, new); prison_ip_free(old); } /* * Restrict a prison's IP address list with its parent's, possibly replacing * it. Return true if succeed, otherwise should redo. * kern_jail_set() helper. */ static bool prison_ip_restrict(struct prison *pr, const pr_family_t af, struct prison_ip **newp) { struct prison_ip *ppip = pr->pr_parent->pr_addrs[af]; struct prison_ip *pip = pr->pr_addrs[af]; int (*const cmp)(const void *, const void *) = pr_families[af].cmp; const size_t size = pr_families[af].size; struct prison_ip *new = newp != NULL ? *newp : NULL; uint32_t ips; mtx_assert(&pr->pr_mtx, MA_OWNED); /* * Due to epoch-synchronized access to the IP address lists we always * allocate a new list even if the old one has enough space. We could * atomically update an IPv4 address inside a list, but that would * screw up sorting, and in case of IPv6 we can't even atomically write * one. */ if (ppip == NULL) { if (pip != NULL) prison_ip_set(pr, af, NULL); return (true); } if (!(pr->pr_flags & pr_families[af].ip_flag)) { if (new == NULL) { new = prison_ip_alloc(af, ppip->ips, M_NOWAIT); if (new == NULL) return (false); /* Redo */ } /* This has no user settings, so just copy the parent's list. */ MPASS(new->ips == ppip->ips); bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size); prison_ip_set(pr, af, new); if (newp != NULL) *newp = NULL; /* Used */ } else if (pip != NULL) { /* Remove addresses that aren't in the parent. */ int i; i = 0; /* index in pip */ ips = 0; /* index in new */ if (new == NULL) { new = prison_ip_alloc(af, pip->ips, M_NOWAIT); if (new == NULL) return (false); /* Redo */ } for (int pi = 0; pi < ppip->ips; pi++) if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) { /* Found our primary address in parent. */ bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), size); i++; ips++; break; } for (int pi = 1; i < pip->ips; ) { /* Check against primary, which is unsorted. */ if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) { /* Matches parent's primary address. */ bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), size); i++; ips++; continue; } /* The rest are sorted. */ switch (pi >= ppip->ips ? -1 : cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) { case -1: i++; break; case 0: bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), size); i++; pi++; ips++; break; case 1: pi++; break; } } if (ips == 0) { if (newp == NULL || *newp == NULL) prison_ip_free(new); new = NULL; } else { /* Shrink to real size */ KASSERT((new->ips >= ips), ("Out-of-bounds write to prison_ip %p", new)); new->ips = ips; } prison_ip_set(pr, af, new); if (newp != NULL) *newp = NULL; /* Used */ } return (true); } /* * Fast-path check if an address belongs to a prison. */ int prison_ip_check(const struct prison *pr, const pr_family_t af, const void *addr) { int (*const cmp)(const void *, const void *) = pr_families[af].cmp; struct prison_ip *pip; int i, a, z, d; MPASS(mtx_owned(&pr->pr_mtx) || in_epoch(net_epoch_preempt) || sx_xlocked(&allprison_lock)); pip = atomic_load_ptr(&pr->pr_addrs[af]); if (__predict_false(pip == NULL)) return (EAFNOSUPPORT); /* Check the primary IP. */ if (cmp(PR_IP(pip, af, 0), addr) == 0) return (0); /* * All the other IPs are sorted so we can do a binary search. */ a = 0; z = pip->ips - 2; while (a <= z) { i = (a + z) / 2; d = cmp(PR_IP(pip, af, i + 1), addr); if (d > 0) z = i - 1; else if (d < 0) a = i + 1; else return (0); } return (EADDRNOTAVAIL); } /* * Grab primary IP. Historically required mutex, but nothing prevents * us to support epoch-protected access. Is it used in fast path? * in{6}_jail.c helper */ const void * prison_ip_get0(const struct prison *pr, const pr_family_t af) { const struct prison_ip *pip = pr->pr_addrs[af]; mtx_assert(&pr->pr_mtx, MA_OWNED); MPASS(pip); return (pip->pr_ip); } u_int prison_ip_cnt(const struct prison *pr, const pr_family_t af) { return (pr->pr_addrs[af]->ips); } #endif /* defined(INET) || defined(INET6) */ int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { struct file *jfp_out; struct nameidata nd; #ifdef INET struct prison_ip *ip4; #endif #ifdef INET6 struct prison_ip *ip6; #endif struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr; struct ucred *jdcred; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; struct bool_flags *bf; struct jailsys_flags *jsf; #if defined(INET) || defined(INET6) void *op; #endif unsigned long hid; size_t namelen, onamelen, pnamelen; int created, cuflags, descend, drflags, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; #ifdef INET int ip4s; bool redo_ip4; #endif #ifdef INET6 int ip6s; bool redo_ip6; #endif bool maybe_changed; uint64_t pr_allow, ch_allow, pr_flags, ch_flags; uint64_t pr_allow_diff; unsigned tallow; char numbuf[12]; mypr = td->td_ucred->cr_prison; if (((flags & (JAIL_CREATE | JAIL_AT_DESC)) == JAIL_CREATE) && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) == (JAIL_USE_DESC | JAIL_AT_DESC)) return (EINVAL); prison_hold(mypr); #ifdef INET ip4 = NULL; #endif #ifdef INET6 ip6 = NULL; #endif g_path = NULL; jfp_out = NULL; jfd_out = -1; /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this * takes care of some expensive stuff (path lookup) before getting * the allprison lock. * * XXX Jails are not filesystems, and jail parameters are not mount * options. But it makes more sense to re-use the vfsopt code * than duplicate it under a different name. */ error = vfs_buildopts(optuio, &opts); if (error) { opts = NULL; goto done_free; } cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { error = EINVAL; vfs_opterror(opts, "no valid operation (create or update)"); goto done_errmsg; } error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); if (error == ENOENT) { if (flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) { vfs_opterror(opts, "missing desc"); goto done_errmsg; } jfd_in = -1; } else if (error != 0) goto done_free; else { if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC))) { error = EINVAL; vfs_opterror(opts, "unexpected desc"); goto done_errmsg; } if (flags & JAIL_AT_DESC) { /* * Look up and create jails based on the * descriptor's prison. */ prison_free(mypr); error = jaildesc_find(td, jfd_in, &mypr, NULL); if (error != 0) { vfs_opterror(opts, error == ENOENT ? "descriptor to dead jail" : "not a jail descriptor"); goto done_errmsg; } if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) { error = EPERM; goto done_free; } } if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { /* Allocate a jail descriptor to return later. */ error = jaildesc_alloc(td, &jfp_out, &jfd_out, flags & JAIL_OWN_DESC); if (error) goto done_free; } } /* * Delay the permission check if using a jail descriptor, * until we get the descriptor's credentials. */ if (!(flags & JAIL_USE_DESC)) { error = priv_check(td, PRIV_JAIL_SET); if (error == 0 && (flags & JAIL_ATTACH)) error = priv_check(td, PRIV_JAIL_ATTACH); if (error) goto done_free; } error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; else if (error != 0) goto done_free; error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); if (error == ENOENT) gotslevel = 0; else if (error != 0) goto done_free; else gotslevel = 1; error = vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); if (error == ENOENT) gotchildmax = 0; else if (error != 0) goto done_free; else gotchildmax = 1; error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); if (error == ENOENT) gotenforce = 0; else if (error != 0) goto done_free; else if (enforce < 0 || enforce > 2) { error = EINVAL; goto done_free; } else gotenforce = 1; error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); if (error == ENOENT) gotrsnum = 0; else if (error != 0) goto done_free; else gotrsnum = 1; pr_flags = ch_flags = 0; for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) { vfs_flagopt(opts, bf->name, &pr_flags, bf->flag); vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag); } ch_flags |= pr_flags; for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys)); if (error == ENOENT) continue; if (error != 0) goto done_free; switch (jsys) { case JAIL_SYS_DISABLE: if (!jsf->disable) { error = EINVAL; goto done_free; } pr_flags |= jsf->disable; break; case JAIL_SYS_NEW: pr_flags |= jsf->new; break; case JAIL_SYS_INHERIT: break; default: error = EINVAL; goto done_free; } ch_flags |= jsf->new | jsf->disable; } if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE && !(pr_flags & PR_PERSIST)) { error = EINVAL; vfs_opterror(opts, "new jail must persist or attach"); goto done_errmsg; } #ifdef VIMAGE if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { error = EINVAL; vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET6 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_errmsg; } #endif pr_allow = ch_allow = 0; for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { vfs_flagopt(opts, bf->name, &pr_allow, bf->flag); vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag); } ch_allow |= pr_allow; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == ENOENT) name = NULL; else if (error != 0) goto done_free; else { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); if (error == ENOENT) host = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || host[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); if (error == ENOENT) domain = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || domain[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); if (error == ENOENT) uuid = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || uuid[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > HOSTUUIDLEN) { error = ENAMETOOLONG; goto done_free; } } #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32; error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); hid = hid32; } else #endif error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); if (error == ENOENT) gothid = 0; else if (error != 0) goto done_free; else { gothid = 1; ch_flags |= PR_HOST; pr_flags |= PR_HOST; } #ifdef INET error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); if (error == ENOENT) ip4s = 0; else if (error != 0) goto done_free; else if (ip4s & (sizeof(struct in_addr) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP4_USER; pr_flags |= PR_IP4_USER; if (ip4s > 0) { ip4s /= sizeof(struct in_addr); if (ip4s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv4 addresses"); goto done_errmsg; } ip4 = prison_ip_copyin(PR_INET, op, ip4s); if (ip4 == NULL) { error = EINVAL; goto done_free; } } } #endif #ifdef INET6 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); if (error == ENOENT) ip6s = 0; else if (error != 0) goto done_free; else if (ip6s & (sizeof(struct in6_addr) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP6_USER; pr_flags |= PR_IP6_USER; if (ip6s > 0) { ip6s /= sizeof(struct in6_addr); if (ip6s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv6 addresses"); goto done_errmsg; } ip6 = prison_ip_copyin(PR_INET6, op, ip6s); if (ip6 == NULL) { error = EINVAL; goto done_free; } } } #endif #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_errmsg; } #endif error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); if (error == ENOENT) osrelstr = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osrelease cannot be changed after creation"); goto done_errmsg; } if (len == 0 || osrelstr[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len >= OSRELEASELEN) { error = ENAMETOOLONG; vfs_opterror(opts, "osrelease string must be 1-%d bytes long", OSRELEASELEN - 1); goto done_errmsg; } } error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); if (error == ENOENT) osreldt = 0; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be changed after creation"); goto done_errmsg; } if (osreldt == 0) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be 0"); goto done_errmsg; } } root = NULL; error = vfs_getopt(opts, "path", (void **)&path, &len); if (error == ENOENT) path = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "path cannot be changed after creation"); goto done_errmsg; } if (len == 0 || path[len - 1] != '\0') { error = EINVAL; goto done_free; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path); error = namei(&nd); if (error) goto done_free; root = nd.ni_vp; NDFREE_PNBUF(&nd); g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); strlcpy(g_path, path, MAXPATHLEN); error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); if (error == 0) { path = g_path; } else { /* exit on other errors */ goto done_free; } if (root->v_type != VDIR) { error = ENOTDIR; vput(root); goto done_free; } VOP_UNLOCK(root); } /* * Find the specified jail, or at least its parent. * This abuses the file error codes ENOENT and EEXIST. */ pr = NULL; inspr = NULL; deadpr = NULL; maybe_changed = false; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); if (*p != '\0') jid = 0; } sx_xlock(&allprison_lock); drflags = PD_LIST_XLOCKED; ppr = mypr; if (!prison_isalive(ppr)) { /* This jail is dying. This process will surely follow. */ error = EAGAIN; goto done_deref; } if (flags & JAIL_USE_DESC) { /* Get the jail from its descriptor. */ error = jaildesc_find(td, jfd_in, &pr, &jdcred); if (error) { vfs_opterror(opts, error == ENOENT ? "descriptor to dead jail" : "not a jail descriptor"); goto done_deref; } drflags |= PD_DEREF; error = priv_check_cred(jdcred, PRIV_JAIL_SET); if (error == 0 && (flags & JAIL_ATTACH)) error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); crfree(jdcred); if (error) goto done_deref; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; if (cuflags == JAIL_CREATE) { error = EEXIST; vfs_opterror(opts, "jail %d already exists", pr->pr_id); goto done_deref; } if (!prison_isalive(pr)) { /* While a jid can be resurrected, the prison * itself cannot. */ error = ENOENT; vfs_opterror(opts, "jail %d is dying", pr->pr_id); goto done_deref; } if (jid != 0 && jid != pr->pr_id) { error = EINVAL; vfs_opterror(opts, "cannot change jid"); goto done_deref; } jid = pr->pr_id; } else if (jid != 0) { if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); goto done_deref; } /* * See if a requested jid already exists. Keep track of * where it can be inserted later. */ TAILQ_FOREACH(inspr, &allprison, pr_list) { if (inspr->pr_id < jid) continue; if (inspr->pr_id > jid) break; if (prison_isalive(inspr)) { pr = inspr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; } else { /* Note a dying jail to handle later. */ deadpr = inspr; } inspr = NULL; break; } if (cuflags == JAIL_CREATE && pr != NULL) { /* * Even creators that cannot see the jail will * get EEXIST. */ error = EEXIST; vfs_opterror(opts, "jail %d already exists", jid); goto done_deref; } if ((pr == NULL) ? cuflags == JAIL_UPDATE : !prison_ischild(mypr, pr)) { /* * Updaters get ENOENT for nonexistent jails, * or for jails they cannot see. The latter * case is true even for CREATE | UPDATE, * which normally cannot give this error. */ error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_deref; } } /* * If the caller provided a name, look for a jail by that name. * This has different semantics for creates and updates keyed by jid * (where the name must not already exist in a different jail), * and updates keyed by the name itself (where the name must exist * because that is the jail being updated). */ namelc = NULL; if (name != NULL) { namelc = strrchr(name, '.'); if (namelc == NULL) namelc = name; else { /* * This is a hierarchical name. Split it into the * parent and child names, and make sure the parent * exists or matches an already found jail. */ if (pr != NULL) { if (strncmp(name, ppr->pr_name, namelc - name) || ppr->pr_name[namelc - name] != '\0') { error = EINVAL; vfs_opterror(opts, "cannot change jail's parent"); goto done_deref; } } else { *namelc = '\0'; ppr = prison_find_name(mypr, name); if (ppr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_deref; } mtx_unlock(&ppr->pr_mtx); if (!prison_isalive(ppr)) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_deref; } *namelc = '.'; } namelc++; } if (namelc[0] != '\0') { pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr == pr || !prison_isalive(tpr) || strcmp(tpr->pr_name + pnamelen, namelc)) continue; if (cuflags == JAIL_CREATE || pr != NULL) { /* * Create, or update(jid): name must * not exist in an active sibling jail. */ error = EEXIST; vfs_opterror(opts, "jail \"%s\" already exists", name); goto done_deref; } /* Use this jail for updates. */ pr = tpr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; break; } /* * Update: name must exist if no jid is specified. * As with the jid case, the jail must be currently * visible, or else even CREATE | UPDATE will get * an error. */ if ((pr == NULL) ? cuflags == JAIL_UPDATE : !prison_isalive(pr)) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_deref; } } } /* Update: must provide a desc, jid, or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); goto done_deref; } /* If there's no prison to update, create a new one and link it in. */ created = pr == NULL; if (created) { for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) if (tpr->pr_childcount >= tpr->pr_childmax) { error = EPERM; vfs_opterror(opts, "prison limit exceeded"); goto done_deref; } if (deadpr != NULL) { /* * The prison being created has the same ID as a dying * one. Handle this by giving the dying jail a new ID. * This may cause some confusion to user space, but * only to those listing dying jails. */ deadid = get_next_deadid(&dinspr); if (deadid == 0) { error = EAGAIN; vfs_opterror(opts, "no available jail IDs"); goto done_deref; } mtx_lock(&deadpr->pr_mtx); deadpr->pr_id = deadid; mtx_unlock(&deadpr->pr_mtx); if (dinspr == deadpr) inspr = deadpr; else { inspr = TAILQ_NEXT(deadpr, pr_list); TAILQ_REMOVE(&allprison, deadpr, pr_list); if (dinspr != NULL) TAILQ_INSERT_AFTER(&allprison, dinspr, deadpr, pr_list); else TAILQ_INSERT_HEAD(&allprison, deadpr, pr_list); } } if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) { error = EAGAIN; vfs_opterror(opts, "no available jail IDs"); goto done_deref; } pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); pr->pr_state = PRISON_STATE_INVALID; refcount_init(&pr->pr_ref, 1); refcount_init(&pr->pr_uref, 0); drflags |= PD_DEREF; LIST_INIT(&pr->pr_children); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); pr->pr_id = jid; if (inspr != NULL) TAILQ_INSERT_BEFORE(inspr, pr, pr_list); else TAILQ_INSERT_TAIL(&allprison, pr, pr_list); pr->pr_parent = ppr; prison_hold(ppr); prison_proc_hold(ppr); LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; pr->pr_klist = knlist_alloc(&pr->pr_mtx); /* Set some default values, and inherit some from the parent. */ if (namelc == NULL) namelc = ""; if (path == NULL) { path = "/"; root = mypr->pr_root; vref(root); } strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); pr->pr_flags |= PR_HOST; #if defined(INET) || defined(INET6) #ifdef VIMAGE if (!(pr_flags & PR_VNET)) #endif { #ifdef INET if (!(ch_flags & PR_IP4_USER)) pr->pr_flags |= PR_IP4 | PR_IP4_USER; else if (!(pr_flags & PR_IP4_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP4; prison_ip_dup(ppr, pr, PR_INET); } #endif #ifdef INET6 if (!(ch_flags & PR_IP6_USER)) pr->pr_flags |= PR_IP6 | PR_IP6_USER; else if (!(pr_flags & PR_IP6_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP6; prison_ip_dup(ppr, pr, PR_INET6); } #endif } #endif /* Source address selection is always on by default. */ pr->pr_flags |= _PR_IP_SADDRSEL; pr->pr_securelevel = ppr->pr_securelevel; pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; pr->pr_enforce_statfs = jail_default_enforce_statfs; pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; if (osrelstr == NULL) strlcpy(pr->pr_osrelease, ppr->pr_osrelease, sizeof(pr->pr_osrelease)); else strlcpy(pr->pr_osrelease, osrelstr, sizeof(pr->pr_osrelease)); #ifdef VIMAGE /* * Allocate a new vnet if specified. * * Set PR_VNET now if so, so that the vnet is disposed of * properly when the jail is destroyed. */ if (pr_flags & PR_VNET) { pr->pr_flags |= PR_VNET; pr->pr_vnet = vnet_alloc(); } else { pr->pr_vnet = ppr->pr_vnet; } #endif /* * Allocate a dedicated cpuset for each jail. * Unlike other initial settings, this may return an error. */ error = cpuset_create_root(ppr, &pr->pr_cpuset); if (error) goto done_deref; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; } else { /* * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ if (!(drflags & PD_DEREF)) { prison_hold(pr); drflags |= PD_DEREF; } #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_deref; } #endif #ifdef INET if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_deref; } #endif #ifdef INET6 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_deref; } #endif } /* Do final error checking before setting anything. */ if (gotslevel) { if (slevel < ppr->pr_securelevel) { error = EPERM; goto done_deref; } } if (gotchildmax) { if (childmax >= ppr->pr_childmax) { error = EPERM; goto done_deref; } } if (gotenforce) { if (enforce < ppr->pr_enforce_statfs) { error = EPERM; goto done_deref; } } if (gotrsnum) { /* * devfs_rsnum is a uint16_t */ if (rsnum < 0 || rsnum > 65535) { error = EINVAL; goto done_deref; } /* * Nested jails always inherit parent's devfs ruleset */ if (jailed(td->td_ucred)) { if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { error = EPERM; goto done_deref; } else rsnum = ppr->pr_devfs_rsnum; } } #ifdef INET if (ip4s > 0) { if ((ppr->pr_flags & PR_IP4) && !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4, PR_INET)) { error = EPERM; goto done_deref; } if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) { error = EADDRINUSE; vfs_opterror(opts, "IPv4 addresses clash"); goto done_deref; } } #endif #ifdef INET6 if (ip6s > 0) { if ((ppr->pr_flags & PR_IP6) && !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6, PR_INET6)) { error = EPERM; goto done_deref; } if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) { error = EADDRINUSE; vfs_opterror(opts, "IPv6 addresses clash"); goto done_deref; } } #endif onamelen = namelen = 0; if (namelc != NULL) { /* Give a default name of the jid. Also allow the name to be * explicitly the jid - but not any other number, and only in * normal form (no leading zero/etc). */ if (namelc[0] == '\0') snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); else if ((strtoul(namelc, &p, 10) != jid || namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { error = EINVAL; vfs_opterror(opts, "name cannot be numeric (unless it is the jid)"); goto done_deref; } /* * Make sure the name isn't too long for the prison or its * children. */ pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; onamelen = strlen(pr->pr_name + pnamelen); namelen = strlen(namelc); if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref; } FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { if (strlen(tpr->pr_name) + (namelen - onamelen) >= sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref; } } } pr_allow_diff = pr_allow & ~ppr->pr_allow; if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) { error = EPERM; goto done_deref; } /* * Let modules check their parameters. This requires unlocking and * then re-locking the prison, but this is still a valid state as long * as allprison_lock remains xlocked. */ mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; error = osd_jail_call(pr, PR_METHOD_CHECK, opts); if (error != 0) goto done_deref; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; /* At this point, all valid parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_deref; } } maybe_changed = true; /* Set the parameters of the prison. */ #ifdef INET redo_ip4 = false; if (pr_flags & PR_IP4_USER) { pr->pr_flags |= PR_IP4; prison_ip_set(pr, PR_INET, ip4); ip4 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (!prison_ip_restrict(tpr, PR_INET, NULL)) { redo_ip4 = true; descend = 0; } } } #endif #ifdef INET6 redo_ip6 = false; if (pr_flags & PR_IP6_USER) { pr->pr_flags |= PR_IP6; prison_ip_set(pr, PR_INET6, ip6); ip6 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (!prison_ip_restrict(tpr, PR_INET6, NULL)) { redo_ip6 = true; descend = 0; } } } #endif if (gotslevel) { pr->pr_securelevel = slevel; /* Set all child jails to be at least this level. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_securelevel < slevel) tpr->pr_securelevel = slevel; } if (gotchildmax) { pr->pr_childmax = childmax; /* Set all child jails to under this limit. */ FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) if (tpr->pr_childmax > childmax - level) tpr->pr_childmax = childmax > level ? childmax - level : 0; } if (gotenforce) { pr->pr_enforce_statfs = enforce; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_enforce_statfs < enforce) tpr->pr_enforce_statfs = enforce; } if (gotrsnum) { pr->pr_devfs_rsnum = rsnum; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_devfs_rsnum = rsnum; } if (namelc != NULL) { if (ppr == &prison0) strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); else snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", ppr->pr_name, namelc); /* Change this component of child names. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, strlen(tpr->pr_name + onamelen) + 1); bcopy(pr->pr_name, tpr->pr_name, namelen); } } if (path != NULL) { /* Try to keep a real-rooted full pathname. */ strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); pr->pr_root = root; root = NULL; } if (PR_HOST & ch_flags & ~pr_flags) { if (pr->pr_flags & PR_HOST) { /* * Copy the parent's host info. As with pr_ip4 above, * the lack of a lock on the parent is not a problem; * it is always set with allprison_lock at least * shared, and is held exclusively here. */ strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, sizeof(pr->pr_hostname)); strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, sizeof(pr->pr_domainname)); strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, sizeof(pr->pr_hostuuid)); pr->pr_hostid = pr->pr_parent->pr_hostid; } } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { /* Set this prison, and any descendants without PR_HOST. */ if (host != NULL) strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); if (domain != NULL) strlcpy(pr->pr_domainname, domain, sizeof(pr->pr_domainname)); if (uuid != NULL) strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); if (gothid) pr->pr_hostid = hid; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { if (tpr->pr_flags & PR_HOST) descend = 0; else { if (host != NULL) strlcpy(tpr->pr_hostname, pr->pr_hostname, sizeof(tpr->pr_hostname)); if (domain != NULL) strlcpy(tpr->pr_domainname, pr->pr_domainname, sizeof(tpr->pr_domainname)); if (uuid != NULL) strlcpy(tpr->pr_hostuuid, pr->pr_hostuuid, sizeof(tpr->pr_hostuuid)); if (gothid) tpr->pr_hostid = hid; } } } pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; if ((tallow = ch_allow & ~pr_allow)) prison_set_allow_locked(pr, tallow, 0); /* * Persistent prisons get an extra reference, and prisons losing their * persist flag lose that reference. */ if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) { if (pr_flags & PR_PERSIST) { prison_hold(pr); /* * This may be a new prison's first user reference, * but wait to call it alive until after OSD calls * have had a chance to run (and perhaps to fail). */ refcount_acquire(&pr->pr_uref); } else { drflags |= PD_DEUREF; prison_free_not_last(pr); } } pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; /* * Any errors past this point will need to de-persist newly created * prisons, as well as call remove methods. */ if (created) drflags |= PD_KILL; #ifdef RACCT if (racct_enable && created) prison_racct_attach(pr); #endif /* Locks may have prevented a complete restriction of child IP * addresses. If so, allocate some more memory and try again. */ #ifdef INET while (redo_ip4) { ip4s = pr->pr_addrs[PR_INET]->ips; MPASS(ip4 == NULL); ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip4 = false; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (!prison_ip_restrict(tpr, PR_INET, &ip4)) redo_ip4 = true; } mtx_unlock(&pr->pr_mtx); } #endif #ifdef INET6 while (redo_ip6) { ip6s = pr->pr_addrs[PR_INET6]->ips; MPASS(ip6 == NULL); ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip6 = false; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (!prison_ip_restrict(tpr, PR_INET6, &ip6)) redo_ip6 = true; } mtx_unlock(&pr->pr_mtx); } #endif /* Let the modules do their work. */ if (created) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) goto done_deref; } error = osd_jail_call(pr, PR_METHOD_SET, opts); if (error) goto done_deref; /* * A new prison is now ready to be seen; either it has gained a user * reference via persistence, or is about to gain one via attachment. */ if (created) { sx_assert(&allprison_lock, SX_XLOCKED); prison_knote(ppr, NOTE_JAIL_CHILD | pr->pr_id); mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; pr->pr_state = PRISON_STATE_ALIVE; } /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { error = do_jail_attach(td, pr, prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS)); drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED); if (error) { vfs_opterror(opts, "attach failed"); goto done_deref; } } #ifdef RACCT if (racct_enable && !created) { if (drflags & PD_LOCKED) { mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; } if (drflags & PD_LIST_XLOCKED) { sx_xunlock(&allprison_lock); drflags &= ~PD_LIST_XLOCKED; } prison_racct_modify(pr); } #endif if (created && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 && (pr->pr_root->v_vflag & VV_ROOT) == 0) printf("Warning jail jid=%d: mountd/nfsd requires a separate" " file system\n", pr->pr_id); /* * Now that the prison is fully created without error, set the * jail descriptor if one was requested. This is the only * parameter that is returned to the caller (except the error * message). */ if (jfd_out >= 0) { if (!(drflags & PD_LOCKED)) { mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; } jfd_pos = 2 * vfs_getopt_pos(opts, "desc") + 1; if (optuio->uio_segflg == UIO_SYSSPACE) *(int*)optuio->uio_iov[jfd_pos].iov_base = jfd_out; else (void)copyout(&jfd_out, optuio->uio_iov[jfd_pos].iov_base, sizeof(jfd_out)); jaildesc_set_prison(jfp_out, pr); } drflags &= ~PD_KILL; td->td_retval[0] = pr->pr_id; done_deref: /* * Report changes to kevent. This can happen even if the * system call fails, as changes might have been made before * the failure. */ if (maybe_changed && !created) prison_knote(pr, NOTE_JAIL_SET); /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (drflags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); if (root != NULL) vrele(root); done_errmsg: if (error) { /* Write the error message back to userspace. */ if (vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len) == 0 && errmsg_len > 0) { errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else (void)copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } done_free: /* Clean up other resources. */ #ifdef INET prison_ip_free(ip4); #endif #ifdef INET6 prison_ip_free(ip6); #endif if (jfp_out != NULL) fdrop(jfp_out, td); if (error && jfd_out >= 0) (void)kern_close(td, jfd_out); if (g_path != NULL) free(g_path, M_TEMP); if (opts != NULL) vfs_freeopts(opts); prison_free(mypr); return (error); } /* * Find the next available prison ID. Return the ID on success, or zero * on failure. Also set a pointer to the allprison list entry the prison * should be inserted before. */ static int get_next_prid(struct prison **insprp) { struct prison *inspr; int jid, maxid; jid = lastprid % JAIL_MAX + 1; if (TAILQ_EMPTY(&allprison) || TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) { /* * A common case is for all jails to be implicitly numbered, * which means they'll go on the end of the list, at least * for the first JAIL_MAX times. */ inspr = NULL; } else { /* * Take two passes through the allprison list: first starting * with the proposed jid, then ending with it. */ for (maxid = JAIL_MAX; maxid != 0; ) { TAILQ_FOREACH(inspr, &allprison, pr_list) { if (inspr->pr_id < jid) continue; if (inspr->pr_id > jid) { /* Found an opening. */ maxid = 0; break; } if (++jid > maxid) { if (lastprid == maxid || lastprid == 0) { /* * The entire legal range * has been traversed */ return 0; } /* Try again from the start. */ jid = 1; maxid = lastprid; break; } } if (inspr == NULL) { /* Found room at the end of the list. */ break; } } } *insprp = inspr; lastprid = jid; return (jid); } /* * Find the next available ID for a renumbered dead prison. This is the same * as get_next_prid, but counting backward from the end of the range. */ static int get_next_deadid(struct prison **dinsprp) { struct prison *dinspr; int deadid, minid; deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX; /* * Take two reverse passes through the allprison list: first * starting with the proposed deadid, then ending with it. */ for (minid = 1; minid != 0; ) { TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) { if (dinspr->pr_id > deadid) continue; if (dinspr->pr_id < deadid) { /* Found an opening. */ minid = 0; break; } if (--deadid < minid) { if (lastdeadid == minid || lastdeadid == 0) { /* * The entire legal range * has been traversed */ return 0; } /* Try again from the end. */ deadid = JAIL_MAX; minid = lastdeadid; break; } } if (dinspr == NULL) { /* Found room at the beginning of the list. */ break; } } *dinsprp = dinspr; lastdeadid = deadid; return (deadid); } /* * struct jail_get_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_get(struct thread *td, struct jail_get_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_get(td, auio, uap->flags); if (error == 0) error = copyout(auio->uio_iov, uap->iovp, uap->iovcnt * sizeof(struct iovec)); freeuio(auio); return (error); } int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct bool_flags *bf; struct file *jfp_out; struct jailsys_flags *jsf; struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos; int jfd_in, jfd_out; unsigned f; if (flags & ~JAIL_GET_MASK) return (EINVAL); if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) == (JAIL_USE_DESC | JAIL_AT_DESC)) return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; prison_hold(mypr); pr = NULL; jfp_out = NULL; jfd_out = -1; /* * Find the prison specified by one of: desc, lastjid, jid, name. */ sx_slock(&allprison_lock); drflags = PD_LIST_SLOCKED; error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); if (error == ENOENT) { if (flags & (JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) { vfs_opterror(opts, "missing desc"); goto done; } } else if (error == 0) { if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC))) { error = EINVAL; vfs_opterror(opts, "unexpected desc"); goto done; } if (flags & JAIL_USE_DESC) { /* Get the jail from its descriptor. */ error = jaildesc_find(td, jfd_in, &pr, NULL); if (error) { vfs_opterror(opts, error == ENOENT ? "descriptor to dead jail" : "not a jail descriptor"); goto done; } drflags |= PD_DEREF; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { error = ENOENT; vfs_opterror(opts, "jail %d is dying", pr->pr_id); goto done; } goto found_prison; } if (flags & JAIL_AT_DESC) { /* Look up jails based on the descriptor's prison. */ prison_free(mypr); error = jaildesc_find(td, jfd_in, &mypr, NULL); if (error != 0) { vfs_opterror(opts, error == ENOENT ? "descriptor to dead jail" : "not a jail descriptor"); goto done; } } if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { /* Allocate a jail descriptor to return later. */ error = jaildesc_alloc(td, &jfp_out, &jfd_out, flags & JAIL_OWN_DESC); if (error) goto done; } } else goto done; error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id > jid && ((flags & JAIL_DYING) || prison_isalive(pr)) && prison_ischild(mypr, pr)) { mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; goto found_prison; } } error = ENOENT; vfs_opterror(opts, "no jail after %d", jid); goto done; } else if (error != ENOENT) goto done; error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == 0) { if (jid != 0) { pr = prison_find_child(mypr, jid); if (pr != NULL) { drflags |= PD_LOCKED; if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done; } } else if (error != ENOENT) goto done; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == 0) { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done; } pr = prison_find_name(mypr, name); if (pr != NULL) { drflags |= PD_LOCKED; if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done; } else if (error != ENOENT) goto done; vfs_opterror(opts, "no jail specified"); error = ENOENT; goto done; found_prison: /* Get the parameters of the prison. */ if (!(drflags & PD_DEREF)) { prison_hold(pr); drflags |= PD_DEREF; } td->td_retval[0] = pr->pr_id; if (jfd_out >= 0) { error = vfs_setopt(opts, "desc", &jfd_out, sizeof(jfd_out)); if (error != 0 && error != ENOENT) goto done; jaildesc_set_prison(jfp_out, pr); } error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done; i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; error = vfs_setopt(opts, "parent", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "name", prison_name(mypr, pr)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, sizeof(pr->pr_cpuset->cs_id)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "path", prison_path(mypr, pr)); if (error != 0 && error != ENOENT) goto done; #ifdef INET error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip, pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips * pr_families[PR_INET].size : 0 ); if (error != 0 && error != ENOENT) goto done; #endif #ifdef INET6 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip, pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips * pr_families[PR_INET6].size : 0 ); if (error != 0 && error != ENOENT) goto done; #endif error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, sizeof(pr->pr_securelevel)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, sizeof(pr->pr_childcount)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "children.max", &pr->pr_childmax, sizeof(pr->pr_childmax)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); if (error != 0 && error != ENOENT) goto done; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32 = pr->pr_hostid; error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); } else #endif error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, sizeof(pr->pr_hostid)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, sizeof(pr->pr_enforce_statfs)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, sizeof(pr->pr_devfs_rsnum)); if (error != 0 && error != ENOENT) goto done; for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) { i = (pr->pr_flags & bf->flag) ? 1 : 0; error = vfs_setopt(opts, bf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { f = pr->pr_flags & (jsf->disable | jsf->new); i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE : (f == jsf->new) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, jsf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { i = (pr->pr_allow & bf->flag) ? 1 : 0; error = vfs_setopt(opts, bf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } i = !prison_isalive(pr); error = vfs_setopt(opts, "dying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, "nodying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, sizeof(pr->pr_osreldate)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); if (error != 0 && error != ENOENT) goto done; /* Get the module parameters. */ mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; error = osd_jail_call(pr, PR_METHOD_GET, opts); if (error) goto done; prison_deref(pr, drflags); pr = NULL; drflags = 0; /* By now, all parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && (strstr(opt->name, JAIL_META_PRIVATE ".") == opt->name || strstr(opt->name, JAIL_META_SHARED ".") == opt->name)) { /* Communicate back a missing key. */ free(opt->value, M_MOUNT); opt->value = NULL; opt->len = 0; continue; } if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done; } } /* Write the fetched parameters back to userspace. */ error = 0; TAILQ_FOREACH(opt, opts, link) { if (opt->pos >= 0 && opt->pos != errmsg_pos) { pos = 2 * opt->pos + 1; optuio->uio_iov[pos].iov_len = opt->len; if (opt->value != NULL) { if (optuio->uio_segflg == UIO_SYSSPACE) { bcopy(opt->value, optuio->uio_iov[pos].iov_base, opt->len); } else { error = copyout(opt->value, optuio->uio_iov[pos].iov_base, opt->len); if (error) break; } } } } done: /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (drflags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); /* Clean up other resources. */ if (jfp_out != NULL) (void)fdrop(jfp_out, td); if (error && jfd_out >= 0) (void)kern_close(td, jfd_out); if (error && errmsg_pos >= 0) { /* Write the error message back to userspace. */ vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); errmsg_pos = 2 * errmsg_pos + 1; if (errmsg_len > 0) { if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else (void)copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } vfs_freeopts(opts); prison_free(mypr); return (error); } /* * struct jail_remove_args { * int jid; * }; */ int sys_jail_remove(struct thread *td, struct jail_remove_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_REMOVE); if (error) return (error); sx_xlock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_xunlock(&allprison_lock); return (EINVAL); } prison_hold(pr); prison_remove(pr); return (0); } /* * struct jail_remove_jd_args { * int fd; * }; */ int sys_jail_remove_jd(struct thread *td, struct jail_remove_jd_args *uap) { struct prison *pr; struct ucred *jdcred; int error; error = jaildesc_find(td, uap->fd, &pr, &jdcred); if (error) return (error); error = priv_check_cred(jdcred, PRIV_JAIL_REMOVE); crfree(jdcred); if (error) { prison_free(pr); return (error); } sx_xlock(&allprison_lock); mtx_lock(&pr->pr_mtx); prison_remove(pr); return (0); } /* * Begin the removal process for a prison. The allprison lock should * be held exclusively, and the prison should be both locked and held. */ void prison_remove(struct prison *pr) { sx_assert(&allprison_lock, SA_XLOCKED); mtx_assert(&pr->pr_mtx, MA_OWNED); prison_deref(pr, PD_KILL | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); } /* * struct jail_attach_args { * int jid; * }; */ int sys_jail_attach(struct thread *td, struct jail_attach_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); return (EINVAL); } /* Do not allow a process to attach to a prison that is not alive. */ if (!prison_isalive(pr)) { mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); return (EINVAL); } return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED)); } /* * struct jail_attach_jd_args { * int fd; * }; */ int sys_jail_attach_jd(struct thread *td, struct jail_attach_jd_args *uap) { struct prison *pr; struct ucred *jdcred; int drflags, error; sx_slock(&allprison_lock); drflags = PD_LIST_SLOCKED; error = jaildesc_find(td, uap->fd, &pr, &jdcred); if (error) goto fail; drflags |= PD_DEREF; error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); crfree(jdcred); if (error) goto fail; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; /* Do not allow a process to attach to a prison that is not alive. */ if (!prison_isalive(pr)) { error = EINVAL; goto fail; } return (do_jail_attach(td, pr, drflags)); fail: prison_deref(pr, drflags); return (error); } static int do_jail_attach(struct thread *td, struct prison *pr, int drflags) { struct proc *p; struct ucred *newcred, *oldcred; int error; mtx_assert(&pr->pr_mtx, MA_OWNED); sx_assert(&allprison_lock, SX_LOCKED); drflags &= PD_LOCK_FLAGS; /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two * different jails at the same time. It is important for * user processes not to do this, or they might end up with * a process root from one prison, but attached to the jail * of another. */ if (!(drflags & PD_DEREF)) { prison_hold(pr); drflags |= PD_DEREF; } refcount_acquire(&pr->pr_uref); drflags |= PD_DEUREF; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; /* Let modules do whatever they need to prepare for attaching. */ error = osd_jail_call(pr, PR_METHOD_ATTACH, td); if (error) { prison_deref(pr, drflags); return (error); } sx_unlock(&allprison_lock); drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED); /* * Reparent the newly attached process to this jail. */ p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) goto e_revert_osd; vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); if ((error = change_dir(pr->pr_root, td)) != 0) goto e_unlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) goto e_unlock; #endif VOP_UNLOCK(pr->pr_root); if ((error = pwd_chroot_chdir(td, pr->pr_root))) goto e_revert_osd; newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_prison = pr; - proc_set_cred(p, newcred); - setsugid(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif #ifdef RCTL crhold(newcred); #endif + /* + * Takes over 'newcred''s reference, so 'newcred' must not be used + * besides this point except on RCTL where we took an additional + * reference above. + */ + proc_set_cred(p, newcred); + setsugid(p); PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif prison_proc_relink(oldcred->cr_prison, pr, p); prison_deref(oldcred->cr_prison, drflags); crfree(oldcred); prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid); /* * If the prison was killed while changing credentials, die along * with it. */ if (!prison_isalive(pr)) { PROC_LOCK(p); kern_psignal(p, SIGKILL); PROC_UNLOCK(p); } return (0); e_unlock: VOP_UNLOCK(pr->pr_root); e_revert_osd: /* Tell modules this thread is still in its old jail after all. */ sx_slock(&allprison_lock); drflags |= PD_LIST_SLOCKED; (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); prison_deref(pr, drflags); return (error); } /* * Returns a locked prison instance, or NULL on failure. */ struct prison * prison_find(int prid) { struct prison *pr; sx_assert(&allprison_lock, SX_LOCKED); TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id < prid) continue; if (pr->pr_id > prid) break; KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); mtx_lock(&pr->pr_mtx); return (pr); } return (NULL); } /* * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. */ struct prison * prison_find_child(struct prison *mypr, int prid) { struct prison *pr; int descend; sx_assert(&allprison_lock, SX_LOCKED); FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (pr->pr_id == prid) { KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); mtx_lock(&pr->pr_mtx); return (pr); } } return (NULL); } /* * Look for the name relative to mypr. Returns a locked prison or NULL. */ struct prison * prison_find_name(struct prison *mypr, const char *name) { struct prison *pr, *deadpr; size_t mylen; int descend; sx_assert(&allprison_lock, SX_LOCKED); mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; deadpr = NULL; FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (!strcmp(pr->pr_name + mylen, name)) { KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); if (prison_isalive(pr)) { mtx_lock(&pr->pr_mtx); return (pr); } deadpr = pr; } } /* There was no valid prison - perhaps there was a dying one. */ if (deadpr != NULL) mtx_lock(&deadpr->pr_mtx); return (deadpr); } /* * See if a prison has the specific flag set. The prison should be locked, * unless checking for flags that are only set at jail creation (such as * PR_IP4 and PR_IP6), or only the single bit is examined, without regard * to any other prison data. */ bool prison_flag(struct ucred *cred, unsigned flag) { return ((cred->cr_prison->pr_flags & flag) != 0); } /* * See if a prison has the specific allow flag set. * The prison *should* be locked, or only a single bit is examined, without * regard to any other prison data. */ bool prison_allow(struct ucred *cred, unsigned flag) { return ((cred->cr_prison->pr_allow & flag) != 0); } /* * Hold a prison reference, by incrementing pr_ref. It is generally * an error to hold a prison that does not already have a reference. * A prison record will remain valid as long as it has at least one * reference, and will not be removed as long as either the prison * mutex or the allprison lock is held (allprison_lock may be shared). */ void prison_hold_locked(struct prison *pr) { /* Locking is no longer required. */ prison_hold(pr); } void prison_hold(struct prison *pr) { #ifdef INVARIANTS int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref); KASSERT(was_valid, ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_acquire(&pr->pr_ref); #endif } /* * Remove a prison reference. If that was the last reference, the * prison will be removed (at a later time). */ void prison_free_locked(struct prison *pr) { mtx_assert(&pr->pr_mtx, MA_OWNED); /* * Locking is no longer required, but unlock because the caller * expects it. */ mtx_unlock(&pr->pr_mtx); prison_free(pr); } void prison_free(struct prison *pr) { KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_ref)) { /* * Don't remove the last reference in this context, * in case there are locks held. */ taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task); } } static void prison_free_not_last(struct prison *pr) { #ifdef INVARIANTS int lastref; KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); lastref = refcount_release(&pr->pr_ref); KASSERT(!lastref, ("prison_free_not_last freed last ref on prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_release(&pr->pr_ref); #endif } /* * Hold a prison for user visibility, by incrementing pr_uref. * It is generally an error to hold a prison that isn't already * user-visible, except through the jail system calls. It is also * an error to hold an invalid prison. A prison record will remain * alive as long as it has at least one user reference, and will not * be set to the dying state until the prison mutex and allprison_lock * are both freed. */ void prison_proc_hold(struct prison *pr) { #ifdef INVARIANTS int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref); KASSERT(was_alive, ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); #else refcount_acquire(&pr->pr_uref); #endif } /* * Remove a prison user reference. If it was the last reference, the * prison will be considered "dying", and may be removed once all of * its references are dropped. */ void prison_proc_free(struct prison *pr) { /* * Locking is only required when releasing the last reference. * This allows assurance that a locked prison will remain alive * until it is unlocked. */ KASSERT(refcount_load(&pr->pr_uref) > 0, ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_uref)) { /* * Don't remove the last user reference in this context, * which is expected to be a process that is not only locked, * but also half dead. Add a reference so any calls to * prison_free() won't re-submit the task. */ prison_hold(pr); mtx_lock(&pr->pr_mtx); KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC), ("Redundant last reference in prison_proc_free (jid=%d)", pr->pr_id)); pr->pr_flags |= PR_COMPLETE_PROC; mtx_unlock(&pr->pr_mtx); taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task); } } static void prison_proc_free_not_last(struct prison *pr) { #ifdef INVARIANTS int lastref; KASSERT(refcount_load(&pr->pr_uref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); lastref = refcount_release(&pr->pr_uref); KASSERT(!lastref, ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_release(&pr->pr_uref); #endif } void prison_proc_link(struct prison *pr, struct proc *p) { sx_assert(&allproc_lock, SA_XLOCKED); LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist); } void prison_proc_unlink(struct prison *pr, struct proc *p) { sx_assert(&allproc_lock, SA_XLOCKED); LIST_REMOVE(p, p_jaillist); } static void prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p) { sx_xlock(&allproc_lock); prison_proc_unlink(opr, p); prison_proc_link(npr, p); sx_xunlock(&allproc_lock); } /* * Complete a call to either prison_free or prison_proc_free. */ static void prison_complete(void *context, int pending) { struct prison *pr = context; int drflags; /* * This could be called to release the last reference, or the last * user reference (plus the reference held in prison_proc_free). */ drflags = prison_lock_xlock(pr, PD_DEREF); if (pr->pr_flags & PR_COMPLETE_PROC) { pr->pr_flags &= ~PR_COMPLETE_PROC; drflags |= PD_DEUREF; } prison_deref(pr, drflags); } static void prison_kill_processes_cb(struct proc *p, void *arg __unused) { kern_psignal(p, SIGKILL); } /* * Note the iteration does not guarantee acting on all processes. * Most notably there may be fork or jail_attach in progress. */ void prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *), void *cbarg) { struct prison *ppr; struct proc *p; if (atomic_load_int(&pr->pr_childcount) == 0) { sx_slock(&allproc_lock); LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) { if (p->p_state == PRS_NEW) continue; PROC_LOCK(p); cb(p, cbarg); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); if (atomic_load_int(&pr->pr_childcount) == 0) return; /* * Some jails popped up during the iteration, fall through to a * system-wide search. */ } sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NEW && p->p_ucred != NULL) { for (ppr = p->p_ucred->cr_prison; ppr != NULL; ppr = ppr->pr_parent) { if (ppr == pr) { cb(p, cbarg); break; } } } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } /* * Remove a prison reference and/or user reference (usually). * This assumes context that allows sleeping (for allprison_lock), * with no non-sleeping locks held, except perhaps the prison itself. * If there are no more references, release and delist the prison. * On completion, the prison lock and the allprison lock are both * unlocked. */ static void prison_deref(struct prison *pr, int flags) { struct prisonlist freeprison; struct prison *killpr, *rpr, *ppr, *tpr; killpr = NULL; TAILQ_INIT(&freeprison); /* * Release this prison as requested, which may cause its parent * to be released, and then maybe its grandparent, etc. */ for (;;) { if (flags & PD_KILL) { /* Kill the prison and its descendents. */ KASSERT(pr != &prison0, ("prison_deref trying to kill prison0")); if (!prison_isalive(pr)) { /* Silently ignore already-dying prisons. */ flags &= ~PD_KILL; } else { if (!(flags & PD_DEREF)) { prison_hold(pr); flags |= PD_DEREF; } flags = prison_lock_xlock(pr, flags); prison_deref_kill(pr, &freeprison); } } if (flags & PD_DEUREF) { /* Drop a user reference. */ KASSERT(refcount_load(&pr->pr_uref) > 0, ("prison_deref PD_DEUREF on a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_uref)) { if (!(flags & PD_DEREF)) { prison_hold(pr); flags |= PD_DEREF; } flags = prison_lock_xlock(pr, flags); if (refcount_release(&pr->pr_uref) && pr->pr_state == PRISON_STATE_ALIVE) { /* * When the last user references goes, * this becomes a dying prison. */ KASSERT( refcount_load(&prison0.pr_uref) > 0, ("prison0 pr_uref=0")); pr->pr_state = PRISON_STATE_DYING; prison_cleanup_locked(pr); mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; prison_cleanup_unlocked(pr); } } } if (flags & PD_KILL) { /* * Any remaining user references are probably processes * that need to be killed, either in this prison or its * descendants. */ if (refcount_load(&pr->pr_uref) > 0) killpr = pr; /* Make sure the parent prison doesn't get killed. */ flags &= ~PD_KILL; } if (flags & PD_DEREF) { /* Drop a reference. */ KASSERT(refcount_load(&pr->pr_ref) > 0, ("prison_deref PD_DEREF on a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_ref)) { flags = prison_lock_xlock(pr, flags); if (refcount_release(&pr->pr_ref)) { /* * When the last reference goes, * unlink the prison and set it aside. */ KASSERT( refcount_load(&pr->pr_uref) == 0, ("prison_deref: last ref, " "but still has %d urefs (jid=%d)", pr->pr_uref, pr->pr_id)); KASSERT( refcount_load(&prison0.pr_ref) != 0, ("prison0 pr_ref=0")); pr->pr_state = PRISON_STATE_INVALID; TAILQ_REMOVE(&allprison, pr, pr_list); LIST_REMOVE(pr, pr_sibling); TAILQ_INSERT_TAIL(&freeprison, pr, pr_list); for (ppr = pr->pr_parent; ppr != NULL; ppr = ppr->pr_parent) ppr->pr_childcount--; /* * Removing a prison frees references * from its parent. */ ppr = pr->pr_parent; pr->pr_parent = NULL; mtx_unlock(&pr->pr_mtx); pr = ppr; flags &= ~PD_LOCKED; flags |= PD_DEREF | PD_DEUREF; continue; } } } break; } /* Release all the prison locks. */ if (flags & PD_LOCKED) mtx_unlock(&pr->pr_mtx); if (flags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (flags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); /* Kill any processes attached to a killed prison. */ if (killpr != NULL) prison_proc_iterate(killpr, prison_kill_processes_cb, NULL); /* * Finish removing any unreferenced prisons, which couldn't happen * while allprison_lock was held (to avoid a LOR on vrele). */ TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) { #ifdef VIMAGE if (rpr->pr_flags & PR_VNET) vnet_destroy(rpr->pr_vnet); #endif if (rpr->pr_root != NULL) vrele(rpr->pr_root); mtx_destroy(&rpr->pr_mtx); #ifdef INET prison_ip_free(rpr->pr_addrs[PR_INET]); #endif #ifdef INET6 prison_ip_free(rpr->pr_addrs[PR_INET6]); #endif if (rpr->pr_cpuset != NULL) cpuset_rel(rpr->pr_cpuset); osd_jail_exit(rpr); #ifdef RACCT if (racct_enable) prison_racct_detach(rpr); #endif TAILQ_REMOVE(&freeprison, rpr, pr_list); free(rpr, M_PRISON); } } /* * Kill the prison and its descendants. Mark them as dying, clear the * persist flag, and call module remove methods. */ static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) { struct prison *cpr, *ppr, *rpr; bool descend; /* * Unlike the descendants, the target prison can be killed * even if it is currently dying. This is useful for failed * creation in jail_set(2). */ KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to kill dead prison %p (jid=%d).", pr, pr->pr_id)); refcount_acquire(&pr->pr_uref); pr->pr_state = PRISON_STATE_DYING; mtx_unlock(&pr->pr_mtx); rpr = NULL; FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) { if (descend) { if (!prison_isalive(cpr)) { descend = false; continue; } prison_hold(cpr); prison_proc_hold(cpr); mtx_lock(&cpr->pr_mtx); cpr->pr_state = PRISON_STATE_DYING; cpr->pr_flags |= PR_REMOVE; mtx_unlock(&cpr->pr_mtx); continue; } if (!(cpr->pr_flags & PR_REMOVE)) continue; prison_cleanup_unlocked(cpr); mtx_lock(&cpr->pr_mtx); prison_cleanup_locked(cpr); cpr->pr_flags &= ~PR_REMOVE; if (cpr->pr_flags & PR_PERSIST) { cpr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(cpr); prison_free_not_last(cpr); } (void)refcount_release(&cpr->pr_uref); if (refcount_release(&cpr->pr_ref)) { /* * When the last reference goes, unlink the prison * and set it aside for prison_deref() to handle. * Delay unlinking the sibling list to keep the loop * safe. */ if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); rpr = cpr; rpr->pr_state = PRISON_STATE_INVALID; TAILQ_REMOVE(&allprison, rpr, pr_list); TAILQ_INSERT_TAIL(freeprison, rpr, pr_list); /* * Removing a prison frees references from its parent. */ ppr = rpr->pr_parent; prison_proc_free_not_last(ppr); prison_free_not_last(ppr); for (; ppr != NULL; ppr = ppr->pr_parent) ppr->pr_childcount--; } mtx_unlock(&cpr->pr_mtx); } if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); prison_cleanup_unlocked(pr); mtx_lock(&pr->pr_mtx); prison_cleanup_locked(pr); if (pr->pr_flags & PR_PERSIST) { pr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(pr); prison_free_not_last(pr); } (void)refcount_release(&pr->pr_uref); } /* * Given the current locking state in the flags, make sure allprison_lock * is held exclusive, and the prison is locked. Return flags indicating * the new state. */ static int prison_lock_xlock(struct prison *pr, int flags) { if (!(flags & PD_LIST_XLOCKED)) { /* * Get allprison_lock, which may be an upgrade, * and may require unlocking the prison. */ if (flags & PD_LOCKED) { mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; } if (flags & PD_LIST_SLOCKED) { if (!sx_try_upgrade(&allprison_lock)) { sx_sunlock(&allprison_lock); sx_xlock(&allprison_lock); } flags &= ~PD_LIST_SLOCKED; } else sx_xlock(&allprison_lock); flags |= PD_LIST_XLOCKED; } if (!(flags & PD_LOCKED)) { /* Lock the prison mutex. */ mtx_lock(&pr->pr_mtx); flags |= PD_LOCKED; } return flags; } /* * Release a prison's resources when it starts dying (when the last user * reference is dropped, or when it is killed). Two functions are called, * for work that requires a locked prison or an unlocked one. */ static void prison_cleanup_locked(struct prison *pr) { sx_assert(&allprison_lock, SA_XLOCKED); mtx_assert(&pr->pr_mtx, MA_OWNED); prison_knote(pr, NOTE_JAIL_REMOVE); knlist_detach(pr->pr_klist); jaildesc_prison_cleanup(pr); pr->pr_klist = NULL; } static void prison_cleanup_unlocked(struct prison *pr) { sx_assert(&allprison_lock, SA_XLOCKED); mtx_assert(&pr->pr_mtx, MA_NOTOWNED); vfs_exjail_delete(pr); shm_remove_prison(pr); (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); } /* * Set or clear a permission bit in the pr_allow field, passing restrictions * (cleared permission) down to child jails. */ void prison_set_allow(struct ucred *cred, unsigned flag, int enable) { struct prison *pr; pr = cred->cr_prison; sx_slock(&allprison_lock); mtx_lock(&pr->pr_mtx); prison_set_allow_locked(pr, flag, enable); mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); } static void prison_set_allow_locked(struct prison *pr, unsigned flag, int enable) { struct prison *cpr; int descend; if (enable != 0) pr->pr_allow |= flag; else { pr->pr_allow &= ~flag; FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) cpr->pr_allow &= ~flag; } } /* * Check if a jail supports the given address family. * * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT * if not. */ int prison_check_af(struct ucred *cred, int af) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); pr = cred->cr_prison; #ifdef VIMAGE /* Prisons with their own network stack are not limited. */ if (prison_owns_vnet(pr)) return (0); #endif error = 0; switch (af) { #ifdef INET case AF_INET: if (pr->pr_flags & PR_IP4) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP4) && pr->pr_addrs[PR_INET] == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif #ifdef INET6 case AF_INET6: if (pr->pr_flags & PR_IP6) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP6) && pr->pr_addrs[PR_INET6] == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif case AF_LOCAL: case AF_ROUTE: case AF_NETLINK: break; default: if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Check if given address belongs to the jail referenced by cred (wrapper to * prison_check_ip[46]). * * Returns 0 if jail doesn't restrict the address family or if address belongs * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if * the jail doesn't allow the address family. IPv4 Address passed in in NBO. */ int prison_if(struct ucred *cred, const struct sockaddr *sa) { #ifdef INET const struct sockaddr_in *sai; #endif #ifdef INET6 const struct sockaddr_in6 *sai6; #endif int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); #ifdef VIMAGE if (prison_owns_vnet(cred->cr_prison)) return (0); #endif error = 0; switch (sa->sa_family) { #ifdef INET case AF_INET: sai = (const struct sockaddr_in *)sa; error = prison_check_ip4(cred, &sai->sin_addr); break; #endif #ifdef INET6 case AF_INET6: sai6 = (const struct sockaddr_in6 *)sa; error = prison_check_ip6(cred, &sai6->sin6_addr); break; #endif default: if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ int prison_check(struct ucred *cred1, struct ucred *cred2) { return ((cred1->cr_prison == cred2->cr_prison || prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); } /* * For mountd/nfsd to run within a prison, it must be: * - A vnet prison. * - PR_ALLOW_NFSD must be set on it. * - The root directory (pr_root) of the prison must be * a file system mount point, so the mountd can hang * export information on it. * - The prison's enforce_statfs cannot be 0, so that * mountd(8) can do exports. */ bool prison_check_nfsd(struct ucred *cred) { if (jailed_without_vnet(cred)) return (false); if (!prison_allow(cred, PR_ALLOW_NFSD)) return (false); if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0) return (false); if (cred->cr_prison->pr_enforce_statfs == 0) return (false); return (true); } /* * Return true if p2 is a child of p1, otherwise false. */ bool prison_ischild(struct prison *pr1, struct prison *pr2) { for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) if (pr1 == pr2) return (true); return (false); } /* * Return true if the prison is currently alive. A prison is alive if it * holds user references and it isn't being removed. */ bool prison_isalive(const struct prison *pr) { if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE)) return (false); return (true); } /* * Return true if the prison is currently valid. A prison is valid if it has * been fully created, and is not being destroyed. Note that dying prisons * are still considered valid. Invalid prisons won't be found under normal * circumstances, as they're only put in that state by functions that have * an exclusive hold on allprison_lock. */ bool prison_isvalid(struct prison *pr) { if (__predict_false(pr->pr_state == PRISON_STATE_INVALID)) return (false); if (__predict_false(refcount_load(&pr->pr_ref) == 0)) return (false); return (true); } /* * Return true if the passed credential is in a jail and that jail does not * have its own virtual network stack, otherwise false. */ bool jailed_without_vnet(struct ucred *cred) { if (!jailed(cred)) return (false); #ifdef VIMAGE if (prison_owns_vnet(cred->cr_prison)) return (false); #endif return (true); } /* * Return the correct hostname (domainname, et al) for the passed credential. */ void getcredhostname(struct ucred *cred, char *buf, size_t size) { struct prison *pr; /* * A NULL credential can be used to shortcut to the physical * system's hostname. */ pr = (cred != NULL) ? cred->cr_prison : &prison0; mtx_lock(&pr->pr_mtx); strlcpy(buf, pr->pr_hostname, size); mtx_unlock(&pr->pr_mtx); } void getcreddomainname(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_domainname, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostuuid(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_hostuuid, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostid(struct ucred *cred, unsigned long *hostid) { mtx_lock(&cred->cr_prison->pr_mtx); *hostid = cred->cr_prison->pr_hostid; mtx_unlock(&cred->cr_prison->pr_mtx); } void getjailname(struct ucred *cred, char *name, size_t len) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(name, cred->cr_prison->pr_name, len); mtx_unlock(&cred->cr_prison->pr_mtx); } #ifdef VIMAGE /* * Determine whether the prison owns its VNET. */ bool prison_owns_vnet(struct prison *pr) { /* * vnets cannot be added/removed after jail creation, * so no need to lock here. */ return ((pr->pr_flags & PR_VNET) != 0); } #endif /* * Determine whether the subject represented by cred can "see" * status of a mount point. * Returns: 0 for permitted, ENOENT otherwise. * XXX: This function should be called cr_canseemount() and should be * placed in kern_prot.c. */ int prison_canseemount(struct ucred *cred, struct mount *mp) { struct prison *pr; struct statfs *sp; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return (0); if (pr->pr_root->v_mount == mp) return (0); if (pr->pr_enforce_statfs == 2) return (ENOENT); /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. * This is ugly check, but this is the only situation when jail's * directory ends with '/'. */ if (strcmp(pr->pr_path, "/") == 0) return (0); len = strlen(pr->pr_path); sp = &mp->mnt_stat; if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) return (ENOENT); /* * Be sure that we don't have situation where jail's root directory * is "/some/path" and mount point is "/some/pathpath". */ if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') return (ENOENT); return (0); } void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) { char jpath[MAXPATHLEN]; struct prison *pr; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return; if (prison_canseemount(cred, mp) != 0) { bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, "[restricted]", sizeof(sp->f_mntonname)); return; } if (pr->pr_root->v_mount == mp) { /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); *sp->f_mntonname = '/'; return; } /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. */ if (strcmp(pr->pr_path, "/") == 0) return; len = strlen(pr->pr_path); strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); if (*jpath == '\0') { /* Should never happen. */ *sp->f_mntonname = '/'; } else { strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); } } /* * Check with permission for a specific privilege is granted within jail. We * have a specific list of accepted privileges; the rest are denied. */ int prison_priv_check(struct ucred *cred, int priv) { struct prison *pr; int error; /* * Some policies have custom handlers. This routine should not be * called for them. See priv_check_cred(). */ switch (priv) { case PRIV_VFS_LOOKUP: case PRIV_VFS_GENERATION: KASSERT(0, ("prison_priv_check instead of a custom handler " "called for %d\n", priv)); } if (!jailed(cred)) return (0); #ifdef VIMAGE /* * Privileges specific to prisons with a virtual network stack. * There might be a duplicate entry here in case the privilege * is only granted conditionally in the legacy jail case. */ switch (priv) { /* * NFS-specific privileges. */ case PRIV_NFS_DAEMON: case PRIV_VFS_GETFH: case PRIV_VFS_MOUNT_EXPORTED: if (!prison_check_nfsd(cred)) return (EPERM); #ifdef notyet case PRIV_NFS_LOCKD: #endif /* * Network stack privileges. */ case PRIV_NET_BRIDGE: case PRIV_NET_GRE: case PRIV_NET_BPF: case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ case PRIV_NET_ROUTE: case PRIV_NET_TAP: case PRIV_NET_SETIFMTU: case PRIV_NET_SETIFFLAGS: case PRIV_NET_SETIFCAP: case PRIV_NET_SETIFDESCR: case PRIV_NET_SETIFNAME : case PRIV_NET_SETIFMETRIC: case PRIV_NET_SETIFPHYS: case PRIV_NET_SETIFMAC: case PRIV_NET_SETLANPCP: case PRIV_NET_ADDMULTI: case PRIV_NET_DELMULTI: case PRIV_NET_HWIOCTL: case PRIV_NET_SETLLADDR: case PRIV_NET_ADDIFGROUP: case PRIV_NET_DELIFGROUP: case PRIV_NET_IFCREATE: case PRIV_NET_IFDESTROY: case PRIV_NET_ADDIFADDR: case PRIV_NET_DELIFADDR: case PRIV_NET_LAGG: case PRIV_NET_GIF: case PRIV_NET_SETIFVNET: case PRIV_NET_SETIFFIB: case PRIV_NET_OVPN: case PRIV_NET_ME: case PRIV_NET_WG: /* * 802.11-related privileges. */ case PRIV_NET80211_VAP_GETKEY: case PRIV_NET80211_VAP_MANAGE: #ifdef notyet /* * ATM privileges. */ case PRIV_NETATM_CFG: case PRIV_NETATM_ADD: case PRIV_NETATM_DEL: case PRIV_NETATM_SET: /* * Bluetooth privileges. */ case PRIV_NETBLUETOOTH_RAW: #endif /* * Netgraph and netgraph module privileges. */ case PRIV_NETGRAPH_CONTROL: #ifdef notyet case PRIV_NETGRAPH_TTY: #endif /* * IPv4 and IPv6 privileges. */ case PRIV_NETINET_IPFW: case PRIV_NETINET_DIVERT: case PRIV_NETINET_PF: case PRIV_NETINET_DUMMYNET: case PRIV_NETINET_CARP: case PRIV_NETINET_MROUTE: case PRIV_NETINET_RAW: case PRIV_NETINET_ADDRCTRL6: case PRIV_NETINET_ND6: case PRIV_NETINET_SCOPE6: case PRIV_NETINET_ALIFETIME6: case PRIV_NETINET_IPSEC: case PRIV_NETINET_BINDANY: #ifdef notyet /* * NCP privileges. */ case PRIV_NETNCP: /* * SMB privileges. */ case PRIV_NETSMB: #endif /* * No default: or deny here. * In case of no permit fall through to next switch(). */ if (cred->cr_prison->pr_flags & PR_VNET) return (0); } #endif /* VIMAGE */ switch (priv) { /* * Allow ktrace privileges for root in jail. */ case PRIV_KTRACE: /* * Allow jailed processes to configure audit identity and * submit audit records (login, etc). In the future we may * want to further refine the relationship between audit and * jail. */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: if (cred->cr_prison->pr_allow & PR_ALLOW_SETAUDIT) return (0); else return (EPERM); #if 0 case PRIV_AUDIT_SUBMIT: #endif /* * Allow jailed processes to manipulate process UNIX * credentials in any way they see fit. */ case PRIV_CRED_SETCRED: case PRIV_CRED_SETUID: case PRIV_CRED_SETEUID: case PRIV_CRED_SETGID: case PRIV_CRED_SETEGID: case PRIV_CRED_SETGROUPS: case PRIV_CRED_SETREUID: case PRIV_CRED_SETREGID: case PRIV_CRED_SETRESUID: case PRIV_CRED_SETRESGID: /* * Jail implements visibility constraints already, so allow * jailed root to override uid/gid-based constraints. */ case PRIV_SEEOTHERGIDS: case PRIV_SEEOTHERUIDS: case PRIV_SEEJAILPROC: /* * Jail implements inter-process debugging limits already, so * allow jailed root various debugging privileges. */ case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: case PRIV_DEBUG_DIFFJAIL: /* * Allow jail to set various resource limits and login * properties, and for now, exceed process resource limits. */ case PRIV_PROC_LIMIT: case PRIV_PROC_SETLOGIN: case PRIV_PROC_SETRLIMIT: /* * Debuggers should work in jails. */ case PRIV_PROC_MEM_WRITE: /* * System V and POSIX IPC privileges are granted in jail. */ case PRIV_IPC_READ: case PRIV_IPC_WRITE: case PRIV_IPC_ADMIN: case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: /* * Jail operations within a jail work on child jails. */ case PRIV_JAIL_ATTACH: case PRIV_JAIL_SET: case PRIV_JAIL_REMOVE: /* * Jail implements its own inter-process limits, so allow * root processes in jail to change scheduling on other * processes in the same jail. Likewise for signalling. */ case PRIV_SCHED_DIFFCRED: case PRIV_SCHED_CPUSET: case PRIV_SCHED_DIFFJAIL: case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: case PRIV_SIGNAL_DIFFJAIL: /* * Allow jailed processes to write to sysctls marked as jail * writable. */ case PRIV_SYSCTL_WRITEJAIL: /* * Allow root in jail to manage a variety of quota * properties. These should likely be conditional on a * configuration option. */ case PRIV_VFS_GETQUOTA: case PRIV_VFS_SETQUOTA: /* * Since Jail relies on chroot() to implement file system * protections, grant many VFS privileges to root in jail. * Be careful to exclude mount-related and NFS-related * privileges. */ case PRIV_VFS_READ: case PRIV_VFS_WRITE: case PRIV_VFS_ADMIN: case PRIV_VFS_EXEC: case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ case PRIV_VFS_CHFLAGS_DEV: case PRIV_VFS_CHOWN: case PRIV_VFS_CHROOT: case PRIV_VFS_RETAINSUGID: case PRIV_VFS_FCHROOT: case PRIV_VFS_LINK: case PRIV_VFS_SETGID: case PRIV_VFS_STAT: case PRIV_VFS_STICKYFILE: /* * As in the non-jail case, non-root users are expected to be * able to read kernel/physical memory (provided /dev/[k]mem * exists in the jail and they have permission to access it). */ case PRIV_KMEM_READ: return (0); /* * Depending on the global setting, allow privilege of * setting system flags. */ case PRIV_VFS_SYSFLAGS: if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) return (0); else return (EPERM); /* * Depending on the global setting, allow privilege of * mounting/unmounting file systems. */ case PRIV_VFS_MOUNT: case PRIV_VFS_UNMOUNT: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_MOUNT_OWNER: pr = cred->cr_prison; prison_lock(pr); if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2) error = 0; else error = EPERM; prison_unlock(pr); return (error); /* * Jails should hold no disposition on the PRIV_VFS_READ_DIR * policy. priv_check_cred will not specifically allow it, and * we may want a MAC policy to allow it. */ case PRIV_VFS_READ_DIR: return (0); /* * Conditionally allow privileged process in the jail to * manipulate filesystem extended attributes in the system * namespace. */ case PRIV_VFS_EXTATTR_SYSTEM: if ((cred->cr_prison->pr_allow & PR_ALLOW_EXTATTR) != 0) return (0); else return (EPERM); /* * Conditionnaly allow locking (unlocking) physical pages * in memory. */ case PRIV_VM_MLOCK: case PRIV_VM_MUNLOCK: if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK) return (0); else return (EPERM); /* * Conditionally allow jailed root to bind reserved ports. */ case PRIV_NETINET_RESERVEDPORT: if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS) return (0); else return (EPERM); /* * Allow jailed root to reuse in-use ports. */ case PRIV_NETINET_REUSEPORT: return (0); /* * Allow jailed root to set certain IPv4/6 (option) headers. */ case PRIV_NETINET_SETHDROPTS: return (0); /* * Conditionally allow creating raw sockets in jail. */ case PRIV_NETINET_RAW: if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) return (0); else return (EPERM); /* * Since jail implements its own visibility limits on netstat * sysctls, allow getcred. This allows identd to work in * jail. */ case PRIV_NETINET_GETCRED: return (0); /* * Allow jailed root to set loginclass. */ case PRIV_PROC_SETLOGINCLASS: return (0); /* * Do not allow a process inside a jail to read the kernel * message buffer unless explicitly permitted. */ case PRIV_MSGBUF: if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF) return (0); return (EPERM); /* * Conditionally allow privileged process in the jail adjust * machine time. */ case PRIV_ADJTIME: case PRIV_NTP_ADJTIME: if (cred->cr_prison->pr_allow & (PR_ALLOW_ADJTIME | PR_ALLOW_SETTIME)) { return (0); } return (EPERM); /* * Conditionally allow privileged process in the jail set * machine time. */ case PRIV_SETTIMEOFDAY: case PRIV_CLOCK_SETTIME: if (cred->cr_prison->pr_allow & PR_ALLOW_SETTIME) return (0); else return (EPERM); /* * Conditionally allow privileged process in the jail to modify * the routing table. */ case PRIV_NET_ROUTE: if (cred->cr_prison->pr_allow & PR_ALLOW_ROUTING) return (0); else return (EPERM); default: /* * In all remaining cases, deny the privilege request. This * includes almost all network privileges, many system * configuration privileges. */ return (EPERM); } } /* * Return the part of pr2's name that is relative to pr1, or the whole name * if it does not directly follow. */ char * prison_name(struct prison *pr1, struct prison *pr2) { char *name; /* Jails see themselves as "0" (if they see themselves at all). */ if (pr1 == pr2) return "0"; name = pr2->pr_name; if (prison_ischild(pr1, pr2)) { /* * pr1 isn't locked (and allprison_lock may not be either) * so its length can't be counted on. But the number of dots * can be counted on - and counted. */ for (; pr1 != &prison0; pr1 = pr1->pr_parent) name = strchr(name, '.') + 1; } return (name); } /* * Return the part of pr2's path that is relative to pr1, or the whole path * if it does not directly follow. */ static char * prison_path(struct prison *pr1, struct prison *pr2) { char *path1, *path2; int len1; path1 = pr1->pr_path; path2 = pr2->pr_path; if (!strcmp(path1, "/")) return (path2); len1 = strlen(path1); if (strncmp(path1, path2, len1)) return (path2); if (path2[len1] == '\0') return "/"; if (path2[len1] == '/') return (path2 + len1); return (path2); } /* * Jail-related sysctls. */ SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Jails"); #if defined(INET) || defined(INET6) /* * Copy address array to memory that would be then SYSCTL_OUT-ed. * sysctl_jail_list() helper. */ static void prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len) { const struct prison_ip *pip; const size_t size = pr_families[af].size; again: mtx_assert(&pr->pr_mtx, MA_OWNED); if ((pip = pr->pr_addrs[af]) != NULL) { if (*len < pip->ips) { *len = pip->ips; mtx_unlock(&pr->pr_mtx); *out = realloc(*out, *len * size, M_TEMP, M_WAITOK); mtx_lock(&pr->pr_mtx); goto again; } bcopy(pip->pr_ip, *out, pip->ips * size); } } #endif static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { struct xprison *xp; struct prison *pr, *cpr; #ifdef INET struct in_addr *ip4 = NULL; int ip4s = 0; #endif #ifdef INET6 struct in6_addr *ip6 = NULL; int ip6s = 0; #endif int descend, error; xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); pr = req->td->td_ucred->cr_prison; error = 0; sx_slock(&allprison_lock); FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { mtx_lock(&cpr->pr_mtx); #ifdef INET prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s); #endif #ifdef INET6 prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s); #endif bzero(xp, sizeof(*xp)); xp->pr_version = XPRISON_VERSION; xp->pr_id = cpr->pr_id; xp->pr_state = cpr->pr_state; strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); #ifdef INET xp->pr_ip4s = ip4s; #endif #ifdef INET6 xp->pr_ip6s = ip6s; #endif mtx_unlock(&cpr->pr_mtx); error = SYSCTL_OUT(req, xp, sizeof(*xp)); if (error) break; #ifdef INET if (xp->pr_ip4s > 0) { error = SYSCTL_OUT(req, ip4, xp->pr_ip4s * sizeof(struct in_addr)); if (error) break; } #endif #ifdef INET6 if (xp->pr_ip6s > 0) { error = SYSCTL_OUT(req, ip6, xp->pr_ip6s * sizeof(struct in6_addr)); if (error) break; } #endif } sx_sunlock(&allprison_lock); free(xp, M_TEMP); #ifdef INET free(ip4, M_TEMP); #endif #ifdef INET6 free(ip6, M_TEMP); #endif return (error); } SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_list, "S", "List of active jails"); static int sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) { int error, injail; injail = jailed(req->td->td_ucred); error = SYSCTL_OUT(req, &injail, sizeof(injail)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_jailed, "I", "Process in jail?"); static int sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) { int error, havevnet; #ifdef VIMAGE struct ucred *cred = req->td->td_ucred; havevnet = jailed(cred) && prison_owns_vnet(cred->cr_prison); #else havevnet = 0; #endif error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, vnet, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns vnet?"); #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, "Number of IP addresses a jail may have at most per address family (deprecated)"); #endif /* * Default parameters for jail(2) compatibility. For historical reasons, * the sysctl names have varying similarity to the parameter names. Prisons * just see their own parameters, and can't change them. */ static int sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) { int error, i; /* Get the current flag value, and convert it to a boolean. */ if (req->td->td_ucred->cr_prison == &prison0) { mtx_lock(&prison0.pr_mtx); i = (jail_default_allow & arg2) != 0; mtx_unlock(&prison0.pr_mtx); } else i = prison_allow(req->td->td_ucred, arg2); if (arg1 != NULL) i = !i; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); i = i ? arg2 : 0; if (arg1 != NULL) i ^= arg2; /* * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 * for writing. */ mtx_lock(&prison0.pr_mtx); jail_default_allow = (jail_default_allow & ~arg2) | i; mtx_unlock(&prison0.pr_mtx); return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", "Processes in jail can set their hostnames (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", "Processes in jail can use System V IPC primitives (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", "Prison root can create raw sockets (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", "Processes in jail can alter system file flags (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", "Processes in jail can mount/unmount jail-friendly file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I", "Processes in jail can lock/unlock physical pages in memory"); static int sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) { struct prison *pr; int level, error; pr = req->td->td_ucred->cr_prison; level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); *(int *)arg1 = level; return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), sysctl_jail_default_level, "I", "Processes in jail cannot see all mounted file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), sysctl_jail_default_level, "I", "Ruleset for the devfs filesystem in jail (deprecated)"); SYSCTL_NODE(_security_jail, OID_AUTO, children, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Limits and stats of child jails"); static int sysctl_jail_children(SYSCTL_HANDLER_ARGS) { struct prison *pr; int i; pr = req->td->td_ucred->cr_prison; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_INT: i = *(int *)((char *)pr + arg2); return (SYSCTL_OUT(req, &i, sizeof(i))); } return (0); } SYSCTL_PROC(_security_jail_children, OID_AUTO, max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, offsetof(struct prison, pr_childmax), sysctl_jail_children, "I", "Maximum number of child jails"); SYSCTL_PROC(_security_jail_children, OID_AUTO, cur, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, offsetof(struct prison, pr_childcount), sysctl_jail_children, "I", "Current number of child jails"); /* * Nodes to describe jail parameters. Maximum length of string parameters * is returned in the string itself, and the other parameters exist merely * to make themselves and their types known. */ SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Jail parameters"); int sysctl_jail_param(SYSCTL_HANDLER_ARGS) { int i; long l; size_t s; char numbuf[12]; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_LONG: case CTLTYPE_ULONG: l = 0; #ifdef SCTL_MASK32 if (!(req->flags & SCTL_MASK32)) #endif return (SYSCTL_OUT(req, &l, sizeof(l))); case CTLTYPE_INT: case CTLTYPE_UINT: i = 0; return (SYSCTL_OUT(req, &i, sizeof(i))); case CTLTYPE_STRING: snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); return (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); case CTLTYPE_STRUCT: s = (size_t)arg2; return (SYSCTL_OUT(req, &s, sizeof(s))); } return (0); } /* * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at * jail creation time but cannot be changed in an existing jail. */ SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail secure level"); SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail value for kern.osreldate and uname -K"); SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, "Jail value for kern.osrelease and uname -r"); SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail cannot see all mounted file systems"); SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, "I", "Ruleset for in-jail devfs mounts"); SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail persistence"); #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, "I", "Current number of child jails"); SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, "I", "Maximum number of child jails"); SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail hostname"); SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail NIS domainname"); SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, "Jail host UUID"); SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, "LU", "Jail host ID"); SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); #ifdef INET SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, "Jail IPv4 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), "S,in_addr,a", "Jail IPv4 addresses"); SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv4 source address selection rather than the " "primary jail IPv4 address."); #endif #ifdef INET6 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, "Jail IPv6 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), "S,in6_addr,a", "Jail IPv6 addresses"); SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv6 source address selection rather than the " "primary jail IPv6 address."); #endif SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set hostname"); SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may use SYSV IPC"); SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create raw sockets"); SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may alter system file flags"); SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set file quotas"); SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may lock (unlock) physical pages in memory"); SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may bind sockets to reserved ports"); SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may read the kernel message buffer"); SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW, "B", "Unprivileged processes may use process debugging facilities"); SYSCTL_JAIL_PARAM(_allow, unprivileged_parent_tampering, CTLTYPE_INT | CTLFLAG_RW, "B", "Unprivileged parent jail processes may tamper with same-uid processes" " (signal/debug/cpuset)"); SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW, "B", "Processes in jail with uid 0 have privilege"); #ifdef VIMAGE SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW, "B", "Mountd/nfsd may run in the jail"); #endif SYSCTL_JAIL_PARAM(_allow, extattr, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set system-level filesystem extended attributes"); SYSCTL_JAIL_PARAM(_allow, adjtime, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may adjust system time"); SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set system time"); SYSCTL_JAIL_PARAM(_allow, routing, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may modify routing table"); #ifdef AUDIT SYSCTL_JAIL_PARAM(_allow, setaudit, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set and get audit session state"); #endif SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount/unmount jail-friendly file systems in general"); /* * Add a dynamic parameter allow., or allow... Return * its associated bit in the pr_allow bitmask, or zero if the parameter was * not created. */ unsigned prison_add_allow(const char *prefix, const char *name, const char *prefix_descr, const char *descr) { struct bool_flags *bf; struct sysctl_oid *parent; char *allow_name, *allow_noname, *allowed; #ifndef NO_SYSCTL_DESCR char *descr_deprecated; #endif u_int allow_flag; if (prefix ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name) < 0 || asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name) < 0 : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 || asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) { free(allow_name, M_PRISON); return 0; } /* * See if this parameter has already beed added, i.e. a module was * previously loaded/unloaded. */ mtx_lock(&prison0.pr_mtx); for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { if (strcmp(bf->name, allow_name) == 0) { allow_flag = bf->flag; goto no_add; } } /* * Find a free bit in pr_allow_all, failing if there are none * (which shouldn't happen as long as we keep track of how many * potential dynamic flags exist). */ for (allow_flag = 1;; allow_flag <<= 1) { if (allow_flag == 0) goto no_add; if ((pr_allow_all & allow_flag) == 0) break; } /* Note the parameter in the next open slot in pr_flag_allow. */ for (bf = pr_flag_allow; ; bf++) { if (bf == pr_flag_allow + nitems(pr_flag_allow)) { /* This should never happen, but is not fatal. */ allow_flag = 0; goto no_add; } if (atomic_load_int(&bf->flag) == 0) break; } bf->name = allow_name; bf->noname = allow_noname; pr_allow_all |= allow_flag; /* * prison0 always has permission for the new parameter. * Other jails must have it granted to them. */ prison0.pr_allow |= allow_flag; /* The flag indicates a valid entry, so make sure it is set last. */ atomic_store_rel_int(&bf->flag, allow_flag); mtx_unlock(&prison0.pr_mtx); /* * Create sysctls for the parameter, and the back-compat global * permission. */ parent = prefix ? SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(&sysctl___security_jail_param_allow), OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr) : &sysctl___security_jail_param_allow; (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_param, "B", descr); if ((prefix ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name) : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) { #ifndef NO_SYSCTL_DESCR (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)", descr); #endif (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag, sysctl_jail_default_allow, "I", descr_deprecated); #ifndef NO_SYSCTL_DESCR free(descr_deprecated, M_TEMP); #endif free(allowed, M_TEMP); } return allow_flag; no_add: mtx_unlock(&prison0.pr_mtx); free(allow_name, M_PRISON); free(allow_noname, M_PRISON); return allow_flag; } /* * The VFS system will register jail-aware filesystems here. They each get * a parameter allow.mount.xxxfs and a flag to check when a jailed user * attempts to mount. */ void prison_add_vfs(struct vfsconf *vfsp) { #ifdef NO_SYSCTL_DESCR vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, NULL, NULL); #else char *descr; (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system", vfsp->vfc_name); vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, NULL, descr); free(descr, M_TEMP); #endif } #ifdef RACCT void prison_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_slock(&allprison_lock); if (pre != NULL) (pre)(); LIST_FOREACH(prr, &allprison_racct, prr_next) (callback)(prr->prr_racct, arg2, arg3); if (post != NULL) (post)(); sx_sunlock(&allprison_lock); } static struct prison_racct * prison_racct_find_locked(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) return (NULL); LIST_FOREACH(prr, &allprison_racct, prr_next) { if (strcmp(name, prr->prr_name) != 0) continue; /* Found prison_racct with a matching name? */ prison_racct_hold(prr); return (prr); } /* Add new prison_racct. */ prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); racct_create(&prr->prr_racct); strcpy(prr->prr_name, name); refcount_init(&prr->prr_refcount, 1); LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); return (prr); } struct prison_racct * prison_racct_find(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_xlock(&allprison_lock); prr = prison_racct_find_locked(name); sx_xunlock(&allprison_lock); return (prr); } void prison_racct_hold(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); refcount_acquire(&prr->prr_refcount); } static void prison_racct_free_locked(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (refcount_release(&prr->prr_refcount)) { racct_destroy(&prr->prr_racct); LIST_REMOVE(prr, prr_next); free(prr, M_PRISON_RACCT); } } void prison_racct_free(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (refcount_release_if_not_last(&prr->prr_refcount)) return; sx_xlock(&allprison_lock); prison_racct_free_locked(prr); sx_xunlock(&allprison_lock); } static void prison_racct_attach(struct prison *pr) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); prr = prison_racct_find_locked(pr->pr_name); KASSERT(prr != NULL, ("cannot find prison_racct")); pr->pr_prison_racct = prr; } /* * Handle jail renaming. From the racct point of view, renaming means * moving from one prison_racct to another. */ static void prison_racct_modify(struct prison *pr) { #ifdef RCTL struct proc *p; struct ucred *cred; #endif struct prison_racct *oldprr; ASSERT_RACCT_ENABLED(); sx_slock(&allproc_lock); sx_xlock(&allprison_lock); if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { sx_xunlock(&allprison_lock); sx_sunlock(&allproc_lock); return; } oldprr = pr->pr_prison_racct; pr->pr_prison_racct = NULL; prison_racct_attach(pr); /* * Move resource utilisation records. */ racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); #ifdef RCTL /* * Force rctl to reattach rules to processes. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); cred = crhold(p->p_ucred); PROC_UNLOCK(p); rctl_proc_ucred_changed(p, cred); crfree(cred); } #endif sx_sunlock(&allproc_lock); prison_racct_free_locked(oldprr); sx_xunlock(&allprison_lock); } static void prison_racct_detach(struct prison *pr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (pr->pr_prison_racct == NULL) return; prison_racct_free(pr->pr_prison_racct); pr->pr_prison_racct = NULL; } #endif /* RACCT */ /* * Submit a knote for a prison, locking if necessary. */ static void prison_knote(struct prison *pr, long hint) { int locked; locked = mtx_owned(&pr->pr_mtx); if (!locked) mtx_lock(&pr->pr_mtx); KNOTE_LOCKED(pr->pr_klist, hint); jaildesc_knote(pr, hint); if (!locked) mtx_unlock(&pr->pr_mtx); } #ifdef DDB static void db_show_prison(struct prison *pr) { struct bool_flags *bf; struct jailsys_flags *jsf; #if defined(INET) || defined(INET6) int ii; struct prison_ip *pip; #endif unsigned f; #ifdef INET char ip4buf[INET_ADDRSTRLEN]; #endif #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif db_printf("prison %p:\n", pr); db_printf(" jid = %d\n", pr->pr_id); db_printf(" name = %s\n", pr->pr_name); db_printf(" parent = %p\n", pr->pr_parent); db_printf(" ref = %d\n", pr->pr_ref); db_printf(" uref = %d\n", pr->pr_uref); db_printf(" state = %s\n", pr->pr_state == PRISON_STATE_ALIVE ? "alive" : pr->pr_state == PRISON_STATE_DYING ? "dying" : "invalid"); db_printf(" path = %s\n", pr->pr_path); db_printf(" cpuset = %d\n", pr->pr_cpuset ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); db_printf(" children.max = %d\n", pr->pr_childmax); db_printf(" children.cur = %d\n", pr->pr_childcount); db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); db_printf(" flags = 0x%x", pr->pr_flags); for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) if (pr->pr_flags & bf->flag) db_printf(" %s", bf->name); for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { f = pr->pr_flags & (jsf->disable | jsf->new); db_printf(" %-16s= %s\n", jsf->name, (f != 0 && f == jsf->disable) ? "disable" : (f == jsf->new) ? "new" : "inherit"); } db_printf(" allow = 0x%x", pr->pr_allow); for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) if (pr->pr_allow & bf->flag) db_printf(" %s", bf->name); db_printf("\n"); db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); db_printf(" host.hostname = %s\n", pr->pr_hostname); db_printf(" host.domainname = %s\n", pr->pr_domainname); db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); db_printf(" host.hostid = %lu\n", pr->pr_hostid); #ifdef INET if ((pip = pr->pr_addrs[PR_INET]) != NULL) { db_printf(" ip4s = %d\n", pip->ips); for (ii = 0; ii < pip->ips; ii++) db_printf(" %s %s\n", ii == 0 ? "ip4.addr =" : " ", inet_ntoa_r( *(const struct in_addr *)PR_IP(pip, PR_INET, ii), ip4buf)); } #endif #ifdef INET6 if ((pip = pr->pr_addrs[PR_INET6]) != NULL) { db_printf(" ip6s = %d\n", pip->ips); for (ii = 0; ii < pip->ips; ii++) db_printf(" %s %s\n", ii == 0 ? "ip6.addr =" : " ", ip6_sprintf(ip6buf, (const struct in6_addr *)PR_IP(pip, PR_INET6, ii))); } #endif } DB_SHOW_COMMAND(prison, db_show_prison_command) { struct prison *pr; if (!have_addr) { /* * Show all prisons in the list, and prison0 which is not * listed. */ db_show_prison(&prison0); if (!db_pager_quit) { TAILQ_FOREACH(pr, &allprison, pr_list) { db_show_prison(pr); if (db_pager_quit) break; } } return; } if (addr == 0) pr = &prison0; else { /* Look for a prison with the ID and with references. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr && pr->pr_ref > 0) break; if (pr == NULL) /* Look again, without requiring a reference. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr) break; if (pr == NULL) /* Assume address points to a valid prison. */ pr = (struct prison *)addr; } db_show_prison(pr); } #endif /* DDB */ diff --git a/sys/kern/kern_loginclass.c b/sys/kern/kern_loginclass.c index 0c111c4f78d8..07d388f18f8d 100644 --- a/sys/kern/kern_loginclass.c +++ b/sys/kern/kern_loginclass.c @@ -1,258 +1,263 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 The FreeBSD Foundation * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Processes may set login class name using setloginclass(2). This * is usually done through call to setusercontext(3), by programs * such as login(1), based on information from master.passwd(5). Kernel * uses this information to enforce per-class resource limits. Current * login class can be determined using id(1). Login class is inherited * from the parent process during fork(2). If not set, it defaults * to "default". * * Code in this file implements setloginclass(2) and getloginclass(2) * system calls, and maintains class name storage and retrieval. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_LOGINCLASS, "loginclass", "loginclass structures"); LIST_HEAD(, loginclass) loginclasses; /* * Lock protecting loginclasses list. */ static struct rwlock loginclasses_lock; RW_SYSINIT(loginclasses_init, &loginclasses_lock, "loginclasses lock"); void loginclass_hold(struct loginclass *lc) { refcount_acquire(&lc->lc_refcount); } void loginclass_free(struct loginclass *lc) { if (refcount_release_if_not_last(&lc->lc_refcount)) return; rw_wlock(&loginclasses_lock); if (!refcount_release(&lc->lc_refcount)) { rw_wunlock(&loginclasses_lock); return; } racct_destroy(&lc->lc_racct); LIST_REMOVE(lc, lc_next); rw_wunlock(&loginclasses_lock); free(lc, M_LOGINCLASS); } /* * Look up a loginclass struct for the parameter name. * loginclasses_lock must be locked. * Increase refcount on loginclass struct returned. */ static struct loginclass * loginclass_lookup(const char *name) { struct loginclass *lc; rw_assert(&loginclasses_lock, RA_LOCKED); LIST_FOREACH(lc, &loginclasses, lc_next) if (strcmp(name, lc->lc_name) == 0) { loginclass_hold(lc); break; } return (lc); } /* * Return loginclass structure with a corresponding name. Not * performance critical, as it's used mainly by setloginclass(2), * which happens once per login session. Caller has to use * loginclass_free() on the returned value when it's no longer * needed. */ struct loginclass * loginclass_find(const char *name) { struct loginclass *lc, *new_lc; if (name[0] == '\0' || strlen(name) >= MAXLOGNAME) return (NULL); lc = curthread->td_ucred->cr_loginclass; if (strcmp(name, lc->lc_name) == 0) { loginclass_hold(lc); return (lc); } rw_rlock(&loginclasses_lock); lc = loginclass_lookup(name); rw_runlock(&loginclasses_lock); if (lc != NULL) return (lc); new_lc = malloc(sizeof(*new_lc), M_LOGINCLASS, M_ZERO | M_WAITOK); racct_create(&new_lc->lc_racct); refcount_init(&new_lc->lc_refcount, 1); strcpy(new_lc->lc_name, name); rw_wlock(&loginclasses_lock); /* * There's a chance someone created our loginclass while we * were in malloc and not holding the lock, so we have to * make sure we don't insert a duplicate loginclass. */ if ((lc = loginclass_lookup(name)) == NULL) { LIST_INSERT_HEAD(&loginclasses, new_lc, lc_next); rw_wunlock(&loginclasses_lock); lc = new_lc; } else { rw_wunlock(&loginclasses_lock); racct_destroy(&new_lc->lc_racct); free(new_lc, M_LOGINCLASS); } return (lc); } /* * Get login class name. */ #ifndef _SYS_SYSPROTO_H_ struct getloginclass_args { char *namebuf; size_t namelen; }; #endif /* ARGSUSED */ int sys_getloginclass(struct thread *td, struct getloginclass_args *uap) { struct loginclass *lc; size_t lcnamelen; lc = td->td_ucred->cr_loginclass; lcnamelen = strlen(lc->lc_name) + 1; if (lcnamelen > uap->namelen) return (ERANGE); return (copyout(lc->lc_name, uap->namebuf, lcnamelen)); } /* * Set login class name. */ #ifndef _SYS_SYSPROTO_H_ struct setloginclass_args { const char *namebuf; }; #endif /* ARGSUSED */ int sys_setloginclass(struct thread *td, struct setloginclass_args *uap) { struct proc *p = td->td_proc; int error; char lcname[MAXLOGNAME]; struct loginclass *newlc; struct ucred *newcred, *oldcred; error = priv_check(td, PRIV_PROC_SETLOGINCLASS); if (error != 0) return (error); error = copyinstr(uap->namebuf, lcname, sizeof(lcname), NULL); if (error != 0) return (error); newlc = loginclass_find(lcname); if (newlc == NULL) return (EINVAL); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_loginclass = newlc; - proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif #ifdef RCTL crhold(newcred); #endif + /* + * Takes over 'newcred''s reference, so 'newcred' must not be used + * besides this point except on RCTL where we took an additional + * reference above. + */ + proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif loginclass_free(oldcred->cr_loginclass); crfree(oldcred); return (0); } void loginclass_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3) { struct loginclass *lc; rw_rlock(&loginclasses_lock); if (pre != NULL) (pre)(); LIST_FOREACH(lc, &loginclasses, lc_next) (callback)(lc->lc_racct, arg2, arg3); if (post != NULL) (post)(); rw_runlock(&loginclasses_lock); } diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c index 47f9937e49aa..06696612f8c4 100644 --- a/sys/kern/kern_prot.c +++ b/sys/kern/kern_prot.c @@ -1,3213 +1,3237 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 * The Regents of the University of California. * (c) UNIX System Laboratories, Inc. * Copyright (c) 2000-2001 Robert N. M. Watson. * All rights reserved. * * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * System calls related to processes and protection */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_43 #include #endif #include #include #include #include #include #include #include #include #include #ifdef MAC #include #endif #include #ifdef REGRESSION FEATURE(regression, "Kernel support for interfaces necessary for regression testing (SECURITY RISK!)"); #endif #include #include static MALLOC_DEFINE(M_CRED, "cred", "credentials"); SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "BSD security policy"); static void crfree_final(struct ucred *cr); static inline void groups_check_positive_len(int ngrp) { MPASS2(ngrp >= 0, "negative number of groups"); } static inline void groups_check_max_len(int ngrp) { MPASS2(ngrp <= ngroups_max, "too many supplementary groups"); } static void groups_normalize(int *ngrp, gid_t *groups); static void crsetgroups_internal(struct ucred *cr, int ngrp, const gid_t *groups); static int cr_canseeotheruids(struct ucred *u1, struct ucred *u2); static int cr_canseeothergids(struct ucred *u1, struct ucred *u2); static int cr_canseejailproc(struct ucred *u1, struct ucred *u2); #ifndef _SYS_SYSPROTO_H_ struct getpid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getpid(struct thread *td, struct getpid_args *uap) { struct proc *p = td->td_proc; td->td_retval[0] = p->p_pid; #if defined(COMPAT_43) if (SV_PROC_FLAG(p, SV_AOUT)) td->td_retval[1] = kern_getppid(td); #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct getppid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getppid(struct thread *td, struct getppid_args *uap) { td->td_retval[0] = kern_getppid(td); return (0); } int kern_getppid(struct thread *td) { struct proc *p = td->td_proc; return (p->p_oppid); } /* * Get process group ID; note that POSIX getpgrp takes no parameter. */ #ifndef _SYS_SYSPROTO_H_ struct getpgrp_args { int dummy; }; #endif int sys_getpgrp(struct thread *td, struct getpgrp_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* Get an arbitrary pid's process group id */ #ifndef _SYS_SYSPROTO_H_ struct getpgid_args { pid_t pid; }; #endif int sys_getpgid(struct thread *td, struct getpgid_args *uap) { struct proc *p; int error; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* * Get an arbitrary pid's session id. */ #ifndef _SYS_SYSPROTO_H_ struct getsid_args { pid_t pid; }; #endif int sys_getsid(struct thread *td, struct getsid_args *uap) { return (kern_getsid(td, uap->pid)); } int kern_getsid(struct thread *td, pid_t pid) { struct proc *p; int error; if (pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_session->s_sid; PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct getuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getuid(struct thread *td, struct getuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_ruid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_uid; #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct geteuid_args { int dummy; }; #endif /* ARGSUSED */ int sys_geteuid(struct thread *td, struct geteuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_uid; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getgid(struct thread *td, struct getgid_args *uap) { td->td_retval[0] = td->td_ucred->cr_rgid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_gid; #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct getegid_args { int dummy; }; #endif /* ARGSUSED */ int sys_getegid(struct thread *td, struct getegid_args *uap) { td->td_retval[0] = td->td_ucred->cr_gid; return (0); } #ifdef COMPAT_FREEBSD14 int freebsd14_getgroups(struct thread *td, struct freebsd14_getgroups_args *uap) { struct ucred *cred; int ngrp, error; cred = td->td_ucred; /* * For FreeBSD < 15.0, we account for the egid being placed at the * beginning of the group list prior to all supplementary groups. */ ngrp = cred->cr_ngroups + 1; if (uap->gidsetsize == 0) { error = 0; goto out; } else if (uap->gidsetsize < ngrp) { return (EINVAL); } error = copyout(&cred->cr_gid, uap->gidset, sizeof(gid_t)); if (error == 0) error = copyout(cred->cr_groups, uap->gidset + 1, (ngrp - 1) * sizeof(gid_t)); out: td->td_retval[0] = ngrp; return (error); } #endif /* COMPAT_FREEBSD14 */ #ifndef _SYS_SYSPROTO_H_ struct getgroups_args { int gidsetsize; gid_t *gidset; }; #endif int sys_getgroups(struct thread *td, struct getgroups_args *uap) { struct ucred *cred; int ngrp, error; cred = td->td_ucred; ngrp = cred->cr_ngroups; if (uap->gidsetsize == 0) { error = 0; goto out; } if (uap->gidsetsize < ngrp) return (EINVAL); error = copyout(cred->cr_groups, uap->gidset, ngrp * sizeof(gid_t)); out: td->td_retval[0] = ngrp; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setsid_args { int dummy; }; #endif /* ARGSUSED */ int sys_setsid(struct thread *td, struct setsid_args *uap) { struct pgrp *pgrp; int error; struct proc *p = td->td_proc; struct pgrp *newpgrp; struct session *newsess; pgrp = NULL; newpgrp = uma_zalloc(pgrp_zone, M_WAITOK); newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); again: error = 0; sx_xlock(&proctree_lock); if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) { if (pgrp != NULL) PGRP_UNLOCK(pgrp); error = EPERM; } else { error = enterpgrp(p, p->p_pid, newpgrp, newsess); if (error == ERESTART) goto again; MPASS(error == 0); td->td_retval[0] = p->p_pid; newpgrp = NULL; newsess = NULL; } sx_xunlock(&proctree_lock); uma_zfree(pgrp_zone, newpgrp); free(newsess, M_SESSION); return (error); } /* * set process group (setpgid/old setpgrp) * * caller does setpgid(targpid, targpgid) * * pid must be caller or child of caller (ESRCH) * if a child * pid must be in same session (EPERM) * pid can't have done an exec (EACCES) * if pgid != pid * there must exist some pid in same session having pgid (EPERM) * pid must not be session leader (EPERM) */ #ifndef _SYS_SYSPROTO_H_ struct setpgid_args { int pid; /* target process id */ int pgid; /* target pgrp id */ }; #endif /* ARGSUSED */ int sys_setpgid(struct thread *td, struct setpgid_args *uap) { struct proc *curp = td->td_proc; struct proc *targp; /* target process */ struct pgrp *pgrp; /* target pgrp */ int error; struct pgrp *newpgrp; if (uap->pgid < 0) return (EINVAL); newpgrp = uma_zalloc(pgrp_zone, M_WAITOK); again: error = 0; sx_xlock(&proctree_lock); if (uap->pid != 0 && uap->pid != curp->p_pid) { if ((targp = pfind(uap->pid)) == NULL) { error = ESRCH; goto done; } if (!inferior(targp)) { PROC_UNLOCK(targp); error = ESRCH; goto done; } if ((error = p_cansee(td, targp))) { PROC_UNLOCK(targp); goto done; } if (targp->p_pgrp == NULL || targp->p_session != curp->p_session) { PROC_UNLOCK(targp); error = EPERM; goto done; } if (targp->p_flag & P_EXEC) { PROC_UNLOCK(targp); error = EACCES; goto done; } PROC_UNLOCK(targp); } else targp = curp; if (SESS_LEADER(targp)) { error = EPERM; goto done; } if (uap->pgid == 0) uap->pgid = targp->p_pid; if ((pgrp = pgfind(uap->pgid)) == NULL) { if (uap->pgid == targp->p_pid) { error = enterpgrp(targp, uap->pgid, newpgrp, NULL); if (error == 0) newpgrp = NULL; } else error = EPERM; } else { if (pgrp == targp->p_pgrp) { PGRP_UNLOCK(pgrp); goto done; } if (pgrp->pg_id != targp->p_pid && pgrp->pg_session != curp->p_session) { PGRP_UNLOCK(pgrp); error = EPERM; goto done; } PGRP_UNLOCK(pgrp); error = enterthispgrp(targp, pgrp); } done: KASSERT(error == 0 || newpgrp != NULL, ("setpgid failed and newpgrp is NULL")); if (error == ERESTART) goto again; sx_xunlock(&proctree_lock); uma_zfree(pgrp_zone, newpgrp); return (error); } static int gidp_cmp(const void *p1, const void *p2) { const gid_t g1 = *(const gid_t *)p1; const gid_t g2 = *(const gid_t *)p2; return ((g1 > g2) - (g1 < g2)); } /* * Final storage for supplementary groups will be returned via 'groups'. * '*groups' must be NULL on input, and if not equal to 'smallgroups' * on output, must be freed (M_TEMP) *even if* an error is returned. */ static int kern_setcred_copyin_supp_groups(struct setcred *const wcred, const u_int flags, gid_t *const smallgroups, gid_t **const groups) { MPASS(*groups == NULL); if (flags & SETCREDF_SUPP_GROUPS) { int error; /* * Check for the limit for number of groups right now in order * to limit the amount of bytes to copy. */ if (wcred->sc_supp_groups_nb > ngroups_max) return (EINVAL); /* * Since we are going to be copying the supplementary groups * from userland, make room also for the effective GID right * now, to avoid having to allocate and copy again the * supplementary groups. */ *groups = wcred->sc_supp_groups_nb <= CRED_SMALLGROUPS_NB ? smallgroups : malloc(wcred->sc_supp_groups_nb * sizeof(*groups), M_TEMP, M_WAITOK); error = copyin(wcred->sc_supp_groups, *groups, wcred->sc_supp_groups_nb * sizeof(*groups)); if (error != 0) return (error); wcred->sc_supp_groups = *groups; } else { wcred->sc_supp_groups_nb = 0; wcred->sc_supp_groups = NULL; } return (0); } int user_setcred(struct thread *td, const u_int flags, const void *const uwcred, const size_t size, bool is_32bit) { struct setcred wcred; #ifdef MAC struct mac mac; /* Pointer to 'struct mac' or 'struct mac32'. */ void *umac; #endif gid_t smallgroups[CRED_SMALLGROUPS_NB]; gid_t *groups = NULL; int error; /* * As the only point of this wrapper function is to copyin() from * userland, we only interpret the data pieces we need to perform this * operation and defer further sanity checks to kern_setcred(), except * that we redundantly check here that no unknown flags have been * passed. */ if ((flags & ~SETCREDF_MASK) != 0) return (EINVAL); #ifdef COMPAT_FREEBSD32 if (is_32bit) { struct setcred32 wcred32; if (size != sizeof(wcred32)) return (EINVAL); error = copyin(uwcred, &wcred32, sizeof(wcred32)); if (error != 0) return (error); /* These fields have exactly the same sizes and positions. */ memcpy(&wcred, &wcred32, &wcred32.setcred32_copy_end - &wcred32.setcred32_copy_start); /* Remaining fields are pointers and need PTRIN*(). */ PTRIN_CP(wcred32, wcred, sc_supp_groups); PTRIN_CP(wcred32, wcred, sc_label); } else #endif /* COMPAT_FREEBSD32 */ { if (size != sizeof(wcred)) return (EINVAL); error = copyin(uwcred, &wcred, sizeof(wcred)); if (error != 0) return (error); } #ifdef MAC umac = wcred.sc_label; #endif /* Also done on !MAC as a defensive measure. */ wcred.sc_label = NULL; /* * Copy supplementary groups as needed. There is no specific * alternative for 32-bit compatibility as 'gid_t' has the same size * everywhere. */ error = kern_setcred_copyin_supp_groups(&wcred, flags, smallgroups, &groups); if (error != 0) goto free_groups; #ifdef MAC if ((flags & SETCREDF_MAC_LABEL) != 0) { #ifdef COMPAT_FREEBSD32 if (is_32bit) error = mac_label_copyin32(umac, &mac, NULL); else #endif error = mac_label_copyin(umac, &mac, NULL); if (error != 0) goto free_groups; wcred.sc_label = &mac; } #endif error = kern_setcred(td, flags, &wcred, groups); #ifdef MAC if (wcred.sc_label != NULL) free_copied_label(wcred.sc_label); #endif free_groups: if (groups != smallgroups) free(groups, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setcred_args { u_int flags; /* Flags. */ const struct setcred *wcred; size_t size; /* Passed 'setcred' structure length. */ }; #endif /* ARGSUSED */ int sys_setcred(struct thread *td, struct setcred_args *uap) { return (user_setcred(td, uap->flags, uap->wcred, uap->size, false)); } /* * CAUTION: This function normalizes groups in 'wcred'. * * If 'preallocated_groups' is non-NULL, it must be an already allocated array * of size 'wcred->sc_supp_groups_nb' containing the supplementary groups, and * 'wcred->sc_supp_groups' then must point to it. */ int kern_setcred(struct thread *const td, const u_int flags, struct setcred *const wcred, gid_t *preallocated_groups) { struct proc *const p = td->td_proc; struct ucred *new_cred, *old_cred, *to_free_cred; struct uidinfo *uip = NULL, *ruip = NULL; #ifdef MAC void *mac_set_proc_data = NULL; bool proc_label_set = false; #endif gid_t *groups = NULL; gid_t smallgroups[CRED_SMALLGROUPS_NB]; int error; bool cred_set = false; /* Bail out on unrecognized flags. */ if (flags & ~SETCREDF_MASK) return (EINVAL); /* * Part 1: We allocate and perform preparatory operations with no locks. */ if (flags & SETCREDF_SUPP_GROUPS) { if (wcred->sc_supp_groups_nb > ngroups_max) return (EINVAL); if (preallocated_groups != NULL) { groups = preallocated_groups; MPASS(preallocated_groups == wcred->sc_supp_groups); } else { if (wcred->sc_supp_groups_nb <= CRED_SMALLGROUPS_NB) groups = smallgroups; else groups = malloc(wcred->sc_supp_groups_nb * sizeof(*groups), M_TEMP, M_WAITOK); memcpy(groups, wcred->sc_supp_groups, wcred->sc_supp_groups_nb * sizeof(*groups)); } } if (flags & SETCREDF_MAC_LABEL) { #ifdef MAC error = mac_set_proc_prepare(td, wcred->sc_label, &mac_set_proc_data); if (error != 0) goto free_groups; #else error = ENOTSUP; goto free_groups; #endif } if (flags & SETCREDF_UID) { AUDIT_ARG_EUID(wcred->sc_uid); uip = uifind(wcred->sc_uid); } if (flags & SETCREDF_RUID) { AUDIT_ARG_RUID(wcred->sc_ruid); ruip = uifind(wcred->sc_ruid); } if (flags & SETCREDF_SVUID) AUDIT_ARG_SUID(wcred->sc_svuid); if (flags & SETCREDF_GID) AUDIT_ARG_EGID(wcred->sc_gid); if (flags & SETCREDF_RGID) AUDIT_ARG_RGID(wcred->sc_rgid); if (flags & SETCREDF_SVGID) AUDIT_ARG_SGID(wcred->sc_svgid); if (flags & SETCREDF_SUPP_GROUPS) { /* * Output the raw supplementary groups array for better * traceability. */ AUDIT_ARG_GROUPSET(groups, wcred->sc_supp_groups_nb); groups_normalize(&wcred->sc_supp_groups_nb, groups); } /* * We first completely build the new credentials and only then pass them * to MAC along with the old ones so that modules can check whether the * requested transition is allowed. */ new_cred = crget(); to_free_cred = new_cred; if (flags & SETCREDF_SUPP_GROUPS) crextend(new_cred, wcred->sc_supp_groups_nb); #ifdef MAC mac_cred_setcred_enter(); #endif /* * Part 2: We grab the process lock as to have a stable view of its * current credentials, and prepare a copy of them with the requested * changes applied under that lock. */ PROC_LOCK(p); old_cred = crcopysafe(p, new_cred); /* * Change user IDs. */ if (flags & SETCREDF_UID) change_euid(new_cred, uip); if (flags & SETCREDF_RUID) change_ruid(new_cred, ruip); if (flags & SETCREDF_SVUID) change_svuid(new_cred, wcred->sc_svuid); /* * Change groups. */ if (flags & SETCREDF_SUPP_GROUPS) crsetgroups_internal(new_cred, wcred->sc_supp_groups_nb, groups); if (flags & SETCREDF_GID) change_egid(new_cred, wcred->sc_gid); if (flags & SETCREDF_RGID) change_rgid(new_cred, wcred->sc_rgid); if (flags & SETCREDF_SVGID) change_svgid(new_cred, wcred->sc_svgid); #ifdef MAC /* * Change the MAC label. */ if (flags & SETCREDF_MAC_LABEL) { error = mac_set_proc_core(td, new_cred, mac_set_proc_data); if (error != 0) goto unlock_finish; proc_label_set = true; } /* * MAC security modules checks. */ error = mac_cred_check_setcred(flags, old_cred, new_cred); if (error != 0) goto unlock_finish; #endif /* * Privilege check. */ error = priv_check_cred(old_cred, PRIV_CRED_SETCRED); if (error != 0) goto unlock_finish; +#ifdef RACCT /* - * Set the new credentials, noting that they have changed. + * Hold a reference to 'new_cred', as we need to call some functions on + * it after proc_set_cred_enforce_proc_lim(). */ + crhold(new_cred); +#endif + + /* Set the new credentials. */ cred_set = proc_set_cred_enforce_proc_lim(p, new_cred); if (cred_set) { setsugid(p); - to_free_cred = old_cred; #ifdef RACCT + /* Adjust RACCT counters. */ racct_proc_ucred_changed(p, old_cred, new_cred); #endif -#ifdef RCTL - crhold(new_cred); -#endif + to_free_cred = old_cred; MPASS(error == 0); - } else + } else { +#ifdef RACCT + /* Matches the crhold() just before the containing 'if'. */ + crfree(new_cred); +#endif error = EAGAIN; + } unlock_finish: PROC_UNLOCK(p); /* * Part 3: After releasing the process lock, we perform cleanups and * finishing operations. */ -#ifdef RCTL +#ifdef RACCT if (cred_set) { +#ifdef RCTL rctl_proc_ucred_changed(p, new_cred); - /* Paired with the crhold() just above. */ +#endif + /* Paired with the crhold() above. */ crfree(new_cred); } #endif #ifdef MAC if (mac_set_proc_data != NULL) mac_set_proc_finish(td, proc_label_set, mac_set_proc_data); mac_cred_setcred_exit(); #endif crfree(to_free_cred); if (uip != NULL) uifree(uip); if (ruip != NULL) uifree(ruip); free_groups: if (groups != preallocated_groups && groups != smallgroups) free(groups, M_TEMP); /* Deals with 'groups' being NULL. */ return (error); } /* * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD * compatible. It says that setting the uid/gid to euid/egid is a special * case of "appropriate privilege". Once the rules are expanded out, this * basically means that setuid(nnn) sets all three id's, in all permitted * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid()) * does not set the saved id - this is dangerous for traditional BSD * programs. For this reason, we *really* do not want to set * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2. */ #define POSIX_APPENDIX_B_4_2_2 #ifndef _SYS_SYSPROTO_H_ struct setuid_args { uid_t uid; }; #endif /* ARGSUSED */ int sys_setuid(struct thread *td, struct setuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t uid; struct uidinfo *uip; int error; uid = uap->uid; AUDIT_ARG_UID(uid); newcred = crget(); uip = uifind(uid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setuid(oldcred, uid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setuid(geteuid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setuid(xx)" sets all * three id's (assuming you have privs). * * Notes on the logic. We do things in three steps. * 1: We determine if the euid is going to change, and do EPERM * right away. We unconditionally change the euid later if this * test is satisfied, simplifying that part of the logic. * 2: We determine if the real and/or saved uids are going to * change. Determined by compile options. * 3: Change euid last. (after tests in #2 for "appropriate privs") */ if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */ #ifdef _POSIX_SAVED_IDS uid != oldcred->cr_svuid && /* allow setuid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ uid != oldcred->cr_uid && /* allow setuid(geteuid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETUID)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or uid == euid) * If so, we are changing the real uid and/or saved uid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */ uid == oldcred->cr_uid || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETUID) == 0) #endif { /* * Set the real uid. */ if (uid != oldcred->cr_ruid) { change_ruid(newcred, uip); setsugid(p); } /* * Set saved uid * * XXX always set saved uid even if not _POSIX_SAVED_IDS, as * the security of seteuid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (uid != oldcred->cr_svuid) { change_svuid(newcred, uid); setsugid(p); } } /* * In all permitted cases, we are changing the euid. */ if (uid != oldcred->cr_uid) { change_euid(newcred, uip); setsugid(p); } - /* - * This also transfers the proc count to the new user. - */ - proc_set_cred(p, newcred); + #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif #ifdef RCTL crhold(newcred); #endif + /* + * Takes over 'newcred''s reference, so 'newcred' must not be used + * besides this point except on RCTL where we took an additional + * reference above. + */ + proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(uip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(uip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct seteuid_args { uid_t euid; }; #endif /* ARGSUSED */ int sys_seteuid(struct thread *td, struct seteuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid; struct uidinfo *euip; int error; euid = uap->euid; AUDIT_ARG_EUID(euid); newcred = crget(); euip = uifind(euid); PROC_LOCK(p); /* * Copy credentials so other references do not see our changes. */ oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_seteuid(oldcred, euid); if (error) goto fail; #endif if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */ euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID)) != 0) goto fail; /* * Everything's okay, do it. */ if (oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgid_args { gid_t gid; }; #endif /* ARGSUSED */ int sys_setgid(struct thread *td, struct setgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t gid; int error; gid = uap->gid; AUDIT_ARG_GID(gid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setgid(oldcred, gid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setgid(getegid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setgid(xx)" sets all * three id's (assuming you have privs). * * For notes on the logic here, see setuid() above. */ if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */ #ifdef _POSIX_SAVED_IDS gid != oldcred->cr_svgid && /* allow setgid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ gid != oldcred->cr_gid && /* allow setgid(getegid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETGID)) != 0) goto fail; #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or gid == egid) * If so, we are changing the real uid and saved gid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */ gid == oldcred->cr_gid || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETGID) == 0) #endif { /* * Set real gid */ if (oldcred->cr_rgid != gid) { change_rgid(newcred, gid); setsugid(p); } /* * Set saved gid * * XXX always set saved gid even if not _POSIX_SAVED_IDS, as * the security of setegid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (oldcred->cr_svgid != gid) { change_svgid(newcred, gid); setsugid(p); } } /* * In all cases permitted cases, we are changing the egid. * Copy credentials so other references do not see our changes. */ if (oldcred->cr_gid != gid) { change_egid(newcred, gid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setegid_args { gid_t egid; }; #endif /* ARGSUSED */ int sys_setegid(struct thread *td, struct setegid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid; int error; egid = uap->egid; AUDIT_ARG_EGID(egid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setegid(oldcred, egid); if (error) goto fail; #endif if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */ egid != oldcred->cr_svgid && /* allow setegid(saved gid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID)) != 0) goto fail; if (oldcred->cr_gid != egid) { change_egid(newcred, egid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifdef COMPAT_FREEBSD14 int freebsd14_setgroups(struct thread *td, struct freebsd14_setgroups_args *uap) { gid_t smallgroups[CRED_SMALLGROUPS_NB]; gid_t *groups; int gidsetsize, error; /* * Before FreeBSD 15.0, we allow one more group to be supplied to * account for the egid appearing before the supplementary groups. This * may technically allow one more supplementary group for systems that * did use the default NGROUPS_MAX if we round it back up to 1024. */ gidsetsize = uap->gidsetsize; if (gidsetsize > ngroups_max + 1 || gidsetsize < 0) return (EINVAL); if (gidsetsize > CRED_SMALLGROUPS_NB) groups = malloc(gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK); else groups = smallgroups; error = copyin(uap->gidset, groups, gidsetsize * sizeof(gid_t)); if (error == 0) { int ngroups = gidsetsize > 0 ? gidsetsize - 1 /* egid */ : 0; error = kern_setgroups(td, &ngroups, groups + 1); if (error == 0 && gidsetsize > 0) td->td_proc->p_ucred->cr_gid = groups[0]; } if (groups != smallgroups) free(groups, M_TEMP); return (error); } #endif /* COMPAT_FREEBSD14 */ #ifndef _SYS_SYSPROTO_H_ struct setgroups_args { int gidsetsize; gid_t *gidset; }; #endif /* ARGSUSED */ int sys_setgroups(struct thread *td, struct setgroups_args *uap) { gid_t smallgroups[CRED_SMALLGROUPS_NB]; gid_t *groups; int gidsetsize, error; /* * Sanity check size now to avoid passing too big a value to copyin(), * even if kern_setgroups() will do it again. * * Ideally, the 'gidsetsize' argument should have been a 'u_int' (and it * was, in this implementation, for a long time), but POSIX standardized * getgroups() to take an 'int' and it would be quite entrapping to have * setgroups() differ. */ gidsetsize = uap->gidsetsize; if (gidsetsize > ngroups_max || gidsetsize < 0) return (EINVAL); if (gidsetsize > CRED_SMALLGROUPS_NB) groups = malloc(gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK); else groups = smallgroups; error = copyin(uap->gidset, groups, gidsetsize * sizeof(gid_t)); if (error == 0) error = kern_setgroups(td, &gidsetsize, groups); if (groups != smallgroups) free(groups, M_TEMP); return (error); } /* * CAUTION: This function normalizes 'groups', possibly also changing the value * of '*ngrpp' as a consequence. */ int kern_setgroups(struct thread *td, int *ngrpp, gid_t *groups) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; int ngrp, error; ngrp = *ngrpp; /* Sanity check size. */ if (ngrp < 0 || ngrp > ngroups_max) return (EINVAL); AUDIT_ARG_GROUPSET(groups, ngrp); groups_normalize(&ngrp, groups); *ngrpp = ngrp; newcred = crget(); crextend(newcred, ngrp); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC /* * We pass NULL here explicitly if we don't have any supplementary * groups mostly for the sake of normalization, but also to avoid/detect * a situation where a MAC module has some assumption about the layout * of `groups` matching historical behavior. */ error = mac_cred_check_setgroups(oldcred, ngrp, ngrp == 0 ? NULL : groups); if (error) goto fail; #endif error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS); if (error) goto fail; crsetgroups_internal(newcred, ngrp, groups); setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setreuid_args { uid_t ruid; uid_t euid; }; #endif /* ARGSUSED */ int sys_setreuid(struct thread *td, struct setreuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setreuid(oldcred, ruid, euid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid) || (euid != (uid_t)-1 && euid != oldcred->cr_uid && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) && newcred->cr_svuid != newcred->cr_uid) { change_svuid(newcred, newcred->cr_uid); setsugid(p); } - proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif #ifdef RCTL crhold(newcred); #endif + /* + * Takes over 'newcred''s reference, so 'newcred' must not be used + * besides this point except on RCTL where we took an additional + * reference above. + */ + proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setregid_args { gid_t rgid; gid_t egid; }; #endif /* ARGSUSED */ int sys_setregid(struct thread *td, struct setregid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid; int error; egid = uap->egid; rgid = uap->rgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setregid(oldcred, rgid, egid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid) || (egid != (gid_t)-1 && egid != oldcred->cr_gid && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_gid != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if ((rgid != (gid_t)-1 || newcred->cr_gid != newcred->cr_rgid) && newcred->cr_svgid != newcred->cr_gid) { change_svgid(newcred, newcred->cr_gid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } /* * setresuid(ruid, euid, suid) is like setreuid except control over the saved * uid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresuid_args { uid_t ruid; uid_t euid; uid_t suid; }; #endif /* ARGSUSED */ int sys_setresuid(struct thread *td, struct setresuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid, suid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; suid = uap->suid; AUDIT_ARG_EUID(euid); AUDIT_ARG_RUID(ruid); AUDIT_ARG_SUID(suid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresuid(oldcred, ruid, euid, suid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid && ruid != oldcred->cr_uid) || (euid != (uid_t)-1 && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid && euid != oldcred->cr_uid) || (suid != (uid_t)-1 && suid != oldcred->cr_ruid && suid != oldcred->cr_svuid && suid != oldcred->cr_uid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID)) != 0) goto fail; if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) { change_svuid(newcred, suid); setsugid(p); } - proc_set_cred(p, newcred); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif #ifdef RCTL crhold(newcred); #endif + /* + * Takes over 'newcred''s reference, so 'newcred' must not be used + * besides this point except on RCTL where we took an additional + * reference above. + */ + proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } /* * setresgid(rgid, egid, sgid) is like setregid except control over the saved * gid is explicit. */ #ifndef _SYS_SYSPROTO_H_ struct setresgid_args { gid_t rgid; gid_t egid; gid_t sgid; }; #endif /* ARGSUSED */ int sys_setresgid(struct thread *td, struct setresgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid, sgid; int error; egid = uap->egid; rgid = uap->rgid; sgid = uap->sgid; AUDIT_ARG_EGID(egid); AUDIT_ARG_RGID(rgid); AUDIT_ARG_SGID(sgid); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); #ifdef MAC error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid && rgid != oldcred->cr_gid) || (egid != (gid_t)-1 && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid && egid != oldcred->cr_gid) || (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid && sgid != oldcred->cr_svgid && sgid != oldcred->cr_gid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID)) != 0) goto fail; if (egid != (gid_t)-1 && oldcred->cr_gid != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) { change_svgid(newcred, sgid); setsugid(p); } proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getresuid_args { uid_t *ruid; uid_t *euid; uid_t *suid; }; #endif /* ARGSUSED */ int sys_getresuid(struct thread *td, struct getresuid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->ruid) error1 = copyout(&cred->cr_ruid, uap->ruid, sizeof(cred->cr_ruid)); if (uap->euid) error2 = copyout(&cred->cr_uid, uap->euid, sizeof(cred->cr_uid)); if (uap->suid) error3 = copyout(&cred->cr_svuid, uap->suid, sizeof(cred->cr_svuid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct getresgid_args { gid_t *rgid; gid_t *egid; gid_t *sgid; }; #endif /* ARGSUSED */ int sys_getresgid(struct thread *td, struct getresgid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->rgid) error1 = copyout(&cred->cr_rgid, uap->rgid, sizeof(cred->cr_rgid)); if (uap->egid) error2 = copyout(&cred->cr_gid, uap->egid, sizeof(cred->cr_gid)); if (uap->sgid) error3 = copyout(&cred->cr_svgid, uap->sgid, sizeof(cred->cr_svgid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct issetugid_args { int dummy; }; #endif /* ARGSUSED */ int sys_issetugid(struct thread *td, struct issetugid_args *uap) { struct proc *p = td->td_proc; /* * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, * we use P_SUGID because we consider changing the owners as * "tainting" as well. * This is significant for procs that start as root and "become" * a user without an exec - programs cannot know *everything* * that libc *might* have put in their data segment. */ td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0; return (0); } int sys___setugid(struct thread *td, struct __setugid_args *uap) { #ifdef REGRESSION struct proc *p; p = td->td_proc; switch (uap->flag) { case 0: PROC_LOCK(p); p->p_flag &= ~P_SUGID; PROC_UNLOCK(p); return (0); case 1: PROC_LOCK(p); p->p_flag |= P_SUGID; PROC_UNLOCK(p); return (0); default: return (EINVAL); } #else /* !REGRESSION */ return (ENOSYS); #endif /* REGRESSION */ } #ifdef INVARIANTS static void groups_check_normalized(int ngrp, const gid_t *groups) { gid_t prev_g; groups_check_positive_len(ngrp); groups_check_max_len(ngrp); if (ngrp <= 1) return; prev_g = groups[0]; for (int i = 1; i < ngrp; ++i) { const gid_t g = groups[i]; if (prev_g >= g) panic("%s: groups[%d] (%u) >= groups[%d] (%u)", __func__, i - 1, prev_g, i, g); prev_g = g; } } #else #define groups_check_normalized(...) #endif /* * Returns whether gid designates a supplementary group in cred. */ bool group_is_supplementary(const gid_t gid, const struct ucred *const cred) { groups_check_normalized(cred->cr_ngroups, cred->cr_groups); /* * Perform a binary search of the supplementary groups. This is * possible because we sort the groups in crsetgroups(). */ return (bsearch(&gid, cred->cr_groups, cred->cr_ngroups, sizeof(gid), gidp_cmp) != NULL); } /* * Check if gid is a member of the (effective) group set (i.e., effective and * supplementary groups). */ bool groupmember(gid_t gid, const struct ucred *cred) { groups_check_positive_len(cred->cr_ngroups); if (gid == cred->cr_gid) return (true); return (group_is_supplementary(gid, cred)); } /* * Check if gid is a member of the real group set (i.e., real and supplementary * groups). */ bool realgroupmember(gid_t gid, const struct ucred *cred) { groups_check_positive_len(cred->cr_ngroups); if (gid == cred->cr_rgid) return (true); return (group_is_supplementary(gid, cred)); } /* * Test the active securelevel against a given level. securelevel_gt() * implements (securelevel > level). securelevel_ge() implements * (securelevel >= level). Note that the logic is inverted -- these * functions return EPERM on "success" and 0 on "failure". * * Due to care taken when setting the securelevel, we know that no jail will * be less secure that its parent (or the physical system), so it is sufficient * to test the current jail only. * * XXXRW: Possibly since this has to do with privilege, it should move to * kern_priv.c. */ int securelevel_gt(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel > level ? EPERM : 0); } int securelevel_ge(struct ucred *cr, int level) { return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0); } /* * 'see_other_uids' determines whether or not visibility of processes * and sockets with credentials holding different real uids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_uids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW, &see_other_uids, 0, "Unprivileged processes may see subjects/objects with different real uid"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_other_uids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_canseeotheruids(struct ucred *u1, struct ucred *u2) { if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) { if (priv_check_cred(u1, PRIV_SEEOTHERUIDS) != 0) return (ESRCH); } return (0); } /* * 'see_other_gids' determines whether or not visibility of processes * and sockets with credentials holding different real gids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_gids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW, &see_other_gids, 0, "Unprivileged processes may see subjects/objects with different real gid"); /* * Determine if u1 can "see" the subject specified by u2, according to the * 'see_other_gids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_canseeothergids(struct ucred *u1, struct ucred *u2) { if (see_other_gids) return (0); /* Restriction in force. */ if (realgroupmember(u1->cr_rgid, u2)) return (0); for (int i = 0; i < u1->cr_ngroups; i++) if (realgroupmember(u1->cr_groups[i], u2)) return (0); if (priv_check_cred(u1, PRIV_SEEOTHERGIDS) == 0) return (0); return (ESRCH); } /* * 'see_jail_proc' determines whether or not visibility of processes and * sockets with credentials holding different jail ids is possible using a * variety of system MIBs. * * XXX: data declarations should be together near the beginning of the file. */ static int see_jail_proc = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_jail_proc, CTLFLAG_RW, &see_jail_proc, 0, "Unprivileged processes may see subjects/objects with different jail ids"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_jail_proc' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_canseejailproc(struct ucred *u1, struct ucred *u2) { if (see_jail_proc || /* Policy deactivated. */ u1->cr_prison == u2->cr_prison || /* Same jail. */ priv_check_cred(u1, PRIV_SEEJAILPROC) == 0) /* Privileged. */ return (0); return (ESRCH); } /* * Determine if u1 can tamper with the subject specified by u2, if they are in * different jails and 'unprivileged_parent_tampering' jail policy allows it. * * May be called if u1 and u2 are in the same jail, but it is expected that the * caller has already done a prison_check() prior to calling it. * * Returns: 0 for permitted, EPERM otherwise */ static int cr_can_tamper_with_subjail(struct ucred *u1, struct ucred *u2, int priv) { MPASS(prison_check(u1, u2) == 0); if (u1->cr_prison == u2->cr_prison) return (0); if (priv_check_cred(u1, priv) == 0) return (0); /* * Jails do not maintain a distinct UID space, so process visibility is * all that would control an unprivileged process' ability to tamper * with a process in a subjail by default if we did not have the * allow.unprivileged_parent_tampering knob to restrict it by default. */ if (prison_allow(u2, PR_ALLOW_UNPRIV_PARENT_TAMPER)) return (0); return (EPERM); } /* * Helper for cr_cansee*() functions to abide by system-wide security.bsd.see_* * policies. Determines if u1 "can see" u2 according to these policies. * Returns: 0 for permitted, ESRCH otherwise */ int cr_bsd_visible(struct ucred *u1, struct ucred *u2) { int error; error = cr_canseeotheruids(u1, u2); if (error != 0) return (error); error = cr_canseeothergids(u1, u2); if (error != 0) return (error); error = cr_canseejailproc(u1, u2); if (error != 0) return (error); return (0); } /*- * Determine if u1 "can see" the subject specified by u2. * Returns: 0 for permitted, an errno value otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_cansee(struct ucred *u1, struct ucred *u2) { int error; if ((error = prison_check(u1, u2))) return (error); #ifdef MAC if ((error = mac_cred_check_visible(u1, u2))) return (error); #endif if ((error = cr_bsd_visible(u1, u2))) return (error); return (0); } /*- * Determine if td "can see" the subject specified by p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect p->p_ucred must be held. td really * should be curthread. * References: td and p must be valid for the lifetime of the call */ int p_cansee(struct thread *td, struct proc *p) { /* Wrap cr_cansee() for all functionality. */ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); return (cr_cansee(td->td_ucred, p->p_ucred)); } /* * 'conservative_signals' prevents the delivery of a broad class of * signals by unprivileged processes to processes that have changed their * credentials since the last invocation of execve(). This can prevent * the leakage of cached information or retained privileges as a result * of a common class of signal-related vulnerabilities. However, this * may interfere with some applications that expect to be able to * deliver these signals to peer processes after having given up * privilege. */ static int conservative_signals = 1; SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW, &conservative_signals, 0, "Unprivileged processes prevented from " "sending certain signals to processes whose credentials have changed"); /*- * Determine whether cred may deliver the specified signal to proc. * Returns: 0 for permitted, an errno value otherwise. * Locks: A lock must be held for proc. * References: cred and proc must be valid for the lifetime of the call. */ int cr_cansignal(struct ucred *cred, struct proc *proc, int signum) { int error; PROC_LOCK_ASSERT(proc, MA_OWNED); /* * Jail semantics limit the scope of signalling to proc in the * same jail as cred, if cred is in jail. */ error = prison_check(cred, proc->p_ucred); if (error) return (error); #ifdef MAC if ((error = mac_proc_check_signal(cred, proc, signum))) return (error); #endif if ((error = cr_bsd_visible(cred, proc->p_ucred))) return (error); /* * UNIX signal semantics depend on the status of the P_SUGID * bit on the target process. If the bit is set, then additional * restrictions are placed on the set of available signals. */ if (conservative_signals && (proc->p_flag & P_SUGID)) { switch (signum) { case 0: case SIGKILL: case SIGINT: case SIGTERM: case SIGALRM: case SIGSTOP: case SIGTTIN: case SIGTTOU: case SIGTSTP: case SIGHUP: case SIGUSR1: case SIGUSR2: /* * Generally, permit job and terminal control * signals. */ break; default: /* Not permitted without privilege. */ error = priv_check_cred(cred, PRIV_SIGNAL_SUGID); if (error) return (error); } } /* * Generally, the target credential's ruid or svuid must match the * subject credential's ruid or euid. */ if (cred->cr_ruid != proc->p_ucred->cr_ruid && cred->cr_ruid != proc->p_ucred->cr_svuid && cred->cr_uid != proc->p_ucred->cr_ruid && cred->cr_uid != proc->p_ucred->cr_svuid) { error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED); if (error) return (error); } /* * At this point, the target may be in a different jail than the * subject -- the subject must be in a parent jail to the target, * whether it is prison0 or a subordinate of prison0 that has * children. Additional privileges are required to allow this, as * whether the creds are truly equivalent or not must be determined on * a case-by-case basis. */ error = cr_can_tamper_with_subjail(cred, proc->p_ucred, PRIV_SIGNAL_DIFFJAIL); if (error) return (error); return (0); } /*- * Determine whether td may deliver the specified signal to p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must be * held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansignal(struct thread *td, struct proc *p, int signum) { KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); /* * UNIX signalling semantics require that processes in the same * session always be able to deliver SIGCONT to one another, * overriding the remaining protections. */ /* XXX: This will require an additional lock of some sort. */ if (signum == SIGCONT && td->td_proc->p_session == p->p_session) return (0); /* * Some compat layers use SIGTHR and higher signals for * communication between different kernel threads of the same * process, so that they expect that it's always possible to * deliver them, even for suid applications where cr_cansignal() can * deny such ability for security consideration. It should be * pretty safe to do since the only way to create two processes * with the same p_leader is via rfork(2). */ if (td->td_proc->p_leader != NULL && signum >= SIGTHR && signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader) return (0); return (cr_cansignal(td->td_ucred, p, signum)); } /*- * Determine whether td may reschedule p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansched(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_sched(td->td_ucred, p))) return (error); #endif if ((error = cr_bsd_visible(td->td_ucred, p->p_ucred))) return (error); if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid && td->td_ucred->cr_uid != p->p_ucred->cr_ruid) { error = priv_check(td, PRIV_SCHED_DIFFCRED); if (error) return (error); } error = cr_can_tamper_with_subjail(td->td_ucred, p->p_ucred, PRIV_SCHED_DIFFJAIL); if (error) return (error); return (0); } /* * Handle getting or setting the prison's unprivileged_proc_debug * value. */ static int sysctl_unprivileged_proc_debug(SYSCTL_HANDLER_ARGS) { int error, val; val = prison_allow(req->td->td_ucred, PR_ALLOW_UNPRIV_DEBUG); error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && val != 1) return (EINVAL); prison_set_allow(req->td->td_ucred, PR_ALLOW_UNPRIV_DEBUG, val); return (0); } /* * The 'unprivileged_proc_debug' flag may be used to disable a variety of * unprivileged inter-process debugging services, including some procfs * functionality, ptrace(), and ktrace(). In the past, inter-process * debugging has been involved in a variety of security problems, and sites * not requiring the service might choose to disable it when hardening * systems. */ SYSCTL_PROC(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_SECURE | CTLFLAG_MPSAFE, 0, 0, sysctl_unprivileged_proc_debug, "I", "Unprivileged processes may use process debugging facilities"); /* * Return true if the object owner/group ids are subset of the active * credentials. */ bool cr_xids_subset(struct ucred *active_cred, struct ucred *obj_cred) { int i; bool grpsubset, uidsubset; /* * Is p's group set a subset of td's effective group set? This * includes p's egid, group access list, rgid, and svgid. */ grpsubset = true; for (i = 0; i < obj_cred->cr_ngroups; i++) { if (!groupmember(obj_cred->cr_groups[i], active_cred)) { grpsubset = false; break; } } grpsubset = grpsubset && groupmember(obj_cred->cr_gid, active_cred) && groupmember(obj_cred->cr_rgid, active_cred) && groupmember(obj_cred->cr_svgid, active_cred); /* * Are the uids present in obj_cred's credential equal to * active_cred's effective uid? This includes obj_cred's * euid, svuid, and ruid. */ uidsubset = (active_cred->cr_uid == obj_cred->cr_uid && active_cred->cr_uid == obj_cred->cr_svuid && active_cred->cr_uid == obj_cred->cr_ruid); return (uidsubset && grpsubset); } /*- * Determine whether td may debug p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_candebug(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); if ((error = priv_check(td, PRIV_DEBUG_UNPRIV))) return (error); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_debug(td->td_ucred, p))) return (error); #endif if ((error = cr_bsd_visible(td->td_ucred, p->p_ucred))) return (error); /* * If p's gids aren't a subset, or the uids aren't a subset, * or the credential has changed, require appropriate privilege * for td to debug p. */ if (!cr_xids_subset(td->td_ucred, p->p_ucred)) { error = priv_check(td, PRIV_DEBUG_DIFFCRED); if (error) return (error); } /* * Has the credential of the process changed since the last exec()? */ if ((p->p_flag & P_SUGID) != 0) { error = priv_check(td, PRIV_DEBUG_SUGID); if (error) return (error); } error = cr_can_tamper_with_subjail(td->td_ucred, p->p_ucred, PRIV_DEBUG_DIFFJAIL); if (error) return (error); /* Can't trace init when securelevel > 0. */ if (p == initproc) { error = securelevel_gt(td->td_ucred, 0); if (error) return (error); } /* * Can't trace a process that's currently exec'ing. * * XXX: Note, this is not a security policy decision, it's a * basic correctness/functionality decision. Therefore, this check * should be moved to the caller's of p_candebug(). */ if ((p->p_flag & P_INEXEC) != 0) return (EBUSY); /* Denied explicitly */ if ((p->p_flag2 & P2_NOTRACE) != 0) { error = priv_check(td, PRIV_DEBUG_DENIED); if (error != 0) return (error); } return (0); } /*- * Determine whether the subject represented by cred can "see" a socket. * Returns: 0 for permitted, ENOENT otherwise. */ int cr_canseesocket(struct ucred *cred, struct socket *so) { int error; error = prison_check(cred, so->so_cred); if (error) return (ENOENT); #ifdef MAC error = mac_socket_check_visible(cred, so); if (error) return (error); #endif if (cr_bsd_visible(cred, so->so_cred)) return (ENOENT); return (0); } /*- * Determine whether td can wait for the exit of p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_canwait(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_proc_check_wait(td->td_ucred, p))) return (error); #endif #if 0 /* XXXMAC: This could have odd effects on some shells. */ if ((error = cr_bsd_visible(td->td_ucred, p->p_ucred))) return (error); #endif return (0); } /* * Credential management. * * struct ucred objects are rarely allocated but gain and lose references all * the time (e.g., on struct file alloc/dealloc) turning refcount updates into * a significant source of cache-line ping ponging. Common cases are worked * around by modifying thread-local counter instead if the cred to operate on * matches td_realucred. * * The counter is split into 2 parts: * - cr_users -- total count of all struct proc and struct thread objects * which have given cred in p_ucred and td_ucred respectively * - cr_ref -- the actual ref count, only valid if cr_users == 0 * * If users == 0 then cr_ref behaves similarly to refcount(9), in particular if * the count reaches 0 the object is freeable. * If users > 0 and curthread->td_realucred == cred, then updates are performed * against td_ucredref. * In other cases updates are performed against cr_ref. * * Changing td_realucred into something else decrements cr_users and transfers * accumulated updates. */ struct ucred * crcowget(struct ucred *cr) { mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users++; cr->cr_ref++; mtx_unlock(&cr->cr_mtx); return (cr); } static struct ucred * crunuse(struct thread *td) { struct ucred *cr, *crold; MPASS(td->td_realucred == td->td_ucred); cr = td->td_realucred; mtx_lock(&cr->cr_mtx); cr->cr_ref += td->td_ucredref; td->td_ucredref = 0; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_users--; if (cr->cr_users == 0) { KASSERT(cr->cr_ref > 0, ("%s: ref %ld not > 0 on cred %p", __func__, cr->cr_ref, cr)); crold = cr; } else { cr->cr_ref--; crold = NULL; } mtx_unlock(&cr->cr_mtx); td->td_realucred = NULL; return (crold); } static void crunusebatch(struct ucred *cr, u_int users, long ref) { KASSERT(users > 0, ("%s: passed users %d not > 0 ; cred %p", __func__, users, cr)); mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users >= users, ("%s: users %d not > %d on cred %p", __func__, cr->cr_users, users, cr)); cr->cr_users -= users; cr->cr_ref += ref; cr->cr_ref -= users; if (cr->cr_users > 0) { mtx_unlock(&cr->cr_mtx); return; } KASSERT(cr->cr_ref >= 0, ("%s: ref %ld not >= 0 on cred %p", __func__, cr->cr_ref, cr)); if (cr->cr_ref > 0) { mtx_unlock(&cr->cr_mtx); return; } crfree_final(cr); } void crcowfree(struct thread *td) { struct ucred *cr; cr = crunuse(td); if (cr != NULL) crfree(cr); } struct ucred * crcowsync(void) { struct thread *td; struct proc *p; struct ucred *crnew, *crold; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(td->td_realucred == td->td_ucred); if (td->td_realucred == p->p_ucred) return (NULL); crnew = crcowget(p->p_ucred); crold = crunuse(td); td->td_realucred = crnew; td->td_ucred = td->td_realucred; return (crold); } /* * Batching. */ void credbatch_add(struct credbatch *crb, struct thread *td) { struct ucred *cr; MPASS(td->td_realucred != NULL); MPASS(td->td_realucred == td->td_ucred); MPASS(TD_GET_STATE(td) == TDS_INACTIVE); cr = td->td_realucred; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); if (crb->cred != cr) { if (crb->users > 0) { MPASS(crb->cred != NULL); crunusebatch(crb->cred, crb->users, crb->ref); crb->users = 0; crb->ref = 0; } } crb->cred = cr; crb->users++; crb->ref += td->td_ucredref; td->td_ucredref = 0; td->td_realucred = NULL; } void credbatch_final(struct credbatch *crb) { MPASS(crb->cred != NULL); MPASS(crb->users > 0); crunusebatch(crb->cred, crb->users, crb->ref); } /* * Allocate a zeroed cred structure. */ struct ucred * crget(void) { struct ucred *cr; cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); mtx_init(&cr->cr_mtx, "cred", NULL, MTX_DEF); cr->cr_ref = 1; #ifdef AUDIT audit_cred_init(cr); #endif #ifdef MAC mac_cred_init(cr); #endif cr->cr_groups = cr->cr_smallgroups; cr->cr_agroups = nitems(cr->cr_smallgroups); return (cr); } /* * Claim another reference to a ucred structure. */ struct ucred * crhold(struct ucred *cr) { struct thread *td; td = curthread; if (__predict_true(td->td_realucred == cr)) { KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); td->td_ucredref++; return (cr); } mtx_lock(&cr->cr_mtx); cr->cr_ref++; mtx_unlock(&cr->cr_mtx); return (cr); } /* * Free a cred structure. Throws away space when ref count gets to 0. */ void crfree(struct ucred *cr) { struct thread *td; td = curthread; if (__predict_true(td->td_realucred == cr)) { KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); td->td_ucredref--; return; } mtx_lock(&cr->cr_mtx); KASSERT(cr->cr_users >= 0, ("%s: users %d not >= 0 on cred %p", __func__, cr->cr_users, cr)); cr->cr_ref--; if (cr->cr_users > 0) { mtx_unlock(&cr->cr_mtx); return; } KASSERT(cr->cr_ref >= 0, ("%s: ref %ld not >= 0 on cred %p", __func__, cr->cr_ref, cr)); if (cr->cr_ref > 0) { mtx_unlock(&cr->cr_mtx); return; } crfree_final(cr); } static void crfree_final(struct ucred *cr) { KASSERT(cr->cr_users == 0, ("%s: users %d not == 0 on cred %p", __func__, cr->cr_users, cr)); KASSERT(cr->cr_ref == 0, ("%s: ref %ld not == 0 on cred %p", __func__, cr->cr_ref, cr)); /* * Some callers of crget(), such as nfs_statfs(), allocate a temporary * credential, but don't allocate a uidinfo structure. */ if (cr->cr_uidinfo != NULL) uifree(cr->cr_uidinfo); if (cr->cr_ruidinfo != NULL) uifree(cr->cr_ruidinfo); if (cr->cr_prison != NULL) prison_free(cr->cr_prison); if (cr->cr_loginclass != NULL) loginclass_free(cr->cr_loginclass); #ifdef AUDIT audit_cred_destroy(cr); #endif #ifdef MAC mac_cred_destroy(cr); #endif mtx_destroy(&cr->cr_mtx); if (cr->cr_groups != cr->cr_smallgroups) free(cr->cr_groups, M_CRED); free(cr, M_CRED); } /* * Copy a ucred's contents from a template. Does not block. */ void crcopy(struct ucred *dest, struct ucred *src) { bcopy(&src->cr_startcopy, &dest->cr_startcopy, (unsigned)((caddr_t)&src->cr_endcopy - (caddr_t)&src->cr_startcopy)); dest->cr_flags = src->cr_flags; crsetgroups(dest, src->cr_ngroups, src->cr_groups); uihold(dest->cr_uidinfo); uihold(dest->cr_ruidinfo); prison_hold(dest->cr_prison); loginclass_hold(dest->cr_loginclass); #ifdef AUDIT audit_cred_copy(src, dest); #endif #ifdef MAC mac_cred_copy(src, dest); #endif } /* * Dup cred struct to a new held one. */ struct ucred * crdup(struct ucred *cr) { struct ucred *newcr; newcr = crget(); crcopy(newcr, cr); return (newcr); } /* * Fill in a struct xucred based on a struct ucred. */ void cru2x(struct ucred *cr, struct xucred *xcr) { int ngroups; bzero(xcr, sizeof(*xcr)); xcr->cr_version = XUCRED_VERSION; xcr->cr_uid = cr->cr_uid; xcr->cr_gid = cr->cr_gid; /* * We use a union to alias cr_gid to cr_groups[0] in the xucred, so * this is kind of ugly; cr_ngroups still includes the egid for our * purposes to avoid bumping the xucred version. */ ngroups = MIN(cr->cr_ngroups + 1, nitems(xcr->cr_groups)); xcr->cr_ngroups = ngroups; bcopy(cr->cr_groups, xcr->cr_sgroups, (ngroups - 1) * sizeof(*cr->cr_groups)); } void cru2xt(struct thread *td, struct xucred *xcr) { cru2x(td->td_ucred, xcr); xcr->cr_pid = td->td_proc->p_pid; } /* * Change process credentials. * * Callers are responsible for providing the reference for passed credentials * and for freeing old ones. Calls chgproccnt() to correctly account the * current process to the proper real UID, if the latter has changed. Returns * whether the operation was successful. Failure can happen only on * 'enforce_proc_lim' being true and if no new process can be accounted to the * new real UID because of the current limit (see the inner comment for more * details) and the caller does not have privilege (PRIV_PROC_LIMIT) to override - * that. + * that. In this case, the reference to 'newcred' is not taken over. */ static bool _proc_set_cred(struct proc *p, struct ucred *newcred, bool enforce_proc_lim) { struct ucred *const oldcred = p->p_ucred; MPASS(oldcred != NULL); PROC_LOCK_ASSERT(p, MA_OWNED); if (newcred->cr_ruidinfo != oldcred->cr_ruidinfo) { /* * XXXOC: This check is flawed but nonetheless the best we can * currently do as we don't really track limits per UID contrary * to what we pretend in setrlimit(2). Until this is reworked, * we just check here that the number of processes for our new * real UID doesn't exceed this process' process number limit * (which is meant to be associated with the current real UID). */ const int proccnt_changed = chgproccnt(newcred->cr_ruidinfo, 1, enforce_proc_lim ? lim_cur_proc(p, RLIMIT_NPROC) : 0); if (!proccnt_changed) { if (priv_check_cred(oldcred, PRIV_PROC_LIMIT) != 0) return (false); (void)chgproccnt(newcred->cr_ruidinfo, 1, 0); } } mtx_lock(&oldcred->cr_mtx); KASSERT(oldcred->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, oldcred->cr_users, oldcred)); oldcred->cr_users--; mtx_unlock(&oldcred->cr_mtx); mtx_lock(&newcred->cr_mtx); newcred->cr_users++; mtx_unlock(&newcred->cr_mtx); p->p_ucred = newcred; PROC_UPDATE_COW(p); if (newcred->cr_ruidinfo != oldcred->cr_ruidinfo) (void)chgproccnt(oldcred->cr_ruidinfo, -1, 0); return (true); } void proc_set_cred(struct proc *p, struct ucred *newcred) { bool success __diagused = _proc_set_cred(p, newcred, false); MPASS(success); } bool proc_set_cred_enforce_proc_lim(struct proc *p, struct ucred *newcred) { return (_proc_set_cred(p, newcred, true)); } void proc_unset_cred(struct proc *p, bool decrement_proc_count) { struct ucred *cr; MPASS(p->p_state == PRS_ZOMBIE || p->p_state == PRS_NEW); cr = p->p_ucred; p->p_ucred = NULL; KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p", __func__, cr->cr_users, cr)); mtx_lock(&cr->cr_mtx); cr->cr_users--; if (cr->cr_users == 0) KASSERT(cr->cr_ref > 0, ("%s: ref %ld not > 0 on cred %p", __func__, cr->cr_ref, cr)); mtx_unlock(&cr->cr_mtx); if (decrement_proc_count) (void)chgproccnt(cr->cr_ruidinfo, -1, 0); crfree(cr); } struct ucred * crcopysafe(struct proc *p, struct ucred *cr) { struct ucred *oldcred; int groups; PROC_LOCK_ASSERT(p, MA_OWNED); oldcred = p->p_ucred; while (cr->cr_agroups < oldcred->cr_ngroups) { groups = oldcred->cr_ngroups; PROC_UNLOCK(p); crextend(cr, groups); PROC_LOCK(p); oldcred = p->p_ucred; } crcopy(cr, oldcred); return (oldcred); } /* * Extend the passed-in credentials to hold n groups. * * Must not be called after groups have been set. */ void crextend(struct ucred *cr, int n) { size_t nbytes; MPASS2(cr->cr_ref == 1, "'cr_ref' must be 1 (referenced, unshared)"); MPASS2((cr->cr_flags & CRED_FLAG_GROUPSET) == 0, "groups on 'cr' already set!"); groups_check_positive_len(n); groups_check_max_len(n); if (n <= cr->cr_agroups) return; nbytes = n * sizeof(gid_t); if (nbytes < n) panic("Too many groups (memory size overflow)! " "Computation of 'kern.ngroups' should have prevented this, " "please fix it. In the meantime, reduce 'kern.ngroups'."); /* * We allocate a power of 2 larger than 'nbytes', except when that * exceeds PAGE_SIZE, in which case we allocate the right multiple of * pages. We assume PAGE_SIZE is a power of 2 (the call to roundup2() * below) but do not need to for sizeof(gid_t). */ if (nbytes < PAGE_SIZE) { if (!powerof2(nbytes)) /* fls*() return a bit index starting at 1. */ nbytes = 1 << flsl(nbytes); } else nbytes = roundup2(nbytes, PAGE_SIZE); /* Free the old array. */ if (cr->cr_groups != cr->cr_smallgroups) free(cr->cr_groups, M_CRED); cr->cr_groups = malloc(nbytes, M_CRED, M_WAITOK | M_ZERO); cr->cr_agroups = nbytes / sizeof(gid_t); } /* * Normalizes a set of groups to be applied to a 'struct ucred'. * * Normalization ensures that the supplementary groups are sorted in ascending * order and do not contain duplicates. This allows group_is_supplementary() to * do a binary search. */ static void groups_normalize(int *ngrp, gid_t *groups) { gid_t prev_g; int ins_idx; groups_check_positive_len(*ngrp); groups_check_max_len(*ngrp); if (*ngrp <= 1) return; qsort(groups, *ngrp, sizeof(*groups), gidp_cmp); /* Remove duplicates. */ prev_g = groups[0]; ins_idx = 1; for (int i = ins_idx; i < *ngrp; ++i) { const gid_t g = groups[i]; if (g != prev_g) { if (i != ins_idx) groups[ins_idx] = g; ++ins_idx; prev_g = g; } } *ngrp = ins_idx; groups_check_normalized(*ngrp, groups); } /* * Internal function copying groups into a credential. * * 'ngrp' must be strictly positive. Either the passed 'groups' array must have * been normalized in advance (see groups_normalize()), else it must be so * before the structure is to be used again. * * This function is suitable to be used under any lock (it doesn't take any lock * itself nor sleep, and in particular doesn't allocate memory). crextend() * must have been called beforehand to ensure sufficient space is available. * See also crsetgroups(), which handles that. */ static void crsetgroups_internal(struct ucred *cr, int ngrp, const gid_t *groups) { MPASS2(cr->cr_ref == 1, "'cr_ref' must be 1 (referenced, unshared)"); MPASS2(cr->cr_agroups >= ngrp, "'cr_agroups' too small"); groups_check_positive_len(ngrp); bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t)); cr->cr_ngroups = ngrp; cr->cr_flags |= CRED_FLAG_GROUPSET; } /* * Copy groups in to a credential after expanding it if required. * * May sleep in order to allocate memory (except if, e.g., crextend() was called * before with 'ngrp' or greater). Truncates the list to 'ngroups_max' if * it is too large. Array 'groups' doesn't need to be sorted. 'ngrp' must be * positive. */ void crsetgroups(struct ucred *cr, int ngrp, const gid_t *groups) { if (ngrp > ngroups_max) ngrp = ngroups_max; cr->cr_ngroups = 0; if (ngrp == 0) { cr->cr_flags |= CRED_FLAG_GROUPSET; return; } /* * crextend() asserts that groups are not set, as it may allocate a new * backing storage without copying the content of the old one. Since we * are going to install a completely new set anyway, signal that we * consider the old ones thrown away. */ cr->cr_flags &= ~CRED_FLAG_GROUPSET; crextend(cr, ngrp); crsetgroups_internal(cr, ngrp, groups); groups_normalize(&cr->cr_ngroups, cr->cr_groups); } /* * Same as crsetgroups() but sets the effective GID as well. * * This function ensures that an effective GID is always present in credentials. * An empty array will only set the effective GID to 'default_egid', while * a non-empty array will peel off groups[0] to set as the effective GID and use * the remainder, if any, as supplementary groups. */ void crsetgroups_and_egid(struct ucred *cr, int ngrp, const gid_t *groups, const gid_t default_egid) { if (ngrp == 0) { cr->cr_gid = default_egid; cr->cr_ngroups = 0; cr->cr_flags |= CRED_FLAG_GROUPSET; return; } crsetgroups(cr, ngrp - 1, groups + 1); cr->cr_gid = groups[0]; } /* * Get login name, if available. */ #ifndef _SYS_SYSPROTO_H_ struct getlogin_args { char *namebuf; u_int namelen; }; #endif /* ARGSUSED */ int sys_getlogin(struct thread *td, struct getlogin_args *uap) { char login[MAXLOGNAME]; struct proc *p = td->td_proc; size_t len; if (uap->namelen > MAXLOGNAME) uap->namelen = MAXLOGNAME; PROC_LOCK(p); SESS_LOCK(p->p_session); len = strlcpy(login, p->p_session->s_login, uap->namelen) + 1; SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (len > uap->namelen) return (ERANGE); return (copyout(login, uap->namebuf, len)); } /* * Set login name. */ #ifndef _SYS_SYSPROTO_H_ struct setlogin_args { char *namebuf; }; #endif /* ARGSUSED */ int sys_setlogin(struct thread *td, struct setlogin_args *uap) { struct proc *p = td->td_proc; int error; char logintmp[MAXLOGNAME]; CTASSERT(sizeof(p->p_session->s_login) >= sizeof(logintmp)); error = priv_check(td, PRIV_PROC_SETLOGIN); if (error) return (error); error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL); if (error != 0) { if (error == ENAMETOOLONG) error = EINVAL; return (error); } AUDIT_ARG_LOGIN(logintmp); PROC_LOCK(p); SESS_LOCK(p->p_session); strcpy(p->p_session->s_login, logintmp); SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); return (0); } void setsugid(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_SUGID; } /*- * Change a process's effective uid. * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_euid(struct ucred *newcred, struct uidinfo *euip) { newcred->cr_uid = euip->ui_uid; uihold(euip); uifree(newcred->cr_uidinfo); newcred->cr_uidinfo = euip; } /*- * Change a process's effective gid. * Side effects: newcred->cr_gid will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_egid(struct ucred *newcred, gid_t egid) { newcred->cr_gid = egid; } /*- * Change a process's real uid. * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo * will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_ruid(struct ucred *newcred, struct uidinfo *ruip) { newcred->cr_ruid = ruip->ui_uid; uihold(ruip); uifree(newcred->cr_ruidinfo); newcred->cr_ruidinfo = ruip; } /*- * Change a process's real gid. * Side effects: newcred->cr_rgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_rgid(struct ucred *newcred, gid_t rgid) { newcred->cr_rgid = rgid; } /*- * Change a process's saved uid. * Side effects: newcred->cr_svuid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svuid(struct ucred *newcred, uid_t svuid) { newcred->cr_svuid = svuid; } /*- * Change a process's saved gid. * Side effects: newcred->cr_svgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svgid(struct ucred *newcred, gid_t svgid) { newcred->cr_svgid = svgid; } bool allow_ptrace = true; SYSCTL_BOOL(_security_bsd, OID_AUTO, allow_ptrace, CTLFLAG_RWTUN, &allow_ptrace, 0, "Deny ptrace(2) use by returning ENOSYS"); diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c index 17b64ad00bb5..d1324935bdc3 100644 --- a/sys/kern/kern_racct.c +++ b/sys/kern/kern_racct.c @@ -1,1338 +1,1340 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2010 The FreeBSD Foundation * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RCTL #include #endif FEATURE(racct, "Resource Accounting"); /* * Do not block processes that have their %cpu usage <= pcpu_threshold. */ static int pcpu_threshold = 1; #ifdef RACCT_DEFAULT_TO_DISABLED bool __read_frequently racct_enable = false; #else bool __read_frequently racct_enable = true; #endif SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Resource Accounting"); SYSCTL_BOOL(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 0, "Enable RACCT/RCTL"); SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 0, "Processes with higher %cpu usage than this value can be throttled."); /* * How many seconds it takes to use the scheduler %cpu calculations. When a * process starts, we compute its %cpu usage by dividing its runtime by the * process wall clock time. After RACCT_PCPU_SECS pass, we use the value * provided by the scheduler. */ #define RACCT_PCPU_SECS 3 struct mtx racct_lock; MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); static uma_zone_t racct_zone; static void racct_sub_racct(struct racct *dest, const struct racct *src); static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount); static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount); static int racct_set_locked(struct proc *p, int resource, uint64_t amount, int force); static void racct_updatepcpu_locked(struct proc *p); static void racct_updatepcpu_racct_locked(struct racct *racct); static void racct_updatepcpu_containers(void); static void racct_settime_locked(struct proc *p, bool exit); static void racct_zeropcpu_locked(struct proc *p); SDT_PROVIDER_DEFINE(racct); SDT_PROBE_DEFINE3(racct, , rusage, add, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__failure, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__buf, "struct proc *", "const struct buf *", "int"); SDT_PROBE_DEFINE3(racct, , rusage, add__cred, "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, add__force, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set__failure, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, set__force, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, sub, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, , rusage, sub__cred, "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE1(racct, , racct, create, "struct racct *"); SDT_PROBE_DEFINE1(racct, , racct, destroy, "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, join, "struct racct *", "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, join__failure, "struct racct *", "struct racct *"); SDT_PROBE_DEFINE2(racct, , racct, leave, "struct racct *", "struct racct *"); int racct_types[] = { [RACCT_CPU] = RACCT_IN_MILLIONS, [RACCT_DATA] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_STACK] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_CORE] = RACCT_DENIABLE, [RACCT_RSS] = RACCT_RECLAIMABLE, [RACCT_MEMLOCK] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NPROC] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NOFILE] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_VMEM] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NPTS] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SWAP] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NTHR] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_MSGQQUEUED] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_MSGQSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NMSGQ] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEMOP] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NSHM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SHMSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_WALLCLOCK] = RACCT_IN_MILLIONS, [RACCT_PCTCPU] = RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS, [RACCT_READBPS] = RACCT_DECAYING, [RACCT_WRITEBPS] = RACCT_DECAYING, [RACCT_READIOPS] = RACCT_DECAYING, [RACCT_WRITEIOPS] = RACCT_DECAYING }; static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; #ifdef SCHED_4BSD /* * Contains intermediate values for %cpu calculations to avoid using floating * point in the kernel. * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to * zero so the calculations are more straightforward. */ fixpt_t ccpu_exp[] = { [0] = FSCALE * 1, [1] = FSCALE * 0.95122942450071400909, [2] = FSCALE * 0.90483741803595957316, [3] = FSCALE * 0.86070797642505780722, [4] = FSCALE * 0.81873075307798185866, [5] = FSCALE * 0.77880078307140486824, [6] = FSCALE * 0.74081822068171786606, [7] = FSCALE * 0.70468808971871343435, [8] = FSCALE * 0.67032004603563930074, [9] = FSCALE * 0.63762815162177329314, [10] = FSCALE * 0.60653065971263342360, [11] = FSCALE * 0.57694981038048669531, [12] = FSCALE * 0.54881163609402643262, [13] = FSCALE * 0.52204577676101604789, [14] = FSCALE * 0.49658530379140951470, [15] = FSCALE * 0.47236655274101470713, [16] = FSCALE * 0.44932896411722159143, [17] = FSCALE * 0.42741493194872666992, [18] = FSCALE * 0.40656965974059911188, [19] = FSCALE * 0.38674102345450120691, [20] = FSCALE * 0.36787944117144232159, [21] = FSCALE * 0.34993774911115535467, [22] = FSCALE * 0.33287108369807955328, [23] = FSCALE * 0.31663676937905321821, [24] = FSCALE * 0.30119421191220209664, [25] = FSCALE * 0.28650479686019010032, [26] = FSCALE * 0.27253179303401260312, [27] = FSCALE * 0.25924026064589150757, [28] = FSCALE * 0.24659696394160647693, [29] = FSCALE * 0.23457028809379765313, [30] = FSCALE * 0.22313016014842982893, [31] = FSCALE * 0.21224797382674305771, [32] = FSCALE * 0.20189651799465540848, [33] = FSCALE * 0.19204990862075411423, [34] = FSCALE * 0.18268352405273465022, [35] = FSCALE * 0.17377394345044512668, [36] = FSCALE * 0.16529888822158653829, [37] = FSCALE * 0.15723716631362761621, [38] = FSCALE * 0.14956861922263505264, [39] = FSCALE * 0.14227407158651357185, [40] = FSCALE * 0.13533528323661269189, [41] = FSCALE * 0.12873490358780421886, [42] = FSCALE * 0.12245642825298191021, [43] = FSCALE * 0.11648415777349695786, [44] = FSCALE * 0.11080315836233388333, [45] = FSCALE * 0.10539922456186433678, [46] = FSCALE * 0.10025884372280373372, [47] = FSCALE * 0.09536916221554961888, [48] = FSCALE * 0.09071795328941250337, [49] = FSCALE * 0.08629358649937051097, [50] = FSCALE * 0.08208499862389879516, [51] = FSCALE * 0.07808166600115315231, [52] = FSCALE * 0.07427357821433388042, [53] = FSCALE * 0.07065121306042958674, [54] = FSCALE * 0.06720551273974976512, [55] = FSCALE * 0.06392786120670757270, [56] = FSCALE * 0.06081006262521796499, [57] = FSCALE * 0.05784432087483846296, [58] = FSCALE * 0.05502322005640722902, [59] = FSCALE * 0.05233970594843239308, [60] = FSCALE * 0.04978706836786394297, [61] = FSCALE * 0.04735892439114092119, [62] = FSCALE * 0.04504920239355780606, [63] = FSCALE * 0.04285212686704017991, [64] = FSCALE * 0.04076220397836621516, [65] = FSCALE * 0.03877420783172200988, [66] = FSCALE * 0.03688316740124000544, [67] = FSCALE * 0.03508435410084502588, [68] = FSCALE * 0.03337326996032607948, [69] = FSCALE * 0.03174563637806794323, [70] = FSCALE * 0.03019738342231850073, [71] = FSCALE * 0.02872463965423942912, [72] = FSCALE * 0.02732372244729256080, [73] = FSCALE * 0.02599112877875534358, [74] = FSCALE * 0.02472352647033939120, [75] = FSCALE * 0.02351774585600910823, [76] = FSCALE * 0.02237077185616559577, [77] = FSCALE * 0.02127973643837716938, [78] = FSCALE * 0.02024191144580438847, [79] = FSCALE * 0.01925470177538692429, [80] = FSCALE * 0.01831563888873418029, [81] = FSCALE * 0.01742237463949351138, [82] = FSCALE * 0.01657267540176124754, [83] = FSCALE * 0.01576441648485449082, [84] = FSCALE * 0.01499557682047770621, [85] = FSCALE * 0.01426423390899925527, [86] = FSCALE * 0.01356855901220093175, [87] = FSCALE * 0.01290681258047986886, [88] = FSCALE * 0.01227733990306844117, [89] = FSCALE * 0.01167856697039544521, [90] = FSCALE * 0.01110899653824230649, [91] = FSCALE * 0.01056720438385265337, [92] = FSCALE * 0.01005183574463358164, [93] = FSCALE * 0.00956160193054350793, [94] = FSCALE * 0.00909527710169581709, [95] = FSCALE * 0.00865169520312063417, [96] = FSCALE * 0.00822974704902002884, [97] = FSCALE * 0.00782837754922577143, [98] = FSCALE * 0.00744658307092434051, [99] = FSCALE * 0.00708340892905212004, [100] = FSCALE * 0.00673794699908546709, [101] = FSCALE * 0.00640933344625638184, [102] = FSCALE * 0.00609674656551563610, [103] = FSCALE * 0.00579940472684214321, [104] = FSCALE * 0.00551656442076077241, [105] = FSCALE * 0.00524751839918138427, [106] = FSCALE * 0.00499159390691021621, [107] = FSCALE * 0.00474815099941147558, [108] = FSCALE * 0.00451658094261266798, [109] = FSCALE * 0.00429630469075234057, [110] = FSCALE * 0.00408677143846406699, }; #endif #define CCPU_EXP_MAX 110 static void racct_add_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); dest->r_resources[i] += src->r_resources[i]; } } static void racct_sub_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); KASSERT(src->r_resources[i] <= dest->r_resources[i], ("%s: resource %d propagation meltdown: src > dest", __func__, i)); } if (RACCT_CAN_DROP(i)) { dest->r_resources[i] -= src->r_resources[i]; if (dest->r_resources[i] < 0) dest->r_resources[i] = 0; } } } void racct_create(struct racct **racctp) { if (!racct_enable) return; SDT_PROBE1(racct, , racct, create, racctp); KASSERT(*racctp == NULL, ("racct already allocated")); *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); } static void racct_destroy_locked(struct racct **racctp) { struct racct *racct; int i; ASSERT_RACCT_ENABLED(); SDT_PROBE1(racct, , racct, destroy, racctp); RACCT_LOCK_ASSERT(); KASSERT(racctp != NULL, ("NULL racctp")); KASSERT(*racctp != NULL, ("NULL racct")); racct = *racctp; for (i = 0; i <= RACCT_MAX; i++) { if (RACCT_IS_SLOPPY(i)) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; KASSERT(racct->r_resources[i] == 0, ("destroying non-empty racct: " "%ju allocated for resource %d\n", racct->r_resources[i], i)); } uma_zfree(racct_zone, racct); *racctp = NULL; } void racct_destroy(struct racct **racct) { if (!racct_enable) return; RACCT_LOCK(); racct_destroy_locked(racct); RACCT_UNLOCK(); } /* * Increase consumption of 'resource' by 'amount' for 'racct', * but not its parents. Differently from other cases, 'amount' here * may be less than zero. */ static void racct_adjust_resource(struct racct *racct, int resource, int64_t amount) { ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); KASSERT(racct != NULL, ("NULL racct")); racct->r_resources[resource] += amount; if (racct->r_resources[resource] < 0) { KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), ("%s: resource %d usage < 0", __func__, resource)); racct->r_resources[resource] = 0; } } static int racct_add_locked(struct proc *p, int resource, uint64_t amount, int force) { #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); #ifdef RCTL error = rctl_enforce(p, resource, amount); if (error && !force && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount); return (error); } #endif racct_adjust_resource(p->p_racct, resource, amount); racct_add_cred_locked(p->p_ucred, resource, amount); return (0); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. */ int racct_add(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); SDT_PROBE3(racct, , rusage, add, p, resource, amount); RACCT_LOCK(); error = racct_add_locked(p, resource, amount, 0); RACCT_UNLOCK(); return (error); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Doesn't check for limits and never fails. */ void racct_add_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, add__force, p, resource, amount); RACCT_LOCK(); racct_add_locked(p, resource, amount, 1); RACCT_UNLOCK(); } static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount); } /* * Increase allocation of 'resource' by 'amount' for credential 'cred'. * Doesn't check for limits and never fails. */ void racct_add_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount); RACCT_LOCK(); racct_add_cred_locked(cred, resource, amount); RACCT_UNLOCK(); } /* * Account for disk IO resource consumption. Checks for limits, * but never fails, due to disk limits being undeniable. */ void racct_add_buf(struct proc *p, const struct buf *bp, int is_write) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write); RACCT_LOCK(); if (is_write) { racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1); racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1); } else { racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1); racct_add_locked(curproc, RACCT_READIOPS, 1, 1); } RACCT_UNLOCK(); } static void racct_settime_locked(struct proc *p, bool exit) { struct thread *td; struct timeval wallclock; uint64_t runtime; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); PROC_LOCK_ASSERT(p, MA_OWNED); if (exit) { /* * proc_reap() has already calculated rux * and added crux to rux. */ runtime = cputick2usec(p->p_rux.rux_runtime - p->p_crux.rux_runtime); } else { PROC_STATLOCK(p); FOREACH_THREAD_IN_PROC(p, td) ruxagg(p, td); PROC_STATUNLOCK(p); runtime = cputick2usec(p->p_rux.rux_runtime); } microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); racct_set_locked(p, RACCT_CPU, runtime, 0); racct_set_locked(p, RACCT_WALLCLOCK, (uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec, 0); } static int racct_set_locked(struct proc *p, int resource, uint64_t amount, int force) { int64_t old_amount, diff_proc, diff_cred; #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); old_amount = p->p_racct->r_resources[resource]; /* * The diffs may be negative. */ diff_proc = amount - old_amount; diff_cred = diff_proc; #ifdef notyet KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), ("%s: usage of non-droppable resource %d dropping", __func__, resource)); #endif #ifdef RCTL if (diff_proc > 0) { error = rctl_enforce(p, resource, diff_proc); if (error && !force && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, , rusage, set__failure, p, resource, amount); return (error); } } #endif racct_adjust_resource(p->p_racct, resource, diff_proc); if (diff_cred > 0) racct_add_cred_locked(p->p_ucred, resource, diff_cred); else if (diff_cred < 0) racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); return (0); } /* * Set allocation of 'resource' to 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. * * Note that decreasing the allocation always returns 0, * even if it's above the limit. */ int racct_set_unlocked(struct proc *p, int resource, uint64_t amount) { int error; ASSERT_RACCT_ENABLED(); PROC_LOCK(p); error = racct_set(p, resource, amount); PROC_UNLOCK(p); return (error); } int racct_set(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); SDT_PROBE3(racct, , rusage, set__force, p, resource, amount); RACCT_LOCK(); error = racct_set_locked(p, resource, amount, 0); RACCT_UNLOCK(); return (error); } void racct_set_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, set, p, resource, amount); RACCT_LOCK(); racct_set_locked(p, resource, amount, 1); RACCT_UNLOCK(); } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * not matter. */ uint64_t racct_get_limit(struct proc *p, int resource) { #ifdef RCTL uint64_t available; if (!racct_enable) return (UINT64_MAX); RACCT_LOCK(); available = rctl_get_limit(p, resource); RACCT_UNLOCK(); return (available); #else return (UINT64_MAX); #endif } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * matter. */ uint64_t racct_get_available(struct proc *p, int resource) { #ifdef RCTL uint64_t available; if (!racct_enable) return (UINT64_MAX); RACCT_LOCK(); available = rctl_get_available(p, resource); RACCT_UNLOCK(); return (available); #else return (UINT64_MAX); #endif } /* * Returns amount of the %cpu resource that process 'p' can add to its %cpu * utilization. Adding more than that would lead to the process being * throttled. */ static int64_t racct_pcpu_available(struct proc *p) { #ifdef RCTL uint64_t available; ASSERT_RACCT_ENABLED(); RACCT_LOCK(); available = rctl_pcpu_available(p); RACCT_UNLOCK(); return (available); #else return (INT64_MAX); #endif } /* * Decrease allocation of 'resource' by 'amount' for process 'p'. */ void racct_sub(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, sub, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(RACCT_CAN_DROP(resource), ("%s: called for non-droppable resource %d", __func__, resource)); RACCT_LOCK(); KASSERT(amount <= p->p_racct->r_resources[resource], ("%s: freeing %ju of resource %d, which is more " "than allocated %jd for %s (pid %d)", __func__, amount, resource, (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); racct_adjust_resource(p->p_racct, resource, -amount); racct_sub_cred_locked(p->p_ucred, resource, amount); RACCT_UNLOCK(); } static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, -amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount); } /* * Decrease allocation of 'resource' by 'amount' for credential 'cred'. */ void racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount); #ifdef notyet KASSERT(RACCT_CAN_DROP(resource), ("%s: called for resource %d which can not drop", __func__, resource)); #endif RACCT_LOCK(); racct_sub_cred_locked(cred, resource, amount); RACCT_UNLOCK(); } /* * Inherit resource usage information from the parent process. */ int racct_proc_fork(struct proc *parent, struct proc *child) { int i, error = 0; if (!racct_enable) return (0); /* * Create racct for the child process. */ racct_create(&child->p_racct); PROC_LOCK(parent); PROC_LOCK(child); RACCT_LOCK(); #ifdef RCTL error = rctl_proc_fork(parent, child); if (error != 0) goto out; #endif child->p_throttled = 0; /* * Inherit resource usage. */ for (i = 0; i <= RACCT_MAX; i++) { if (parent->p_racct->r_resources[i] == 0 || !RACCT_IS_INHERITABLE(i)) continue; error = racct_set_locked(child, i, parent->p_racct->r_resources[i], 0); if (error != 0) goto out; } error = racct_add_locked(child, RACCT_NPROC, 1, 0); error += racct_add_locked(child, RACCT_NTHR, 1, 0); out: RACCT_UNLOCK(); PROC_UNLOCK(child); PROC_UNLOCK(parent); if (error != 0) racct_proc_exit(child); return (error); } /* * Called at the end of fork1(), to handle rules that require the process * to be fully initialized. */ void racct_proc_fork_done(struct proc *child) { if (!racct_enable) return; #ifdef RCTL PROC_LOCK(child); RACCT_LOCK(); rctl_enforce(child, RACCT_NPROC, 0); rctl_enforce(child, RACCT_NTHR, 0); RACCT_UNLOCK(); PROC_UNLOCK(child); #endif } void racct_proc_exit(struct proc *p) { int i; if (!racct_enable) return; PROC_LOCK(p); RACCT_LOCK(); racct_settime_locked(p, true); racct_zeropcpu_locked(p); KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0, ("process reaped with %ju allocated for RSS\n", p->p_racct->r_resources[RACCT_RSS])); for (i = 0; i <= RACCT_MAX; i++) { if (p->p_racct->r_resources[i] == 0) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; racct_set_locked(p, i, 0, 0); } #ifdef RCTL rctl_racct_release(p->p_racct); #endif racct_destroy_locked(&p->p_racct); RACCT_UNLOCK(); PROC_UNLOCK(p); } /* - * Called after credentials change, to move resource utilisation - * between raccts. + * Called to signal credentials change, to move resource utilisation + * between raccts. Must be called with the proc lock held, in the same span as + * the credentials change itself (i.e., without the proc lock being unlocked + * between the two), but the order does not matter. */ void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, struct ucred *newcred) { struct uidinfo *olduip, *newuip; struct loginclass *oldlc, *newlc; struct prison *oldpr, *newpr, *pr; if (!racct_enable) return; PROC_LOCK_ASSERT(p, MA_OWNED); newuip = newcred->cr_ruidinfo; olduip = oldcred->cr_ruidinfo; newlc = newcred->cr_loginclass; oldlc = oldcred->cr_loginclass; newpr = newcred->cr_prison; oldpr = oldcred->cr_prison; RACCT_LOCK(); if (newuip != olduip) { racct_sub_racct(olduip->ui_racct, p->p_racct); racct_add_racct(newuip->ui_racct, p->p_racct); } if (newlc != oldlc) { racct_sub_racct(oldlc->lc_racct, p->p_racct); racct_add_racct(newlc->lc_racct, p->p_racct); } if (newpr != oldpr) { for (pr = oldpr; pr != NULL; pr = pr->pr_parent) racct_sub_racct(pr->pr_prison_racct->prr_racct, p->p_racct); for (pr = newpr; pr != NULL; pr = pr->pr_parent) racct_add_racct(pr->pr_prison_racct->prr_racct, p->p_racct); } RACCT_UNLOCK(); } void racct_move(struct racct *dest, struct racct *src) { ASSERT_RACCT_ENABLED(); RACCT_LOCK(); racct_add_racct(dest, src); racct_sub_racct(src, src); dest->r_runtime = src->r_runtime; dest->r_time = src->r_time; src->r_runtime = 0; timevalsub(&src->r_time, &src->r_time); RACCT_UNLOCK(); } static void ast_racct(struct thread *td, int tda __unused) { struct proc *p; ASSERT_RACCT_ENABLED(); p = td->td_proc; if (p->p_throttled == 0) return; PROC_LOCK(p); while (p->p_throttled != 0) { msleep(p->p_racct, &p->p_mtx, 0, "racct", p->p_throttled < 0 ? 0 : p->p_throttled); if (p->p_throttled > 0) p->p_throttled = 0; } PROC_UNLOCK(p); } /* * Make the process sleep in userret() for 'timeout' ticks. Setting * timeout to -1 makes it sleep until woken up by racct_proc_wakeup(). */ void racct_proc_throttle(struct proc *p, int timeout) { struct thread *td; #ifdef SMP int cpuid; #endif KASSERT(timeout != 0, ("timeout %d", timeout)); ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); /* * Do not block kernel processes. Also do not block processes with * low %cpu utilization to improve interactivity. */ if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) return; if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout)) return; p->p_throttled = timeout; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); ast_sched_locked(td, TDA_RACCT); switch (TD_GET_STATE(td)) { case TDS_RUNQ: /* * If the thread is on the scheduler run-queue, we can * not just remove it from there. So we set the flag * TDA_SCHED for the thread, so that once it is * running, it is taken off the cpu as soon as possible. */ ast_sched_locked(td, TDA_SCHED); break; case TDS_RUNNING: /* * If the thread is running, we request a context * switch for it by setting the TDA_SCHED flag. */ ast_sched_locked(td, TDA_SCHED); #ifdef SMP cpuid = td->td_oncpu; if ((cpuid != NOCPU) && (td != curthread)) ipi_cpu(cpuid, IPI_AST); #endif break; default: break; } thread_unlock(td); } } static void racct_proc_wakeup(struct proc *p) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_throttled != 0) { p->p_throttled = 0; wakeup(p->p_racct); } } static void racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2) { ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); #ifdef RCTL rctl_throttle_decay(racct, RACCT_READBPS); rctl_throttle_decay(racct, RACCT_WRITEBPS); rctl_throttle_decay(racct, RACCT_READIOPS); rctl_throttle_decay(racct, RACCT_WRITEIOPS); #endif } static void racct_decay_pre(void) { RACCT_LOCK(); } static void racct_decay_post(void) { RACCT_UNLOCK(); } static void racct_decay(void) { ASSERT_RACCT_ENABLED(); ui_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); loginclass_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); prison_racct_foreach(racct_decay_callback, racct_decay_pre, racct_decay_post, NULL, NULL); } static void racct_updatepcpu_racct_locked(struct racct *racct) { struct timeval diff; uint64_t elapsed; uint64_t runtime; uint64_t newpcpu; uint64_t oldpcpu; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* Difference between now and previously-recorded time. */ microuptime(&diff); timevalsub(&diff, &racct->r_time); elapsed = (uint64_t)diff.tv_sec * 1000000 + diff.tv_usec; /* Difference between current and previously-recorded runtime. */ runtime = racct->r_resources[RACCT_CPU] - racct->r_runtime; newpcpu = runtime * 100 * 1000000 / elapsed; oldpcpu = racct->r_resources[RACCT_PCTCPU]; /* * This calculation is equivalent to * (1 - 0.3) * newpcpu + 0.3 * oldpcpu * where RACCT_DECAY_FACTOR = 0.3 * FSCALE. */ racct->r_resources[RACCT_PCTCPU] = ((FSCALE - RACCT_DECAY_FACTOR) * newpcpu + RACCT_DECAY_FACTOR * oldpcpu) / FSCALE; if (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (uint64_t)mp_ncpus) racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (uint64_t)mp_ncpus; /* Record current times. */ racct->r_runtime = racct->r_resources[RACCT_CPU]; timevaladd(&racct->r_time, &diff); } static void racct_zeropcpu_locked(struct proc *p) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); p->p_racct->r_resources[RACCT_PCTCPU] = 0; } static void racct_updatepcpu_locked(struct proc *p) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); racct_updatepcpu_racct_locked(p->p_racct); } static void racct_updatepcpu_pre(void) { RACCT_LOCK(); } static void racct_updatepcpu_post(void) { RACCT_UNLOCK(); } static void racct_updatepcpu_racct_callback(struct racct *racct, void *dummy1, void *dummy2) { racct_updatepcpu_racct_locked(racct); } static void racct_updatepcpu_containers(void) { ASSERT_RACCT_ENABLED(); ui_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, racct_updatepcpu_post, NULL, NULL); loginclass_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, racct_updatepcpu_post, NULL, NULL); prison_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre, racct_updatepcpu_post, NULL, NULL); } static bool racct_proc_to_skip(const struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); return (p->p_state != PRS_NORMAL || (p->p_flag & P_IDLEPROC) != 0); } static void racctd(void) { struct proc *p; ASSERT_RACCT_ENABLED(); for (;;) { racct_decay(); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (racct_proc_to_skip(p)) { PROC_UNLOCK(p); continue; } RACCT_LOCK(); #ifdef RCTL rctl_throttle_decay(p->p_racct, RACCT_READBPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); rctl_throttle_decay(p->p_racct, RACCT_READIOPS); rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); #endif racct_settime_locked(p, false); racct_updatepcpu_locked(p); RACCT_UNLOCK(); PROC_UNLOCK(p); } /* * To ensure that processes are throttled in a fair way, we need * to iterate over all processes again and check the limits * for %cpu resource only after ucred racct containers have been * properly filled. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (racct_proc_to_skip(p)) { PROC_UNLOCK(p); continue; } if (racct_pcpu_available(p) <= 0) { if (p->p_racct->r_resources[RACCT_PCTCPU] > pcpu_threshold) racct_proc_throttle(p, -1); } else if (p->p_throttled == -1) { racct_proc_wakeup(p); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); racct_updatepcpu_containers(); pause("-", hz); } } static struct kproc_desc racctd_kp = { "racctd", racctd, NULL }; static void racctd_init(void *dummy __unused) { if (!racct_enable) return; kproc_start(&racctd_kp); } SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); static void racct_init(void *dummy __unused) { if (!racct_enable) return; racct_zone = uma_zcreate("racct", sizeof(struct racct), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); ast_register(TDA_RACCT, ASTR_ASTF_REQUIRED, 0, ast_racct); /* * XXX: Move this somewhere. */ prison0.pr_prison_racct = prison_racct_find("0"); } SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);