Index: head/sys/kern/kern_jail.c =================================================================== --- head/sys/kern/kern_jail.c (revision 290856) +++ head/sys/kern/kern_jail.c (revision 290857) @@ -1,4769 +1,4774 @@ /*- * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ #include #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ #ifdef INET #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL #endif #else /* !INET */ #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL 0 #endif #endif /* prison0 describes what is "real" about the system. */ struct prison prison0 = { .pr_id = 0, .pr_name = "0", .pr_ref = 1, .pr_uref = 1, .pr_path = "/", .pr_securelevel = -1, .pr_devfs_rsnum = 0, .pr_childmax = JAIL_MAX, .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif .pr_allow = PR_ALLOW_ALL, }; MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ struct sx allprison_lock; SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); LIST_HEAD(, prison_racct) allprison_racct; int lastprid = 0; static int do_jail_attach(struct thread *td, struct prison *pr); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static char *prison_path(struct prison *pr1, struct prison *pr2); static void prison_remove_one(struct prison *pr); #ifdef RACCT static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif #ifdef INET static int _prison_check_ip4(const struct prison *, const struct in_addr *); static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4); #endif #ifdef INET6 static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6); static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6); #endif /* Flags for prison_deref */ #define PD_DEREF 0x01 #define PD_DEUREF 0x02 #define PD_LOCKED 0x04 #define PD_LIST_SLOCKED 0x08 #define PD_LIST_XLOCKED 0x10 /* * Parameter names corresponding to PR_* flag values. Size values are for kvm * as we cannot figure out the size of a sparse array, or an array without a * terminating entry. */ static char *pr_flag_names[] = { [0] = "persist", #ifdef INET [7] = "ip4.saddrsel", #endif #ifdef INET6 [8] = "ip6.saddrsel", #endif }; const size_t pr_flag_names_size = sizeof(pr_flag_names); static char *pr_flag_nonames[] = { [0] = "nopersist", #ifdef INET [7] = "ip4.nosaddrsel", #endif #ifdef INET6 [8] = "ip6.nosaddrsel", #endif }; const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames); struct jailsys_flags { const char *name; unsigned disable; unsigned new; } pr_flag_jailsys[] = { { "host", 0, PR_HOST }, #ifdef VIMAGE { "vnet", 0, PR_VNET }, #endif #ifdef INET { "ip4", PR_IP4_USER, PR_IP4_USER }, #endif #ifdef INET6 { "ip6", PR_IP6_USER, PR_IP6_USER }, #endif }; const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); static char *pr_allow_names[] = { "allow.set_hostname", "allow.sysvipc", "allow.raw_sockets", "allow.chflags", "allow.mount", "allow.quotas", "allow.socket_af", "allow.mount.devfs", "allow.mount.nullfs", "allow.mount.zfs", "allow.mount.procfs", "allow.mount.tmpfs", "allow.mount.fdescfs", "allow.mount.linprocfs", "allow.mount.linsysfs", }; const size_t pr_allow_names_size = sizeof(pr_allow_names); static char *pr_allow_nonames[] = { "allow.noset_hostname", "allow.nosysvipc", "allow.noraw_sockets", "allow.nochflags", "allow.nomount", "allow.noquotas", "allow.nosocket_af", "allow.mount.nodevfs", "allow.mount.nonullfs", "allow.mount.nozfs", "allow.mount.noprocfs", "allow.mount.notmpfs", "allow.mount.nofdescfs", "allow.mount.nolinprocfs", "allow.mount.nolinsysfs", }; const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames); #define JAIL_DEFAULT_ALLOW PR_ALLOW_SET_HOSTNAME #define JAIL_DEFAULT_ENFORCE_STATFS 2 #define JAIL_DEFAULT_DEVFS_RSNUM 0 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; #if defined(INET) || defined(INET6) static unsigned jail_max_af_ips = 255; #endif /* * Initialize the parts of prison0 that can't be static-initialized with * constants. This is called from proc0_init() after creating thread0 cpuset. */ void prison0_init(void) { prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); prison0.pr_osreldate = osreldate; strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); } #ifdef INET static int qcmp_v4(const void *ip1, const void *ip2) { in_addr_t iaa, iab; /* * We need to compare in HBO here to get the list sorted as expected * by the result of the code. Sorting NBO addresses gives you * interesting results. If you do not understand, do not try. */ iaa = ntohl(((const struct in_addr *)ip1)->s_addr); iab = ntohl(((const struct in_addr *)ip2)->s_addr); /* * Do not simply return the difference of the two numbers, the int is * not wide enough. */ if (iaa > iab) return (1); else if (iaa < iab) return (-1); else return (0); } #endif #ifdef INET6 static int qcmp_v6(const void *ip1, const void *ip2) { const struct in6_addr *ia6a, *ia6b; int i, rc; ia6a = (const struct in6_addr *)ip1; ia6b = (const struct in6_addr *)ip2; rc = 0; for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) { if (ia6a->s6_addr[i] > ia6b->s6_addr[i]) rc = 1; else if (ia6a->s6_addr[i] < ia6b->s6_addr[i]) rc = -1; } return (rc); } #endif /* * struct jail_args { * struct jail *jail; * }; */ int sys_jail(struct thread *td, struct jail_args *uap) { uint32_t version; int error; struct jail j; error = copyin(uap->jail, &version, sizeof(uint32_t)); if (error) return (error); switch (version) { case 0: { struct jail_v0 j0; /* FreeBSD single IPv4 jails. */ bzero(&j, sizeof(struct jail)); error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); if (error) return (error); j.version = j0.version; j.path = j0.path; j.hostname = j0.hostname; j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ break; } case 1: /* * Version 1 was used by multi-IPv4 jail implementations * that never made it into the official kernel. */ return (EINVAL); case 2: /* JAIL_API_VERSION */ /* FreeBSD multi-IPv4/IPv6,noIP jails. */ error = copyin(uap->jail, &j, sizeof(struct jail)); if (error) return (error); break; default: /* Sci-Fi jails are not supported, sorry. */ return (EINVAL); } return (kern_jail(td, &j)); } int kern_jail(struct thread *td, struct jail *j) { struct iovec optiov[2 * (4 + sizeof(pr_allow_names) / sizeof(pr_allow_names[0]) #ifdef INET + 1 #endif #ifdef INET6 + 1 #endif )]; struct uio opt; char *u_path, *u_hostname, *u_name; #ifdef INET uint32_t ip4s; struct in_addr *u_ip4; #endif #ifdef INET6 struct in6_addr *u_ip6; #endif size_t tmplen; int error, enforce_statfs, fi; bzero(&optiov, sizeof(optiov)); opt.uio_iov = optiov; opt.uio_iovcnt = 0; opt.uio_offset = -1; opt.uio_resid = -1; opt.uio_segflg = UIO_SYSSPACE; opt.uio_rw = UIO_READ; opt.uio_td = td; /* Set permissions for top-level jails from sysctls. */ if (!jailed(td->td_ucred)) { for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); fi++) { optiov[opt.uio_iovcnt].iov_base = (jail_default_allow & (1 << fi)) ? pr_allow_names[fi] : pr_allow_nonames[fi]; optiov[opt.uio_iovcnt].iov_len = strlen(optiov[opt.uio_iovcnt].iov_base) + 1; opt.uio_iovcnt += 2; } optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); opt.uio_iovcnt++; enforce_statfs = jail_default_enforce_statfs; optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); opt.uio_iovcnt++; } tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; #ifdef INET ip4s = (j->version == 0) ? 1 : j->ip4s; if (ip4s > jail_max_af_ips) return (EINVAL); tmplen += ip4s * sizeof(struct in_addr); #else if (j->ip4s > 0) return (EINVAL); #endif #ifdef INET6 if (j->ip6s > jail_max_af_ips) return (EINVAL); tmplen += j->ip6s * sizeof(struct in6_addr); #else if (j->ip6s > 0) return (EINVAL); #endif u_path = malloc(tmplen, M_TEMP, M_WAITOK); u_hostname = u_path + MAXPATHLEN; u_name = u_hostname + MAXHOSTNAMELEN; #ifdef INET u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); #endif #ifdef INET6 #ifdef INET u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); #else u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); #endif #endif optiov[opt.uio_iovcnt].iov_base = "path"; optiov[opt.uio_iovcnt].iov_len = sizeof("path"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_path; error = copyinstr(j->path, u_path, MAXPATHLEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = "host.hostname"; optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_hostname; error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; if (j->jailname != NULL) { optiov[opt.uio_iovcnt].iov_base = "name"; optiov[opt.uio_iovcnt].iov_len = sizeof("name"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_name; error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; } #ifdef INET optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip4; optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); if (j->version == 0) u_ip4->s_addr = j->ip4s; else { error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } } opt.uio_iovcnt++; #endif #ifdef INET6 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip6; optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; #endif KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]), ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); free(u_path, M_TEMP); return (error); } /* * struct jail_set_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_set(struct thread *td, struct jail_set_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_set(td, auio, uap->flags); free(auio, M_IOV); return (error); } int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { struct nameidata nd; #ifdef INET struct in_addr *ip4; #endif #ifdef INET6 struct in6_addr *ip6; #endif struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *mypr, *ppr, *tpr; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; #if defined(INET) || defined(INET6) struct prison *tppr; void *op; #endif unsigned long hid; size_t namelen, onamelen; int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; int fi, jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; int fullpath_disabled; #if defined(INET) || defined(INET6) int ii, ij; #endif #ifdef INET int ip4s, redo_ip4; #endif #ifdef INET6 int ip6s, redo_ip6; #endif uint64_t pr_allow, ch_allow, pr_flags, ch_flags; unsigned tallow; char numbuf[12]; error = priv_check(td, PRIV_JAIL_SET); if (!error && (flags & JAIL_ATTACH)) error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); mypr = ppr = td->td_ucred->cr_prison; if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this * takes care of some expensive stuff (path lookup) before getting * the allprison lock. * * XXX Jails are not filesystems, and jail parameters are not mount * options. But it makes more sense to re-use the vfsopt code * than duplicate it under a different name. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); #ifdef INET ip4 = NULL; #endif #ifdef INET6 ip6 = NULL; #endif g_path = NULL; error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; else if (error != 0) goto done_free; error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); if (error == ENOENT) gotslevel = 0; else if (error != 0) goto done_free; else gotslevel = 1; error = vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); if (error == ENOENT) gotchildmax = 0; else if (error != 0) goto done_free; else gotchildmax = 1; error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); if (error == ENOENT) gotenforce = 0; else if (error != 0) goto done_free; else if (enforce < 0 || enforce > 2) { error = EINVAL; goto done_free; } else gotenforce = 1; error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); if (error == ENOENT) gotrsnum = 0; else if (error != 0) goto done_free; else gotrsnum = 1; pr_flags = ch_flags = 0; for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]); fi++) { if (pr_flag_names[fi] == NULL) continue; vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi); vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi); } ch_flags |= pr_flags; for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]); fi++) { error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys, sizeof(jsys)); if (error == ENOENT) continue; if (error != 0) goto done_free; switch (jsys) { case JAIL_SYS_DISABLE: if (!pr_flag_jailsys[fi].disable) { error = EINVAL; goto done_free; } pr_flags |= pr_flag_jailsys[fi].disable; break; case JAIL_SYS_NEW: pr_flags |= pr_flag_jailsys[fi].new; break; case JAIL_SYS_INHERIT: break; default: error = EINVAL; goto done_free; } ch_flags |= pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable; } if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE && !(pr_flags & PR_PERSIST)) { error = EINVAL; vfs_opterror(opts, "new jail must persist or attach"); goto done_errmsg; } #ifdef VIMAGE if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { error = EINVAL; vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET6 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_errmsg; } #endif pr_allow = ch_allow = 0; for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); fi++) { vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi); vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi); } ch_allow |= pr_allow; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == ENOENT) name = NULL; else if (error != 0) goto done_free; else { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); if (error == ENOENT) host = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || host[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); if (error == ENOENT) domain = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || domain[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); if (error == ENOENT) uuid = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || uuid[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > HOSTUUIDLEN) { error = ENAMETOOLONG; goto done_free; } } #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32; error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); hid = hid32; } else #endif error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); if (error == ENOENT) gothid = 0; else if (error != 0) goto done_free; else { gothid = 1; ch_flags |= PR_HOST; pr_flags |= PR_HOST; } #ifdef INET error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); if (error == ENOENT) ip4s = 0; else if (error != 0) goto done_free; else if (ip4s & (sizeof(*ip4) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP4_USER; pr_flags |= PR_IP4_USER; if (ip4s > 0) { ip4s /= sizeof(*ip4); if (ip4s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv4 addresses"); goto done_errmsg; } ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); bcopy(op, ip4, ip4s * sizeof(*ip4)); /* * IP addresses are all sorted but ip[0] to preserve * the primary IP address as given from userland. * This special IP is used for unbound outgoing * connections as well for "loopback" traffic in case * source address selection cannot find any more fitting * address to connect from. */ if (ip4s > 1) qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4); /* * Check for duplicate addresses and do some simple * zero and broadcast checks. If users give other bogus * addresses it is their problem. * * We do not have to care about byte order for these * checks so we will do them in NBO. */ for (ii = 0; ii < ip4s; ii++) { if (ip4[ii].s_addr == INADDR_ANY || ip4[ii].s_addr == INADDR_BROADCAST) { error = EINVAL; goto done_free; } if ((ii+1) < ip4s && (ip4[0].s_addr == ip4[ii+1].s_addr || ip4[ii].s_addr == ip4[ii+1].s_addr)) { error = EINVAL; goto done_free; } } } } #endif #ifdef INET6 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); if (error == ENOENT) ip6s = 0; else if (error != 0) goto done_free; else if (ip6s & (sizeof(*ip6) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP6_USER; pr_flags |= PR_IP6_USER; if (ip6s > 0) { ip6s /= sizeof(*ip6); if (ip6s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv6 addresses"); goto done_errmsg; } ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); bcopy(op, ip6, ip6s * sizeof(*ip6)); if (ip6s > 1) qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6); for (ii = 0; ii < ip6s; ii++) { if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) { error = EINVAL; goto done_free; } if ((ii+1) < ip6s && (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) || IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1]))) { error = EINVAL; goto done_free; } } } } #endif #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_errmsg; } #endif fullpath_disabled = 0; root = NULL; error = vfs_getopt(opts, "path", (void **)&path, &len); if (error == ENOENT) path = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "path cannot be changed after creation"); goto done_errmsg; } if (len == 0 || path[len - 1] != '\0') { error = EINVAL; goto done_free; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, td); error = namei(&nd); if (error) goto done_free; root = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); strlcpy(g_path, path, MAXPATHLEN); error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); if (error == 0) path = g_path; else if (error == ENODEV) { /* proceed if sysctl debug.disablefullpath == 1 */ fullpath_disabled = 1; if (len < 2 || (len == 2 && path[0] == '/')) path = NULL; } else { /* exit on other errors */ goto done_free; } if (root->v_type != VDIR) { error = ENOTDIR; vput(root); goto done_free; } VOP_UNLOCK(root, 0); if (fullpath_disabled) { /* Leave room for a real-root full pathname. */ if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/") ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) { error = ENAMETOOLONG; goto done_free; } } } error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); if (error == ENOENT) osrelstr = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osrelease cannot be changed after creation"); goto done_errmsg; } if (len == 0 || len >= OSRELEASELEN) { error = EINVAL; vfs_opterror(opts, "osrelease string must be 1-%d bytes long", OSRELEASELEN - 1); goto done_errmsg; } } error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); if (error == ENOENT) osreldt = 0; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be changed after creation"); goto done_errmsg; } if (osreldt == 0) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be 0"); goto done_errmsg; } } /* * Grab the allprison lock before letting modules check their * parameters. Once we have it, do not let go so we'll have a * consistent view of the OSD list. */ sx_xlock(&allprison_lock); error = osd_jail_call(NULL, PR_METHOD_CHECK, opts); if (error) goto done_unlock_list; /* By now, all parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_unlock_list; } } /* * See if we are creating a new record or updating an existing one. * This abuses the file error codes ENOENT and EEXIST. */ cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { error = EINVAL; vfs_opterror(opts, "no valid operation (create or update)"); goto done_unlock_list; } pr = NULL; namelc = NULL; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); if (*p != '\0') jid = 0; } if (jid != 0) { /* * See if a requested jid already exists. There is an * information leak here if the jid exists but is not within * the caller's jail hierarchy. Jail creators will get EEXIST * even though they cannot see the jail, and CREATE | UPDATE * will return ENOENT which is not normally a valid error. */ if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); goto done_unlock_list; } pr = prison_find(jid); if (pr != NULL) { ppr = pr->pr_parent; /* Create: jid must not exist. */ if (cuflags == JAIL_CREATE) { mtx_unlock(&pr->pr_mtx); error = EEXIST; vfs_opterror(opts, "jail %d already exists", jid); goto done_unlock_list; } if (!prison_ischild(mypr, pr)) { mtx_unlock(&pr->pr_mtx); pr = NULL; } else if (pr->pr_uref == 0) { if (!(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done_unlock_list; } else if ((flags & JAIL_ATTACH) || (pr_flags & PR_PERSIST)) { /* * A dying jail might be resurrected * (via attach or persist), but first * it must determine if another jail * has claimed its name. Accomplish * this by implicitly re-setting the * name. */ if (name == NULL) name = prison_name(mypr, pr); } } } if (pr == NULL) { /* Update: jid must exist. */ if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_unlock_list; } } } /* * If the caller provided a name, look for a jail by that name. * This has different semantics for creates and updates keyed by jid * (where the name must not already exist in a different jail), * and updates keyed by the name itself (where the name must exist * because that is the jail being updated). */ if (name != NULL) { namelc = strrchr(name, '.'); if (namelc == NULL) namelc = name; else { /* * This is a hierarchical name. Split it into the * parent and child names, and make sure the parent * exists or matches an already found jail. */ *namelc = '\0'; if (pr != NULL) { if (strncmp(name, ppr->pr_name, namelc - name) || ppr->pr_name[namelc - name] != '\0') { mtx_unlock(&pr->pr_mtx); error = EINVAL; vfs_opterror(opts, "cannot change jail's parent"); goto done_unlock_list; } } else { ppr = prison_find_name(mypr, name); if (ppr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } mtx_unlock(&ppr->pr_mtx); } name = ++namelc; } if (name[0] != '\0') { namelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; name_again: deadpr = NULL; FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr != pr && tpr->pr_ref > 0 && !strcmp(tpr->pr_name + namelen, name)) { if (pr == NULL && cuflags != JAIL_CREATE) { mtx_lock(&tpr->pr_mtx); if (tpr->pr_ref > 0) { /* * Use this jail * for updates. */ if (tpr->pr_uref > 0) { pr = tpr; break; } deadpr = tpr; } mtx_unlock(&tpr->pr_mtx); } else if (tpr->pr_uref > 0) { /* * Create, or update(jid): * name must not exist in an * active sibling jail. */ error = EEXIST; if (pr != NULL) mtx_unlock(&pr->pr_mtx); vfs_opterror(opts, "jail \"%s\" already exists", name); goto done_unlock_list; } } } /* If no active jail is found, use a dying one. */ if (deadpr != NULL && pr == NULL) { if (flags & JAIL_DYING) { mtx_lock(&deadpr->pr_mtx); if (deadpr->pr_ref == 0) { mtx_unlock(&deadpr->pr_mtx); goto name_again; } pr = deadpr; } else if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_unlock_list; } } /* Update: name must exist if no jid. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } } } /* Update: must provide a jid or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); goto done_unlock_list; } /* If there's no prison to update, create a new one and link it in. */ if (pr == NULL) { for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) if (tpr->pr_childcount >= tpr->pr_childmax) { error = EPERM; vfs_opterror(opts, "prison limit exceeded"); goto done_unlock_list; } created = 1; mtx_lock(&ppr->pr_mtx); if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) { mtx_unlock(&ppr->pr_mtx); error = ENOENT; vfs_opterror(opts, "parent jail went away!"); goto done_unlock_list; } ppr->pr_ref++; ppr->pr_uref++; mtx_unlock(&ppr->pr_mtx); pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); if (jid == 0) { /* Find the next free jid. */ jid = lastprid + 1; findnext: if (jid == JAIL_MAX) jid = 1; TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr->pr_id < jid) continue; if (tpr->pr_id > jid || tpr->pr_ref == 0) { TAILQ_INSERT_BEFORE(tpr, pr, pr_list); break; } if (jid == lastprid) { error = EAGAIN; vfs_opterror(opts, "no available jail IDs"); free(pr, M_PRISON); prison_deref(ppr, PD_DEREF | PD_DEUREF | PD_LIST_XLOCKED); goto done_releroot; } jid++; goto findnext; } lastprid = jid; } else { /* * The jail already has a jid (that did not yet exist), * so just find where to insert it. */ TAILQ_FOREACH(tpr, &allprison, pr_list) if (tpr->pr_id >= jid) { TAILQ_INSERT_BEFORE(tpr, pr, pr_list); break; } } if (tpr == NULL) TAILQ_INSERT_TAIL(&allprison, pr, pr_list); LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; pr->pr_parent = ppr; pr->pr_id = jid; /* Set some default values, and inherit some from the parent. */ if (name == NULL) name = ""; if (path == NULL) { path = "/"; root = mypr->pr_root; vref(root); } strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); pr->pr_flags |= PR_HOST; #if defined(INET) || defined(INET6) #ifdef VIMAGE if (!(pr_flags & PR_VNET)) #endif { #ifdef INET if (!(ch_flags & PR_IP4_USER)) pr->pr_flags |= PR_IP4 | PR_IP4_USER; else if (!(pr_flags & PR_IP4_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP4; if (ppr->pr_ip4 != NULL) { pr->pr_ip4s = ppr->pr_ip4s; pr->pr_ip4 = malloc(pr->pr_ip4s * sizeof(struct in_addr), M_PRISON, M_WAITOK); bcopy(ppr->pr_ip4, pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); } } #endif #ifdef INET6 if (!(ch_flags & PR_IP6_USER)) pr->pr_flags |= PR_IP6 | PR_IP6_USER; else if (!(pr_flags & PR_IP6_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP6; if (ppr->pr_ip6 != NULL) { pr->pr_ip6s = ppr->pr_ip6s; pr->pr_ip6 = malloc(pr->pr_ip6s * sizeof(struct in6_addr), M_PRISON, M_WAITOK); bcopy(ppr->pr_ip6, pr->pr_ip6, pr->pr_ip6s * sizeof(*pr->pr_ip6)); } } #endif } #endif /* Source address selection is always on by default. */ pr->pr_flags |= _PR_IP_SADDRSEL; pr->pr_securelevel = ppr->pr_securelevel; pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; pr->pr_enforce_statfs = jail_default_enforce_statfs; pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; if (osrelstr == NULL) strcpy(pr->pr_osrelease, ppr->pr_osrelease); else strcpy(pr->pr_osrelease, osrelstr); LIST_INIT(&pr->pr_children); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); #ifdef VIMAGE /* Allocate a new vnet if specified. */ pr->pr_vnet = (pr_flags & PR_VNET) ? vnet_alloc() : ppr->pr_vnet; #endif /* * Allocate a dedicated cpuset for each jail. * Unlike other initial settings, this may return an erorr. */ error = cpuset_create_root(ppr, &pr->pr_cpuset); if (error) { prison_deref(pr, PD_LIST_XLOCKED); goto done_releroot; } mtx_lock(&pr->pr_mtx); /* * New prisons do not yet have a reference, because we do not * want other to see the incomplete prison once the * allprison_lock is downgraded. */ } else { created = 0; /* * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ pr->pr_ref++; #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_deref_locked; } #endif #ifdef INET if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_deref_locked; } #endif #ifdef INET6 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_deref_locked; } #endif } /* Do final error checking before setting anything. */ if (gotslevel) { if (slevel < ppr->pr_securelevel) { error = EPERM; goto done_deref_locked; } } if (gotchildmax) { if (childmax >= ppr->pr_childmax) { error = EPERM; goto done_deref_locked; } } if (gotenforce) { if (enforce < ppr->pr_enforce_statfs) { error = EPERM; goto done_deref_locked; } } if (gotrsnum) { /* * devfs_rsnum is a uint16_t */ if (rsnum < 0 || rsnum > 65535) { error = EINVAL; goto done_deref_locked; } /* * Nested jails always inherit parent's devfs ruleset */ if (jailed(td->td_ucred)) { if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { error = EPERM; goto done_deref_locked; } else rsnum = ppr->pr_devfs_rsnum; } } #ifdef INET if (ip4s > 0) { if (ppr->pr_flags & PR_IP4) { /* * Make sure the new set of IP addresses is a * subset of the parent's list. Don't worry * about the parent being unlocked, as any * setting is done with allprison_lock held. */ for (ij = 0; ij < ppr->pr_ip4s; ij++) if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij == ppr->pr_ip4s) { error = EPERM; goto done_deref_locked; } if (ip4s > 1) { for (ii = ij = 1; ii < ip4s; ii++) { if (ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) continue; for (; ij < ppr->pr_ip4s; ij++) if (ip4[ii].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij == ppr->pr_ip4s) break; } if (ij == ppr->pr_ip4s) { error = EPERM; goto done_deref_locked; } } } /* * Check for conflicting IP addresses. We permit them * if there is no more than one IP on each jail. If * there is a duplicate on a jail with more than one * IP stop checking and return error. */ tppr = ppr; #ifdef VIMAGE for (; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif tpr->pr_uref == 0) { descend = 0; continue; } if (!(tpr->pr_flags & PR_IP4_USER)) continue; descend = 0; if (tpr->pr_ip4 == NULL || (ip4s == 1 && tpr->pr_ip4s == 1)) continue; for (ii = 0; ii < ip4s; ii++) { if (_prison_check_ip4(tpr, &ip4[ii]) == 0) { error = EADDRINUSE; vfs_opterror(opts, "IPv4 addresses clash"); goto done_deref_locked; } } } } #endif #ifdef INET6 if (ip6s > 0) { if (ppr->pr_flags & PR_IP6) { /* * Make sure the new set of IP addresses is a * subset of the parent's list. */ for (ij = 0; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL(&ip6[0], &ppr->pr_ip6[ij])) break; if (ij == ppr->pr_ip6s) { error = EPERM; goto done_deref_locked; } if (ip6s > 1) { for (ii = ij = 1; ii < ip6s; ii++) { if (IN6_ARE_ADDR_EQUAL(&ip6[ii], &ppr->pr_ip6[0])) continue; for (; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL( &ip6[ii], &ppr->pr_ip6[ij])) break; if (ij == ppr->pr_ip6s) break; } if (ij == ppr->pr_ip6s) { error = EPERM; goto done_deref_locked; } } } /* Check for conflicting IP addresses. */ tppr = ppr; #ifdef VIMAGE for (; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif tpr->pr_uref == 0) { descend = 0; continue; } if (!(tpr->pr_flags & PR_IP6_USER)) continue; descend = 0; if (tpr->pr_ip6 == NULL || (ip6s == 1 && tpr->pr_ip6s == 1)) continue; for (ii = 0; ii < ip6s; ii++) { if (_prison_check_ip6(tpr, &ip6[ii]) == 0) { error = EADDRINUSE; vfs_opterror(opts, "IPv6 addresses clash"); goto done_deref_locked; } } } } #endif onamelen = namelen = 0; if (name != NULL) { /* Give a default name of the jid. */ if (name[0] == '\0') snprintf(name = numbuf, sizeof(numbuf), "%d", jid); else if (*namelc == '0' || (strtoul(namelc, &p, 10) != jid && *p == '\0')) { error = EINVAL; vfs_opterror(opts, "name cannot be numeric (unless it is the jid)"); goto done_deref_locked; } /* * Make sure the name isn't too long for the prison or its * children. */ onamelen = strlen(pr->pr_name); namelen = strlen(name); if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref_locked; } FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { if (strlen(tpr->pr_name) + (namelen - onamelen) >= sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref_locked; } } } if (pr_allow & ~ppr->pr_allow) { error = EPERM; goto done_deref_locked; } /* Set the parameters of the prison. */ #ifdef INET redo_ip4 = 0; if (pr_flags & PR_IP4_USER) { pr->pr_flags |= PR_IP4; free(pr->pr_ip4, M_PRISON); pr->pr_ip4s = ip4s; pr->pr_ip4 = ip4; ip4 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip4(tpr, NULL)) { redo_ip4 = 1; descend = 0; } } } #endif #ifdef INET6 redo_ip6 = 0; if (pr_flags & PR_IP6_USER) { pr->pr_flags |= PR_IP6; free(pr->pr_ip6, M_PRISON); pr->pr_ip6s = ip6s; pr->pr_ip6 = ip6; ip6 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip6(tpr, NULL)) { redo_ip6 = 1; descend = 0; } } } #endif if (gotslevel) { pr->pr_securelevel = slevel; /* Set all child jails to be at least this level. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_securelevel < slevel) tpr->pr_securelevel = slevel; } if (gotchildmax) { pr->pr_childmax = childmax; /* Set all child jails to under this limit. */ FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) if (tpr->pr_childmax > childmax - level) tpr->pr_childmax = childmax > level ? childmax - level : 0; } if (gotenforce) { pr->pr_enforce_statfs = enforce; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_enforce_statfs < enforce) tpr->pr_enforce_statfs = enforce; } if (gotrsnum) { pr->pr_devfs_rsnum = rsnum; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_devfs_rsnum = rsnum; } if (name != NULL) { if (ppr == &prison0) strlcpy(pr->pr_name, name, sizeof(pr->pr_name)); else snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", ppr->pr_name, name); /* Change this component of child names. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, strlen(tpr->pr_name + onamelen) + 1); bcopy(pr->pr_name, tpr->pr_name, namelen); } } if (path != NULL) { /* Try to keep a real-rooted full pathname. */ if (fullpath_disabled && path[0] == '/' && strcmp(mypr->pr_path, "/")) snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s", mypr->pr_path, path); else strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); pr->pr_root = root; } if (PR_HOST & ch_flags & ~pr_flags) { if (pr->pr_flags & PR_HOST) { /* * Copy the parent's host info. As with pr_ip4 above, * the lack of a lock on the parent is not a problem; * it is always set with allprison_lock at least * shared, and is held exclusively here. */ strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, sizeof(pr->pr_hostname)); strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, sizeof(pr->pr_domainname)); strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, sizeof(pr->pr_hostuuid)); pr->pr_hostid = pr->pr_parent->pr_hostid; } } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { /* Set this prison, and any descendants without PR_HOST. */ if (host != NULL) strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); if (domain != NULL) strlcpy(pr->pr_domainname, domain, sizeof(pr->pr_domainname)); if (uuid != NULL) strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); if (gothid) pr->pr_hostid = hid; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { if (tpr->pr_flags & PR_HOST) descend = 0; else { if (host != NULL) strlcpy(tpr->pr_hostname, pr->pr_hostname, sizeof(tpr->pr_hostname)); if (domain != NULL) strlcpy(tpr->pr_domainname, pr->pr_domainname, sizeof(tpr->pr_domainname)); if (uuid != NULL) strlcpy(tpr->pr_hostuuid, pr->pr_hostuuid, sizeof(tpr->pr_hostuuid)); if (gothid) tpr->pr_hostid = hid; } } } if ((tallow = ch_allow & ~pr_allow)) { /* Clear allow bits in all children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_allow &= ~tallow; } pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; /* * Persistent prisons get an extra reference, and prisons losing their * persist flag lose that reference. Only do this for existing prisons * for now, so new ones will remain unseen until after the module * handlers have completed. */ if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) { if (pr_flags & PR_PERSIST) { pr->pr_ref++; pr->pr_uref++; } else { pr->pr_ref--; pr->pr_uref--; } } pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; mtx_unlock(&pr->pr_mtx); #ifdef RACCT if (racct_enable && created) prison_racct_attach(pr); #endif /* Locks may have prevented a complete restriction of child IP * addresses. If so, allocate some more memory and try again. */ #ifdef INET while (redo_ip4) { ip4s = pr->pr_ip4s; ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip4 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip4(tpr, ip4)) { if (ip4 != NULL) ip4 = NULL; else redo_ip4 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif #ifdef INET6 while (redo_ip6) { ip6s = pr->pr_ip6s; ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip6 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip6(tpr, ip6)) { if (ip6 != NULL) ip6 = NULL; else redo_ip6 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif /* Let the modules do their work. */ sx_downgrade(&allprison_lock); if (created) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) { prison_deref(pr, PD_LIST_SLOCKED); goto done_errmsg; } } error = osd_jail_call(pr, PR_METHOD_SET, opts); if (error) { prison_deref(pr, created ? PD_LIST_SLOCKED : PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; } /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { mtx_lock(&pr->pr_mtx); error = do_jail_attach(td, pr); if (error) { vfs_opterror(opts, "attach failed"); if (!created) prison_deref(pr, PD_DEREF); goto done_errmsg; } } #ifdef RACCT if (racct_enable && !created) { if (!(flags & JAIL_ATTACH)) sx_sunlock(&allprison_lock); prison_racct_modify(pr); if (!(flags & JAIL_ATTACH)) sx_slock(&allprison_lock); } #endif td->td_retval[0] = pr->pr_id; /* * Now that it is all there, drop the temporary reference from existing * prisons. Or add a reference to newly created persistent prisons * (which was not done earlier so that the prison would not be publicly * visible). */ if (!created) { prison_deref(pr, (flags & JAIL_ATTACH) ? PD_DEREF : PD_DEREF | PD_LIST_SLOCKED); } else { if (pr_flags & PR_PERSIST) { mtx_lock(&pr->pr_mtx); pr->pr_ref++; pr->pr_uref++; mtx_unlock(&pr->pr_mtx); } if (!(flags & JAIL_ATTACH)) sx_sunlock(&allprison_lock); } goto done_errmsg; done_deref_locked: prison_deref(pr, created ? PD_LOCKED | PD_LIST_XLOCKED : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); goto done_releroot; done_unlock_list: sx_xunlock(&allprison_lock); done_releroot: if (root != NULL) vrele(root); done_errmsg: if (error) { vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); if (errmsg_len > 0) { errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; if (errmsg_pos > 0) { if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } } done_free: #ifdef INET free(ip4, M_PRISON); #endif #ifdef INET6 free(ip6, M_PRISON); #endif if (g_path != NULL) free(g_path, M_TEMP); vfs_freeopts(opts); return (error); } /* * struct jail_get_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_get(struct thread *td, struct jail_get_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_get(td, auio, uap->flags); if (error == 0) error = copyout(auio->uio_iov, uap->iovp, uap->iovcnt * sizeof (struct iovec)); free(auio, M_IOV); return (error); } int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos; if (flags & ~JAIL_GET_MASK) return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; /* * Find the prison specified by one of: lastjid, jid, name. */ sx_slock(&allprison_lock); error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id > jid && prison_ischild(mypr, pr)) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0 && (pr->pr_uref > 0 || (flags & JAIL_DYING))) break; mtx_unlock(&pr->pr_mtx); } } if (pr != NULL) goto found_prison; error = ENOENT; vfs_opterror(opts, "no jail after %d", jid); goto done_unlock_list; } else if (error != ENOENT) goto done_unlock_list; error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == 0) { if (jid != 0) { pr = prison_find_child(mypr, jid); if (pr != NULL) { if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done_unlock_list; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_unlock_list; } } else if (error != ENOENT) goto done_unlock_list; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == 0) { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_unlock_list; } pr = prison_find_name(mypr, name); if (pr != NULL) { if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_unlock_list; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_unlock_list; } else if (error != ENOENT) goto done_unlock_list; vfs_opterror(opts, "no jail specified"); error = ENOENT; goto done_unlock_list; found_prison: /* Get the parameters of the prison. */ pr->pr_ref++; locked = PD_LOCKED; td->td_retval[0] = pr->pr_id; error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done_deref; i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; error = vfs_setopt(opts, "parent", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "name", prison_name(mypr, pr)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, sizeof(pr->pr_cpuset->cs_id)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "path", prison_path(mypr, pr)); if (error != 0 && error != ENOENT) goto done_deref; #ifdef INET error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); if (error != 0 && error != ENOENT) goto done_deref; #endif #ifdef INET6 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6, pr->pr_ip6s * sizeof(*pr->pr_ip6)); if (error != 0 && error != ENOENT) goto done_deref; #endif error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, sizeof(pr->pr_securelevel)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, sizeof(pr->pr_childcount)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "children.max", &pr->pr_childmax, sizeof(pr->pr_childmax)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); if (error != 0 && error != ENOENT) goto done_deref; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32 = pr->pr_hostid; error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); } else #endif error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, sizeof(pr->pr_hostid)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, sizeof(pr->pr_enforce_statfs)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, sizeof(pr->pr_devfs_rsnum)); if (error != 0 && error != ENOENT) goto done_deref; for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]); fi++) { if (pr_flag_names[fi] == NULL) continue; i = (pr->pr_flags & (1 << fi)) ? 1 : 0; error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]); fi++) { i = pr->pr_flags & (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new); i = pr_flag_jailsys[fi].disable && (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); fi++) { if (pr_allow_names[fi] == NULL) continue; i = (pr->pr_allow & (1 << fi)) ? 1 : 0; error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; } i = (pr->pr_uref == 0); error = vfs_setopt(opts, "dying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; i = !i; error = vfs_setopt(opts, "nodying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, sizeof(pr->pr_osreldate)); if (error != 0 && error != ENOENT) goto done_deref; error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); if (error != 0 && error != ENOENT) goto done_deref; /* Get the module parameters. */ mtx_unlock(&pr->pr_mtx); locked = 0; error = osd_jail_call(pr, PR_METHOD_GET, opts); if (error) goto done_deref; prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED); /* By now, all parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_errmsg; } } /* Write the fetched parameters back to userspace. */ error = 0; TAILQ_FOREACH(opt, opts, link) { if (opt->pos >= 0 && opt->pos != errmsg_pos) { pos = 2 * opt->pos + 1; optuio->uio_iov[pos].iov_len = opt->len; if (opt->value != NULL) { if (optuio->uio_segflg == UIO_SYSSPACE) { bcopy(opt->value, optuio->uio_iov[pos].iov_base, opt->len); } else { error = copyout(opt->value, optuio->uio_iov[pos].iov_base, opt->len); if (error) break; } } } } goto done_errmsg; done_deref: prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED); goto done_errmsg; done_unlock_list: sx_sunlock(&allprison_lock); done_errmsg: if (error && errmsg_pos >= 0) { vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); errmsg_pos = 2 * errmsg_pos + 1; if (errmsg_len > 0) { if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } vfs_freeopts(opts); return (error); } /* * struct jail_remove_args { * int jid; * }; */ int sys_jail_remove(struct thread *td, struct jail_remove_args *uap) { struct prison *pr, *cpr, *lpr, *tpr; int descend, error; error = priv_check(td, PRIV_JAIL_REMOVE); if (error) return (error); sx_xlock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_xunlock(&allprison_lock); return (EINVAL); } /* Remove all descendants of this prison, then remove this prison. */ pr->pr_ref++; pr->pr_flags |= PR_REMOVE; if (!LIST_EMPTY(&pr->pr_children)) { mtx_unlock(&pr->pr_mtx); lpr = NULL; FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { mtx_lock(&cpr->pr_mtx); if (cpr->pr_ref > 0) { tpr = cpr; cpr->pr_ref++; cpr->pr_flags |= PR_REMOVE; } else { /* Already removed - do not do it again. */ tpr = NULL; } mtx_unlock(&cpr->pr_mtx); if (lpr != NULL) { mtx_lock(&lpr->pr_mtx); prison_remove_one(lpr); sx_xlock(&allprison_lock); } lpr = tpr; } if (lpr != NULL) { mtx_lock(&lpr->pr_mtx); prison_remove_one(lpr); sx_xlock(&allprison_lock); } mtx_lock(&pr->pr_mtx); } prison_remove_one(pr); return (0); } static void prison_remove_one(struct prison *pr) { struct proc *p; int deuref; /* If the prison was persistent, it is not anymore. */ deuref = 0; if (pr->pr_flags & PR_PERSIST) { pr->pr_ref--; deuref = PD_DEUREF; pr->pr_flags &= ~PR_PERSIST; } /* * jail_remove added a reference. If that's the only one, remove * the prison now. */ KASSERT(pr->pr_ref > 0, ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id)); if (pr->pr_ref == 1) { prison_deref(pr, deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); return; } mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); /* * Kill all processes unfortunate enough to be attached to this prison. */ sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { PROC_LOCK(p); if (p->p_state != PRS_NEW && p->p_ucred && p->p_ucred->cr_prison == pr) kern_psignal(p, SIGKILL); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* Remove the temporary reference added by jail_remove. */ prison_deref(pr, deuref | PD_DEREF); } /* * struct jail_attach_args { * int jid; * }; */ int sys_jail_attach(struct thread *td, struct jail_attach_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); return (EINVAL); } /* * Do not allow a process to attach to a prison that is not * considered to be "alive". */ if (pr->pr_uref == 0) { mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); return (EINVAL); } return (do_jail_attach(td, pr)); } static int do_jail_attach(struct thread *td, struct prison *pr) { struct prison *ppr; struct proc *p; struct ucred *newcred, *oldcred; int error; /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two * different jails at the same time. It is important for * user processes not to do this, or they might end up with * a process root from one prison, but attached to the jail * of another. */ pr->pr_ref++; pr->pr_uref++; mtx_unlock(&pr->pr_mtx); /* Let modules do whatever they need to prepare for attaching. */ error = osd_jail_call(pr, PR_METHOD_ATTACH, td); if (error) { prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED); return (error); } sx_sunlock(&allprison_lock); /* * Reparent the newly attached process to this jail. */ ppr = td->td_ucred->cr_prison; p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) goto e_revert_osd; vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); if ((error = change_dir(pr->pr_root, td)) != 0) goto e_unlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) goto e_unlock; #endif VOP_UNLOCK(pr->pr_root, 0); if ((error = pwd_chroot(td, pr->pr_root))) goto e_revert_osd; newcred = crget(); PROC_LOCK(p); oldcred = p->p_ucred; setsugid(p); crcopy(newcred, oldcred); newcred->cr_prison = pr; proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif crfree(oldcred); prison_deref(ppr, PD_DEREF | PD_DEUREF); return (0); e_unlock: VOP_UNLOCK(pr->pr_root, 0); e_revert_osd: /* Tell modules this thread is still in its old jail after all. */ (void)osd_jail_call(ppr, PR_METHOD_ATTACH, td); prison_deref(pr, PD_DEREF | PD_DEUREF); return (error); } /* * Returns a locked prison instance, or NULL on failure. */ struct prison * prison_find(int prid) { struct prison *pr; sx_assert(&allprison_lock, SX_LOCKED); TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id == prid) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) return (pr); mtx_unlock(&pr->pr_mtx); } } return (NULL); } /* * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. */ struct prison * prison_find_child(struct prison *mypr, int prid) { struct prison *pr; int descend; sx_assert(&allprison_lock, SX_LOCKED); FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (pr->pr_id == prid) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) return (pr); mtx_unlock(&pr->pr_mtx); } } return (NULL); } /* * Look for the name relative to mypr. Returns a locked prison or NULL. */ struct prison * prison_find_name(struct prison *mypr, const char *name) { struct prison *pr, *deadpr; size_t mylen; int descend; sx_assert(&allprison_lock, SX_LOCKED); mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; again: deadpr = NULL; FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (!strcmp(pr->pr_name + mylen, name)) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) { if (pr->pr_uref > 0) return (pr); deadpr = pr; } mtx_unlock(&pr->pr_mtx); } } /* There was no valid prison - perhaps there was a dying one. */ if (deadpr != NULL) { mtx_lock(&deadpr->pr_mtx); if (deadpr->pr_ref == 0) { mtx_unlock(&deadpr->pr_mtx); goto again; } } return (deadpr); } /* * See if a prison has the specific flag set. */ int prison_flag(struct ucred *cred, unsigned flag) { /* This is an atomic read, so no locking is necessary. */ return (cred->cr_prison->pr_flags & flag); } int prison_allow(struct ucred *cred, unsigned flag) { /* This is an atomic read, so no locking is necessary. */ return (cred->cr_prison->pr_allow & flag); } /* * Remove a prison reference. If that was the last reference, remove the * prison itself - but not in this context in case there are locks held. */ void prison_free_locked(struct prison *pr) { mtx_assert(&pr->pr_mtx, MA_OWNED); pr->pr_ref--; if (pr->pr_ref == 0) { mtx_unlock(&pr->pr_mtx); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); return; } mtx_unlock(&pr->pr_mtx); } void prison_free(struct prison *pr) { mtx_lock(&pr->pr_mtx); prison_free_locked(pr); } static void prison_complete(void *context, int pending) { prison_deref((struct prison *)context, 0); } /* * Remove a prison reference (usually). This internal version assumes no * mutexes are held, except perhaps the prison itself. If there are no more * references, release and delist the prison. On completion, the prison lock * and the allprison lock are both unlocked. */ static void prison_deref(struct prison *pr, int flags) { struct prison *ppr, *tpr; if (!(flags & PD_LOCKED)) mtx_lock(&pr->pr_mtx); for (;;) { if (flags & PD_DEUREF) { pr->pr_uref--; KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0")); } if (flags & PD_DEREF) pr->pr_ref--; /* If the prison still has references, nothing else to do. */ if (pr->pr_ref > 0) { mtx_unlock(&pr->pr_mtx); if (flags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (flags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); return; } mtx_unlock(&pr->pr_mtx); if (flags & PD_LIST_SLOCKED) { if (!sx_try_upgrade(&allprison_lock)) { sx_sunlock(&allprison_lock); sx_xlock(&allprison_lock); } } else if (!(flags & PD_LIST_XLOCKED)) sx_xlock(&allprison_lock); TAILQ_REMOVE(&allprison, pr, pr_list); LIST_REMOVE(pr, pr_sibling); ppr = pr->pr_parent; for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount--; sx_xunlock(&allprison_lock); #ifdef VIMAGE if (pr->pr_vnet != ppr->pr_vnet) vnet_destroy(pr->pr_vnet); #endif if (pr->pr_root != NULL) vrele(pr->pr_root); mtx_destroy(&pr->pr_mtx); #ifdef INET free(pr->pr_ip4, M_PRISON); #endif #ifdef INET6 free(pr->pr_ip6, M_PRISON); #endif if (pr->pr_cpuset != NULL) cpuset_rel(pr->pr_cpuset); osd_jail_exit(pr); #ifdef RACCT if (racct_enable) prison_racct_detach(pr); #endif free(pr, M_PRISON); /* Removing a prison frees a reference on its parent. */ pr = ppr; mtx_lock(&pr->pr_mtx); flags = PD_DEREF | PD_DEUREF; } } void prison_hold_locked(struct prison *pr) { mtx_assert(&pr->pr_mtx, MA_OWNED); KASSERT(pr->pr_ref > 0, ("Trying to hold dead prison (jid=%d).", pr->pr_id)); pr->pr_ref++; } void prison_hold(struct prison *pr) { mtx_lock(&pr->pr_mtx); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); } void prison_proc_hold(struct prison *pr) { mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); pr->pr_uref++; mtx_unlock(&pr->pr_mtx); } void prison_proc_free(struct prison *pr) { mtx_lock(&pr->pr_mtx); KASSERT(pr->pr_uref > 0, ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); prison_deref(pr, PD_DEUREF | PD_LOCKED); } #ifdef INET /* * Restrict a prison's IP address list with its parent's, possibly replacing * it. Return true if the replacement buffer was used (or would have been). */ static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4) { int ii, ij, used; struct prison *ppr; ppr = pr->pr_parent; if (!(pr->pr_flags & PR_IP4_USER)) { /* This has no user settings, so just copy the parent's list. */ if (pr->pr_ip4s < ppr->pr_ip4s) { /* * There's no room for the parent's list. Use the * new list buffer, which is assumed to be big enough * (if it was passed). If there's no buffer, try to * allocate one. */ used = 1; if (newip4 == NULL) { newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4), M_PRISON, M_NOWAIT); if (newip4 != NULL) used = 0; } if (newip4 != NULL) { bcopy(ppr->pr_ip4, newip4, ppr->pr_ip4s * sizeof(*newip4)); free(pr->pr_ip4, M_PRISON); pr->pr_ip4 = newip4; pr->pr_ip4s = ppr->pr_ip4s; } return (used); } pr->pr_ip4s = ppr->pr_ip4s; if (pr->pr_ip4s > 0) bcopy(ppr->pr_ip4, pr->pr_ip4, pr->pr_ip4s * sizeof(*newip4)); else if (pr->pr_ip4 != NULL) { free(pr->pr_ip4, M_PRISON); pr->pr_ip4 = NULL; } } else if (pr->pr_ip4s > 0) { /* Remove addresses that aren't in the parent. */ for (ij = 0; ij < ppr->pr_ip4s; ij++) if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij < ppr->pr_ip4s) ii = 1; else { bcopy(pr->pr_ip4 + 1, pr->pr_ip4, --pr->pr_ip4s * sizeof(*pr->pr_ip4)); ii = 0; } for (ij = 1; ii < pr->pr_ip4s; ) { if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) { ii++; continue; } switch (ij >= ppr->pr_ip4s ? -1 : qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) { case -1: bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii, (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4)); break; case 0: ii++; ij++; break; case 1: ij++; break; } } if (pr->pr_ip4s == 0) { free(pr->pr_ip4, M_PRISON); pr->pr_ip4 = NULL; } } return (0); } /* * Pass back primary IPv4 address of this jail. * * If not restricted return success but do not alter the address. Caller has * to make sure to initialize it correctly (e.g. INADDR_ANY). * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4. * Address returned in NBO. */ int prison_get_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } /* * Return 1 if we should do proper source address selection or are not jailed. * We will return 0 if we should bypass source address selection in favour * of the primary jail IPv4 address. Only in this case *ia will be updated and * returned in NBO. * Return EAFNOSUPPORT, in case this jail does not allow IPv4. */ int prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; struct in_addr lia; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); if (!jailed(cred)) return (1); pr = cred->cr_prison; if (pr->pr_flags & PR_IP4_SADDRSEL) return (1); lia.s_addr = INADDR_ANY; error = prison_get_ip4(cred, &lia); if (error) return (error); if (lia.s_addr == INADDR_ANY) return (1); ia->s_addr = lia.s_addr; return (0); } /* * Return true if pr1 and pr2 have the same IPv4 address restrictions. */ int prison_equal_ip4(struct prison *pr1, struct prison *pr2) { if (pr1 == pr2) return (1); /* * No need to lock since the PR_IP4_USER flag can't be altered for * existing prisons. */ while (pr1 != &prison0 && #ifdef VIMAGE !(pr1->pr_flags & PR_VNET) && #endif !(pr1->pr_flags & PR_IP4_USER)) pr1 = pr1->pr_parent; while (pr2 != &prison0 && #ifdef VIMAGE !(pr2->pr_flags & PR_VNET) && #endif !(pr2->pr_flags & PR_IP4_USER)) pr2 = pr2->pr_parent; return (pr1 == pr2); } /* * Make sure our (source) address is set to something meaningful to this * jail. * * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv4. Address passed in in NBO and returned in NBO. */ int prison_local_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; struct in_addr ia0; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } ia0.s_addr = ntohl(ia->s_addr); if (ia0.s_addr == INADDR_LOOPBACK) { ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } if (ia0.s_addr == INADDR_ANY) { /* * In case there is only 1 IPv4 address, bind directly. */ if (pr->pr_ip4s == 1) ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } error = _prison_check_ip4(pr, ia); mtx_unlock(&pr->pr_mtx); return (error); } /* * Rewrite destination address in case we will connect to loopback address. * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4. * Address passed in in NBO and returned in NBO. */ int prison_remote_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (ntohl(ia->s_addr) == INADDR_LOOPBACK) { ia->s_addr = pr->pr_ip4[0].s_addr; mtx_unlock(&pr->pr_mtx); return (0); } /* * Return success because nothing had to be changed. */ mtx_unlock(&pr->pr_mtx); return (0); } /* * Check if given address belongs to the jail referenced by cred/prison. * * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv4. Address passed in in NBO. */ static int _prison_check_ip4(const struct prison *pr, const struct in_addr *ia) { int i, a, z, d; /* * Check the primary IP. */ if (pr->pr_ip4[0].s_addr == ia->s_addr) return (0); /* * All the other IPs are sorted so we can do a binary search. */ a = 0; z = pr->pr_ip4s - 2; while (a <= z) { i = (a + z) / 2; d = qcmp_v4(&pr->pr_ip4[i+1], ia); if (d > 0) z = i - 1; else if (d < 0) a = i + 1; else return (0); } return (EADDRNOTAVAIL); } int prison_check_ip4(const struct ucred *cred, const struct in_addr *ia) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } error = _prison_check_ip4(pr, ia); mtx_unlock(&pr->pr_mtx); return (error); } #endif #ifdef INET6 static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6) { int ii, ij, used; struct prison *ppr; ppr = pr->pr_parent; if (!(pr->pr_flags & PR_IP6_USER)) { /* This has no user settings, so just copy the parent's list. */ if (pr->pr_ip6s < ppr->pr_ip6s) { /* * There's no room for the parent's list. Use the * new list buffer, which is assumed to be big enough * (if it was passed). If there's no buffer, try to * allocate one. */ used = 1; if (newip6 == NULL) { newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6), M_PRISON, M_NOWAIT); if (newip6 != NULL) used = 0; } if (newip6 != NULL) { bcopy(ppr->pr_ip6, newip6, ppr->pr_ip6s * sizeof(*newip6)); free(pr->pr_ip6, M_PRISON); pr->pr_ip6 = newip6; pr->pr_ip6s = ppr->pr_ip6s; } return (used); } pr->pr_ip6s = ppr->pr_ip6s; if (pr->pr_ip6s > 0) bcopy(ppr->pr_ip6, pr->pr_ip6, pr->pr_ip6s * sizeof(*newip6)); else if (pr->pr_ip6 != NULL) { free(pr->pr_ip6, M_PRISON); pr->pr_ip6 = NULL; } } else if (pr->pr_ip6s > 0) { /* Remove addresses that aren't in the parent. */ for (ij = 0; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], &ppr->pr_ip6[ij])) break; if (ij < ppr->pr_ip6s) ii = 1; else { bcopy(pr->pr_ip6 + 1, pr->pr_ip6, --pr->pr_ip6s * sizeof(*pr->pr_ip6)); ii = 0; } for (ij = 1; ii < pr->pr_ip6s; ) { if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii], &ppr->pr_ip6[0])) { ii++; continue; } switch (ij >= ppr->pr_ip6s ? -1 : qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) { case -1: bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii, (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6)); break; case 0: ii++; ij++; break; case 1: ij++; break; } } if (pr->pr_ip6s == 0) { free(pr->pr_ip6, M_PRISON); pr->pr_ip6 = NULL; } } return 0; } /* * Pass back primary IPv6 address for this jail. * * If not restricted return success but do not alter the address. Caller has * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT). * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6. */ int prison_get_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } /* * Return 1 if we should do proper source address selection or are not jailed. * We will return 0 if we should bypass source address selection in favour * of the primary jail IPv6 address. Only in this case *ia will be updated and * returned in NBO. * Return EAFNOSUPPORT, in case this jail does not allow IPv6. */ int prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; struct in6_addr lia6; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); if (!jailed(cred)) return (1); pr = cred->cr_prison; if (pr->pr_flags & PR_IP6_SADDRSEL) return (1); lia6 = in6addr_any; error = prison_get_ip6(cred, &lia6); if (error) return (error); if (IN6_IS_ADDR_UNSPECIFIED(&lia6)) return (1); bcopy(&lia6, ia6, sizeof(struct in6_addr)); return (0); } /* * Return true if pr1 and pr2 have the same IPv6 address restrictions. */ int prison_equal_ip6(struct prison *pr1, struct prison *pr2) { if (pr1 == pr2) return (1); while (pr1 != &prison0 && #ifdef VIMAGE !(pr1->pr_flags & PR_VNET) && #endif !(pr1->pr_flags & PR_IP6_USER)) pr1 = pr1->pr_parent; while (pr2 != &prison0 && #ifdef VIMAGE !(pr2->pr_flags & PR_VNET) && #endif !(pr2->pr_flags & PR_IP6_USER)) pr2 = pr2->pr_parent; return (pr1 == pr2); } /* * Make sure our (source) address is set to something meaningful to this jail. * * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0) * when needed while binding. * * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv6. */ int prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (IN6_IS_ADDR_LOOPBACK(ia6)) { bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } if (IN6_IS_ADDR_UNSPECIFIED(ia6)) { /* * In case there is only 1 IPv6 address, and v6only is true, * then bind directly. */ if (v6only != 0 && pr->pr_ip6s == 1) bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } error = _prison_check_ip6(pr, ia6); mtx_unlock(&pr->pr_mtx); return (error); } /* * Rewrite destination address in case we will connect to loopback address. * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6. */ int prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (IN6_IS_ADDR_LOOPBACK(ia6)) { bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); mtx_unlock(&pr->pr_mtx); return (0); } /* * Return success because nothing had to be changed. */ mtx_unlock(&pr->pr_mtx); return (0); } /* * Check if given address belongs to the jail referenced by cred/prison. * * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv6. */ static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6) { int i, a, z, d; /* * Check the primary IP. */ if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6)) return (0); /* * All the other IPs are sorted so we can do a binary search. */ a = 0; z = pr->pr_ip6s - 2; while (a <= z) { i = (a + z) / 2; d = qcmp_v6(&pr->pr_ip6[i+1], ia6); if (d > 0) z = i - 1; else if (d < 0) a = i + 1; else return (0); } return (EADDRNOTAVAIL); } int prison_check_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } error = _prison_check_ip6(pr, ia6); mtx_unlock(&pr->pr_mtx); return (error); } #endif /* * Check if a jail supports the given address family. * * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT * if not. */ int prison_check_af(struct ucred *cred, int af) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); pr = cred->cr_prison; #ifdef VIMAGE /* Prisons with their own network stack are not limited. */ if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (af) { #ifdef INET case AF_INET: if (pr->pr_flags & PR_IP4) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif #ifdef INET6 case AF_INET6: if (pr->pr_flags & PR_IP6) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif case AF_LOCAL: case AF_ROUTE: break; default: if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Check if given address belongs to the jail referenced by cred (wrapper to * prison_check_ip[46]). * * Returns 0 if jail doesn't restrict the address family or if address belongs * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if * the jail doesn't allow the address family. IPv4 Address passed in in NBO. */ int prison_if(struct ucred *cred, struct sockaddr *sa) { #ifdef INET struct sockaddr_in *sai; #endif #ifdef INET6 struct sockaddr_in6 *sai6; #endif int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (sa->sa_family) { #ifdef INET case AF_INET: sai = (struct sockaddr_in *)sa; error = prison_check_ip4(cred, &sai->sin_addr); break; #endif #ifdef INET6 case AF_INET6: sai6 = (struct sockaddr_in6 *)sa; error = prison_check_ip6(cred, &sai6->sin6_addr); break; #endif default: if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ int prison_check(struct ucred *cred1, struct ucred *cred2) { return ((cred1->cr_prison == cred2->cr_prison || prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); } /* * Return 1 if p2 is a child of p1, otherwise 0. */ int prison_ischild(struct prison *pr1, struct prison *pr2) { for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) if (pr1 == pr2) return (1); return (0); } /* * Return 1 if the passed credential is in a jail, otherwise 0. */ int jailed(struct ucred *cred) { return (cred->cr_prison != &prison0); } /* * Return 1 if the passed credential is in a jail and that jail does not * have its own virtual network stack, otherwise 0. */ int jailed_without_vnet(struct ucred *cred) { if (!jailed(cred)) return (0); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif return (1); } /* * Return the correct hostname (domainname, et al) for the passed credential. */ void getcredhostname(struct ucred *cred, char *buf, size_t size) { struct prison *pr; /* * A NULL credential can be used to shortcut to the physical * system's hostname. */ pr = (cred != NULL) ? cred->cr_prison : &prison0; mtx_lock(&pr->pr_mtx); strlcpy(buf, pr->pr_hostname, size); mtx_unlock(&pr->pr_mtx); } void getcreddomainname(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_domainname, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostuuid(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_hostuuid, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostid(struct ucred *cred, unsigned long *hostid) { mtx_lock(&cred->cr_prison->pr_mtx); *hostid = cred->cr_prison->pr_hostid; mtx_unlock(&cred->cr_prison->pr_mtx); } #ifdef VIMAGE /* * Determine whether the prison represented by cred owns * its vnet rather than having it inherited. * * Returns 1 in case the prison owns the vnet, 0 otherwise. */ int prison_owns_vnet(struct ucred *cred) { /* * vnets cannot be added/removed after jail creation, * so no need to lock here. */ return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0); } #endif /* * Determine whether the subject represented by cred can "see" * status of a mount point. * Returns: 0 for permitted, ENOENT otherwise. * XXX: This function should be called cr_canseemount() and should be * placed in kern_prot.c. */ int prison_canseemount(struct ucred *cred, struct mount *mp) { struct prison *pr; struct statfs *sp; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return (0); if (pr->pr_root->v_mount == mp) return (0); if (pr->pr_enforce_statfs == 2) return (ENOENT); /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. * This is ugly check, but this is the only situation when jail's * directory ends with '/'. */ if (strcmp(pr->pr_path, "/") == 0) return (0); len = strlen(pr->pr_path); sp = &mp->mnt_stat; if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) return (ENOENT); /* * Be sure that we don't have situation where jail's root directory * is "/some/path" and mount point is "/some/pathpath". */ if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') return (ENOENT); return (0); } void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) { char jpath[MAXPATHLEN]; struct prison *pr; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return; if (prison_canseemount(cred, mp) != 0) { bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, "[restricted]", sizeof(sp->f_mntonname)); return; } if (pr->pr_root->v_mount == mp) { /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); *sp->f_mntonname = '/'; return; } /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. */ if (strcmp(pr->pr_path, "/") == 0) return; len = strlen(pr->pr_path); strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); if (*jpath == '\0') { /* Should never happen. */ *sp->f_mntonname = '/'; } else { strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); } } /* * Check with permission for a specific privilege is granted within jail. We * have a specific list of accepted privileges; the rest are denied. */ int prison_priv_check(struct ucred *cred, int priv) { if (!jailed(cred)) return (0); #ifdef VIMAGE /* * Privileges specific to prisons with a virtual network stack. * There might be a duplicate entry here in case the privilege * is only granted conditionally in the legacy jail case. */ switch (priv) { #ifdef notyet /* * NFS-specific privileges. */ case PRIV_NFS_DAEMON: case PRIV_NFS_LOCKD: #endif /* * Network stack privileges. */ case PRIV_NET_BRIDGE: case PRIV_NET_GRE: case PRIV_NET_BPF: case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ case PRIV_NET_ROUTE: case PRIV_NET_TAP: case PRIV_NET_SETIFMTU: case PRIV_NET_SETIFFLAGS: case PRIV_NET_SETIFCAP: case PRIV_NET_SETIFDESCR: case PRIV_NET_SETIFNAME : case PRIV_NET_SETIFMETRIC: case PRIV_NET_SETIFPHYS: case PRIV_NET_SETIFMAC: case PRIV_NET_ADDMULTI: case PRIV_NET_DELMULTI: case PRIV_NET_HWIOCTL: case PRIV_NET_SETLLADDR: case PRIV_NET_ADDIFGROUP: case PRIV_NET_DELIFGROUP: case PRIV_NET_IFCREATE: case PRIV_NET_IFDESTROY: case PRIV_NET_ADDIFADDR: case PRIV_NET_DELIFADDR: case PRIV_NET_LAGG: case PRIV_NET_GIF: case PRIV_NET_SETIFVNET: case PRIV_NET_SETIFFIB: /* * 802.11-related privileges. */ case PRIV_NET80211_GETKEY: #ifdef notyet case PRIV_NET80211_MANAGE: /* XXX-BZ discuss with sam@ */ #endif #ifdef notyet /* * ATM privileges. */ case PRIV_NETATM_CFG: case PRIV_NETATM_ADD: case PRIV_NETATM_DEL: case PRIV_NETATM_SET: /* * Bluetooth privileges. */ case PRIV_NETBLUETOOTH_RAW: #endif /* * Netgraph and netgraph module privileges. */ case PRIV_NETGRAPH_CONTROL: #ifdef notyet case PRIV_NETGRAPH_TTY: #endif /* * IPv4 and IPv6 privileges. */ case PRIV_NETINET_IPFW: case PRIV_NETINET_DIVERT: case PRIV_NETINET_PF: case PRIV_NETINET_DUMMYNET: case PRIV_NETINET_CARP: case PRIV_NETINET_MROUTE: case PRIV_NETINET_RAW: case PRIV_NETINET_ADDRCTRL6: case PRIV_NETINET_ND6: case PRIV_NETINET_SCOPE6: case PRIV_NETINET_ALIFETIME6: case PRIV_NETINET_IPSEC: case PRIV_NETINET_BINDANY: #ifdef notyet /* * NCP privileges. */ case PRIV_NETNCP: /* * SMB privileges. */ case PRIV_NETSMB: #endif /* * No default: or deny here. * In case of no permit fall through to next switch(). */ if (cred->cr_prison->pr_flags & PR_VNET) return (0); } #endif /* VIMAGE */ switch (priv) { /* * Allow ktrace privileges for root in jail. */ case PRIV_KTRACE: #if 0 /* * Allow jailed processes to configure audit identity and * submit audit records (login, etc). In the future we may * want to further refine the relationship between audit and * jail. */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: case PRIV_AUDIT_SUBMIT: #endif /* * Allow jailed processes to manipulate process UNIX * credentials in any way they see fit. */ case PRIV_CRED_SETUID: case PRIV_CRED_SETEUID: case PRIV_CRED_SETGID: case PRIV_CRED_SETEGID: case PRIV_CRED_SETGROUPS: case PRIV_CRED_SETREUID: case PRIV_CRED_SETREGID: case PRIV_CRED_SETRESUID: case PRIV_CRED_SETRESGID: /* * Jail implements visibility constraints already, so allow * jailed root to override uid/gid-based constraints. */ case PRIV_SEEOTHERGIDS: case PRIV_SEEOTHERUIDS: /* * Jail implements inter-process debugging limits already, so * allow jailed root various debugging privileges. */ case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: /* * Allow jail to set various resource limits and login * properties, and for now, exceed process resource limits. */ case PRIV_PROC_LIMIT: case PRIV_PROC_SETLOGIN: case PRIV_PROC_SETRLIMIT: /* * System V and POSIX IPC privileges are granted in jail. */ case PRIV_IPC_READ: case PRIV_IPC_WRITE: case PRIV_IPC_ADMIN: case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: /* * Jail operations within a jail work on child jails. */ case PRIV_JAIL_ATTACH: case PRIV_JAIL_SET: case PRIV_JAIL_REMOVE: /* * Jail implements its own inter-process limits, so allow * root processes in jail to change scheduling on other * processes in the same jail. Likewise for signalling. */ case PRIV_SCHED_DIFFCRED: case PRIV_SCHED_CPUSET: case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: /* * Allow jailed processes to write to sysctls marked as jail * writable. */ case PRIV_SYSCTL_WRITEJAIL: /* * Allow root in jail to manage a variety of quota * properties. These should likely be conditional on a * configuration option. */ case PRIV_VFS_GETQUOTA: case PRIV_VFS_SETQUOTA: /* * Since Jail relies on chroot() to implement file system * protections, grant many VFS privileges to root in jail. * Be careful to exclude mount-related and NFS-related * privileges. */ case PRIV_VFS_READ: case PRIV_VFS_WRITE: case PRIV_VFS_ADMIN: case PRIV_VFS_EXEC: case PRIV_VFS_LOOKUP: case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ case PRIV_VFS_CHFLAGS_DEV: case PRIV_VFS_CHOWN: case PRIV_VFS_CHROOT: case PRIV_VFS_RETAINSUGID: case PRIV_VFS_FCHROOT: case PRIV_VFS_LINK: case PRIV_VFS_SETGID: case PRIV_VFS_STAT: case PRIV_VFS_STICKYFILE: /* * As in the non-jail case, non-root users are expected to be * able to read kernel/phyiscal memory (provided /dev/[k]mem * exists in the jail and they have permission to access it). */ case PRIV_KMEM_READ: return (0); /* * Depending on the global setting, allow privilege of * setting system flags. */ case PRIV_VFS_SYSFLAGS: if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) return (0); else return (EPERM); /* * Depending on the global setting, allow privilege of * mounting/unmounting file systems. */ case PRIV_VFS_MOUNT: case PRIV_VFS_UNMOUNT: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_MOUNT_OWNER: if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT && cred->cr_prison->pr_enforce_statfs < 2) return (0); else return (EPERM); /* * Allow jailed root to bind reserved ports and reuse in-use * ports. */ case PRIV_NETINET_RESERVEDPORT: case PRIV_NETINET_REUSEPORT: return (0); /* * Allow jailed root to set certian IPv4/6 (option) headers. */ case PRIV_NETINET_SETHDROPTS: return (0); /* * Conditionally allow creating raw sockets in jail. */ case PRIV_NETINET_RAW: if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) return (0); else return (EPERM); /* * Since jail implements its own visibility limits on netstat * sysctls, allow getcred. This allows identd to work in * jail. */ case PRIV_NETINET_GETCRED: return (0); /* * Allow jailed root to set loginclass. */ case PRIV_PROC_SETLOGINCLASS: return (0); default: /* * In all remaining cases, deny the privilege request. This * includes almost all network privileges, many system * configuration privileges. */ return (EPERM); } } /* * Return the part of pr2's name that is relative to pr1, or the whole name * if it does not directly follow. */ char * prison_name(struct prison *pr1, struct prison *pr2) { char *name; /* Jails see themselves as "0" (if they see themselves at all). */ if (pr1 == pr2) return "0"; name = pr2->pr_name; if (prison_ischild(pr1, pr2)) { /* * pr1 isn't locked (and allprison_lock may not be either) * so its length can't be counted on. But the number of dots * can be counted on - and counted. */ for (; pr1 != &prison0; pr1 = pr1->pr_parent) name = strchr(name, '.') + 1; } return (name); } /* * Return the part of pr2's path that is relative to pr1, or the whole path * if it does not directly follow. */ static char * prison_path(struct prison *pr1, struct prison *pr2) { char *path1, *path2; int len1; path1 = pr1->pr_path; path2 = pr2->pr_path; if (!strcmp(path1, "/")) return (path2); len1 = strlen(path1); if (strncmp(path1, path2, len1)) return (path2); if (path2[len1] == '\0') return "/"; if (path2[len1] == '/') return (path2 + len1); return (path2); } /* * Jail-related sysctls. */ static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0, "Jails"); static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { struct xprison *xp; struct prison *pr, *cpr; #ifdef INET struct in_addr *ip4 = NULL; int ip4s = 0; #endif #ifdef INET6 struct in6_addr *ip6 = NULL; int ip6s = 0; #endif int descend, error; xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); pr = req->td->td_ucred->cr_prison; error = 0; sx_slock(&allprison_lock); FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { #if defined(INET) || defined(INET6) again: #endif mtx_lock(&cpr->pr_mtx); #ifdef INET if (cpr->pr_ip4s > 0) { if (ip4s < cpr->pr_ip4s) { ip4s = cpr->pr_ip4s; mtx_unlock(&cpr->pr_mtx); ip4 = realloc(ip4, ip4s * sizeof(struct in_addr), M_TEMP, M_WAITOK); goto again; } bcopy(cpr->pr_ip4, ip4, cpr->pr_ip4s * sizeof(struct in_addr)); } #endif #ifdef INET6 if (cpr->pr_ip6s > 0) { if (ip6s < cpr->pr_ip6s) { ip6s = cpr->pr_ip6s; mtx_unlock(&cpr->pr_mtx); ip6 = realloc(ip6, ip6s * sizeof(struct in6_addr), M_TEMP, M_WAITOK); goto again; } bcopy(cpr->pr_ip6, ip6, cpr->pr_ip6s * sizeof(struct in6_addr)); } #endif if (cpr->pr_ref == 0) { mtx_unlock(&cpr->pr_mtx); continue; } bzero(xp, sizeof(*xp)); xp->pr_version = XPRISON_VERSION; xp->pr_id = cpr->pr_id; xp->pr_state = cpr->pr_uref > 0 ? PRISON_STATE_ALIVE : PRISON_STATE_DYING; strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); #ifdef INET xp->pr_ip4s = cpr->pr_ip4s; #endif #ifdef INET6 xp->pr_ip6s = cpr->pr_ip6s; #endif mtx_unlock(&cpr->pr_mtx); error = SYSCTL_OUT(req, xp, sizeof(*xp)); if (error) break; #ifdef INET if (xp->pr_ip4s > 0) { error = SYSCTL_OUT(req, ip4, xp->pr_ip4s * sizeof(struct in_addr)); if (error) break; } #endif #ifdef INET6 if (xp->pr_ip6s > 0) { error = SYSCTL_OUT(req, ip6, xp->pr_ip6s * sizeof(struct in6_addr)); if (error) break; } #endif } sx_sunlock(&allprison_lock); free(xp, M_TEMP); #ifdef INET free(ip4, M_TEMP); #endif #ifdef INET6 free(ip6, M_TEMP); #endif return (error); } SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_list, "S", "List of active jails"); static int sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) { int error, injail; injail = jailed(req->td->td_ucred); error = SYSCTL_OUT(req, &injail, sizeof(injail)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_jailed, "I", "Process in jail?"); static int sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) { int error, havevnet; #ifdef VIMAGE struct ucred *cred = req->td->td_ucred; havevnet = jailed(cred) && prison_owns_vnet(cred); #else havevnet = 0; #endif error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, vnet, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns VNET?"); #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, "Number of IP addresses a jail may have at most per address family"); #endif /* * Default parameters for jail(2) compatability. For historical reasons, * the sysctl names have varying similarity to the parameter names. Prisons * just see their own parameters, and can't change them. */ static int sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) { struct prison *pr; int allow, error, i; pr = req->td->td_ucred->cr_prison; allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow; /* Get the current flag value, and convert it to a boolean. */ i = (allow & arg2) ? 1 : 0; if (arg1 != NULL) i = !i; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); i = i ? arg2 : 0; if (arg1 != NULL) i ^= arg2; /* * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 * for writing. */ mtx_lock(&prison0.pr_mtx); jail_default_allow = (jail_default_allow & ~arg2) | i; mtx_unlock(&prison0.pr_mtx); return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", "Processes in jail can set their hostnames"); SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", "Processes in jail are limited to creating UNIX/IP/route sockets only"); SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", "Processes in jail can use System V IPC primitives"); SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", "Prison root can create raw sockets"); SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", "Processes in jail can alter system file flags"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", "Processes in jail can mount/unmount jail-friendly file systems"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the devfs file system"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the fdescfs file system"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the nullfs file system"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the procfs file system"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the linprocfs file system"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the linsysfs file system"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the tmpfs file system"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I", "Processes in jail can mount the zfs file system"); static int sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) { struct prison *pr; int level, error; pr = req->td->td_ucred->cr_prison; level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); *(int *)arg1 = level; return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), sysctl_jail_default_level, "I", "Processes in jail cannot see all mounted file systems"); SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), sysctl_jail_default_level, "I", "Ruleset for the devfs filesystem in jail"); /* * Nodes to describe jail parameters. Maximum length of string parameters * is returned in the string itself, and the other parameters exist merely * to make themselves and their types known. */ SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0, "Jail parameters"); int sysctl_jail_param(SYSCTL_HANDLER_ARGS) { int i; long l; size_t s; char numbuf[12]; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_LONG: case CTLTYPE_ULONG: l = 0; #ifdef SCTL_MASK32 if (!(req->flags & SCTL_MASK32)) #endif return (SYSCTL_OUT(req, &l, sizeof(l))); case CTLTYPE_INT: case CTLTYPE_UINT: i = 0; return (SYSCTL_OUT(req, &i, sizeof(i))); case CTLTYPE_STRING: snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); return (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); case CTLTYPE_STRUCT: s = (size_t)arg2; return (SYSCTL_OUT(req, &s, sizeof(s))); } return (0); } /* * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at * jail creation time but cannot be changed in an existing jail. */ SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail secure level"); SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail value for kern.osreldate and uname -K"); SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, "Jail value for kern.osrelease and uname -r"); SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail cannot see all mounted file systems"); SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, "I", "Ruleset for in-jail devfs mounts"); SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail persistence"); #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, "I", "Current number of child jails"); SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, "I", "Maximum number of child jails"); SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail hostname"); SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail NIS domainname"); SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, "Jail host UUID"); SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, "LU", "Jail host ID"); SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); #ifdef INET SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, "Jail IPv4 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), "S,in_addr,a", "Jail IPv4 addresses"); SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv4 source address selection rather than the " "primary jail IPv4 address."); #endif #ifdef INET6 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, "Jail IPv6 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), "S,in6_addr,a", "Jail IPv6 addresses"); SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv6 source address selection rather than the " "primary jail IPv6 address."); #endif SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set hostname"); SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may use SYSV IPC"); SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create raw sockets"); SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may alter system file flags"); SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set file quotas"); SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount/unmount jail-friendly file systems in general"); SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the devfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the fdescfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the nullfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the procfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the linprocfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the linsysfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the tmpfs file system"); SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount the zfs file system"); #ifdef RACCT void prison_racct_foreach(void (*callback)(struct racct *racct, - void *arg2, void *arg3), void *arg2, void *arg3) + void *arg2, void *arg3), void (*pre)(void), void (*post)(void), + void *arg2, void *arg3) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_slock(&allprison_lock); + if (pre != NULL) + (pre)(); LIST_FOREACH(prr, &allprison_racct, prr_next) (callback)(prr->prr_racct, arg2, arg3); + if (post != NULL) + (post)(); sx_sunlock(&allprison_lock); } static struct prison_racct * prison_racct_find_locked(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) return (NULL); LIST_FOREACH(prr, &allprison_racct, prr_next) { if (strcmp(name, prr->prr_name) != 0) continue; /* Found prison_racct with a matching name? */ prison_racct_hold(prr); return (prr); } /* Add new prison_racct. */ prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); racct_create(&prr->prr_racct); strcpy(prr->prr_name, name); refcount_init(&prr->prr_refcount, 1); LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); return (prr); } struct prison_racct * prison_racct_find(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_xlock(&allprison_lock); prr = prison_racct_find_locked(name); sx_xunlock(&allprison_lock); return (prr); } void prison_racct_hold(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); refcount_acquire(&prr->prr_refcount); } static void prison_racct_free_locked(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (refcount_release(&prr->prr_refcount)) { racct_destroy(&prr->prr_racct); LIST_REMOVE(prr, prr_next); free(prr, M_PRISON_RACCT); } } void prison_racct_free(struct prison_racct *prr) { int old; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); old = prr->prr_refcount; if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1)) return; sx_xlock(&allprison_lock); prison_racct_free_locked(prr); sx_xunlock(&allprison_lock); } static void prison_racct_attach(struct prison *pr) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); prr = prison_racct_find_locked(pr->pr_name); KASSERT(prr != NULL, ("cannot find prison_racct")); pr->pr_prison_racct = prr; } /* * Handle jail renaming. From the racct point of view, renaming means * moving from one prison_racct to another. */ static void prison_racct_modify(struct prison *pr) { struct proc *p; struct ucred *cred; struct prison_racct *oldprr; ASSERT_RACCT_ENABLED(); sx_slock(&allproc_lock); sx_xlock(&allprison_lock); if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { sx_xunlock(&allprison_lock); sx_sunlock(&allproc_lock); return; } oldprr = pr->pr_prison_racct; pr->pr_prison_racct = NULL; prison_racct_attach(pr); /* * Move resource utilisation records. */ racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); /* * Force rctl to reattach rules to processes. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); cred = crhold(p->p_ucred); PROC_UNLOCK(p); racct_proc_ucred_changed(p, cred, cred); crfree(cred); } sx_sunlock(&allproc_lock); prison_racct_free_locked(oldprr); sx_xunlock(&allprison_lock); } static void prison_racct_detach(struct prison *pr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (pr->pr_prison_racct == NULL) return; prison_racct_free(pr->pr_prison_racct); pr->pr_prison_racct = NULL; } #endif /* RACCT */ #ifdef DDB static void db_show_prison(struct prison *pr) { int fi; #if defined(INET) || defined(INET6) int ii; #endif unsigned jsf; #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif db_printf("prison %p:\n", pr); db_printf(" jid = %d\n", pr->pr_id); db_printf(" name = %s\n", pr->pr_name); db_printf(" parent = %p\n", pr->pr_parent); db_printf(" ref = %d\n", pr->pr_ref); db_printf(" uref = %d\n", pr->pr_uref); db_printf(" path = %s\n", pr->pr_path); db_printf(" cpuset = %d\n", pr->pr_cpuset ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); db_printf(" children.max = %d\n", pr->pr_childmax); db_printf(" children.cur = %d\n", pr->pr_childcount); db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); db_printf(" flags = 0x%x", pr->pr_flags); for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]); fi++) if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi))) db_printf(" %s", pr_flag_names[fi]); for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]); fi++) { jsf = pr->pr_flags & (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new); db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name, pr_flag_jailsys[fi].disable && (jsf == pr_flag_jailsys[fi].disable) ? "disable" : (jsf == pr_flag_jailsys[fi].new) ? "new" : "inherit"); } db_printf(" allow = 0x%x", pr->pr_allow); for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); fi++) if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi))) db_printf(" %s", pr_allow_names[fi]); db_printf("\n"); db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); db_printf(" host.hostname = %s\n", pr->pr_hostname); db_printf(" host.domainname = %s\n", pr->pr_domainname); db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); db_printf(" host.hostid = %lu\n", pr->pr_hostid); #ifdef INET db_printf(" ip4s = %d\n", pr->pr_ip4s); for (ii = 0; ii < pr->pr_ip4s; ii++) db_printf(" %s %s\n", ii == 0 ? "ip4.addr =" : " ", inet_ntoa(pr->pr_ip4[ii])); #endif #ifdef INET6 db_printf(" ip6s = %d\n", pr->pr_ip6s); for (ii = 0; ii < pr->pr_ip6s; ii++) db_printf(" %s %s\n", ii == 0 ? "ip6.addr =" : " ", ip6_sprintf(ip6buf, &pr->pr_ip6[ii])); #endif } DB_SHOW_COMMAND(prison, db_show_prison_command) { struct prison *pr; if (!have_addr) { /* * Show all prisons in the list, and prison0 which is not * listed. */ db_show_prison(&prison0); if (!db_pager_quit) { TAILQ_FOREACH(pr, &allprison, pr_list) { db_show_prison(pr); if (db_pager_quit) break; } } return; } if (addr == 0) pr = &prison0; else { /* Look for a prison with the ID and with references. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr && pr->pr_ref > 0) break; if (pr == NULL) /* Look again, without requiring a reference. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr) break; if (pr == NULL) /* Assume address points to a valid prison. */ pr = (struct prison *)addr; } db_show_prison(pr); } #endif /* DDB */ Index: head/sys/kern/kern_loginclass.c =================================================================== --- head/sys/kern/kern_loginclass.c (revision 290856) +++ head/sys/kern/kern_loginclass.c (revision 290857) @@ -1,245 +1,250 @@ /*- * Copyright (c) 2011 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Processes may set login class name using setloginclass(2). This * is usually done through call to setusercontext(3), by programs * such as login(1), based on information from master.passwd(5). Kernel * uses this information to enforce per-class resource limits. Current * login class can be determined using id(1). Login class is inherited * from the parent process during fork(2). If not set, it defaults * to "default". * * Code in this file implements setloginclass(2) and getloginclass(2) * system calls, and maintains class name storage and retrieval. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_LOGINCLASS, "loginclass", "loginclass structures"); LIST_HEAD(, loginclass) loginclasses; /* * Lock protecting loginclasses list. */ static struct rwlock loginclasses_lock; RW_SYSINIT(loginclasses_init, &loginclasses_lock, "loginclasses lock"); void loginclass_hold(struct loginclass *lc) { refcount_acquire(&lc->lc_refcount); } void loginclass_free(struct loginclass *lc) { int old; old = lc->lc_refcount; if (old > 1 && atomic_cmpset_int(&lc->lc_refcount, old, old - 1)) return; rw_wlock(&loginclasses_lock); if (!refcount_release(&lc->lc_refcount)) { rw_wunlock(&loginclasses_lock); return; } racct_destroy(&lc->lc_racct); LIST_REMOVE(lc, lc_next); rw_wunlock(&loginclasses_lock); free(lc, M_LOGINCLASS); } /* * Look up a loginclass struct for the parameter name. * loginclasses_lock must be locked. * Increase refcount on loginclass struct returned. */ static struct loginclass * loginclass_lookup(const char *name) { struct loginclass *lc; rw_assert(&loginclasses_lock, RA_LOCKED); LIST_FOREACH(lc, &loginclasses, lc_next) if (strcmp(name, lc->lc_name) == 0) { loginclass_hold(lc); break; } return (lc); } /* * Return loginclass structure with a corresponding name. Not * performance critical, as it's used mainly by setloginclass(2), * which happens once per login session. Caller has to use * loginclass_free() on the returned value when it's no longer * needed. */ struct loginclass * loginclass_find(const char *name) { struct loginclass *lc, *new_lc; if (name[0] == '\0' || strlen(name) >= MAXLOGNAME) return (NULL); rw_rlock(&loginclasses_lock); lc = loginclass_lookup(name); rw_runlock(&loginclasses_lock); if (lc != NULL) return (lc); new_lc = malloc(sizeof(*new_lc), M_LOGINCLASS, M_ZERO | M_WAITOK); racct_create(&new_lc->lc_racct); refcount_init(&new_lc->lc_refcount, 1); strcpy(new_lc->lc_name, name); rw_wlock(&loginclasses_lock); /* * There's a chance someone created our loginclass while we * were in malloc and not holding the lock, so we have to * make sure we don't insert a duplicate loginclass. */ if ((lc = loginclass_lookup(name)) == NULL) { LIST_INSERT_HEAD(&loginclasses, new_lc, lc_next); rw_wunlock(&loginclasses_lock); lc = new_lc; } else { rw_wunlock(&loginclasses_lock); racct_destroy(&new_lc->lc_racct); free(new_lc, M_LOGINCLASS); } return (lc); } /* * Get login class name. */ #ifndef _SYS_SYSPROTO_H_ struct getloginclass_args { char *namebuf; size_t namelen; }; #endif /* ARGSUSED */ int sys_getloginclass(struct thread *td, struct getloginclass_args *uap) { struct loginclass *lc; size_t lcnamelen; lc = td->td_ucred->cr_loginclass; lcnamelen = strlen(lc->lc_name) + 1; if (lcnamelen > uap->namelen) return (ERANGE); return (copyout(lc->lc_name, uap->namebuf, lcnamelen)); } /* * Set login class name. */ #ifndef _SYS_SYSPROTO_H_ struct setloginclass_args { const char *namebuf; }; #endif /* ARGSUSED */ int sys_setloginclass(struct thread *td, struct setloginclass_args *uap) { struct proc *p = td->td_proc; int error; char lcname[MAXLOGNAME]; struct loginclass *newlc; struct ucred *newcred, *oldcred; error = priv_check(td, PRIV_PROC_SETLOGINCLASS); if (error != 0) return (error); error = copyinstr(uap->namebuf, lcname, sizeof(lcname), NULL); if (error != 0) return (error); newlc = loginclass_find(lcname); if (newlc == NULL) return (EINVAL); newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_loginclass = newlc; proc_set_cred(p, newcred); PROC_UNLOCK(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); #endif loginclass_free(oldcred->cr_loginclass); crfree(oldcred); return (0); } void loginclass_racct_foreach(void (*callback)(struct racct *racct, - void *arg2, void *arg3), void *arg2, void *arg3) + void *arg2, void *arg3), void (*pre)(void), void (*post)(void), + void *arg2, void *arg3) { struct loginclass *lc; rw_rlock(&loginclasses_lock); + if (pre != NULL) + (pre)(); LIST_FOREACH(lc, &loginclasses, lc_next) (callback)(lc->lc_racct, arg2, arg3); + if (post != NULL) + (post)(); rw_runlock(&loginclasses_lock); } Index: head/sys/kern/kern_racct.c =================================================================== --- head/sys/kern/kern_racct.c (revision 290856) +++ head/sys/kern/kern_racct.c (revision 290857) @@ -1,1300 +1,1316 @@ /*- * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RCTL #include #endif #ifdef RACCT FEATURE(racct, "Resource Accounting"); /* * Do not block processes that have their %cpu usage <= pcpu_threshold. */ static int pcpu_threshold = 1; #ifdef RACCT_DEFAULT_TO_DISABLED int racct_enable = 0; #else int racct_enable = 1; #endif SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting"); SYSCTL_UINT(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable, 0, "Enable RACCT/RCTL"); SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold, 0, "Processes with higher %cpu usage than this value can be throttled."); /* * How many seconds it takes to use the scheduler %cpu calculations. When a * process starts, we compute its %cpu usage by dividing its runtime by the * process wall clock time. After RACCT_PCPU_SECS pass, we use the value * provided by the scheduler. */ #define RACCT_PCPU_SECS 3 static struct mtx racct_lock; MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF); static uma_zone_t racct_zone; static void racct_sub_racct(struct racct *dest, const struct racct *src); static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount); static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount); SDT_PROVIDER_DEFINE(racct); SDT_PROBE_DEFINE3(racct, kernel, rusage, add, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, kernel, rusage, add__failure, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, kernel, rusage, add__cred, "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, kernel, rusage, add__force, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, kernel, rusage, set, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, kernel, rusage, set__failure, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, "struct proc *", "int", "uint64_t"); SDT_PROBE_DEFINE3(racct, kernel, rusage, sub__cred, "struct ucred *", "int", "uint64_t"); SDT_PROBE_DEFINE1(racct, kernel, racct, create, "struct racct *"); SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, "struct racct *"); SDT_PROBE_DEFINE2(racct, kernel, racct, join, "struct racct *", "struct racct *"); SDT_PROBE_DEFINE2(racct, kernel, racct, join__failure, "struct racct *", "struct racct *"); SDT_PROBE_DEFINE2(racct, kernel, racct, leave, "struct racct *", "struct racct *"); int racct_types[] = { [RACCT_CPU] = RACCT_IN_MILLIONS, [RACCT_DATA] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_STACK] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_CORE] = RACCT_DENIABLE, [RACCT_RSS] = RACCT_RECLAIMABLE, [RACCT_MEMLOCK] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NPROC] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_NOFILE] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_VMEM] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NPTS] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SWAP] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NTHR] = RACCT_RECLAIMABLE | RACCT_DENIABLE, [RACCT_MSGQQUEUED] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_MSGQSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NMSGQ] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_NSEMOP] = RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE, [RACCT_NSHM] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_SHMSIZE] = RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY, [RACCT_WALLCLOCK] = RACCT_IN_MILLIONS, [RACCT_PCTCPU] = RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS }; static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE; #ifdef SCHED_4BSD /* * Contains intermediate values for %cpu calculations to avoid using floating * point in the kernel. * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20) * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to * zero so the calculations are more straightforward. */ fixpt_t ccpu_exp[] = { [0] = FSCALE * 1, [1] = FSCALE * 0.95122942450071400909, [2] = FSCALE * 0.90483741803595957316, [3] = FSCALE * 0.86070797642505780722, [4] = FSCALE * 0.81873075307798185866, [5] = FSCALE * 0.77880078307140486824, [6] = FSCALE * 0.74081822068171786606, [7] = FSCALE * 0.70468808971871343435, [8] = FSCALE * 0.67032004603563930074, [9] = FSCALE * 0.63762815162177329314, [10] = FSCALE * 0.60653065971263342360, [11] = FSCALE * 0.57694981038048669531, [12] = FSCALE * 0.54881163609402643262, [13] = FSCALE * 0.52204577676101604789, [14] = FSCALE * 0.49658530379140951470, [15] = FSCALE * 0.47236655274101470713, [16] = FSCALE * 0.44932896411722159143, [17] = FSCALE * 0.42741493194872666992, [18] = FSCALE * 0.40656965974059911188, [19] = FSCALE * 0.38674102345450120691, [20] = FSCALE * 0.36787944117144232159, [21] = FSCALE * 0.34993774911115535467, [22] = FSCALE * 0.33287108369807955328, [23] = FSCALE * 0.31663676937905321821, [24] = FSCALE * 0.30119421191220209664, [25] = FSCALE * 0.28650479686019010032, [26] = FSCALE * 0.27253179303401260312, [27] = FSCALE * 0.25924026064589150757, [28] = FSCALE * 0.24659696394160647693, [29] = FSCALE * 0.23457028809379765313, [30] = FSCALE * 0.22313016014842982893, [31] = FSCALE * 0.21224797382674305771, [32] = FSCALE * 0.20189651799465540848, [33] = FSCALE * 0.19204990862075411423, [34] = FSCALE * 0.18268352405273465022, [35] = FSCALE * 0.17377394345044512668, [36] = FSCALE * 0.16529888822158653829, [37] = FSCALE * 0.15723716631362761621, [38] = FSCALE * 0.14956861922263505264, [39] = FSCALE * 0.14227407158651357185, [40] = FSCALE * 0.13533528323661269189, [41] = FSCALE * 0.12873490358780421886, [42] = FSCALE * 0.12245642825298191021, [43] = FSCALE * 0.11648415777349695786, [44] = FSCALE * 0.11080315836233388333, [45] = FSCALE * 0.10539922456186433678, [46] = FSCALE * 0.10025884372280373372, [47] = FSCALE * 0.09536916221554961888, [48] = FSCALE * 0.09071795328941250337, [49] = FSCALE * 0.08629358649937051097, [50] = FSCALE * 0.08208499862389879516, [51] = FSCALE * 0.07808166600115315231, [52] = FSCALE * 0.07427357821433388042, [53] = FSCALE * 0.07065121306042958674, [54] = FSCALE * 0.06720551273974976512, [55] = FSCALE * 0.06392786120670757270, [56] = FSCALE * 0.06081006262521796499, [57] = FSCALE * 0.05784432087483846296, [58] = FSCALE * 0.05502322005640722902, [59] = FSCALE * 0.05233970594843239308, [60] = FSCALE * 0.04978706836786394297, [61] = FSCALE * 0.04735892439114092119, [62] = FSCALE * 0.04504920239355780606, [63] = FSCALE * 0.04285212686704017991, [64] = FSCALE * 0.04076220397836621516, [65] = FSCALE * 0.03877420783172200988, [66] = FSCALE * 0.03688316740124000544, [67] = FSCALE * 0.03508435410084502588, [68] = FSCALE * 0.03337326996032607948, [69] = FSCALE * 0.03174563637806794323, [70] = FSCALE * 0.03019738342231850073, [71] = FSCALE * 0.02872463965423942912, [72] = FSCALE * 0.02732372244729256080, [73] = FSCALE * 0.02599112877875534358, [74] = FSCALE * 0.02472352647033939120, [75] = FSCALE * 0.02351774585600910823, [76] = FSCALE * 0.02237077185616559577, [77] = FSCALE * 0.02127973643837716938, [78] = FSCALE * 0.02024191144580438847, [79] = FSCALE * 0.01925470177538692429, [80] = FSCALE * 0.01831563888873418029, [81] = FSCALE * 0.01742237463949351138, [82] = FSCALE * 0.01657267540176124754, [83] = FSCALE * 0.01576441648485449082, [84] = FSCALE * 0.01499557682047770621, [85] = FSCALE * 0.01426423390899925527, [86] = FSCALE * 0.01356855901220093175, [87] = FSCALE * 0.01290681258047986886, [88] = FSCALE * 0.01227733990306844117, [89] = FSCALE * 0.01167856697039544521, [90] = FSCALE * 0.01110899653824230649, [91] = FSCALE * 0.01056720438385265337, [92] = FSCALE * 0.01005183574463358164, [93] = FSCALE * 0.00956160193054350793, [94] = FSCALE * 0.00909527710169581709, [95] = FSCALE * 0.00865169520312063417, [96] = FSCALE * 0.00822974704902002884, [97] = FSCALE * 0.00782837754922577143, [98] = FSCALE * 0.00744658307092434051, [99] = FSCALE * 0.00708340892905212004, [100] = FSCALE * 0.00673794699908546709, [101] = FSCALE * 0.00640933344625638184, [102] = FSCALE * 0.00609674656551563610, [103] = FSCALE * 0.00579940472684214321, [104] = FSCALE * 0.00551656442076077241, [105] = FSCALE * 0.00524751839918138427, [106] = FSCALE * 0.00499159390691021621, [107] = FSCALE * 0.00474815099941147558, [108] = FSCALE * 0.00451658094261266798, [109] = FSCALE * 0.00429630469075234057, [110] = FSCALE * 0.00408677143846406699, }; #endif #define CCPU_EXP_MAX 110 /* * This function is analogical to the getpcpu() function in the ps(1) command. * They should both calculate in the same way so that the racct %cpu * calculations are consistent with the values showed by the ps(1) tool. * The calculations are more complex in the 4BSD scheduler because of the value * of the ccpu variable. In ULE it is defined to be zero which saves us some * work. */ static uint64_t racct_getpcpu(struct proc *p, u_int pcpu) { u_int swtime; #ifdef SCHED_4BSD fixpt_t pctcpu, pctcpu_next; #endif #ifdef SMP struct pcpu *pc; int found; #endif fixpt_t p_pctcpu; struct thread *td; ASSERT_RACCT_ENABLED(); /* * If the process is swapped out, we count its %cpu usage as zero. * This behaviour is consistent with the userland ps(1) tool. */ if ((p->p_flag & P_INMEM) == 0) return (0); swtime = (ticks - p->p_swtick) / hz; /* * For short-lived processes, the sched_pctcpu() returns small * values even for cpu intensive processes. Therefore we use * our own estimate in this case. */ if (swtime < RACCT_PCPU_SECS) return (pcpu); p_pctcpu = 0; FOREACH_THREAD_IN_PROC(p, td) { if (td == PCPU_GET(idlethread)) continue; #ifdef SMP found = 0; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (td == pc->pc_idlethread) { found = 1; break; } } if (found) continue; #endif thread_lock(td); #ifdef SCHED_4BSD pctcpu = sched_pctcpu(td); /* Count also the yet unfinished second. */ pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT; pctcpu_next += sched_pctcpu_delta(td); p_pctcpu += max(pctcpu, pctcpu_next); #else /* * In ULE the %cpu statistics are updated on every * sched_pctcpu() call. So special calculations to * account for the latest (unfinished) second are * not needed. */ p_pctcpu += sched_pctcpu(td); #endif thread_unlock(td); } #ifdef SCHED_4BSD if (swtime <= CCPU_EXP_MAX) return ((100 * (uint64_t)p_pctcpu * 1000000) / (FSCALE - ccpu_exp[swtime])); #endif return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE); } static void racct_add_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); dest->r_resources[i] += src->r_resources[i]; } } static void racct_sub_racct(struct racct *dest, const struct racct *src) { int i; ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); /* * Update resource usage in dest. */ for (i = 0; i <= RACCT_MAX; i++) { if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) { KASSERT(dest->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: dest < 0", __func__, i)); KASSERT(src->r_resources[i] >= 0, ("%s: resource %d propagation meltdown: src < 0", __func__, i)); KASSERT(src->r_resources[i] <= dest->r_resources[i], ("%s: resource %d propagation meltdown: src > dest", __func__, i)); } if (RACCT_CAN_DROP(i)) { dest->r_resources[i] -= src->r_resources[i]; if (dest->r_resources[i] < 0) { KASSERT(RACCT_IS_SLOPPY(i) || RACCT_IS_DECAYING(i), ("%s: resource %d usage < 0", __func__, i)); dest->r_resources[i] = 0; } } } } void racct_create(struct racct **racctp) { if (!racct_enable) return; SDT_PROBE1(racct, kernel, racct, create, racctp); KASSERT(*racctp == NULL, ("racct already allocated")); *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO); } static void racct_destroy_locked(struct racct **racctp) { int i; struct racct *racct; ASSERT_RACCT_ENABLED(); SDT_PROBE1(racct, kernel, racct, destroy, racctp); mtx_assert(&racct_lock, MA_OWNED); KASSERT(racctp != NULL, ("NULL racctp")); KASSERT(*racctp != NULL, ("NULL racct")); racct = *racctp; for (i = 0; i <= RACCT_MAX; i++) { if (RACCT_IS_SLOPPY(i)) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; KASSERT(racct->r_resources[i] == 0, ("destroying non-empty racct: " "%ju allocated for resource %d\n", racct->r_resources[i], i)); } uma_zfree(racct_zone, racct); *racctp = NULL; } void racct_destroy(struct racct **racct) { if (!racct_enable) return; mtx_lock(&racct_lock); racct_destroy_locked(racct); mtx_unlock(&racct_lock); } /* * Increase consumption of 'resource' by 'amount' for 'racct' * and all its parents. Differently from other cases, 'amount' here * may be less than zero. */ static void racct_adjust_resource(struct racct *racct, int resource, uint64_t amount) { ASSERT_RACCT_ENABLED(); mtx_assert(&racct_lock, MA_OWNED); KASSERT(racct != NULL, ("NULL racct")); racct->r_resources[resource] += amount; if (racct->r_resources[resource] < 0) { KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource), ("%s: resource %d usage < 0", __func__, resource)); racct->r_resources[resource] = 0; } /* * There are some cases where the racct %cpu resource would grow * beyond 100% per core. For example in racct_proc_exit() we add * the process %cpu usage to the ucred racct containers. If too * many processes terminated in a short time span, the ucred %cpu * resource could grow too much. Also, the 4BSD scheduler sometimes * returns for a thread more than 100% cpu usage. So we set a sane * boundary here to 100% * the maxumum number of CPUs. */ if ((resource == RACCT_PCTCPU) && (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU)) racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU; } static int racct_add_locked(struct proc *p, int resource, uint64_t amount) { #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); SDT_PROBE3(racct, kernel, rusage, add, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); #ifdef RCTL error = rctl_enforce(p, resource, amount); if (error && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, kernel, rusage, add__failure, p, resource, amount); return (error); } #endif racct_adjust_resource(p->p_racct, resource, amount); racct_add_cred_locked(p->p_ucred, resource, amount); return (0); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. */ int racct_add(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); mtx_lock(&racct_lock); error = racct_add_locked(p, resource, amount); mtx_unlock(&racct_lock); return (error); } static void racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); SDT_PROBE3(racct, kernel, rusage, add__cred, cred, resource, amount); racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount); } /* * Increase allocation of 'resource' by 'amount' for credential 'cred'. * Doesn't check for limits and never fails. * * XXX: Shouldn't this ever return an error? */ void racct_add_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; mtx_lock(&racct_lock); racct_add_cred_locked(cred, resource, amount); mtx_unlock(&racct_lock); } /* * Increase allocation of 'resource' by 'amount' for process 'p'. * Doesn't check for limits and never fails. */ void racct_add_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, kernel, rusage, add__force, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); mtx_lock(&racct_lock); racct_adjust_resource(p->p_racct, resource, amount); mtx_unlock(&racct_lock); racct_add_cred(p->p_ucred, resource, amount); } static int racct_set_locked(struct proc *p, int resource, uint64_t amount) { int64_t old_amount, decayed_amount; int64_t diff_proc, diff_cred; #ifdef RCTL int error; #endif ASSERT_RACCT_ENABLED(); SDT_PROBE3(racct, kernel, rusage, set, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); old_amount = p->p_racct->r_resources[resource]; /* * The diffs may be negative. */ diff_proc = amount - old_amount; if (RACCT_IS_DECAYING(resource)) { /* * Resources in per-credential racct containers may decay. * If this is the case, we need to calculate the difference * between the new amount and the proportional value of the * old amount that has decayed in the ucred racct containers. */ decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; diff_cred = amount - decayed_amount; } else diff_cred = diff_proc; #ifdef notyet KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource), ("%s: usage of non-droppable resource %d dropping", __func__, resource)); #endif #ifdef RCTL if (diff_proc > 0) { error = rctl_enforce(p, resource, diff_proc); if (error && RACCT_IS_DENIABLE(resource)) { SDT_PROBE3(racct, kernel, rusage, set__failure, p, resource, amount); return (error); } } #endif racct_adjust_resource(p->p_racct, resource, diff_proc); if (diff_cred > 0) racct_add_cred_locked(p->p_ucred, resource, diff_cred); else if (diff_cred < 0) racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); return (0); } /* * Set allocation of 'resource' to 'amount' for process 'p'. * Return 0 if it's below limits, or errno, if it's not. * * Note that decreasing the allocation always returns 0, * even if it's above the limit. */ int racct_set(struct proc *p, int resource, uint64_t amount) { int error; if (!racct_enable) return (0); mtx_lock(&racct_lock); error = racct_set_locked(p, resource, amount); mtx_unlock(&racct_lock); return (error); } static void racct_set_force_locked(struct proc *p, int resource, uint64_t amount) { int64_t old_amount, decayed_amount; int64_t diff_proc, diff_cred; ASSERT_RACCT_ENABLED(); SDT_PROBE3(racct, kernel, rusage, set, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); old_amount = p->p_racct->r_resources[resource]; /* * The diffs may be negative. */ diff_proc = amount - old_amount; if (RACCT_IS_DECAYING(resource)) { /* * Resources in per-credential racct containers may decay. * If this is the case, we need to calculate the difference * between the new amount and the proportional value of the * old amount that has decayed in the ucred racct containers. */ decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE; diff_cred = amount - decayed_amount; } else diff_cred = diff_proc; racct_adjust_resource(p->p_racct, resource, diff_proc); if (diff_cred > 0) racct_add_cred_locked(p->p_ucred, resource, diff_cred); else if (diff_cred < 0) racct_sub_cred_locked(p->p_ucred, resource, -diff_cred); } void racct_set_force(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; mtx_lock(&racct_lock); racct_set_force_locked(p, resource, amount); mtx_unlock(&racct_lock); } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * not matter. */ uint64_t racct_get_limit(struct proc *p, int resource) { if (!racct_enable) return (UINT64_MAX); #ifdef RCTL return (rctl_get_limit(p, resource)); #else return (UINT64_MAX); #endif } /* * Returns amount of 'resource' the process 'p' can keep allocated. * Allocating more than that would be denied, unless the resource * is marked undeniable. Amount of already allocated resource does * matter. */ uint64_t racct_get_available(struct proc *p, int resource) { if (!racct_enable) return (UINT64_MAX); #ifdef RCTL return (rctl_get_available(p, resource)); #else return (UINT64_MAX); #endif } /* * Returns amount of the %cpu resource that process 'p' can add to its %cpu * utilization. Adding more than that would lead to the process being * throttled. */ static int64_t racct_pcpu_available(struct proc *p) { ASSERT_RACCT_ENABLED(); #ifdef RCTL return (rctl_pcpu_available(p)); #else return (INT64_MAX); #endif } /* * Decrease allocation of 'resource' by 'amount' for process 'p'. */ void racct_sub(struct proc *p, int resource, uint64_t amount) { if (!racct_enable) return; SDT_PROBE3(racct, kernel, rusage, sub, p, resource, amount); /* * We need proc lock to dereference p->p_ucred. */ PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(RACCT_CAN_DROP(resource), ("%s: called for non-droppable resource %d", __func__, resource)); mtx_lock(&racct_lock); KASSERT(amount <= p->p_racct->r_resources[resource], ("%s: freeing %ju of resource %d, which is more " "than allocated %jd for %s (pid %d)", __func__, amount, resource, (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid)); racct_adjust_resource(p->p_racct, resource, -amount); racct_sub_cred_locked(p->p_ucred, resource, amount); mtx_unlock(&racct_lock); } static void racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount) { struct prison *pr; ASSERT_RACCT_ENABLED(); SDT_PROBE3(racct, kernel, rusage, sub__cred, cred, resource, amount); #ifdef notyet KASSERT(RACCT_CAN_DROP(resource), ("%s: called for resource %d which can not drop", __func__, resource)); #endif racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount); for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource, -amount); racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount); } /* * Decrease allocation of 'resource' by 'amount' for credential 'cred'. */ void racct_sub_cred(struct ucred *cred, int resource, uint64_t amount) { if (!racct_enable) return; mtx_lock(&racct_lock); racct_sub_cred_locked(cred, resource, amount); mtx_unlock(&racct_lock); } /* * Inherit resource usage information from the parent process. */ int racct_proc_fork(struct proc *parent, struct proc *child) { int i, error = 0; if (!racct_enable) return (0); /* * Create racct for the child process. */ racct_create(&child->p_racct); PROC_LOCK(parent); PROC_LOCK(child); mtx_lock(&racct_lock); #ifdef RCTL error = rctl_proc_fork(parent, child); if (error != 0) goto out; #endif /* Init process cpu time. */ child->p_prev_runtime = 0; child->p_throttled = 0; /* * Inherit resource usage. */ for (i = 0; i <= RACCT_MAX; i++) { if (parent->p_racct->r_resources[i] == 0 || !RACCT_IS_INHERITABLE(i)) continue; error = racct_set_locked(child, i, parent->p_racct->r_resources[i]); if (error != 0) goto out; } error = racct_add_locked(child, RACCT_NPROC, 1); error += racct_add_locked(child, RACCT_NTHR, 1); out: mtx_unlock(&racct_lock); PROC_UNLOCK(child); PROC_UNLOCK(parent); if (error != 0) racct_proc_exit(child); return (error); } /* * Called at the end of fork1(), to handle rules that require the process * to be fully initialized. */ void racct_proc_fork_done(struct proc *child) { #ifdef RCTL if (!racct_enable) return; PROC_LOCK(child); mtx_lock(&racct_lock); rctl_enforce(child, RACCT_NPROC, 0); rctl_enforce(child, RACCT_NTHR, 0); mtx_unlock(&racct_lock); PROC_UNLOCK(child); #endif } void racct_proc_exit(struct proc *p) { int i; uint64_t runtime; struct timeval wallclock; uint64_t pct_estimate, pct; if (!racct_enable) return; PROC_LOCK(p); /* * We don't need to calculate rux, proc_reap() has already done this. */ runtime = cputick2usec(p->p_rux.rux_runtime); #ifdef notyet KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); #else if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { pct_estimate = (1000000 * runtime * 100) / ((uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); } else pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); mtx_lock(&racct_lock); racct_set_locked(p, RACCT_CPU, runtime); racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct); for (i = 0; i <= RACCT_MAX; i++) { if (p->p_racct->r_resources[i] == 0) continue; if (!RACCT_IS_RECLAIMABLE(i)) continue; racct_set_locked(p, i, 0); } mtx_unlock(&racct_lock); PROC_UNLOCK(p); #ifdef RCTL rctl_racct_release(p->p_racct); #endif racct_destroy(&p->p_racct); } /* * Called after credentials change, to move resource utilisation * between raccts. */ void racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred, struct ucred *newcred) { struct uidinfo *olduip, *newuip; struct loginclass *oldlc, *newlc; struct prison *oldpr, *newpr, *pr; if (!racct_enable) return; PROC_LOCK_ASSERT(p, MA_NOTOWNED); newuip = newcred->cr_ruidinfo; olduip = oldcred->cr_ruidinfo; newlc = newcred->cr_loginclass; oldlc = oldcred->cr_loginclass; newpr = newcred->cr_prison; oldpr = oldcred->cr_prison; mtx_lock(&racct_lock); if (newuip != olduip) { racct_sub_racct(olduip->ui_racct, p->p_racct); racct_add_racct(newuip->ui_racct, p->p_racct); } if (newlc != oldlc) { racct_sub_racct(oldlc->lc_racct, p->p_racct); racct_add_racct(newlc->lc_racct, p->p_racct); } if (newpr != oldpr) { for (pr = oldpr; pr != NULL; pr = pr->pr_parent) racct_sub_racct(pr->pr_prison_racct->prr_racct, p->p_racct); for (pr = newpr; pr != NULL; pr = pr->pr_parent) racct_add_racct(pr->pr_prison_racct->prr_racct, p->p_racct); } mtx_unlock(&racct_lock); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); #endif } void racct_move(struct racct *dest, struct racct *src) { ASSERT_RACCT_ENABLED(); mtx_lock(&racct_lock); racct_add_racct(dest, src); racct_sub_racct(src, src); mtx_unlock(&racct_lock); } static void racct_proc_throttle(struct proc *p) { struct thread *td; #ifdef SMP int cpuid; #endif ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); /* * Do not block kernel processes. Also do not block processes with * low %cpu utilization to improve interactivity. */ if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) || (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold)) return; p->p_throttled = 1; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); switch (td->td_state) { case TDS_RUNQ: /* * If the thread is on the scheduler run-queue, we can * not just remove it from there. So we set the flag * TDF_NEEDRESCHED for the thread, so that once it is * running, it is taken off the cpu as soon as possible. */ td->td_flags |= TDF_NEEDRESCHED; break; case TDS_RUNNING: /* * If the thread is running, we request a context * switch for it by setting the TDF_NEEDRESCHED flag. */ td->td_flags |= TDF_NEEDRESCHED; #ifdef SMP cpuid = td->td_oncpu; if ((cpuid != NOCPU) && (td != curthread)) ipi_cpu(cpuid, IPI_AST); #endif break; default: break; } thread_unlock(td); } } static void racct_proc_wakeup(struct proc *p) { ASSERT_RACCT_ENABLED(); PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_throttled) { p->p_throttled = 0; wakeup(p->p_racct); } } static void racct_decay_resource(struct racct *racct, void * res, void* dummy) { int resource; int64_t r_old, r_new; ASSERT_RACCT_ENABLED(); + mtx_assert(&racct_lock, MA_OWNED); resource = *(int *)res; r_old = racct->r_resources[resource]; /* If there is nothing to decay, just exit. */ if (r_old <= 0) return; - mtx_lock(&racct_lock); r_new = r_old * RACCT_DECAY_FACTOR / FSCALE; racct->r_resources[resource] = r_new; +} + +static void +racct_decay_pre(void) +{ + + mtx_lock(&racct_lock); +} + +static void +racct_decay_post(void) +{ + mtx_unlock(&racct_lock); } static void racct_decay(int resource) { ASSERT_RACCT_ENABLED(); - ui_racct_foreach(racct_decay_resource, &resource, NULL); - loginclass_racct_foreach(racct_decay_resource, &resource, NULL); - prison_racct_foreach(racct_decay_resource, &resource, NULL); + ui_racct_foreach(racct_decay_resource, racct_decay_pre, + racct_decay_post, &resource, NULL); + loginclass_racct_foreach(racct_decay_resource, racct_decay_pre, + racct_decay_post, &resource, NULL); + prison_racct_foreach(racct_decay_resource, racct_decay_pre, + racct_decay_post, &resource, NULL); } static void racctd(void) { struct thread *td; struct proc *p; struct timeval wallclock; uint64_t runtime; uint64_t pct, pct_estimate; ASSERT_RACCT_ENABLED(); for (;;) { racct_decay(RACCT_PCTCPU); sx_slock(&allproc_lock); LIST_FOREACH(p, &zombproc, p_list) { PROC_LOCK(p); racct_set(p, RACCT_PCTCPU, 0); PROC_UNLOCK(p); } FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { PROC_UNLOCK(p); continue; } microuptime(&wallclock); timevalsub(&wallclock, &p->p_stats->p_start); PROC_STATLOCK(p); FOREACH_THREAD_IN_PROC(p, td) ruxagg(p, td); runtime = cputick2usec(p->p_rux.rux_runtime); PROC_STATUNLOCK(p); #ifdef notyet KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime")); #else if (runtime < p->p_prev_runtime) runtime = p->p_prev_runtime; #endif p->p_prev_runtime = runtime; if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { pct_estimate = (1000000 * runtime * 100) / ((uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); } else pct_estimate = 0; pct = racct_getpcpu(p, pct_estimate); mtx_lock(&racct_lock); racct_set_force_locked(p, RACCT_PCTCPU, pct); racct_set_locked(p, RACCT_CPU, runtime); racct_set_locked(p, RACCT_WALLCLOCK, (uint64_t)wallclock.tv_sec * 1000000 + wallclock.tv_usec); mtx_unlock(&racct_lock); PROC_UNLOCK(p); } /* * To ensure that processes are throttled in a fair way, we need * to iterate over all processes again and check the limits * for %cpu resource only after ucred racct containers have been * properly filled. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL) { PROC_UNLOCK(p); continue; } if (racct_pcpu_available(p) <= 0) racct_proc_throttle(p); else if (p->p_throttled) racct_proc_wakeup(p); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); pause("-", hz); } } static struct kproc_desc racctd_kp = { "racctd", racctd, NULL }; static void racctd_init(void) { if (!racct_enable) return; kproc_start(&racctd_kp); } SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL); static void racct_init(void) { if (!racct_enable) return; racct_zone = uma_zcreate("racct", sizeof(struct racct), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); /* * XXX: Move this somewhere. */ prison0.pr_prison_racct = prison_racct_find("0"); } SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL); #endif /* !RACCT */ Index: head/sys/kern/kern_rctl.c =================================================================== --- head/sys/kern/kern_rctl.c (revision 290856) +++ head/sys/kern/kern_rctl.c (revision 290857) @@ -1,1951 +1,1972 @@ /*- * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RCTL #ifndef RACCT #error "The RCTL option requires the RACCT option" #endif FEATURE(rctl, "Resource Limits"); #define HRF_DEFAULT 0 #define HRF_DONT_INHERIT 1 #define HRF_DONT_ACCUMULATE 2 #define RCTL_MAX_INBUFSIZE 4 * 1024 #define RCTL_MAX_OUTBUFSIZE 16 * 1024 * 1024 #define RCTL_LOG_BUFSIZE 128 #define RCTL_PCPU_SHIFT (10 * 1000000) unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN, &rctl_maxbufsize, 0, "Maximum output buffer size"); /* * 'rctl_rule_link' connects a rule with every racct it's related to. * For example, rule 'user:X:openfiles:deny=N/process' is linked * with uidinfo for user X, and to each process of that user. */ struct rctl_rule_link { LIST_ENTRY(rctl_rule_link) rrl_next; struct rctl_rule *rrl_rule; int rrl_exceeded; }; struct dict { const char *d_name; int d_value; }; static struct dict subjectnames[] = { { "process", RCTL_SUBJECT_TYPE_PROCESS }, { "user", RCTL_SUBJECT_TYPE_USER }, { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS }, { "jail", RCTL_SUBJECT_TYPE_JAIL }, { NULL, -1 }}; static struct dict resourcenames[] = { { "cputime", RACCT_CPU }, { "datasize", RACCT_DATA }, { "stacksize", RACCT_STACK }, { "coredumpsize", RACCT_CORE }, { "memoryuse", RACCT_RSS }, { "memorylocked", RACCT_MEMLOCK }, { "maxproc", RACCT_NPROC }, { "openfiles", RACCT_NOFILE }, { "vmemoryuse", RACCT_VMEM }, { "pseudoterminals", RACCT_NPTS }, { "swapuse", RACCT_SWAP }, { "nthr", RACCT_NTHR }, { "msgqqueued", RACCT_MSGQQUEUED }, { "msgqsize", RACCT_MSGQSIZE }, { "nmsgq", RACCT_NMSGQ }, { "nsem", RACCT_NSEM }, { "nsemop", RACCT_NSEMOP }, { "nshm", RACCT_NSHM }, { "shmsize", RACCT_SHMSIZE }, { "wallclock", RACCT_WALLCLOCK }, { "pcpu", RACCT_PCTCPU }, { NULL, -1 }}; static struct dict actionnames[] = { { "sighup", RCTL_ACTION_SIGHUP }, { "sigint", RCTL_ACTION_SIGINT }, { "sigquit", RCTL_ACTION_SIGQUIT }, { "sigill", RCTL_ACTION_SIGILL }, { "sigtrap", RCTL_ACTION_SIGTRAP }, { "sigabrt", RCTL_ACTION_SIGABRT }, { "sigemt", RCTL_ACTION_SIGEMT }, { "sigfpe", RCTL_ACTION_SIGFPE }, { "sigkill", RCTL_ACTION_SIGKILL }, { "sigbus", RCTL_ACTION_SIGBUS }, { "sigsegv", RCTL_ACTION_SIGSEGV }, { "sigsys", RCTL_ACTION_SIGSYS }, { "sigpipe", RCTL_ACTION_SIGPIPE }, { "sigalrm", RCTL_ACTION_SIGALRM }, { "sigterm", RCTL_ACTION_SIGTERM }, { "sigurg", RCTL_ACTION_SIGURG }, { "sigstop", RCTL_ACTION_SIGSTOP }, { "sigtstp", RCTL_ACTION_SIGTSTP }, { "sigchld", RCTL_ACTION_SIGCHLD }, { "sigttin", RCTL_ACTION_SIGTTIN }, { "sigttou", RCTL_ACTION_SIGTTOU }, { "sigio", RCTL_ACTION_SIGIO }, { "sigxcpu", RCTL_ACTION_SIGXCPU }, { "sigxfsz", RCTL_ACTION_SIGXFSZ }, { "sigvtalrm", RCTL_ACTION_SIGVTALRM }, { "sigprof", RCTL_ACTION_SIGPROF }, { "sigwinch", RCTL_ACTION_SIGWINCH }, { "siginfo", RCTL_ACTION_SIGINFO }, { "sigusr1", RCTL_ACTION_SIGUSR1 }, { "sigusr2", RCTL_ACTION_SIGUSR2 }, { "sigthr", RCTL_ACTION_SIGTHR }, { "deny", RCTL_ACTION_DENY }, { "log", RCTL_ACTION_LOG }, { "devctl", RCTL_ACTION_DEVCTL }, { NULL, -1 }}; static void rctl_init(void); SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL); static uma_zone_t rctl_rule_link_zone; static uma_zone_t rctl_rule_zone; static struct rwlock rctl_lock; RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock"); static int rctl_rule_fully_specified(const struct rctl_rule *rule); static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule); static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits"); static const char * rctl_subject_type_name(int subject) { int i; for (i = 0; subjectnames[i].d_name != NULL; i++) { if (subjectnames[i].d_value == subject) return (subjectnames[i].d_name); } panic("rctl_subject_type_name: unknown subject type %d", subject); } static const char * rctl_action_name(int action) { int i; for (i = 0; actionnames[i].d_name != NULL; i++) { if (actionnames[i].d_value == action) return (actionnames[i].d_name); } panic("rctl_action_name: unknown action %d", action); } const char * rctl_resource_name(int resource) { int i; for (i = 0; resourcenames[i].d_name != NULL; i++) { if (resourcenames[i].d_value == resource) return (resourcenames[i].d_name); } panic("rctl_resource_name: unknown resource %d", resource); } /* * Return the amount of resource that can be allocated by 'p' before * hitting 'rule'. */ static int64_t rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) { int resource; int64_t available = INT64_MAX; struct ucred *cred = p->p_ucred; ASSERT_RACCT_ENABLED(); rw_assert(&rctl_lock, RA_LOCKED); resource = rule->rr_resource; switch (rule->rr_per) { case RCTL_SUBJECT_TYPE_PROCESS: available = rule->rr_amount - p->p_racct->r_resources[resource]; break; case RCTL_SUBJECT_TYPE_USER: available = rule->rr_amount - cred->cr_ruidinfo->ui_racct->r_resources[resource]; break; case RCTL_SUBJECT_TYPE_LOGINCLASS: available = rule->rr_amount - cred->cr_loginclass->lc_racct->r_resources[resource]; break; case RCTL_SUBJECT_TYPE_JAIL: available = rule->rr_amount - cred->cr_prison->pr_prison_racct->prr_racct-> r_resources[resource]; break; default: panic("rctl_compute_available: unknown per %d", rule->rr_per); } return (available); } /* * Return non-zero if allocating 'amount' by proc 'p' would exceed * resource limit specified by 'rule'. */ static int rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule, int64_t amount) { int64_t available; ASSERT_RACCT_ENABLED(); rw_assert(&rctl_lock, RA_LOCKED); available = rctl_available_resource(p, rule); if (available >= amount) return (0); return (1); } /* * Special version of rctl_available() function for the %cpu resource. * We slightly cheat here and return less than we normally would. */ int64_t rctl_pcpu_available(const struct proc *p) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, limit; ASSERT_RACCT_ENABLED(); minavailable = INT64_MAX; limit = 0; rw_rlock(&rctl_lock); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != RACCT_PCTCPU) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) { minavailable = available; limit = rule->rr_amount; } } rw_runlock(&rctl_lock); /* * Return slightly less than actual value of the available * %cpu resource. This makes %cpu throttling more agressive * and lets us act sooner than the limits are already exceeded. */ if (limit != 0) { if (limit > 2 * RCTL_PCPU_SHIFT) minavailable -= RCTL_PCPU_SHIFT; else minavailable -= (limit / 2); } return (minavailable); } /* * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition * to what it keeps allocated now. Returns non-zero if the allocation should * be denied, 0 otherwise. */ int rctl_enforce(struct proc *p, int resource, uint64_t amount) { struct rctl_rule *rule; struct rctl_rule_link *link; struct sbuf sb; int should_deny = 0; char *buf; static int curtime = 0; static struct timeval lasttime; ASSERT_RACCT_ENABLED(); rw_rlock(&rctl_lock); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (!rctl_would_exceed(p, rule, amount)) { link->rrl_exceeded = 0; continue; } switch (rule->rr_action) { case RCTL_ACTION_DENY: should_deny = 1; continue; case RCTL_ACTION_LOG: /* * If rrl_exceeded != 0, it means we've already * logged a warning for this process. */ if (link->rrl_exceeded != 0) continue; /* * If the process state is not fully initialized yet, * we can't access most of the required fields, e.g. * p->p_comm. This happens when called from fork1(). * Ignore this rule for now; it will be processed just * after fork, when called from racct_proc_fork_done(). */ if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&lasttime, &curtime, 10)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); rctl_rule_to_sbuf(&sb, rule); sbuf_finish(&sb); printf("rctl: rule \"%s\" matched by pid %d " "(%s), uid %d, jail %s\n", sbuf_data(&sb), p->p_pid, p->p_comm, p->p_ucred->cr_uid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_DEVCTL: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); sbuf_printf(&sb, "rule="); rctl_rule_to_sbuf(&sb, rule); sbuf_printf(&sb, " pid=%d ruid=%d jail=%s", p->p_pid, p->p_ucred->cr_ruid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_finish(&sb); devctl_notify_f("RCTL", "rule", "matched", sbuf_data(&sb), M_NOWAIT); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; default: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; KASSERT(rule->rr_action > 0 && rule->rr_action <= RCTL_ACTION_SIGNAL_MAX, ("rctl_enforce: unknown action %d", rule->rr_action)); /* * We're using the fact that RCTL_ACTION_SIG* values * are equal to their counterparts from sys/signal.h. */ kern_psignal(p, rule->rr_action); link->rrl_exceeded = 1; continue; } } rw_runlock(&rctl_lock); if (should_deny) { /* * Return fake error code; the caller should change it * into one proper for the situation - EFSIZ, ENOMEM etc. */ return (EDOOFUS); } return (0); } uint64_t rctl_get_limit(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; uint64_t amount = UINT64_MAX; ASSERT_RACCT_ENABLED(); rw_rlock(&rctl_lock); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; if (rule->rr_amount < amount) amount = rule->rr_amount; } rw_runlock(&rctl_lock); return (amount); } uint64_t rctl_get_available(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, allocated; minavailable = INT64_MAX; ASSERT_RACCT_ENABLED(); rw_rlock(&rctl_lock); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) minavailable = available; } rw_runlock(&rctl_lock); /* * XXX: Think about this _hard_. */ allocated = p->p_racct->r_resources[resource]; if (minavailable < INT64_MAX - allocated) minavailable += allocated; if (minavailable < 0) minavailable = 0; return (minavailable); } static int rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter) { ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_subject_type != filter->rr_subject_type) return (0); switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (filter->rr_subject.rs_proc != NULL && rule->rr_subject.rs_proc != filter->rr_subject.rs_proc) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (filter->rr_subject.rs_uip != NULL && rule->rr_subject.rs_uip != filter->rr_subject.rs_uip) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (filter->rr_subject.rs_loginclass != NULL && rule->rr_subject.rs_loginclass != filter->rr_subject.rs_loginclass) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (filter->rr_subject.rs_prison_racct != NULL && rule->rr_subject.rs_prison_racct != filter->rr_subject.rs_prison_racct) return (0); break; default: panic("rctl_rule_matches: unknown subject type %d", filter->rr_subject_type); } } if (filter->rr_resource != RACCT_UNDEFINED) { if (rule->rr_resource != filter->rr_resource) return (0); } if (filter->rr_action != RCTL_ACTION_UNDEFINED) { if (rule->rr_action != filter->rr_action) return (0); } if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) { if (rule->rr_amount != filter->rr_amount) return (0); } if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_per != filter->rr_per) return (0); } return (1); } static int str2value(const char *str, int *value, struct dict *table) { int i; if (value == NULL) return (EINVAL); for (i = 0; table[i].d_name != NULL; i++) { if (strcasecmp(table[i].d_name, str) == 0) { *value = table[i].d_value; return (0); } } return (EINVAL); } static int str2id(const char *str, id_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); return (0); } static int str2int64(const char *str, int64_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); return (0); } /* * Connect the rule to the racct, increasing refcount for the rule. */ static void rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); rctl_rule_acquire(rule); link = uma_zalloc(rctl_rule_link_zone, M_WAITOK); link->rrl_rule = rule; link->rrl_exceeded = 0; rw_wlock(&rctl_lock); LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); rw_wunlock(&rctl_lock); } static int rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); rw_assert(&rctl_lock, RA_WLOCKED); link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT); if (link == NULL) return (ENOMEM); rctl_rule_acquire(rule); link->rrl_rule = rule; link->rrl_exceeded = 0; LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); return (0); } /* * Remove limits for a rules matching the filter and release * the refcounts for the rules, possibly freeing them. Returns * the number of limit structures removed. */ static int rctl_racct_remove_rules(struct racct *racct, const struct rctl_rule *filter) { int removed = 0; struct rctl_rule_link *link, *linktmp; ASSERT_RACCT_ENABLED(); rw_assert(&rctl_lock, RA_WLOCKED); LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); removed++; } return (removed); } static void rctl_rule_acquire_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_hold(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uihold(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_hold(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_acquire_subject: unknown subject type %d", rule->rr_subject_type); } } static void rctl_rule_release_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_free(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uifree(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_free(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_release_subject: unknown subject type %d", rule->rr_subject_type); } } struct rctl_rule * rctl_rule_alloc(int flags) { struct rctl_rule *rule; ASSERT_RACCT_ENABLED(); rule = uma_zalloc(rctl_rule_zone, flags); if (rule == NULL) return (NULL); rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_resource = RACCT_UNDEFINED; rule->rr_action = RCTL_ACTION_UNDEFINED; rule->rr_amount = RCTL_AMOUNT_UNDEFINED; refcount_init(&rule->rr_refcount, 1); return (rule); } struct rctl_rule * rctl_rule_duplicate(const struct rctl_rule *rule, int flags) { struct rctl_rule *copy; ASSERT_RACCT_ENABLED(); copy = uma_zalloc(rctl_rule_zone, flags); if (copy == NULL) return (NULL); copy->rr_subject_type = rule->rr_subject_type; copy->rr_subject.rs_proc = rule->rr_subject.rs_proc; copy->rr_subject.rs_uip = rule->rr_subject.rs_uip; copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass; copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct; copy->rr_per = rule->rr_per; copy->rr_resource = rule->rr_resource; copy->rr_action = rule->rr_action; copy->rr_amount = rule->rr_amount; refcount_init(©->rr_refcount, 1); rctl_rule_acquire_subject(copy); return (copy); } void rctl_rule_acquire(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); refcount_acquire(&rule->rr_refcount); } static void rctl_rule_free(void *context, int pending) { struct rctl_rule *rule; rule = (struct rctl_rule *)context; ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0")); /* * We don't need locking here; rule is guaranteed to be inaccessible. */ rctl_rule_release_subject(rule); uma_zfree(rctl_rule_zone, rule); } void rctl_rule_release(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); if (refcount_release(&rule->rr_refcount)) { /* * rctl_rule_release() is often called when iterating * over all the uidinfo structures in the system, * holding uihashtbl_lock. Since rctl_rule_free() * might end up calling uifree(), this would lead * to lock recursion. Use taskqueue to avoid this. */ TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule); taskqueue_enqueue(taskqueue_thread, &rule->rr_task); } } static int rctl_rule_fully_specified(const struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: return (0); case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) return (0); break; default: panic("rctl_rule_fully_specified: unknown subject type %d", rule->rr_subject_type); } if (rule->rr_resource == RACCT_UNDEFINED) return (0); if (rule->rr_action == RCTL_ACTION_UNDEFINED) return (0); if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED) return (0); if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED) return (0); return (1); } static int rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep) { int error = 0; char *subjectstr, *subject_idstr, *resourcestr, *actionstr, *amountstr, *perstr; struct rctl_rule *rule; id_t id; ASSERT_RACCT_ENABLED(); rule = rctl_rule_alloc(M_WAITOK); subjectstr = strsep(&rulestr, ":"); subject_idstr = strsep(&rulestr, ":"); resourcestr = strsep(&rulestr, ":"); actionstr = strsep(&rulestr, "=/"); amountstr = strsep(&rulestr, "/"); perstr = rulestr; if (subjectstr == NULL || subjectstr[0] == '\0') rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(subjectstr, &rule->rr_subject_type, subjectnames); if (error != 0) goto out; } if (subject_idstr == NULL || subject_idstr[0] == '\0') { rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; } else { switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: error = EINVAL; goto out; case RCTL_SUBJECT_TYPE_PROCESS: error = str2id(subject_idstr, &id); if (error != 0) goto out; sx_assert(&allproc_lock, SA_LOCKED); rule->rr_subject.rs_proc = pfind(id); if (rule->rr_subject.rs_proc == NULL) { error = ESRCH; goto out; } PROC_UNLOCK(rule->rr_subject.rs_proc); break; case RCTL_SUBJECT_TYPE_USER: error = str2id(subject_idstr, &id); if (error != 0) goto out; rule->rr_subject.rs_uip = uifind(id); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: rule->rr_subject.rs_loginclass = loginclass_find(subject_idstr); if (rule->rr_subject.rs_loginclass == NULL) { error = ENAMETOOLONG; goto out; } break; case RCTL_SUBJECT_TYPE_JAIL: rule->rr_subject.rs_prison_racct = prison_racct_find(subject_idstr); if (rule->rr_subject.rs_prison_racct == NULL) { error = ENAMETOOLONG; goto out; } break; default: panic("rctl_string_to_rule: unknown subject type %d", rule->rr_subject_type); } } if (resourcestr == NULL || resourcestr[0] == '\0') rule->rr_resource = RACCT_UNDEFINED; else { error = str2value(resourcestr, &rule->rr_resource, resourcenames); if (error != 0) goto out; } if (actionstr == NULL || actionstr[0] == '\0') rule->rr_action = RCTL_ACTION_UNDEFINED; else { error = str2value(actionstr, &rule->rr_action, actionnames); if (error != 0) goto out; } if (amountstr == NULL || amountstr[0] == '\0') rule->rr_amount = RCTL_AMOUNT_UNDEFINED; else { error = str2int64(amountstr, &rule->rr_amount); if (error != 0) goto out; if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) rule->rr_amount *= 1000000; } if (perstr == NULL || perstr[0] == '\0') rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(perstr, &rule->rr_per, subjectnames); if (error != 0) goto out; } out: if (error == 0) *rulep = rule; else rctl_rule_release(rule); return (error); } /* * Link a rule with all the subjects it applies to. */ int rctl_rule_add(struct rctl_rule *rule) { struct proc *p; struct ucred *cred; struct uidinfo *uip; struct prison *pr; struct prison_racct *prr; struct loginclass *lc; struct rctl_rule *rule2; int match; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); /* * Some rules just don't make sense. Note that the one below * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU, * for example, is not deniable in the racct sense, but the * limit is enforced in a different way, so "deny" rules for %CPU * do make sense. */ if (rule->rr_action == RCTL_ACTION_DENY && (rule->rr_resource == RACCT_CPU || rule->rr_resource == RACCT_WALLCLOCK)) return (EOPNOTSUPP); if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS && RACCT_IS_SLOPPY(rule->rr_resource)) return (EOPNOTSUPP); /* * Make sure there are no duplicated rules. Also, for the "deny" * rules, remove ones differing only by "amount". */ if (rule->rr_action == RCTL_ACTION_DENY) { rule2 = rctl_rule_duplicate(rule, M_WAITOK); rule2->rr_amount = RCTL_AMOUNT_UNDEFINED; rctl_rule_remove(rule2); rctl_rule_release(rule2); } else rctl_rule_remove(rule); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = rule->rr_subject.rs_proc; KASSERT(p != NULL, ("rctl_rule_add: NULL proc")); rctl_racct_add_rule(p->p_racct, rule); /* * In case of per-process rule, we don't have anything more * to do. */ return (0); case RCTL_SUBJECT_TYPE_USER: uip = rule->rr_subject.rs_uip; KASSERT(uip != NULL, ("rctl_rule_add: NULL uip")); rctl_racct_add_rule(uip->ui_racct, rule); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = rule->rr_subject.rs_loginclass; KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass")); rctl_racct_add_rule(lc->lc_racct, rule); break; case RCTL_SUBJECT_TYPE_JAIL: prr = rule->rr_subject.rs_prison_racct; KASSERT(prr != NULL, ("rctl_rule_add: NULL pr")); rctl_racct_add_rule(prr->prr_racct, rule); break; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } /* * Now go through all the processes and add the new rule to the ones * it applies to. */ sx_assert(&allproc_lock, SA_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { cred = p->p_ucred; switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_USER: if (cred->cr_uidinfo == rule->rr_subject.rs_uip || cred->cr_ruidinfo == rule->rr_subject.rs_uip) break; continue; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (cred->cr_loginclass == rule->rr_subject.rs_loginclass) break; continue; case RCTL_SUBJECT_TYPE_JAIL: match = 0; for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) { if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) { match = 1; break; } } if (match) break; continue; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } rctl_racct_add_rule(p->p_racct, rule); } return (0); } static void +rctl_rule_pre_callback(void) +{ + + rw_wlock(&rctl_lock); +} + +static void +rctl_rule_post_callback(void) +{ + + rw_wunlock(&rctl_lock); +} + +static void rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; int found = 0; ASSERT_RACCT_ENABLED(); + rw_assert(&rctl_lock, RA_WLOCKED); - rw_wlock(&rctl_lock); found += rctl_racct_remove_rules(racct, filter); - rw_wunlock(&rctl_lock); *((int *)arg3) += found; } /* * Remove all rules that match the filter. */ int rctl_rule_remove(struct rctl_rule *filter) { int found = 0; struct proc *p; ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS && filter->rr_subject.rs_proc != NULL) { p = filter->rr_subject.rs_proc; rw_wlock(&rctl_lock); found = rctl_racct_remove_rules(p->p_racct, filter); rw_wunlock(&rctl_lock); if (found) return (0); return (ESRCH); } - loginclass_racct_foreach(rctl_rule_remove_callback, filter, - (void *)&found); - ui_racct_foreach(rctl_rule_remove_callback, filter, - (void *)&found); - prison_racct_foreach(rctl_rule_remove_callback, filter, - (void *)&found); + loginclass_racct_foreach(rctl_rule_remove_callback, + rctl_rule_pre_callback, rctl_rule_post_callback, + filter, (void *)&found); + ui_racct_foreach(rctl_rule_remove_callback, + rctl_rule_pre_callback, rctl_rule_post_callback, + filter, (void *)&found); + prison_racct_foreach(rctl_rule_remove_callback, + rctl_rule_pre_callback, rctl_rule_post_callback, + filter, (void *)&found); sx_assert(&allproc_lock, SA_LOCKED); rw_wlock(&rctl_lock); FOREACH_PROC_IN_SYSTEM(p) { found += rctl_racct_remove_rules(p->p_racct, filter); } rw_wunlock(&rctl_lock); if (found) return (0); return (ESRCH); } /* * Appends a rule to the sbuf. */ static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule) { int64_t amount; ASSERT_RACCT_ENABLED(); sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type)); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_proc->p_pid); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_uip->ui_uid); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_loginclass->lc_name); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_prison_racct->prr_name); break; default: panic("rctl_rule_to_sbuf: unknown subject type %d", rule->rr_subject_type); } amount = rule->rr_amount; if (amount != RCTL_AMOUNT_UNDEFINED && RACCT_IS_IN_MILLIONS(rule->rr_resource)) amount /= 1000000; sbuf_printf(sb, "%s:%s=%jd", rctl_resource_name(rule->rr_resource), rctl_action_name(rule->rr_action), amount); if (rule->rr_per != rule->rr_subject_type) sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per)); } /* * Routine used by RCTL syscalls to read in input string. */ static int rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen) { int error; char *str; ASSERT_RACCT_ENABLED(); if (inbuflen <= 0) return (EINVAL); if (inbuflen > RCTL_MAX_INBUFSIZE) return (E2BIG); str = malloc(inbuflen + 1, M_RCTL, M_WAITOK); error = copyinstr(inbufp, str, inbuflen, NULL); if (error != 0) { free(str, M_RCTL); return (error); } *inputstr = str; return (0); } /* * Routine used by RCTL syscalls to write out output string. */ static int rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen) { int error; ASSERT_RACCT_ENABLED(); if (outputsbuf == NULL) return (0); sbuf_finish(outputsbuf); if (outbuflen < sbuf_len(outputsbuf) + 1) { sbuf_delete(outputsbuf); return (ERANGE); } error = copyout(sbuf_data(outputsbuf), outbufp, sbuf_len(outputsbuf) + 1); sbuf_delete(outputsbuf); return (error); } static struct sbuf * rctl_racct_to_sbuf(struct racct *racct, int sloppy) { int i; int64_t amount; struct sbuf *sb; ASSERT_RACCT_ENABLED(); sb = sbuf_new_auto(); for (i = 0; i <= RACCT_MAX; i++) { if (sloppy == 0 && RACCT_IS_SLOPPY(i)) continue; amount = racct->r_resources[i]; if (RACCT_IS_IN_MILLIONS(i)) amount /= 1000000; sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount); } sbuf_setpos(sb, sbuf_len(sb) - 1); return (sb); } int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { int error; char *inputstr; struct rctl_rule *filter; struct sbuf *outputsbuf = NULL; struct proc *p; struct uidinfo *uip; struct loginclass *lc; struct prison_racct *prr; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RACCT); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = filter->rr_subject.rs_proc; if (p == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0); break; case RCTL_SUBJECT_TYPE_USER: uip = filter->rr_subject.rs_uip; if (uip == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = filter->rr_subject.rs_loginclass; if (lc == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1); break; case RCTL_SUBJECT_TYPE_JAIL: prr = filter->rr_subject.rs_prison_racct; if (prr == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1); break; default: error = EINVAL; } out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); if (error != 0) return (error); error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen); return (error); } static void rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; struct rctl_rule_link *link; struct sbuf *sb = (struct sbuf *)arg3; ASSERT_RACCT_ENABLED(); + rw_assert(&rctl_lock, RA_LOCKED); - rw_rlock(&rctl_lock); LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } - rw_runlock(&rctl_lock); } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { int error; size_t bufsize; char *inputstr, *buf; struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; struct proc *p; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RULES); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); FOREACH_PROC_IN_SYSTEM(p) { rw_rlock(&rctl_lock); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { /* * Non-process rules will be added to the buffer later. * Adding them here would result in duplicated output. */ if (link->rrl_rule->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) continue; if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } rw_runlock(&rctl_lock); } - loginclass_racct_foreach(rctl_get_rules_callback, filter, sb); - ui_racct_foreach(rctl_get_rules_callback, filter, sb); - prison_racct_foreach(rctl_get_rules_callback, filter, sb); + loginclass_racct_foreach(rctl_get_rules_callback, + rctl_rule_pre_callback, rctl_rule_post_callback, + filter, sb); + ui_racct_foreach(rctl_get_rules_callback, + rctl_rule_pre_callback, rctl_rule_post_callback, + filter, sb); + prison_racct_foreach(rctl_get_rules_callback, + rctl_rule_pre_callback, rctl_rule_post_callback, + filter, sb); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { int error; size_t bufsize; char *inputstr, *buf; struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_LIMITS); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EOPNOTSUPP); } if (filter->rr_subject.rs_proc == NULL) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); rw_rlock(&rctl_lock); LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links, rrl_next) { rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } rw_runlock(&rctl_lock); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { int error; struct rctl_rule *rule; char *inputstr; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_ADD_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &rule); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } /* * The 'per' part of a rule is optional. */ if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED && rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) rule->rr_per = rule->rr_subject_type; if (!rctl_rule_fully_specified(rule)) { error = EINVAL; goto out; } error = rctl_rule_add(rule); out: rctl_rule_release(rule); sx_sunlock(&allproc_lock); return (error); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { int error; struct rctl_rule *filter; char *inputstr; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_REMOVE_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } error = rctl_rule_remove(filter); rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (error); } /* * Update RCTL rule list after credential change. */ void rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred) { int rulecnt, i; struct rctl_rule_link *link, *newlink; struct uidinfo *newuip; struct loginclass *newlc; struct prison_racct *newprr; LIST_HEAD(, rctl_rule_link) newrules; ASSERT_RACCT_ENABLED(); newuip = newcred->cr_ruidinfo; newlc = newcred->cr_loginclass; newprr = newcred->cr_prison->pr_prison_racct; LIST_INIT(&newrules); again: /* * First, count the rules that apply to the process with new * credentials. */ rulecnt = 0; rw_rlock(&rctl_lock); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) rulecnt++; } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) rulecnt++; rw_runlock(&rctl_lock); /* * Create temporary list. We've dropped the rctl_lock in order * to use M_WAITOK. */ for (i = 0; i < rulecnt; i++) { newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK); newlink->rrl_rule = NULL; LIST_INSERT_HEAD(&newrules, newlink, rrl_next); } newlink = LIST_FIRST(&newrules); /* * Assign rules to the newly allocated list entries. */ rw_wlock(&rctl_lock); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } if (rulecnt == 0) { /* * Free the old rule list. */ while (!LIST_EMPTY(&p->p_racct->r_rule_links)) { link = LIST_FIRST(&p->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } /* * Replace lists and we're done. * * XXX: Is there any way to switch list heads instead * of iterating here? */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); LIST_INSERT_HEAD(&p->p_racct->r_rule_links, newlink, rrl_next); } rw_wunlock(&rctl_lock); return; } goaround: rw_wunlock(&rctl_lock); /* * Rule list changed while we were not holding the rctl_lock. * Free the new list and try again. */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); if (newlink->rrl_rule != NULL) rctl_rule_release(newlink->rrl_rule); uma_zfree(rctl_rule_link_zone, newlink); } goto again; } /* * Assign RCTL rules to the newly created process. */ int rctl_proc_fork(struct proc *parent, struct proc *child) { int error; struct rctl_rule_link *link; struct rctl_rule *rule; LIST_INIT(&child->p_racct->r_rule_links); ASSERT_RACCT_ENABLED(); KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent)); rw_wlock(&rctl_lock); /* * Go through limits applicable to the parent and assign them * to the child. Rules with 'process' subject have to be duplicated * in order to make their rr_subject point to the new process. */ LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT); if (rule == NULL) goto fail; KASSERT(rule->rr_subject.rs_proc == parent, ("rule->rr_subject.rs_proc != parent")); rule->rr_subject.rs_proc = child; error = rctl_racct_add_rule_locked(child->p_racct, rule); rctl_rule_release(rule); if (error != 0) goto fail; } else { error = rctl_racct_add_rule_locked(child->p_racct, link->rrl_rule); if (error != 0) goto fail; } } rw_wunlock(&rctl_lock); return (0); fail: while (!LIST_EMPTY(&child->p_racct->r_rule_links)) { link = LIST_FIRST(&child->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } rw_wunlock(&rctl_lock); return (EAGAIN); } /* * Release rules attached to the racct. */ void rctl_racct_release(struct racct *racct) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); rw_wlock(&rctl_lock); while (!LIST_EMPTY(&racct->r_rule_links)) { link = LIST_FIRST(&racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } rw_wunlock(&rctl_lock); } static void rctl_init(void) { if (!racct_enable) return; rctl_rule_link_zone = uma_zcreate("rctl_rule_link", sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } #else /* !RCTL */ int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { return (ENOSYS); } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { return (ENOSYS); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { return (ENOSYS); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { return (ENOSYS); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { return (ENOSYS); } #endif /* !RCTL */ Index: head/sys/kern/kern_resource.c =================================================================== --- head/sys/kern/kern_resource.c (revision 290856) +++ head/sys/kern/kern_resource.c (revision 290857) @@ -1,1438 +1,1443 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures"); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) static struct rwlock uihashtbl_lock; static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; static u_long uihash; /* size of hash table - 1 */ static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp); static int donice(struct thread *td, struct proc *chgp, int n); static struct uidinfo *uilookup(uid_t uid); static void ruxagg_locked(struct rusage_ext *rux, struct thread *td); static __inline int lim_shared(struct plimit *limp); /* * Resource controls and accounting. */ #ifndef _SYS_SYSPROTO_H_ struct getpriority_args { int which; int who; }; #endif int sys_getpriority(struct thread *td, register struct getpriority_args *uap) { struct proc *p; struct pgrp *pg; int error, low; error = 0; low = PRIO_MAX + 1; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) low = td->td_proc->p_nice; else { p = pfind(uap->who); if (p == NULL) break; if (p_cansee(td, p) == 0) low = p->p_nice; PROC_UNLOCK(p); } break; case PRIO_PGRP: sx_slock(&proctree_lock); if (uap->who == 0) { pg = td->td_proc->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0) { if (p->p_nice < low) low = p->p_nice; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0 && p->p_ucred->cr_uid == uap->who) { if (p->p_nice < low) low = p->p_nice; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (low == PRIO_MAX + 1 && error == 0) error = ESRCH; td->td_retval[0] = low; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setpriority_args { int which; int who; int prio; }; #endif int sys_setpriority(struct thread *td, struct setpriority_args *uap) { struct proc *curp, *p; struct pgrp *pg; int found = 0, error = 0; curp = td->td_proc; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) { PROC_LOCK(curp); error = donice(td, curp, uap->prio); PROC_UNLOCK(curp); } else { p = pfind(uap->who); if (p == NULL) break; error = p_cansee(td, p); if (error == 0) error = donice(td, p, uap->prio); PROC_UNLOCK(p); } found++; break; case PRIO_PGRP: sx_slock(&proctree_lock); if (uap->who == 0) { pg = curp->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p_cansee(td, p) == 0) { error = donice(td, p, uap->prio); found++; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && p->p_ucred->cr_uid == uap->who && p_cansee(td, p) == 0) { error = donice(td, p, uap->prio); found++; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (found == 0 && error == 0) error = ESRCH; return (error); } /* * Set "nice" for a (whole) process. */ static int donice(struct thread *td, struct proc *p, int n) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = p_cansched(td, p))) return (error); if (n > PRIO_MAX) n = PRIO_MAX; if (n < PRIO_MIN) n = PRIO_MIN; if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0) return (EACCES); sched_nice(p, n); return (0); } static int unprivileged_idprio; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_idprio, CTLFLAG_RW, &unprivileged_idprio, 0, "Allow non-root users to set an idle priority"); /* * Set realtime priority for LWP. */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_thread_args { int function; lwpid_t lwpid; struct rtprio *rtp; }; #endif int sys_rtprio_thread(struct thread *td, struct rtprio_thread_args *uap) { struct proc *p; struct rtprio rtp; struct thread *td1; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; if (uap->lwpid == 0 || uap->lwpid == td->td_tid) { p = td->td_proc; td1 = td; PROC_LOCK(p); } else { /* Only look up thread in current process */ td1 = tdfind(uap->lwpid, curproc->p_pid); if (td1 == NULL) return (ESRCH); p = td1->td_proc; } switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; pri_to_rtp(td1, &rtp); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* Disallow setting rtprio in most cases if not superuser. */ /* * Realtime priority has to be restricted for reasons which * should be obvious. However, for idleprio processes, there is * a potential for system deadlock if an idleprio process gains * a lock on a resource that other processes need (and the * idleprio process can't run due to a CPU-bound normal * process). Fix me! XXX * * This problem is not only related to idleprio process. * A user level program can obtain a file lock and hold it * indefinitely. Additionally, without idleprio processes it is * still conceivable that a program with low priority will never * get to run. In short, allowing this feature might make it * easier to lock a resource indefinitely, but it is not the * only thing that makes it possible. */ if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME || (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE && unprivileged_idprio == 0)) { error = priv_check(td, PRIV_SCHED_RTPRIO); if (error) break; } error = rtp_to_pri(&rtp, td1); break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } /* * Set realtime priority. */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_args { int function; pid_t pid; struct rtprio *rtp; }; #endif int sys_rtprio(struct thread *td, register struct rtprio_args *uap) { struct proc *p; struct thread *tdp; struct rtprio rtp; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); } switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; /* * Return OUR priority if no pid specified, * or if one is, report the highest priority * in the process. There isn't much more you can do as * there is only room to return a single priority. * Note: specifying our own pid is not the same * as leaving it zero. */ if (uap->pid == 0) { pri_to_rtp(td, &rtp); } else { struct rtprio rtp2; rtp.type = RTP_PRIO_IDLE; rtp.prio = RTP_PRIO_MAX; FOREACH_THREAD_IN_PROC(p, tdp) { pri_to_rtp(tdp, &rtp2); if (rtp2.type < rtp.type || (rtp2.type == rtp.type && rtp2.prio < rtp.prio)) { rtp.type = rtp2.type; rtp.prio = rtp2.prio; } } } PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* * Disallow setting rtprio in most cases if not superuser. * See the comment in sys_rtprio_thread about idprio * threads holding a lock. */ if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME || (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE && !unprivileged_idprio)) { error = priv_check(td, PRIV_SCHED_RTPRIO); if (error) break; } /* * If we are setting our own priority, set just our * thread but if we are doing another process, * do all the threads on that process. If we * specify our own pid we do the latter. */ if (uap->pid == 0) { error = rtp_to_pri(&rtp, td); } else { FOREACH_THREAD_IN_PROC(p, td) { if ((error = rtp_to_pri(&rtp, td)) != 0) break; } } break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } int rtp_to_pri(struct rtprio *rtp, struct thread *td) { u_char newpri, oldclass, oldpri; switch (RTP_PRIO_BASE(rtp->type)) { case RTP_PRIO_REALTIME: if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); newpri = PRI_MIN_REALTIME + rtp->prio; break; case RTP_PRIO_NORMAL: if (rtp->prio > (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE)) return (EINVAL); newpri = PRI_MIN_TIMESHARE + rtp->prio; break; case RTP_PRIO_IDLE: if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); newpri = PRI_MIN_IDLE + rtp->prio; break; default: return (EINVAL); } thread_lock(td); oldclass = td->td_pri_class; sched_class(td, rtp->type); /* XXX fix */ oldpri = td->td_user_pri; sched_user_prio(td, newpri); if (td->td_user_pri != oldpri && (oldclass != RTP_PRIO_NORMAL || td->td_pri_class != RTP_PRIO_NORMAL)) sched_prio(td, td->td_user_pri); if (TD_ON_UPILOCK(td) && oldpri != newpri) { critical_enter(); thread_unlock(td); umtx_pi_adjust(td, oldpri); critical_exit(); } else thread_unlock(td); return (0); } void pri_to_rtp(struct thread *td, struct rtprio *rtp) { thread_lock(td); switch (PRI_BASE(td->td_pri_class)) { case PRI_REALTIME: rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME; break; case PRI_TIMESHARE: rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE; break; case PRI_IDLE: rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE; break; default: break; } rtp->type = td->td_pri_class; thread_unlock(td); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osetrlimit_args { u_int which; struct orlimit *rlp; }; #endif int osetrlimit(struct thread *td, register struct osetrlimit_args *uap) { struct orlimit olim; struct rlimit lim; int error; if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit)))) return (error); lim.rlim_cur = olim.rlim_cur; lim.rlim_max = olim.rlim_max; error = kern_setrlimit(td, uap->which, &lim); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ogetrlimit_args { u_int which; struct orlimit *rlp; }; #endif int ogetrlimit(struct thread *td, register struct ogetrlimit_args *uap) { struct orlimit olim; struct rlimit rl; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); lim_rlimit(td, uap->which, &rl); /* * XXX would be more correct to convert only RLIM_INFINITY to the * old RLIM_INFINITY and fail with EOVERFLOW for other larger * values. Most 64->32 and 32->16 conversions, including not * unimportant ones of uids are even more broken than what we * do here (they blindly truncate). We don't do this correctly * here since we have little experience with EOVERFLOW yet. * Elsewhere, getuid() can't fail... */ olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur; olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max; error = copyout(&olim, uap->rlp, sizeof(olim)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct __setrlimit_args { u_int which; struct rlimit *rlp; }; #endif int sys_setrlimit(struct thread *td, register struct __setrlimit_args *uap) { struct rlimit alim; int error; if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit)))) return (error); error = kern_setrlimit(td, uap->which, &alim); return (error); } static void lim_cb(void *arg) { struct rlimit rlim; struct thread *td; struct proc *p; p = arg; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if the process exceeds its cpu resource allocation. If * it reaches the max, arrange to kill the process in ast(). */ if (p->p_cpulimit == RLIM_INFINITY) return; PROC_STATLOCK(p); FOREACH_THREAD_IN_PROC(p, td) { ruxagg(p, td); } PROC_STATUNLOCK(p); if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) { lim_rlimit_proc(p, RLIMIT_CPU, &rlim); if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) { killproc(p, "exceeded maximum CPU limit"); } else { if (p->p_cpulimit < rlim.rlim_max) p->p_cpulimit += 5; kern_psignal(p, SIGXCPU); } } if ((p->p_flag & P_WEXIT) == 0) callout_reset_sbt(&p->p_limco, SBT_1S, 0, lim_cb, p, C_PREL(1)); } int kern_setrlimit(struct thread *td, u_int which, struct rlimit *limp) { return (kern_proc_setrlimit(td, td->td_proc, which, limp)); } int kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which, struct rlimit *limp) { struct plimit *newlim, *oldlim; register struct rlimit *alimp; struct rlimit oldssiz; int error; if (which >= RLIM_NLIMITS) return (EINVAL); /* * Preserve historical bugs by treating negative limits as unsigned. */ if (limp->rlim_cur < 0) limp->rlim_cur = RLIM_INFINITY; if (limp->rlim_max < 0) limp->rlim_max = RLIM_INFINITY; oldssiz.rlim_cur = 0; newlim = lim_alloc(); PROC_LOCK(p); oldlim = p->p_limit; alimp = &oldlim->pl_rlimit[which]; if (limp->rlim_cur > alimp->rlim_max || limp->rlim_max > alimp->rlim_max) if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) { PROC_UNLOCK(p); lim_free(newlim); return (error); } if (limp->rlim_cur > limp->rlim_max) limp->rlim_cur = limp->rlim_max; lim_copy(newlim, oldlim); alimp = &newlim->pl_rlimit[which]; switch (which) { case RLIMIT_CPU: if (limp->rlim_cur != RLIM_INFINITY && p->p_cpulimit == RLIM_INFINITY) callout_reset_sbt(&p->p_limco, SBT_1S, 0, lim_cb, p, C_PREL(1)); p->p_cpulimit = limp->rlim_cur; break; case RLIMIT_DATA: if (limp->rlim_cur > maxdsiz) limp->rlim_cur = maxdsiz; if (limp->rlim_max > maxdsiz) limp->rlim_max = maxdsiz; break; case RLIMIT_STACK: if (limp->rlim_cur > maxssiz) limp->rlim_cur = maxssiz; if (limp->rlim_max > maxssiz) limp->rlim_max = maxssiz; oldssiz = *alimp; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(&oldssiz, RLIMIT_STACK); break; case RLIMIT_NOFILE: if (limp->rlim_cur > maxfilesperproc) limp->rlim_cur = maxfilesperproc; if (limp->rlim_max > maxfilesperproc) limp->rlim_max = maxfilesperproc; break; case RLIMIT_NPROC: if (limp->rlim_cur > maxprocperuid) limp->rlim_cur = maxprocperuid; if (limp->rlim_max > maxprocperuid) limp->rlim_max = maxprocperuid; if (limp->rlim_cur < 1) limp->rlim_cur = 1; if (limp->rlim_max < 1) limp->rlim_max = 1; break; } if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(limp, which); *alimp = *limp; p->p_limit = newlim; PROC_UPDATE_COW(p); PROC_UNLOCK(p); lim_free(oldlim); if (which == RLIMIT_STACK && /* * Skip calls from exec_new_vmspace(), done when stack is * not mapped yet. */ (td != curthread || (p->p_flag & P_INEXEC) == 0)) { /* * Stack is allocated to the max at exec time with only * "rlim_cur" bytes accessible. If stack limit is going * up make more accessible, if going down make inaccessible. */ if (limp->rlim_cur != oldssiz.rlim_cur) { vm_offset_t addr; vm_size_t size; vm_prot_t prot; if (limp->rlim_cur > oldssiz.rlim_cur) { prot = p->p_sysent->sv_stackprot; size = limp->rlim_cur - oldssiz.rlim_cur; addr = p->p_sysent->sv_usrstack - limp->rlim_cur; } else { prot = VM_PROT_NONE; size = oldssiz.rlim_cur - limp->rlim_cur; addr = p->p_sysent->sv_usrstack - oldssiz.rlim_cur; } addr = trunc_page(addr); size = round_page(size); (void)vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, FALSE); } } return (0); } #ifndef _SYS_SYSPROTO_H_ struct __getrlimit_args { u_int which; struct rlimit *rlp; }; #endif /* ARGSUSED */ int sys_getrlimit(struct thread *td, register struct __getrlimit_args *uap) { struct rlimit rlim; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); lim_rlimit(td, uap->which, &rlim); error = copyout(&rlim, uap->rlp, sizeof(struct rlimit)); return (error); } /* * Transform the running time and tick information for children of proc p * into user and system time usage. */ void calccru(struct proc *p, struct timeval *up, struct timeval *sp) { PROC_LOCK_ASSERT(p, MA_OWNED); calcru1(p, &p->p_crux, up, sp); } /* * Transform the running time and tick information in proc p into user * and system time usage. If appropriate, include the current time slice * on this CPU. */ void calcru(struct proc *p, struct timeval *up, struct timeval *sp) { struct thread *td; uint64_t runtime, u; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_STATLOCK_ASSERT(p, MA_OWNED); /* * If we are getting stats for the current process, then add in the * stats that this thread has accumulated in its current time slice. * We reset the thread and CPU state as if we had performed a context * switch right here. */ td = curthread; if (td->td_proc == p) { u = cpu_ticks(); runtime = u - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, u); } /* Make sure the per-thread stats are current. */ FOREACH_THREAD_IN_PROC(p, td) { if (td->td_incruntime == 0) continue; ruxagg(p, td); } calcru1(p, &p->p_rux, up, sp); } /* Collect resource usage for a single thread. */ void rufetchtd(struct thread *td, struct rusage *ru) { struct proc *p; uint64_t runtime, u; p = td->td_proc; PROC_STATLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If we are getting stats for the current thread, then add in the * stats that this thread has accumulated in its current time slice. * We reset the thread and CPU state as if we had performed a context * switch right here. */ if (td == curthread) { u = cpu_ticks(); runtime = u - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, u); } ruxagg(p, td); *ru = td->td_ru; calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime); } static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp) { /* {user, system, interrupt, total} {ticks, usec}: */ uint64_t ut, uu, st, su, it, tt, tu; ut = ruxp->rux_uticks; st = ruxp->rux_sticks; it = ruxp->rux_iticks; tt = ut + st + it; if (tt == 0) { /* Avoid divide by zero */ st = 1; tt = 1; } tu = cputick2usec(ruxp->rux_runtime); if ((int64_t)tu < 0) { /* XXX: this should be an assert /phk */ printf("calcru: negative runtime of %jd usec for pid %d (%s)\n", (intmax_t)tu, p->p_pid, p->p_comm); tu = ruxp->rux_tu; } if (tu >= ruxp->rux_tu) { /* * The normal case, time increased. * Enforce monotonicity of bucketed numbers. */ uu = (tu * ut) / tt; if (uu < ruxp->rux_uu) uu = ruxp->rux_uu; su = (tu * st) / tt; if (su < ruxp->rux_su) su = ruxp->rux_su; } else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) { /* * When we calibrate the cputicker, it is not uncommon to * see the presumably fixed frequency increase slightly over * time as a result of thermal stabilization and NTP * discipline (of the reference clock). We therefore ignore * a bit of backwards slop because we expect to catch up * shortly. We use a 3 microsecond limit to catch low * counts and a 1% limit for high counts. */ uu = ruxp->rux_uu; su = ruxp->rux_su; tu = ruxp->rux_tu; } else { /* tu < ruxp->rux_tu */ /* * What happened here was likely that a laptop, which ran at * a reduced clock frequency at boot, kicked into high gear. * The wisdom of spamming this message in that case is * dubious, but it might also be indicative of something * serious, so lets keep it and hope laptops can be made * more truthful about their CPU speed via ACPI. */ printf("calcru: runtime went backwards from %ju usec " "to %ju usec for pid %d (%s)\n", (uintmax_t)ruxp->rux_tu, (uintmax_t)tu, p->p_pid, p->p_comm); uu = (tu * ut) / tt; su = (tu * st) / tt; } ruxp->rux_uu = uu; ruxp->rux_su = su; ruxp->rux_tu = tu; up->tv_sec = uu / 1000000; up->tv_usec = uu % 1000000; sp->tv_sec = su / 1000000; sp->tv_usec = su % 1000000; } #ifndef _SYS_SYSPROTO_H_ struct getrusage_args { int who; struct rusage *rusage; }; #endif int sys_getrusage(register struct thread *td, register struct getrusage_args *uap) { struct rusage ru; int error; error = kern_getrusage(td, uap->who, &ru); if (error == 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int kern_getrusage(struct thread *td, int who, struct rusage *rup) { struct proc *p; int error; error = 0; p = td->td_proc; PROC_LOCK(p); switch (who) { case RUSAGE_SELF: rufetchcalc(p, rup, &rup->ru_utime, &rup->ru_stime); break; case RUSAGE_CHILDREN: *rup = p->p_stats->p_cru; calccru(p, &rup->ru_utime, &rup->ru_stime); break; case RUSAGE_THREAD: PROC_STATLOCK(p); thread_lock(td); rufetchtd(td, rup); thread_unlock(td); PROC_STATUNLOCK(p); break; default: error = EINVAL; } PROC_UNLOCK(p); return (error); } void rucollect(struct rusage *ru, struct rusage *ru2) { long *ip, *ip2; int i; if (ru->ru_maxrss < ru2->ru_maxrss) ru->ru_maxrss = ru2->ru_maxrss; ip = &ru->ru_first; ip2 = &ru2->ru_first; for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) *ip++ += *ip2++; } void ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2, struct rusage_ext *rux2) { rux->rux_runtime += rux2->rux_runtime; rux->rux_uticks += rux2->rux_uticks; rux->rux_sticks += rux2->rux_sticks; rux->rux_iticks += rux2->rux_iticks; rux->rux_uu += rux2->rux_uu; rux->rux_su += rux2->rux_su; rux->rux_tu += rux2->rux_tu; rucollect(ru, ru2); } /* * Aggregate tick counts into the proc's rusage_ext. */ static void ruxagg_locked(struct rusage_ext *rux, struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); PROC_STATLOCK_ASSERT(td->td_proc, MA_OWNED); rux->rux_runtime += td->td_incruntime; rux->rux_uticks += td->td_uticks; rux->rux_sticks += td->td_sticks; rux->rux_iticks += td->td_iticks; } void ruxagg(struct proc *p, struct thread *td) { thread_lock(td); ruxagg_locked(&p->p_rux, td); ruxagg_locked(&td->td_rux, td); td->td_incruntime = 0; td->td_uticks = 0; td->td_iticks = 0; td->td_sticks = 0; thread_unlock(td); } /* * Update the rusage_ext structure and fetch a valid aggregate rusage * for proc p if storage for one is supplied. */ void rufetch(struct proc *p, struct rusage *ru) { struct thread *td; PROC_STATLOCK_ASSERT(p, MA_OWNED); *ru = p->p_ru; if (p->p_numthreads > 0) { FOREACH_THREAD_IN_PROC(p, td) { ruxagg(p, td); rucollect(ru, &td->td_ru); } } } /* * Atomically perform a rufetch and a calcru together. * Consumers, can safely assume the calcru is executed only once * rufetch is completed. */ void rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up, struct timeval *sp) { PROC_STATLOCK(p); rufetch(p, ru); calcru(p, up, sp); PROC_STATUNLOCK(p); } /* * Allocate a new resource limits structure and initialize its * reference count and mutex pointer. */ struct plimit * lim_alloc() { struct plimit *limp; limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK); refcount_init(&limp->pl_refcnt, 1); return (limp); } struct plimit * lim_hold(struct plimit *limp) { refcount_acquire(&limp->pl_refcnt); return (limp); } static __inline int lim_shared(struct plimit *limp) { return (limp->pl_refcnt > 1); } void lim_fork(struct proc *p1, struct proc *p2) { PROC_LOCK_ASSERT(p1, MA_OWNED); PROC_LOCK_ASSERT(p2, MA_OWNED); p2->p_limit = lim_hold(p1->p_limit); callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0); if (p1->p_cpulimit != RLIM_INFINITY) callout_reset_sbt(&p2->p_limco, SBT_1S, 0, lim_cb, p2, C_PREL(1)); } void lim_free(struct plimit *limp) { if (refcount_release(&limp->pl_refcnt)) free((void *)limp, M_PLIMIT); } /* * Make a copy of the plimit structure. * We share these structures copy-on-write after fork. */ void lim_copy(struct plimit *dst, struct plimit *src) { KASSERT(!lim_shared(dst), ("lim_copy to shared limit")); bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit)); } /* * Return the hard limit for a particular system resource. The * which parameter specifies the index into the rlimit array. */ rlim_t lim_max(struct thread *td, int which) { struct rlimit rl; lim_rlimit(td, which, &rl); return (rl.rlim_max); } rlim_t lim_max_proc(struct proc *p, int which) { struct rlimit rl; lim_rlimit_proc(p, which, &rl); return (rl.rlim_max); } /* * Return the current (soft) limit for a particular system resource. * The which parameter which specifies the index into the rlimit array */ rlim_t lim_cur(struct thread *td, int which) { struct rlimit rl; lim_rlimit(td, which, &rl); return (rl.rlim_cur); } rlim_t lim_cur_proc(struct proc *p, int which) { struct rlimit rl; lim_rlimit_proc(p, which, &rl); return (rl.rlim_cur); } /* * Return a copy of the entire rlimit structure for the system limit * specified by 'which' in the rlimit structure pointed to by 'rlp'. */ void lim_rlimit(struct thread *td, int which, struct rlimit *rlp) { struct proc *p = td->td_proc; MPASS(td == curthread); KASSERT(which >= 0 && which < RLIM_NLIMITS, ("request for invalid resource limit")); *rlp = td->td_limit->pl_rlimit[which]; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(rlp, which); } void lim_rlimit_proc(struct proc *p, int which, struct rlimit *rlp) { PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(which >= 0 && which < RLIM_NLIMITS, ("request for invalid resource limit")); *rlp = p->p_limit->pl_rlimit[which]; if (p->p_sysent->sv_fixlimit != NULL) p->p_sysent->sv_fixlimit(rlp, which); } void uihashinit() { uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash); rw_init(&uihashtbl_lock, "uidinfo hash"); } /* * Look up a uidinfo struct for the parameter uid. * uihashtbl_lock must be locked. * Increase refcount on uidinfo struct returned. */ static struct uidinfo * uilookup(uid_t uid) { struct uihashhead *uipp; struct uidinfo *uip; rw_assert(&uihashtbl_lock, RA_LOCKED); uipp = UIHASH(uid); LIST_FOREACH(uip, uipp, ui_hash) if (uip->ui_uid == uid) { uihold(uip); break; } return (uip); } /* * Find or allocate a struct uidinfo for a particular uid. * Returns with uidinfo struct referenced. * uifree() should be called on a struct uidinfo when released. */ struct uidinfo * uifind(uid_t uid) { struct uidinfo *new_uip, *uip; rw_rlock(&uihashtbl_lock); uip = uilookup(uid); rw_runlock(&uihashtbl_lock); if (uip != NULL) return (uip); new_uip = malloc(sizeof(*new_uip), M_UIDINFO, M_WAITOK | M_ZERO); racct_create(&new_uip->ui_racct); refcount_init(&new_uip->ui_ref, 1); new_uip->ui_uid = uid; mtx_init(&new_uip->ui_vmsize_mtx, "ui_vmsize", NULL, MTX_DEF); rw_wlock(&uihashtbl_lock); /* * There's a chance someone created our uidinfo while we * were in malloc and not holding the lock, so we have to * make sure we don't insert a duplicate uidinfo. */ if ((uip = uilookup(uid)) == NULL) { LIST_INSERT_HEAD(UIHASH(uid), new_uip, ui_hash); rw_wunlock(&uihashtbl_lock); uip = new_uip; } else { rw_wunlock(&uihashtbl_lock); racct_destroy(&new_uip->ui_racct); mtx_destroy(&new_uip->ui_vmsize_mtx); free(new_uip, M_UIDINFO); } return (uip); } /* * Place another refcount on a uidinfo struct. */ void uihold(struct uidinfo *uip) { refcount_acquire(&uip->ui_ref); } /*- * Since uidinfo structs have a long lifetime, we use an * opportunistic refcounting scheme to avoid locking the lookup hash * for each release. * * If the refcount hits 0, we need to free the structure, * which means we need to lock the hash. * Optimal case: * After locking the struct and lowering the refcount, if we find * that we don't need to free, simply unlock and return. * Suboptimal case: * If refcount lowering results in need to free, bump the count * back up, lose the lock and acquire the locks in the proper * order to try again. */ void uifree(struct uidinfo *uip) { int old; /* Prepare for optimal case. */ old = uip->ui_ref; if (old > 1 && atomic_cmpset_int(&uip->ui_ref, old, old - 1)) return; /* Prepare for suboptimal case. */ rw_wlock(&uihashtbl_lock); if (refcount_release(&uip->ui_ref) == 0) { rw_wunlock(&uihashtbl_lock); return; } racct_destroy(&uip->ui_racct); LIST_REMOVE(uip, ui_hash); rw_wunlock(&uihashtbl_lock); if (uip->ui_sbsize != 0) printf("freeing uidinfo: uid = %d, sbsize = %ld\n", uip->ui_uid, uip->ui_sbsize); if (uip->ui_proccnt != 0) printf("freeing uidinfo: uid = %d, proccnt = %ld\n", uip->ui_uid, uip->ui_proccnt); if (uip->ui_vmsize != 0) printf("freeing uidinfo: uid = %d, swapuse = %lld\n", uip->ui_uid, (unsigned long long)uip->ui_vmsize); mtx_destroy(&uip->ui_vmsize_mtx); free(uip, M_UIDINFO); } #ifdef RACCT void ui_racct_foreach(void (*callback)(struct racct *racct, - void *arg2, void *arg3), void *arg2, void *arg3) + void *arg2, void *arg3), void (*pre)(void), void (*post)(void), + void *arg2, void *arg3) { struct uidinfo *uip; struct uihashhead *uih; rw_rlock(&uihashtbl_lock); + if (pre != NULL) + (pre)(); for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) { LIST_FOREACH(uip, uih, ui_hash) { (callback)(uip->ui_racct, arg2, arg3); } } + if (post != NULL) + (post)(); rw_runlock(&uihashtbl_lock); } #endif static inline int chglimit(struct uidinfo *uip, long *limit, int diff, rlim_t max, const char *name) { /* Don't allow them to exceed max, but allow subtraction. */ if (diff > 0 && max != 0) { if (atomic_fetchadd_long(limit, (long)diff) + diff > max) { atomic_subtract_long(limit, (long)diff); return (0); } } else { atomic_add_long(limit, (long)diff); if (*limit < 0) printf("negative %s for uid = %d\n", name, uip->ui_uid); } return (1); } /* * Change the count associated with number of processes * a given user is using. When 'max' is 0, don't enforce a limit */ int chgproccnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_proccnt, diff, max, "proccnt")); } /* * Change the total socket buffer size a user has used. */ int chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to, rlim_t max) { int diff, rv; diff = to - *hiwat; if (diff > 0 && max == 0) { rv = 0; } else { rv = chglimit(uip, &uip->ui_sbsize, diff, max, "sbsize"); if (rv != 0) *hiwat = to; } return (rv); } /* * Change the count associated with number of pseudo-terminals * a given user is using. When 'max' is 0, don't enforce a limit */ int chgptscnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_ptscnt, diff, max, "ptscnt")); } int chgkqcnt(struct uidinfo *uip, int diff, rlim_t max) { return (chglimit(uip, &uip->ui_kqcnt, diff, max, "kqcnt")); }