diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index e505e9bf1276..f1c81d8813bd 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -1,4616 +1,4850 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ #include #define PRISON0_HOSTUUID_MODULE "hostuuid" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ #ifdef INET #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL #endif #else /* !INET */ #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL 0 #endif #endif /* prison0 describes what is "real" about the system. */ struct prison prison0 = { .pr_id = 0, .pr_name = "0", .pr_ref = 1, .pr_uref = 1, .pr_path = "/", .pr_securelevel = -1, .pr_devfs_rsnum = 0, .pr_state = PRISON_STATE_ALIVE, .pr_childmax = JAIL_MAX, .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif .pr_allow = PR_ALLOW_ALL_STATIC, }; MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); struct bool_flags { const char *name; const char *noname; volatile u_int flag; }; struct jailsys_flags { const char *name; unsigned disable; unsigned new; }; /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ struct sx allprison_lock; SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); LIST_HEAD(, prison_racct) allprison_racct; int lastprid = 0; static int get_next_prid(struct prison **insprp); static int do_jail_attach(struct thread *td, struct prison *pr, int drflags); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison); static int prison_lock_xlock(struct prison *pr, int flags); static void prison_free_not_last(struct prison *pr); static void prison_proc_free_not_last(struct prison *pr); static void prison_set_allow_locked(struct prison *pr, unsigned flag, int enable); static char *prison_path(struct prison *pr1, struct prison *pr2); #ifdef RACCT static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif /* Flags for prison_deref */ #define PD_DEREF 0x01 /* Decrement pr_ref */ #define PD_DEUREF 0x02 /* Decrement pr_uref */ #define PD_KILL 0x04 /* Remove jail, kill processes, etc */ #define PD_LOCKED 0x10 /* pr_mtx is held */ #define PD_LIST_SLOCKED 0x20 /* allprison_lock is held shared */ #define PD_LIST_XLOCKED 0x40 /* allprison_lock is held exclusive */ #define PD_OP_FLAGS 0x07 /* Operation flags */ #define PD_LOCK_FLAGS 0x70 /* Lock status flags */ /* * Parameter names corresponding to PR_* flag values. Size values are for kvm * as we cannot figure out the size of a sparse array, or an array without a * terminating entry. */ static struct bool_flags pr_flag_bool[] = { {"persist", "nopersist", PR_PERSIST}, #ifdef INET {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL}, #endif #ifdef INET6 {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL}, #endif }; const size_t pr_flag_bool_size = sizeof(pr_flag_bool); static struct jailsys_flags pr_flag_jailsys[] = { {"host", 0, PR_HOST}, #ifdef VIMAGE {"vnet", 0, PR_VNET}, #endif #ifdef INET {"ip4", PR_IP4_USER, PR_IP4_USER}, #endif #ifdef INET6 {"ip6", PR_IP6_USER, PR_IP6_USER}, #endif }; const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); /* * Make this array full-size so dynamic parameters can be added. * It is protected by prison0.mtx, but lockless reading is allowed * with an atomic check of the flag values. */ static struct bool_flags pr_flag_allow[NBBY * NBPW] = { {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME}, {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC}, {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS}, {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS}, {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT}, {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS}, {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF}, {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK}, {"allow.reserved_ports", "allow.noreserved_ports", PR_ALLOW_RESERVED_PORTS}, {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF}, {"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug", PR_ALLOW_UNPRIV_DEBUG}, {"allow.suser", "allow.nosuser", PR_ALLOW_SUSER}, }; static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC; const size_t pr_flag_allow_size = sizeof(pr_flag_allow); #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | \ PR_ALLOW_RESERVED_PORTS | \ PR_ALLOW_UNPRIV_DEBUG | \ PR_ALLOW_SUSER) #define JAIL_DEFAULT_ENFORCE_STATFS 2 #define JAIL_DEFAULT_DEVFS_RSNUM 0 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; #if defined(INET) || defined(INET6) static unsigned jail_max_af_ips = 255; #endif /* * Initialize the parts of prison0 that can't be static-initialized with * constants. This is called from proc0_init() after creating thread0 cpuset. */ void prison0_init(void) { uint8_t *file, *data; size_t size; char buf[sizeof(prison0.pr_hostuuid)]; bool valid; prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); prison0.pr_osreldate = osreldate; strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); /* If we have a preloaded hostuuid, use it. */ file = preload_search_by_type(PRISON0_HOSTUUID_MODULE); if (file != NULL) { data = preload_fetch_addr(file); size = preload_fetch_size(file); if (data != NULL) { /* * The preloaded data may include trailing whitespace, almost * certainly a newline; skip over any whitespace or * non-printable characters to be safe. */ while (size > 0 && data[size - 1] <= 0x20) { size--; } valid = false; /* * Not NUL-terminated when passed from loader, but * validate_uuid requires that due to using sscanf (as * does the subsequent strlcpy, since it still reads * past the given size to return the true length); * bounce to a temporary buffer to fix. */ if (size >= sizeof(buf)) goto done; memcpy(buf, data, size); buf[size] = '\0'; if (validate_uuid(buf, size, NULL, 0) != 0) goto done; valid = true; (void)strlcpy(prison0.pr_hostuuid, buf, sizeof(prison0.pr_hostuuid)); done: if (bootverbose && !valid) { printf("hostuuid: preload data malformed: '%.*s'\n", (int)size, data); } } } if (bootverbose) printf("hostuuid: using %s\n", prison0.pr_hostuuid); } /* * struct jail_args { * struct jail *jail; * }; */ int sys_jail(struct thread *td, struct jail_args *uap) { uint32_t version; int error; struct jail j; error = copyin(uap->jail, &version, sizeof(uint32_t)); if (error) return (error); switch (version) { case 0: { struct jail_v0 j0; /* FreeBSD single IPv4 jails. */ bzero(&j, sizeof(struct jail)); error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); if (error) return (error); j.version = j0.version; j.path = j0.path; j.hostname = j0.hostname; j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ break; } case 1: /* * Version 1 was used by multi-IPv4 jail implementations * that never made it into the official kernel. */ return (EINVAL); case 2: /* JAIL_API_VERSION */ /* FreeBSD multi-IPv4/IPv6,noIP jails. */ error = copyin(uap->jail, &j, sizeof(struct jail)); if (error) return (error); break; default: /* Sci-Fi jails are not supported, sorry. */ return (EINVAL); } return (kern_jail(td, &j)); } int kern_jail(struct thread *td, struct jail *j) { struct iovec optiov[2 * (4 + nitems(pr_flag_allow) #ifdef INET + 1 #endif #ifdef INET6 + 1 #endif )]; struct uio opt; char *u_path, *u_hostname, *u_name; struct bool_flags *bf; #ifdef INET uint32_t ip4s; struct in_addr *u_ip4; #endif #ifdef INET6 struct in6_addr *u_ip6; #endif size_t tmplen; int error, enforce_statfs; bzero(&optiov, sizeof(optiov)); opt.uio_iov = optiov; opt.uio_iovcnt = 0; opt.uio_offset = -1; opt.uio_resid = -1; opt.uio_segflg = UIO_SYSSPACE; opt.uio_rw = UIO_READ; opt.uio_td = td; /* Set permissions for top-level jails from sysctls. */ if (!jailed(td->td_ucred)) { for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { optiov[opt.uio_iovcnt].iov_base = __DECONST(char *, (jail_default_allow & bf->flag) ? bf->name : bf->noname); optiov[opt.uio_iovcnt].iov_len = strlen(optiov[opt.uio_iovcnt].iov_base) + 1; opt.uio_iovcnt += 2; } optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); opt.uio_iovcnt++; enforce_statfs = jail_default_enforce_statfs; optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); opt.uio_iovcnt++; } tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; #ifdef INET ip4s = (j->version == 0) ? 1 : j->ip4s; if (ip4s > jail_max_af_ips) return (EINVAL); tmplen += ip4s * sizeof(struct in_addr); #else if (j->ip4s > 0) return (EINVAL); #endif #ifdef INET6 if (j->ip6s > jail_max_af_ips) return (EINVAL); tmplen += j->ip6s * sizeof(struct in6_addr); #else if (j->ip6s > 0) return (EINVAL); #endif u_path = malloc(tmplen, M_TEMP, M_WAITOK); u_hostname = u_path + MAXPATHLEN; u_name = u_hostname + MAXHOSTNAMELEN; #ifdef INET u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); #endif #ifdef INET6 #ifdef INET u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); #else u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); #endif #endif optiov[opt.uio_iovcnt].iov_base = "path"; optiov[opt.uio_iovcnt].iov_len = sizeof("path"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_path; error = copyinstr(j->path, u_path, MAXPATHLEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = "host.hostname"; optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_hostname; error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; if (j->jailname != NULL) { optiov[opt.uio_iovcnt].iov_base = "name"; optiov[opt.uio_iovcnt].iov_len = sizeof("name"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_name; error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; } #ifdef INET optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip4; optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); if (j->version == 0) u_ip4->s_addr = j->ip4s; else { error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } } opt.uio_iovcnt++; #endif #ifdef INET6 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip6; optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; #endif KASSERT(opt.uio_iovcnt <= nitems(optiov), ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); free(u_path, M_TEMP); return (error); } /* * struct jail_set_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_set(struct thread *td, struct jail_set_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_set(td, auio, uap->flags); free(auio, M_IOV); return (error); } +#if defined(INET) || defined(INET6) +typedef int prison_addr_cmp_t(const void *, const void *); +typedef bool prison_addr_valid_t(const void *); +static const struct pr_family { + size_t size; + prison_addr_cmp_t *cmp; + prison_addr_valid_t *valid; + int ip_flag; +} pr_families[PR_FAMILY_MAX] = { +#ifdef INET + [PR_INET] = { + .size = sizeof(struct in_addr), + .cmp = prison_qcmp_v4, + .valid = prison_valid_v4, + .ip_flag = PR_IP4_USER, + }, +#endif +#ifdef INET6 + [PR_INET6] = { + .size = sizeof(struct in6_addr), + .cmp = prison_qcmp_v6, + .valid = prison_valid_v6, + .ip_flag = PR_IP6_USER, + }, +#endif +}; + +/* + * Network address lists (pr_addrs) allocation for jails. The addresses + * are accessed locklessly by the network stack, thus need to be protected by + * the network epoch. + */ +struct prison_ip { + struct epoch_context ctx; + uint32_t ips; +#ifdef FUTURE_C + union { + struct in_addr pr_ip4[]; + struct in6_addr pr_ip6[]; + }; +#else /* No future C :( */ +#define PR_IP(pip, i) ((const char *)((pip) + 1) + pr_families[af].size * (i)) +#define PR_IPD(pip, i) ((char *)((pip) + 1) + pr_families[af].size * (i)) +#endif +}; + +static struct prison_ip * +prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags) +{ + struct prison_ip *pip; + + pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size, + M_PRISON, flags); + if (pip != NULL) + pip->ips = cnt; + return (pip); +} + +/* + * Allocate and copyin user supplied address list, sorting and validating. + * kern_jail_set() helper. + */ +static struct prison_ip * +prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt) +{ + prison_addr_cmp_t *const cmp = pr_families[af].cmp; + const size_t size = pr_families[af].size; + struct prison_ip *pip; + + pip = prison_ip_alloc(af, cnt, M_WAITOK); + bcopy(op, pip + 1, cnt * size); + /* + * IP addresses are all sorted but ip[0] to preserve + * the primary IP address as given from userland. + * This special IP is used for unbound outgoing + * connections as well for "loopback" traffic in case + * source address selection cannot find any more fitting + * address to connect from. + */ + if (cnt > 1) + qsort((char *)(pip + 1) + size, cnt - 1, size, + pr_families[af].cmp); + /* + * Check for duplicate addresses and do some simple + * zero and broadcast checks. If users give other bogus + * addresses it is their problem. + */ + for (int i = 0; i < cnt; i++) { + if (!pr_families[af].valid(PR_IP(pip, i))) { + free(pip, M_PRISON); + return (NULL); + } + if (i + 1 < cnt && + (cmp(PR_IP(pip, 0), PR_IP(pip, i + 1)) == 0 || + cmp(PR_IP(pip, i), PR_IP(pip, i + 1)) == 0)) { + free(pip, M_PRISON); + return (NULL); + } + } + + return (pip); +} + +/* + * Allocate and dup parent prison address list. + * kern_jail_set() helper. + */ +static void +prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af) +{ + + if (ppr->pr_addrs[af] != NULL) { + pr->pr_addrs[af] = prison_ip_alloc(af, + ppr->pr_addrs[af]->ips, M_WAITOK); + bcopy(ppr->pr_addrs[af], pr->pr_addrs[af], + pr->pr_addrs[af]->ips * pr_families[af].size); + } +} + +/* + * Make sure the new set of IP addresses is a subset of the parent's list. + * Don't worry about the parent being unlocked, as any setting is done with + * allprison_lock held. + * kern_jail_set() helper. + */ +static bool +prison_ip_parent_match(const struct prison_ip *ppip, + const struct prison_ip *pip, const pr_family_t af) +{ + prison_addr_cmp_t *const cmp = pr_families[af].cmp; + int i, j; + + if (ppip == NULL) + return (false); + + for (i = 0; i < ppip->ips; i++) + if (cmp(PR_IP(pip, 0), PR_IP(ppip, i)) == 0) + break; + + if (i == ppip->ips) + /* Main address not present in parent. */ + return (false); + + if (pip->ips > 1) { + for (i = j = 1; i < pip->ips; i++) { + if (cmp(PR_IP(pip, i), PR_IP(ppip, 0)) == 0) + /* Equals to parent primary address. */ + continue; + for (; j < ppip->ips; j++) + if (cmp(PR_IP(pip, i), PR_IP(ppip, j)) == 0) + break; + if (j == ppip->ips) + break; + } + if (j == ppip->ips) + /* Address not present in parent. */ + return (false); + } + return (true); +} + +/* + * Check for conflicting IP addresses. We permit them if there is no more + * than one IP on each jail. If there is a duplicate on a jail with more + * than one IP stop checking and return error. + * kern_jail_set() helper. + */ +static bool +prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr, + const struct prison_ip *pip, pr_family_t af) +{ + const struct prison *tppr, *tpr; + int descend; + +#ifdef VIMAGE + for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) + if (tppr->pr_flags & PR_VNET) + break; +#else + tppr = &prison0; +#endif + FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { + if (tpr == pr || +#ifdef VIMAGE + (tpr != tppr && (tpr->pr_flags & PR_VNET)) || +#endif + !prison_isalive(tpr)) { + descend = 0; + continue; + } + if (!(tpr->pr_flags & pr_families[af].ip_flag)) + continue; + descend = 0; + if (tpr->pr_addrs[af] == NULL || + (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1)) + continue; + for (int i = 0; i < pip->ips; i++) + if (prison_ip_check(tpr, af, PR_IP(pip, i)) == 0) + return (false); + } + + return (true); +} + +_Static_assert(offsetof(struct prison_ip, ctx) == 0, + "prison must start with epoch context"); +static void +prison_ip_free_deferred(epoch_context_t ctx) +{ + + free(ctx, M_PRISON); +} + +static void +prison_ip_free(struct prison_ip *pip) +{ + + if (pip != NULL) + NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx); +} + +static void +prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new) +{ + struct prison_ip **mem, *old; + + mtx_assert(&pr->pr_mtx, MA_OWNED); + + mem = &pr->pr_addrs[af]; + + old = *mem; + ck_pr_store_ptr(mem, new); + prison_ip_free(old); +} + +/* + * Restrict a prison's IP address list with its parent's, possibly replacing + * it. Return true if the replacement buffer was used (or would have been). + * kern_jail_set() helper. + */ +static bool +prison_ip_restrict(struct prison *pr, const pr_family_t af, + struct prison_ip *new) +{ + const struct prison_ip *ppip = pr->pr_parent->pr_addrs[af]; + const struct prison_ip *pip = pr->pr_addrs[af]; + int (*const cmp)(const void *, const void *) = pr_families[af].cmp; + const size_t size = pr_families[af].size; + uint32_t ips; + bool alloced; + + mtx_assert(&pr->pr_mtx, MA_OWNED); + + /* + * Due to epoch-synchronized access to the IP address lists we always + * allocate a new list even if the old one has enough space. We could + * atomically update an IPv4 address inside a list, but that would + * screw up sorting, and in case of IPv6 we can't even atomically write + * one. + */ + ips = (pr->pr_flags & pr_families[af].ip_flag) ? pip->ips : ppip->ips; + if (ips == 0) { + prison_ip_set(pr, af, NULL); + return (false); + } + if (new == NULL) { + new = prison_ip_alloc(af, ips, M_NOWAIT); + if (new == NULL) + return (true); + alloced = true; + } else + alloced = false; + if (!(pr->pr_flags & pr_families[af].ip_flag)) { + /* This has no user settings, so just copy the parent's list. */ + bcopy(ppip, new, ips * size); + } else { + /* Remove addresses that aren't in the parent. */ + int i; + + i = 0; /* index in pip */ + ips = 0; /* index in new */ + + for (int pi = 0; pi < ppip->ips; pi++) + if (cmp(PR_IP(pip, 0), PR_IP(ppip, pi)) == 0) { + /* Found our primary address in parent. */ + bcopy(PR_IP(pip, i), PR_IPD(new, ips), size); + i++; + ips++; + break; + } + for (int pi = 1; i < pip->ips; ) { + /* Check against primary, which is unsorted. */ + if (cmp(PR_IP(pip, i), PR_IP(ppip, 0)) == 0) { + /* Matches parent's primary address. */ + bcopy(PR_IP(pip, i), PR_IPD(new, ips), size); + i++; + ips++; + continue; + } + /* The rest are sorted. */ + switch (pi >= ppip->ips ? -1 : + cmp(PR_IP(pip, i), PR_IP(ppip, pi))) { + case -1: + i++; + break; + case 0: + bcopy(PR_IP(pr, i), PR_IPD(new, ips), size); + i++; + pi++; + ips++; + break; + case 1: + pi++; + break; + } + } + if (ips == 0) { + if (alloced) + prison_ip_free(new); + new = NULL; + } + } + prison_ip_set(pr, af, new); + return (new != NULL ? true : false); +} + +/* + * Fast-path check if an address belongs to a prison. + */ +int +prison_ip_check(const struct prison *pr, const pr_family_t af, + const void *addr) +{ + int (*const cmp)(const void *, const void *) = pr_families[af].cmp; + const struct prison_ip *pip; + int i, a, z, d; + + MPASS(mtx_owned(&pr->pr_mtx) || + in_epoch(net_epoch_preempt) || + sx_xlocked(&allprison_lock)); + + pip = ck_pr_load_ptr(&pr->pr_addrs[af]); + if (__predict_false(pip == NULL)) + return (EAFNOSUPPORT); + + /* Check the primary IP. */ + if (cmp(PR_IP(pip, 0), addr) == 0) + return (0); + + /* + * All the other IPs are sorted so we can do a binary search. + */ + a = 0; + z = pip->ips - 2; + while (a <= z) { + i = (a + z) / 2; + d = cmp(PR_IP(pip, i + 1), addr); + if (d > 0) + z = i - 1; + else if (d < 0) + a = i + 1; + else + return (0); + } + + return (EADDRNOTAVAIL); +} + +/* + * Grab primary IP. Historically required mutex, but nothing prevents + * us to support epoch-protected access. Is it used in fast path? + * in{6}_jail.c helper + */ +const void * +prison_ip_get0(const struct prison *pr, const pr_family_t af) +{ + const struct prison_ip *pip = pr->pr_addrs[af]; + + mtx_assert(&pr->pr_mtx, MA_OWNED); + MPASS(pip); + + return (pip + 1); +} + +u_int +prison_ip_cnt(const struct prison *pr, const pr_family_t af) +{ + + return (pr->pr_addrs[af]->ips); +} +#endif /* defined(INET) || defined(INET6) */ + int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { struct nameidata nd; #ifdef INET - struct in_addr *ip4; + struct prison_ip *ip4; #endif #ifdef INET6 - struct in6_addr *ip6; + struct prison_ip *ip6; #endif struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *inspr, *mypr, *ppr, *tpr; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; struct bool_flags *bf; struct jailsys_flags *jsf; #if defined(INET) || defined(INET6) - struct prison *tppr; void *op; #endif unsigned long hid; size_t namelen, onamelen, pnamelen; int born, created, cuflags, descend, drflags, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; int jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; -#if defined(INET) || defined(INET6) - int ii, ij; -#endif #ifdef INET int ip4s, redo_ip4; #endif #ifdef INET6 int ip6s, redo_ip6; #endif uint64_t pr_allow, ch_allow, pr_flags, ch_flags; uint64_t pr_allow_diff; unsigned tallow; char numbuf[12]; error = priv_check(td, PRIV_JAIL_SET); if (!error && (flags & JAIL_ATTACH)) error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); mypr = td->td_ucred->cr_prison; if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this * takes care of some expensive stuff (path lookup) before getting * the allprison lock. * * XXX Jails are not filesystems, and jail parameters are not mount * options. But it makes more sense to re-use the vfsopt code * than duplicate it under a different name. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); #ifdef INET ip4 = NULL; #endif #ifdef INET6 ip6 = NULL; #endif g_path = NULL; cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { error = EINVAL; vfs_opterror(opts, "no valid operation (create or update)"); goto done_errmsg; } error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; else if (error != 0) goto done_free; error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); if (error == ENOENT) gotslevel = 0; else if (error != 0) goto done_free; else gotslevel = 1; error = vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); if (error == ENOENT) gotchildmax = 0; else if (error != 0) goto done_free; else gotchildmax = 1; error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); if (error == ENOENT) gotenforce = 0; else if (error != 0) goto done_free; else if (enforce < 0 || enforce > 2) { error = EINVAL; goto done_free; } else gotenforce = 1; error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); if (error == ENOENT) gotrsnum = 0; else if (error != 0) goto done_free; else gotrsnum = 1; pr_flags = ch_flags = 0; for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) { vfs_flagopt(opts, bf->name, &pr_flags, bf->flag); vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag); } ch_flags |= pr_flags; for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys)); if (error == ENOENT) continue; if (error != 0) goto done_free; switch (jsys) { case JAIL_SYS_DISABLE: if (!jsf->disable) { error = EINVAL; goto done_free; } pr_flags |= jsf->disable; break; case JAIL_SYS_NEW: pr_flags |= jsf->new; break; case JAIL_SYS_INHERIT: break; default: error = EINVAL; goto done_free; } ch_flags |= jsf->new | jsf->disable; } if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE && !(pr_flags & PR_PERSIST)) { error = EINVAL; vfs_opterror(opts, "new jail must persist or attach"); goto done_errmsg; } #ifdef VIMAGE if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { error = EINVAL; vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET6 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_errmsg; } #endif pr_allow = ch_allow = 0; for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { vfs_flagopt(opts, bf->name, &pr_allow, bf->flag); vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag); } ch_allow |= pr_allow; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == ENOENT) name = NULL; else if (error != 0) goto done_free; else { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); if (error == ENOENT) host = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || host[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); if (error == ENOENT) domain = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || domain[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); if (error == ENOENT) uuid = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || uuid[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > HOSTUUIDLEN) { error = ENAMETOOLONG; goto done_free; } } #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32; error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); hid = hid32; } else #endif error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); if (error == ENOENT) gothid = 0; else if (error != 0) goto done_free; else { gothid = 1; ch_flags |= PR_HOST; pr_flags |= PR_HOST; } #ifdef INET error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); if (error == ENOENT) ip4s = 0; else if (error != 0) goto done_free; - else if (ip4s & (sizeof(*ip4) - 1)) { + else if (ip4s & (sizeof(struct in_addr) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP4_USER; pr_flags |= PR_IP4_USER; if (ip4s > 0) { - ip4s /= sizeof(*ip4); + ip4s /= sizeof(struct in_addr); if (ip4s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv4 addresses"); goto done_errmsg; } - ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); - bcopy(op, ip4, ip4s * sizeof(*ip4)); - /* - * IP addresses are all sorted but ip[0] to preserve - * the primary IP address as given from userland. - * This special IP is used for unbound outgoing - * connections as well for "loopback" traffic in case - * source address selection cannot find any more fitting - * address to connect from. - */ - if (ip4s > 1) - qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), - prison_qcmp_v4); - /* - * Check for duplicate addresses and do some simple - * zero and broadcast checks. If users give other bogus - * addresses it is their problem. - * - * We do not have to care about byte order for these - * checks so we will do them in NBO. - */ - for (ii = 0; ii < ip4s; ii++) { - if (ip4[ii].s_addr == INADDR_ANY || - ip4[ii].s_addr == INADDR_BROADCAST) { - error = EINVAL; - goto done_free; - } - if ((ii+1) < ip4s && - (ip4[0].s_addr == ip4[ii+1].s_addr || - ip4[ii].s_addr == ip4[ii+1].s_addr)) { - error = EINVAL; - goto done_free; - } + ip4 = prison_ip_copyin(PR_INET, op, ip4s); + if (ip4 == NULL) { + error = EINVAL; + goto done_free; } } } #endif #ifdef INET6 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); if (error == ENOENT) ip6s = 0; else if (error != 0) goto done_free; - else if (ip6s & (sizeof(*ip6) - 1)) { + else if (ip6s & (sizeof(struct in6_addr) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP6_USER; pr_flags |= PR_IP6_USER; if (ip6s > 0) { - ip6s /= sizeof(*ip6); + ip6s /= sizeof(struct in6_addr); if (ip6s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv6 addresses"); goto done_errmsg; } - ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); - bcopy(op, ip6, ip6s * sizeof(*ip6)); - if (ip6s > 1) - qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), - prison_qcmp_v6); - for (ii = 0; ii < ip6s; ii++) { - if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) { - error = EINVAL; - goto done_free; - } - if ((ii+1) < ip6s && - (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) || - IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1]))) - { - error = EINVAL; - goto done_free; - } + ip6 = prison_ip_copyin(PR_INET6, op, ip6s); + if (ip6 == NULL) { + error = EINVAL; + goto done_free; } } } #endif #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_errmsg; } #endif error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); if (error == ENOENT) osrelstr = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osrelease cannot be changed after creation"); goto done_errmsg; } if (len == 0 || osrelstr[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len >= OSRELEASELEN) { error = ENAMETOOLONG; vfs_opterror(opts, "osrelease string must be 1-%d bytes long", OSRELEASELEN - 1); goto done_errmsg; } } error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); if (error == ENOENT) osreldt = 0; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be changed after creation"); goto done_errmsg; } if (osreldt == 0) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be 0"); goto done_errmsg; } } root = NULL; error = vfs_getopt(opts, "path", (void **)&path, &len); if (error == ENOENT) path = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "path cannot be changed after creation"); goto done_errmsg; } if (len == 0 || path[len - 1] != '\0') { error = EINVAL; goto done_free; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path); error = namei(&nd); if (error) goto done_free; root = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); strlcpy(g_path, path, MAXPATHLEN); error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); if (error == 0) { path = g_path; } else { /* exit on other errors */ goto done_free; } if (root->v_type != VDIR) { error = ENOTDIR; vput(root); goto done_free; } VOP_UNLOCK(root); } /* * Find the specified jail, or at least its parent. * This abuses the file error codes ENOENT and EEXIST. */ pr = NULL; inspr = NULL; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); if (*p != '\0') jid = 0; } sx_xlock(&allprison_lock); drflags = PD_LIST_XLOCKED; ppr = mypr; if (!prison_isalive(ppr)) { /* This jail is dying. This process will surely follow. */ error = EAGAIN; goto done_deref; } if (jid != 0) { if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); goto done_deref; } /* * See if a requested jid already exists. Keep track of * where it can be inserted later. */ TAILQ_FOREACH(inspr, &allprison, pr_list) { if (inspr->pr_id < jid) continue; if (inspr->pr_id > jid) break; pr = inspr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; inspr = NULL; break; } if (pr != NULL) { /* Create: jid must not exist. */ if (cuflags == JAIL_CREATE) { /* * Even creators that cannot see the jail will * get EEXIST. */ error = EEXIST; vfs_opterror(opts, "jail %d already exists", jid); goto done_deref; } if (!prison_ischild(mypr, pr)) { /* * Updaters get ENOENT if they cannot see the * jail. This is true even for CREATE | UPDATE, * which normally cannot give this error. */ error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_deref; } ppr = pr->pr_parent; if (!prison_isalive(ppr)) { error = ENOENT; vfs_opterror(opts, "jail %d is dying", ppr->pr_id); goto done_deref; } if (!prison_isalive(pr)) { if (!(flags & JAIL_DYING)) { error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done_deref; } if ((flags & JAIL_ATTACH) || (pr_flags & PR_PERSIST)) { /* * A dying jail might be resurrected * (via attach or persist), but first * it must determine if another jail * has claimed its name. Accomplish * this by implicitly re-setting the * name. */ if (name == NULL) name = prison_name(mypr, pr); } } } else { /* Update: jid must exist. */ if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_deref; } } } /* * If the caller provided a name, look for a jail by that name. * This has different semantics for creates and updates keyed by jid * (where the name must not already exist in a different jail), * and updates keyed by the name itself (where the name must exist * because that is the jail being updated). */ namelc = NULL; if (name != NULL) { namelc = strrchr(name, '.'); if (namelc == NULL) namelc = name; else { /* * This is a hierarchical name. Split it into the * parent and child names, and make sure the parent * exists or matches an already found jail. */ if (pr != NULL) { if (strncmp(name, ppr->pr_name, namelc - name) || ppr->pr_name[namelc - name] != '\0') { error = EINVAL; vfs_opterror(opts, "cannot change jail's parent"); goto done_deref; } } else { *namelc = '\0'; ppr = prison_find_name(mypr, name); if (ppr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_deref; } mtx_unlock(&ppr->pr_mtx); if (!prison_isalive(ppr)) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_deref; } *namelc = '.'; } namelc++; } if (namelc[0] != '\0') { pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; deadpr = NULL; FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr != pr && !strcmp(tpr->pr_name + pnamelen, namelc)) { if (prison_isalive(tpr)) { if (pr == NULL && cuflags != JAIL_CREATE) { /* * Use this jail * for updates. */ pr = tpr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; break; } /* * Create, or update(jid): * name must not exist in an * active sibling jail. */ error = EEXIST; vfs_opterror(opts, "jail \"%s\" already exists", name); goto done_deref; } if (pr == NULL && cuflags != JAIL_CREATE) { deadpr = tpr; } } } /* If no active jail is found, use a dying one. */ if (deadpr != NULL && pr == NULL) { if (flags & JAIL_DYING) { pr = deadpr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; } else if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_deref; } } /* Update: name must exist if no jid. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_deref; } } } /* Update: must provide a jid or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); goto done_deref; } /* If there's no prison to update, create a new one and link it in. */ created = pr == NULL; if (created) { for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) if (tpr->pr_childcount >= tpr->pr_childmax) { error = EPERM; vfs_opterror(opts, "prison limit exceeded"); goto done_deref; } if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) { error = EAGAIN; vfs_opterror(opts, "no available jail IDs"); goto done_deref; } pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); pr->pr_state = PRISON_STATE_INVALID; refcount_init(&pr->pr_ref, 1); refcount_init(&pr->pr_uref, 0); drflags |= PD_DEREF; LIST_INIT(&pr->pr_children); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); pr->pr_id = jid; if (inspr != NULL) TAILQ_INSERT_BEFORE(inspr, pr, pr_list); else TAILQ_INSERT_TAIL(&allprison, pr, pr_list); pr->pr_parent = ppr; prison_hold(ppr); prison_proc_hold(ppr); LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; /* Set some default values, and inherit some from the parent. */ if (namelc == NULL) namelc = ""; if (path == NULL) { path = "/"; root = mypr->pr_root; vref(root); } strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); pr->pr_flags |= PR_HOST; #if defined(INET) || defined(INET6) #ifdef VIMAGE if (!(pr_flags & PR_VNET)) #endif { #ifdef INET if (!(ch_flags & PR_IP4_USER)) pr->pr_flags |= PR_IP4 | PR_IP4_USER; else if (!(pr_flags & PR_IP4_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP4; - if (ppr->pr_ip4 != NULL) { - pr->pr_ip4s = ppr->pr_ip4s; - pr->pr_ip4 = malloc(pr->pr_ip4s * - sizeof(struct in_addr), M_PRISON, - M_WAITOK); - bcopy(ppr->pr_ip4, pr->pr_ip4, - pr->pr_ip4s * sizeof(*pr->pr_ip4)); - } + prison_ip_dup(ppr, pr, PR_INET); } #endif #ifdef INET6 if (!(ch_flags & PR_IP6_USER)) pr->pr_flags |= PR_IP6 | PR_IP6_USER; else if (!(pr_flags & PR_IP6_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP6; - if (ppr->pr_ip6 != NULL) { - pr->pr_ip6s = ppr->pr_ip6s; - pr->pr_ip6 = malloc(pr->pr_ip6s * - sizeof(struct in6_addr), M_PRISON, - M_WAITOK); - bcopy(ppr->pr_ip6, pr->pr_ip6, - pr->pr_ip6s * sizeof(*pr->pr_ip6)); - } + prison_ip_dup(ppr, pr, PR_INET6); } #endif } #endif /* Source address selection is always on by default. */ pr->pr_flags |= _PR_IP_SADDRSEL; pr->pr_securelevel = ppr->pr_securelevel; pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; pr->pr_enforce_statfs = jail_default_enforce_statfs; pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; if (osrelstr == NULL) strlcpy(pr->pr_osrelease, ppr->pr_osrelease, sizeof(pr->pr_osrelease)); else strlcpy(pr->pr_osrelease, osrelstr, sizeof(pr->pr_osrelease)); #ifdef VIMAGE /* Allocate a new vnet if specified. */ pr->pr_vnet = (pr_flags & PR_VNET) ? vnet_alloc() : ppr->pr_vnet; #endif /* * Allocate a dedicated cpuset for each jail. * Unlike other initial settings, this may return an error. */ error = cpuset_create_root(ppr, &pr->pr_cpuset); if (error) goto done_deref; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; } else { /* * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ prison_hold(pr); drflags |= PD_DEREF; #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_deref; } #endif #ifdef INET if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_deref; } #endif #ifdef INET6 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_deref; } #endif } /* Do final error checking before setting anything. */ if (gotslevel) { if (slevel < ppr->pr_securelevel) { error = EPERM; goto done_deref; } } if (gotchildmax) { if (childmax >= ppr->pr_childmax) { error = EPERM; goto done_deref; } } if (gotenforce) { if (enforce < ppr->pr_enforce_statfs) { error = EPERM; goto done_deref; } } if (gotrsnum) { /* * devfs_rsnum is a uint16_t */ if (rsnum < 0 || rsnum > 65535) { error = EINVAL; goto done_deref; } /* * Nested jails always inherit parent's devfs ruleset */ if (jailed(td->td_ucred)) { if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { error = EPERM; goto done_deref; } else rsnum = ppr->pr_devfs_rsnum; } } #ifdef INET if (ip4s > 0) { - if (ppr->pr_flags & PR_IP4) { - /* - * Make sure the new set of IP addresses is a - * subset of the parent's list. Don't worry - * about the parent being unlocked, as any - * setting is done with allprison_lock held. - */ - for (ij = 0; ij < ppr->pr_ip4s; ij++) - if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) - break; - if (ij == ppr->pr_ip4s) { - error = EPERM; - goto done_deref; - } - if (ip4s > 1) { - for (ii = ij = 1; ii < ip4s; ii++) { - if (ip4[ii].s_addr == - ppr->pr_ip4[0].s_addr) - continue; - for (; ij < ppr->pr_ip4s; ij++) - if (ip4[ii].s_addr == - ppr->pr_ip4[ij].s_addr) - break; - if (ij == ppr->pr_ip4s) - break; - } - if (ij == ppr->pr_ip4s) { - error = EPERM; - goto done_deref; - } - } + if ((ppr->pr_flags & PR_IP4) && + !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4, + PR_INET)) { + error = EPERM; + goto done_deref; } - /* - * Check for conflicting IP addresses. We permit them - * if there is no more than one IP on each jail. If - * there is a duplicate on a jail with more than one - * IP stop checking and return error. - */ -#ifdef VIMAGE - for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) - if (tppr->pr_flags & PR_VNET) - break; -#else - tppr = &prison0; -#endif - FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { - if (tpr == pr || -#ifdef VIMAGE - (tpr != tppr && (tpr->pr_flags & PR_VNET)) || -#endif - !prison_isalive(tpr)) { - descend = 0; - continue; - } - if (!(tpr->pr_flags & PR_IP4_USER)) - continue; - descend = 0; - if (tpr->pr_ip4 == NULL || - (ip4s == 1 && tpr->pr_ip4s == 1)) - continue; - for (ii = 0; ii < ip4s; ii++) { - if (prison_check_ip4_locked(tpr, &ip4[ii]) == - 0) { - error = EADDRINUSE; - vfs_opterror(opts, - "IPv4 addresses clash"); - goto done_deref; - } - } + if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) { + error = EADDRINUSE; + vfs_opterror(opts, "IPv4 addresses clash"); + goto done_deref; } } #endif #ifdef INET6 if (ip6s > 0) { - if (ppr->pr_flags & PR_IP6) { - /* - * Make sure the new set of IP addresses is a - * subset of the parent's list. - */ - for (ij = 0; ij < ppr->pr_ip6s; ij++) - if (IN6_ARE_ADDR_EQUAL(&ip6[0], - &ppr->pr_ip6[ij])) - break; - if (ij == ppr->pr_ip6s) { - error = EPERM; - goto done_deref; - } - if (ip6s > 1) { - for (ii = ij = 1; ii < ip6s; ii++) { - if (IN6_ARE_ADDR_EQUAL(&ip6[ii], - &ppr->pr_ip6[0])) - continue; - for (; ij < ppr->pr_ip6s; ij++) - if (IN6_ARE_ADDR_EQUAL( - &ip6[ii], &ppr->pr_ip6[ij])) - break; - if (ij == ppr->pr_ip6s) - break; - } - if (ij == ppr->pr_ip6s) { - error = EPERM; - goto done_deref; - } - } + if ((ppr->pr_flags & PR_IP6) && + !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6, + PR_INET6)) { + error = EPERM; + goto done_deref; } - /* Check for conflicting IP addresses. */ -#ifdef VIMAGE - for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) - if (tppr->pr_flags & PR_VNET) - break; -#else - tppr = &prison0; -#endif - FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { - if (tpr == pr || -#ifdef VIMAGE - (tpr != tppr && (tpr->pr_flags & PR_VNET)) || -#endif - !prison_isalive(tpr)) { - descend = 0; - continue; - } - if (!(tpr->pr_flags & PR_IP6_USER)) - continue; - descend = 0; - if (tpr->pr_ip6 == NULL || - (ip6s == 1 && tpr->pr_ip6s == 1)) - continue; - for (ii = 0; ii < ip6s; ii++) { - if (prison_check_ip6_locked(tpr, &ip6[ii]) == - 0) { - error = EADDRINUSE; - vfs_opterror(opts, - "IPv6 addresses clash"); - goto done_deref; - } - } + if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) { + error = EADDRINUSE; + vfs_opterror(opts, "IPv6 addresses clash"); + goto done_deref; } } #endif onamelen = namelen = 0; if (namelc != NULL) { /* Give a default name of the jid. Also allow the name to be * explicitly the jid - but not any other number, and only in * normal form (no leading zero/etc). */ if (namelc[0] == '\0') snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); else if ((strtoul(namelc, &p, 10) != jid || namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { error = EINVAL; vfs_opterror(opts, "name cannot be numeric (unless it is the jid)"); goto done_deref; } /* * Make sure the name isn't too long for the prison or its * children. */ pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; onamelen = strlen(pr->pr_name + pnamelen); namelen = strlen(namelc); if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref; } FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { if (strlen(tpr->pr_name) + (namelen - onamelen) >= sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref; } } } pr_allow_diff = pr_allow & ~ppr->pr_allow; if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) { error = EPERM; goto done_deref; } /* * Let modules check their parameters. This requires unlocking and * then re-locking the prison, but this is still a valid state as long * as allprison_lock remains xlocked. */ mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; error = osd_jail_call(pr, PR_METHOD_CHECK, opts); if (error != 0) goto done_deref; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; /* At this point, all valid parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_deref; } } /* Set the parameters of the prison. */ #ifdef INET redo_ip4 = 0; if (pr_flags & PR_IP4_USER) { pr->pr_flags |= PR_IP4; - free(pr->pr_ip4, M_PRISON); - pr->pr_ip4s = ip4s; - pr->pr_ip4 = ip4; + prison_ip_set(pr, PR_INET, ip4); ip4 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif - if (prison_restrict_ip4(tpr, NULL)) { + if (prison_ip_restrict(tpr, PR_INET, NULL)) { redo_ip4 = 1; descend = 0; } } } #endif #ifdef INET6 redo_ip6 = 0; if (pr_flags & PR_IP6_USER) { pr->pr_flags |= PR_IP6; - free(pr->pr_ip6, M_PRISON); - pr->pr_ip6s = ip6s; - pr->pr_ip6 = ip6; + prison_ip_set(pr, PR_INET6, ip6); ip6 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif - if (prison_restrict_ip6(tpr, NULL)) { + if (prison_ip_restrict(tpr, PR_INET6, NULL)) { redo_ip6 = 1; descend = 0; } } } #endif if (gotslevel) { pr->pr_securelevel = slevel; /* Set all child jails to be at least this level. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_securelevel < slevel) tpr->pr_securelevel = slevel; } if (gotchildmax) { pr->pr_childmax = childmax; /* Set all child jails to under this limit. */ FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) if (tpr->pr_childmax > childmax - level) tpr->pr_childmax = childmax > level ? childmax - level : 0; } if (gotenforce) { pr->pr_enforce_statfs = enforce; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_enforce_statfs < enforce) tpr->pr_enforce_statfs = enforce; } if (gotrsnum) { pr->pr_devfs_rsnum = rsnum; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_devfs_rsnum = rsnum; } if (namelc != NULL) { if (ppr == &prison0) strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); else snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", ppr->pr_name, namelc); /* Change this component of child names. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, strlen(tpr->pr_name + onamelen) + 1); bcopy(pr->pr_name, tpr->pr_name, namelen); } } if (path != NULL) { /* Try to keep a real-rooted full pathname. */ strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); pr->pr_root = root; root = NULL; } if (PR_HOST & ch_flags & ~pr_flags) { if (pr->pr_flags & PR_HOST) { /* * Copy the parent's host info. As with pr_ip4 above, * the lack of a lock on the parent is not a problem; * it is always set with allprison_lock at least * shared, and is held exclusively here. */ strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, sizeof(pr->pr_hostname)); strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, sizeof(pr->pr_domainname)); strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, sizeof(pr->pr_hostuuid)); pr->pr_hostid = pr->pr_parent->pr_hostid; } } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { /* Set this prison, and any descendants without PR_HOST. */ if (host != NULL) strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); if (domain != NULL) strlcpy(pr->pr_domainname, domain, sizeof(pr->pr_domainname)); if (uuid != NULL) strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); if (gothid) pr->pr_hostid = hid; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { if (tpr->pr_flags & PR_HOST) descend = 0; else { if (host != NULL) strlcpy(tpr->pr_hostname, pr->pr_hostname, sizeof(tpr->pr_hostname)); if (domain != NULL) strlcpy(tpr->pr_domainname, pr->pr_domainname, sizeof(tpr->pr_domainname)); if (uuid != NULL) strlcpy(tpr->pr_hostuuid, pr->pr_hostuuid, sizeof(tpr->pr_hostuuid)); if (gothid) tpr->pr_hostid = hid; } } } pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; if ((tallow = ch_allow & ~pr_allow)) prison_set_allow_locked(pr, tallow, 0); /* * Persistent prisons get an extra reference, and prisons losing their * persist flag lose that reference. */ born = !prison_isalive(pr); if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) { if (pr_flags & PR_PERSIST) { prison_hold(pr); /* * This may make a dead prison alive again, but wait * to label it as such until after OSD calls have had * a chance to run (and perhaps to fail). */ refcount_acquire(&pr->pr_uref); } else { drflags |= PD_DEUREF; prison_free_not_last(pr); } } pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; /* * Any errors past this point will need to de-persist newly created * prisons, as well as call remove methods. */ if (born) drflags |= PD_KILL; #ifdef RACCT if (racct_enable && created) prison_racct_attach(pr); #endif /* Locks may have prevented a complete restriction of child IP * addresses. If so, allocate some more memory and try again. */ #ifdef INET while (redo_ip4) { - ip4s = pr->pr_ip4s; - ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); + ip4s = pr->pr_addrs[PR_INET]->ips; + ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip4 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif - if (prison_restrict_ip4(tpr, ip4)) { + if (prison_ip_restrict(tpr, PR_INET, ip4)) { if (ip4 != NULL) ip4 = NULL; else redo_ip4 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif #ifdef INET6 while (redo_ip6) { - ip6s = pr->pr_ip6s; - ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); + ip6s = pr->pr_addrs[PR_INET6]->ips; + ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip6 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif - if (prison_restrict_ip6(tpr, ip6)) { + if (prison_ip_restrict(tpr, PR_INET6, ip6)) { if (ip6 != NULL) ip6 = NULL; else redo_ip6 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif /* Let the modules do their work. */ if (born) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) goto done_deref; } error = osd_jail_call(pr, PR_METHOD_SET, opts); if (error) goto done_deref; /* * A new prison is now ready to be seen; either it has gained a user * reference via persistence, or is about to gain one via attachment. */ if (born) { drflags = prison_lock_xlock(pr, drflags); pr->pr_state = PRISON_STATE_ALIVE; } /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { error = do_jail_attach(td, pr, prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS)); drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED); if (error) { vfs_opterror(opts, "attach failed"); goto done_deref; } } #ifdef RACCT if (racct_enable && !created) { if (drflags & PD_LOCKED) { mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; } if (drflags & PD_LIST_XLOCKED) { sx_xunlock(&allprison_lock); drflags &= ~PD_LIST_XLOCKED; } prison_racct_modify(pr); } #endif drflags &= ~PD_KILL; td->td_retval[0] = pr->pr_id; done_deref: /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (drflags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); if (root != NULL) vrele(root); done_errmsg: if (error) { /* Write the error message back to userspace. */ if (vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len) == 0 && errmsg_len > 0) { errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } done_free: #ifdef INET - free(ip4, M_PRISON); + prison_ip_free(ip4); #endif #ifdef INET6 - free(ip6, M_PRISON); + prison_ip_free(ip6); #endif if (g_path != NULL) free(g_path, M_TEMP); vfs_freeopts(opts); return (error); } /* * Find the next available prison ID. Return the ID on success, or zero * on failure. Also set a pointer to the allprison list entry the prison * should be inserted before. */ static int get_next_prid(struct prison **insprp) { struct prison *inspr; int jid, maxid; jid = lastprid % JAIL_MAX + 1; if (TAILQ_EMPTY(&allprison) || TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) { /* * A common case is for all jails to be implicitly numbered, * which means they'll go on the end of the list, at least * for the first JAIL_MAX times. */ inspr = NULL; } else { /* * Take two passes through the allprison list: first starting * with the proposed jid, then ending with it. */ for (maxid = JAIL_MAX; maxid != 0; ) { TAILQ_FOREACH(inspr, &allprison, pr_list) { if (inspr->pr_id < jid) continue; if (inspr->pr_id > jid) { /* Found an opening. */ maxid = 0; break; } if (++jid > maxid) { if (lastprid == maxid || lastprid == 0) { /* * The entire legal range * has been traversed */ return 0; } /* Try again from the start. */ jid = 1; maxid = lastprid; break; } } if (inspr == NULL) { /* Found room at the end of the list. */ break; } } } *insprp = inspr; lastprid = jid; return (jid); } /* * struct jail_get_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_get(struct thread *td, struct jail_get_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_get(td, auio, uap->flags); if (error == 0) error = copyout(auio->uio_iov, uap->iovp, uap->iovcnt * sizeof (struct iovec)); free(auio, M_IOV); return (error); } int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct bool_flags *bf; struct jailsys_flags *jsf; struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos; unsigned f; if (flags & ~JAIL_GET_MASK) return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; pr = NULL; /* * Find the prison specified by one of: lastjid, jid, name. */ sx_slock(&allprison_lock); drflags = PD_LIST_SLOCKED; error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id > jid && ((flags & JAIL_DYING) || prison_isalive(pr)) && prison_ischild(mypr, pr)) { mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; goto found_prison; } } error = ENOENT; vfs_opterror(opts, "no jail after %d", jid); goto done; } else if (error != ENOENT) goto done; error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == 0) { if (jid != 0) { pr = prison_find_child(mypr, jid); if (pr != NULL) { drflags |= PD_LOCKED; if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done; } } else if (error != ENOENT) goto done; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == 0) { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done; } pr = prison_find_name(mypr, name); if (pr != NULL) { drflags |= PD_LOCKED; if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done; } else if (error != ENOENT) goto done; vfs_opterror(opts, "no jail specified"); error = ENOENT; goto done; found_prison: /* Get the parameters of the prison. */ prison_hold(pr); drflags |= PD_DEREF; td->td_retval[0] = pr->pr_id; error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done; i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; error = vfs_setopt(opts, "parent", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "name", prison_name(mypr, pr)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, sizeof(pr->pr_cpuset->cs_id)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "path", prison_path(mypr, pr)); if (error != 0 && error != ENOENT) goto done; #ifdef INET - error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4, - pr->pr_ip4s * sizeof(*pr->pr_ip4)); + error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET] + 1, + pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips * + pr_families[PR_INET].size : 0 ); if (error != 0 && error != ENOENT) goto done; #endif #ifdef INET6 - error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6, - pr->pr_ip6s * sizeof(*pr->pr_ip6)); + error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6] + 1, + pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips * + pr_families[PR_INET6].size : 0 ); if (error != 0 && error != ENOENT) goto done; #endif error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, sizeof(pr->pr_securelevel)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, sizeof(pr->pr_childcount)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "children.max", &pr->pr_childmax, sizeof(pr->pr_childmax)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); if (error != 0 && error != ENOENT) goto done; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32 = pr->pr_hostid; error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); } else #endif error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, sizeof(pr->pr_hostid)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, sizeof(pr->pr_enforce_statfs)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, sizeof(pr->pr_devfs_rsnum)); if (error != 0 && error != ENOENT) goto done; for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) { i = (pr->pr_flags & bf->flag) ? 1 : 0; error = vfs_setopt(opts, bf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { f = pr->pr_flags & (jsf->disable | jsf->new); i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE : (f == jsf->new) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, jsf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { i = (pr->pr_allow & bf->flag) ? 1 : 0; error = vfs_setopt(opts, bf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } i = !prison_isalive(pr); error = vfs_setopt(opts, "dying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, "nodying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, sizeof(pr->pr_osreldate)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); if (error != 0 && error != ENOENT) goto done; /* Get the module parameters. */ mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; error = osd_jail_call(pr, PR_METHOD_GET, opts); if (error) goto done; prison_deref(pr, drflags); pr = NULL; drflags = 0; /* By now, all parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done; } } /* Write the fetched parameters back to userspace. */ error = 0; TAILQ_FOREACH(opt, opts, link) { if (opt->pos >= 0 && opt->pos != errmsg_pos) { pos = 2 * opt->pos + 1; optuio->uio_iov[pos].iov_len = opt->len; if (opt->value != NULL) { if (optuio->uio_segflg == UIO_SYSSPACE) { bcopy(opt->value, optuio->uio_iov[pos].iov_base, opt->len); } else { error = copyout(opt->value, optuio->uio_iov[pos].iov_base, opt->len); if (error) break; } } } } done: /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); if (error && errmsg_pos >= 0) { /* Write the error message back to userspace. */ vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); errmsg_pos = 2 * errmsg_pos + 1; if (errmsg_len > 0) { if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } vfs_freeopts(opts); return (error); } /* * struct jail_remove_args { * int jid; * }; */ int sys_jail_remove(struct thread *td, struct jail_remove_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_REMOVE); if (error) return (error); sx_xlock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_xunlock(&allprison_lock); return (EINVAL); } if (!prison_isalive(pr)) { /* Silently ignore already-dying prisons. */ mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); return (0); } prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED); return (0); } /* * struct jail_attach_args { * int jid; * }; */ int sys_jail_attach(struct thread *td, struct jail_attach_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); return (EINVAL); } /* Do not allow a process to attach to a prison that is not alive. */ if (!prison_isalive(pr)) { mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); return (EINVAL); } return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED)); } static int do_jail_attach(struct thread *td, struct prison *pr, int drflags) { struct proc *p; struct ucred *newcred, *oldcred; int error; mtx_assert(&pr->pr_mtx, MA_OWNED); sx_assert(&allprison_lock, SX_LOCKED); drflags &= PD_LOCK_FLAGS; /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two * different jails at the same time. It is important for * user processes not to do this, or they might end up with * a process root from one prison, but attached to the jail * of another. */ prison_hold(pr); refcount_acquire(&pr->pr_uref); drflags |= PD_DEREF | PD_DEUREF; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; /* Let modules do whatever they need to prepare for attaching. */ error = osd_jail_call(pr, PR_METHOD_ATTACH, td); if (error) { prison_deref(pr, drflags); return (error); } sx_unlock(&allprison_lock); drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED); /* * Reparent the newly attached process to this jail. */ p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) goto e_revert_osd; vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); if ((error = change_dir(pr->pr_root, td)) != 0) goto e_unlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) goto e_unlock; #endif VOP_UNLOCK(pr->pr_root); if ((error = pwd_chroot_chdir(td, pr->pr_root))) goto e_revert_osd; newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_prison = pr; proc_set_cred(p, newcred); setsugid(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif prison_deref(oldcred->cr_prison, drflags); crfree(oldcred); /* * If the prison was killed while changing credentials, die along * with it. */ if (!prison_isalive(pr)) { PROC_LOCK(p); kern_psignal(p, SIGKILL); PROC_UNLOCK(p); } return (0); e_unlock: VOP_UNLOCK(pr->pr_root); e_revert_osd: /* Tell modules this thread is still in its old jail after all. */ sx_slock(&allprison_lock); drflags |= PD_LIST_SLOCKED; (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); prison_deref(pr, drflags); return (error); } /* * Returns a locked prison instance, or NULL on failure. */ struct prison * prison_find(int prid) { struct prison *pr; sx_assert(&allprison_lock, SX_LOCKED); TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id < prid) continue; if (pr->pr_id > prid) break; KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); mtx_lock(&pr->pr_mtx); return (pr); } return (NULL); } /* * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. */ struct prison * prison_find_child(struct prison *mypr, int prid) { struct prison *pr; int descend; sx_assert(&allprison_lock, SX_LOCKED); FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (pr->pr_id == prid) { KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); mtx_lock(&pr->pr_mtx); return (pr); } } return (NULL); } /* * Look for the name relative to mypr. Returns a locked prison or NULL. */ struct prison * prison_find_name(struct prison *mypr, const char *name) { struct prison *pr, *deadpr; size_t mylen; int descend; sx_assert(&allprison_lock, SX_LOCKED); mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; deadpr = NULL; FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (!strcmp(pr->pr_name + mylen, name)) { KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); if (prison_isalive(pr)) { mtx_lock(&pr->pr_mtx); return (pr); } deadpr = pr; } } /* There was no valid prison - perhaps there was a dying one. */ if (deadpr != NULL) mtx_lock(&deadpr->pr_mtx); return (deadpr); } /* * See if a prison has the specific flag set. The prison should be locked, * unless checking for flags that are only set at jail creation (such as * PR_IP4 and PR_IP6), or only the single bit is examined, without regard * to any other prison data. */ int prison_flag(struct ucred *cred, unsigned flag) { return (cred->cr_prison->pr_flags & flag); } int prison_allow(struct ucred *cred, unsigned flag) { return ((cred->cr_prison->pr_allow & flag) != 0); } /* * Hold a prison reference, by incrementing pr_ref. It is generally * an error to hold a prison that does not already have a reference. * A prison record will remain valid as long as it has at least one * reference, and will not be removed as long as either the prison * mutex or the allprison lock is held (allprison_lock may be shared). */ void prison_hold_locked(struct prison *pr) { /* Locking is no longer required. */ prison_hold(pr); } void prison_hold(struct prison *pr) { #ifdef INVARIANTS int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref); KASSERT(was_valid, ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_acquire(&pr->pr_ref); #endif } /* * Remove a prison reference. If that was the last reference, the * prison will be removed (at a later time). */ void prison_free_locked(struct prison *pr) { mtx_assert(&pr->pr_mtx, MA_OWNED); /* * Locking is no longer required, but unlock because the caller * expects it. */ mtx_unlock(&pr->pr_mtx); prison_free(pr); } void prison_free(struct prison *pr) { KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_ref)) { /* * Don't remove the last reference in this context, * in case there are locks held. */ taskqueue_enqueue(taskqueue_thread, &pr->pr_task); } } static void prison_free_not_last(struct prison *pr) { #ifdef INVARIANTS int lastref; KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); lastref = refcount_release(&pr->pr_ref); KASSERT(!lastref, ("prison_free_not_last freed last ref on prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_release(&pr->pr_ref); #endif } /* * Hold a a prison for user visibility, by incrementing pr_uref. * It is generally an error to hold a prison that isn't already * user-visible, except through the the jail system calls. It is also * an error to hold an invalid prison. A prison record will remain * alive as long as it has at least one user reference, and will not * be set to the dying state until the prison mutex and allprison_lock * are both freed. */ void prison_proc_hold(struct prison *pr) { #ifdef INVARIANTS int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref); KASSERT(was_alive, ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); #else refcount_acquire(&pr->pr_uref); #endif } /* * Remove a prison user reference. If it was the last reference, the * prison will be considered "dying", and may be removed once all of * its references are dropped. */ void prison_proc_free(struct prison *pr) { /* * Locking is only required when releasing the last reference. * This allows assurance that a locked prison will remain alive * until it is unlocked. */ KASSERT(refcount_load(&pr->pr_uref) > 0, ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_uref)) { /* * Don't remove the last user reference in this context, * which is expected to be a process that is not only locked, * but also half dead. Add a reference so any calls to * prison_free() won't re-submit the task. */ prison_hold(pr); mtx_lock(&pr->pr_mtx); KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC), ("Redundant last reference in prison_proc_free (jid=%d)", pr->pr_id)); pr->pr_flags |= PR_COMPLETE_PROC; mtx_unlock(&pr->pr_mtx); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); } } static void prison_proc_free_not_last(struct prison *pr) { #ifdef INVARIANTS int lastref; KASSERT(refcount_load(&pr->pr_uref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); lastref = refcount_release(&pr->pr_uref); KASSERT(!lastref, ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_release(&pr->pr_uref); #endif } /* * Complete a call to either prison_free or prison_proc_free. */ static void prison_complete(void *context, int pending) { struct prison *pr = context; int drflags; /* * This could be called to release the last reference, or the last * user reference (plus the reference held in prison_proc_free). */ drflags = prison_lock_xlock(pr, PD_DEREF); if (pr->pr_flags & PR_COMPLETE_PROC) { pr->pr_flags &= ~PR_COMPLETE_PROC; drflags |= PD_DEUREF; } prison_deref(pr, drflags); } /* * Remove a prison reference and/or user reference (usually). * This assumes context that allows sleeping (for allprison_lock), * with no non-sleeping locks held, except perhaps the prison itself. * If there are no more references, release and delist the prison. * On completion, the prison lock and the allprison lock are both * unlocked. */ static void prison_deref(struct prison *pr, int flags) { struct prisonlist freeprison; struct prison *killpr, *rpr, *ppr, *tpr; struct proc *p; killpr = NULL; TAILQ_INIT(&freeprison); /* * Release this prison as requested, which may cause its parent * to be released, and then maybe its grandparent, etc. */ for (;;) { if (flags & PD_KILL) { /* Kill the prison and its descendents. */ KASSERT(pr != &prison0, ("prison_deref trying to kill prison0")); if (!(flags & PD_DEREF)) { prison_hold(pr); flags |= PD_DEREF; } flags = prison_lock_xlock(pr, flags); prison_deref_kill(pr, &freeprison); } if (flags & PD_DEUREF) { /* Drop a user reference. */ KASSERT(refcount_load(&pr->pr_uref) > 0, ("prison_deref PD_DEUREF on a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_uref)) { if (!(flags & PD_DEREF)) { prison_hold(pr); flags |= PD_DEREF; } flags = prison_lock_xlock(pr, flags); if (refcount_release(&pr->pr_uref) && pr->pr_state == PRISON_STATE_ALIVE) { /* * When the last user references goes, * this becomes a dying prison. */ KASSERT( refcount_load(&prison0.pr_uref) > 0, ("prison0 pr_uref=0")); pr->pr_state = PRISON_STATE_DYING; mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); } } } if (flags & PD_KILL) { /* * Any remaining user references are probably processes * that need to be killed, either in this prison or its * descendants. */ if (refcount_load(&pr->pr_uref) > 0) killpr = pr; /* Make sure the parent prison doesn't get killed. */ flags &= ~PD_KILL; } if (flags & PD_DEREF) { /* Drop a reference. */ KASSERT(refcount_load(&pr->pr_ref) > 0, ("prison_deref PD_DEREF on a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_ref)) { flags = prison_lock_xlock(pr, flags); if (refcount_release(&pr->pr_ref)) { /* * When the last reference goes, * unlink the prison and set it aside. */ KASSERT( refcount_load(&pr->pr_uref) == 0, ("prison_deref: last ref, " "but still has %d urefs (jid=%d)", pr->pr_uref, pr->pr_id)); KASSERT( refcount_load(&prison0.pr_ref) != 0, ("prison0 pr_ref=0")); pr->pr_state = PRISON_STATE_INVALID; TAILQ_REMOVE(&allprison, pr, pr_list); LIST_REMOVE(pr, pr_sibling); TAILQ_INSERT_TAIL(&freeprison, pr, pr_list); for (ppr = pr->pr_parent; ppr != NULL; ppr = ppr->pr_parent) ppr->pr_childcount--; /* * Removing a prison frees references * from its parent. */ mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; pr = pr->pr_parent; flags |= PD_DEREF | PD_DEUREF; continue; } } } break; } /* Release all the prison locks. */ if (flags & PD_LOCKED) mtx_unlock(&pr->pr_mtx); if (flags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (flags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); /* Kill any processes attached to a killed prison. */ if (killpr != NULL) { sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NEW && p->p_ucred != NULL) { for (ppr = p->p_ucred->cr_prison; ppr != &prison0; ppr = ppr->pr_parent) if (ppr == killpr) { kern_psignal(p, SIGKILL); break; } } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } /* * Finish removing any unreferenced prisons, which couldn't happen * while allprison_lock was held (to avoid a LOR on vrele). */ TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) { #ifdef VIMAGE if (rpr->pr_vnet != rpr->pr_parent->pr_vnet) vnet_destroy(rpr->pr_vnet); #endif if (rpr->pr_root != NULL) vrele(rpr->pr_root); mtx_destroy(&rpr->pr_mtx); #ifdef INET - free(rpr->pr_ip4, M_PRISON); + prison_ip_free(rpr->pr_addrs[PR_INET]); #endif #ifdef INET6 - free(rpr->pr_ip6, M_PRISON); + prison_ip_free(rpr->pr_addrs[PR_INET6]); #endif if (rpr->pr_cpuset != NULL) cpuset_rel(rpr->pr_cpuset); osd_jail_exit(rpr); #ifdef RACCT if (racct_enable) prison_racct_detach(rpr); #endif TAILQ_REMOVE(&freeprison, rpr, pr_list); free(rpr, M_PRISON); } } /* * Kill the prison and its descendants. Mark them as dying, clear the * persist flag, and call module remove methods. */ static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) { struct prison *cpr, *ppr, *rpr; bool descend; /* * Unlike the descendants, the target prison can be killed * even if it is currently dying. This is useful for failed * creation in jail_set(2). */ KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to kill dead prison %p (jid=%d).", pr, pr->pr_id)); refcount_acquire(&pr->pr_uref); pr->pr_state = PRISON_STATE_DYING; mtx_unlock(&pr->pr_mtx); rpr = NULL; FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) { if (descend) { if (!prison_isalive(cpr)) { descend = false; continue; } prison_hold(cpr); prison_proc_hold(cpr); mtx_lock(&cpr->pr_mtx); cpr->pr_state = PRISON_STATE_DYING; cpr->pr_flags |= PR_REMOVE; mtx_unlock(&cpr->pr_mtx); continue; } if (!(cpr->pr_flags & PR_REMOVE)) continue; (void)osd_jail_call(cpr, PR_METHOD_REMOVE, NULL); mtx_lock(&cpr->pr_mtx); cpr->pr_flags &= ~PR_REMOVE; if (cpr->pr_flags & PR_PERSIST) { cpr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(cpr); prison_free_not_last(cpr); } (void)refcount_release(&cpr->pr_uref); if (refcount_release(&cpr->pr_ref)) { /* * When the last reference goes, unlink the prison * and set it aside for prison_deref() to handle. * Delay unlinking the sibling list to keep the loop * safe. */ if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); rpr = cpr; rpr->pr_state = PRISON_STATE_INVALID; TAILQ_REMOVE(&allprison, rpr, pr_list); TAILQ_INSERT_TAIL(freeprison, rpr, pr_list); /* * Removing a prison frees references from its parent. */ ppr = rpr->pr_parent; prison_proc_free_not_last(ppr); prison_free_not_last(ppr); for (; ppr != NULL; ppr = ppr->pr_parent) ppr->pr_childcount--; } mtx_unlock(&cpr->pr_mtx); } if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); mtx_lock(&pr->pr_mtx); if (pr->pr_flags & PR_PERSIST) { pr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(pr); prison_free_not_last(pr); } (void)refcount_release(&pr->pr_uref); } /* * Given the current locking state in the flags, make sure allprison_lock * is held exclusive, and the prison is locked. Return flags indicating * the new state. */ static int prison_lock_xlock(struct prison *pr, int flags) { if (!(flags & PD_LIST_XLOCKED)) { /* * Get allprison_lock, which may be an upgrade, * and may require unlocking the prison. */ if (flags & PD_LOCKED) { mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; } if (flags & PD_LIST_SLOCKED) { if (!sx_try_upgrade(&allprison_lock)) { sx_sunlock(&allprison_lock); sx_xlock(&allprison_lock); } flags &= ~PD_LIST_SLOCKED; } else sx_xlock(&allprison_lock); flags |= PD_LIST_XLOCKED; } if (!(flags & PD_LOCKED)) { /* Lock the prison mutex. */ mtx_lock(&pr->pr_mtx); flags |= PD_LOCKED; } return flags; } /* * Set or clear a permission bit in the pr_allow field, passing restrictions * (cleared permission) down to child jails. */ void prison_set_allow(struct ucred *cred, unsigned flag, int enable) { struct prison *pr; pr = cred->cr_prison; sx_slock(&allprison_lock); mtx_lock(&pr->pr_mtx); prison_set_allow_locked(pr, flag, enable); mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); } static void prison_set_allow_locked(struct prison *pr, unsigned flag, int enable) { struct prison *cpr; int descend; if (enable != 0) pr->pr_allow |= flag; else { pr->pr_allow &= ~flag; FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) cpr->pr_allow &= ~flag; } } /* * Check if a jail supports the given address family. * * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT * if not. */ int prison_check_af(struct ucred *cred, int af) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); pr = cred->cr_prison; #ifdef VIMAGE /* Prisons with their own network stack are not limited. */ if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (af) { #ifdef INET case AF_INET: if (pr->pr_flags & PR_IP4) { mtx_lock(&pr->pr_mtx); - if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL) + if ((pr->pr_flags & PR_IP4) && + pr->pr_addrs[PR_INET] == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif #ifdef INET6 case AF_INET6: if (pr->pr_flags & PR_IP6) { mtx_lock(&pr->pr_mtx); - if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL) + if ((pr->pr_flags & PR_IP6) && + pr->pr_addrs[PR_INET6] == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif case AF_LOCAL: case AF_ROUTE: break; default: if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Check if given address belongs to the jail referenced by cred (wrapper to * prison_check_ip[46]). * * Returns 0 if jail doesn't restrict the address family or if address belongs * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if * the jail doesn't allow the address family. IPv4 Address passed in in NBO. */ int prison_if(struct ucred *cred, const struct sockaddr *sa) { #ifdef INET const struct sockaddr_in *sai; #endif #ifdef INET6 const struct sockaddr_in6 *sai6; #endif int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (sa->sa_family) { #ifdef INET case AF_INET: sai = (const struct sockaddr_in *)sa; error = prison_check_ip4(cred, &sai->sin_addr); break; #endif #ifdef INET6 case AF_INET6: sai6 = (const struct sockaddr_in6 *)sa; error = prison_check_ip6(cred, &sai6->sin6_addr); break; #endif default: if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ int prison_check(struct ucred *cred1, struct ucred *cred2) { return ((cred1->cr_prison == cred2->cr_prison || prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); } /* * Return 1 if p2 is a child of p1, otherwise 0. */ int prison_ischild(struct prison *pr1, struct prison *pr2) { for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) if (pr1 == pr2) return (1); return (0); } /* * Return true if the prison is currently alive. A prison is alive if it * holds user references and it isn't being removed. */ bool -prison_isalive(struct prison *pr) +prison_isalive(const struct prison *pr) { if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE)) return (false); return (true); } /* * Return true if the prison is currently valid. A prison is valid if it has * been fully created, and is not being destroyed. Note that dying prisons * are still considered valid. Invalid prisons won't be found under normal * circumstances, as they're only put in that state by functions that have * an exclusive hold on allprison_lock. */ bool prison_isvalid(struct prison *pr) { if (__predict_false(pr->pr_state == PRISON_STATE_INVALID)) return (false); if (__predict_false(refcount_load(&pr->pr_ref) == 0)) return (false); return (true); } /* * Return 1 if the passed credential is in a jail and that jail does not * have its own virtual network stack, otherwise 0. */ int jailed_without_vnet(struct ucred *cred) { if (!jailed(cred)) return (0); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif return (1); } /* * Return the correct hostname (domainname, et al) for the passed credential. */ void getcredhostname(struct ucred *cred, char *buf, size_t size) { struct prison *pr; /* * A NULL credential can be used to shortcut to the physical * system's hostname. */ pr = (cred != NULL) ? cred->cr_prison : &prison0; mtx_lock(&pr->pr_mtx); strlcpy(buf, pr->pr_hostname, size); mtx_unlock(&pr->pr_mtx); } void getcreddomainname(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_domainname, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostuuid(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_hostuuid, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostid(struct ucred *cred, unsigned long *hostid) { mtx_lock(&cred->cr_prison->pr_mtx); *hostid = cred->cr_prison->pr_hostid; mtx_unlock(&cred->cr_prison->pr_mtx); } void getjailname(struct ucred *cred, char *name, size_t len) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(name, cred->cr_prison->pr_name, len); mtx_unlock(&cred->cr_prison->pr_mtx); } #ifdef VIMAGE /* * Determine whether the prison represented by cred owns * its vnet rather than having it inherited. * * Returns 1 in case the prison owns the vnet, 0 otherwise. */ int prison_owns_vnet(struct ucred *cred) { /* * vnets cannot be added/removed after jail creation, * so no need to lock here. */ return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0); } #endif /* * Determine whether the subject represented by cred can "see" * status of a mount point. * Returns: 0 for permitted, ENOENT otherwise. * XXX: This function should be called cr_canseemount() and should be * placed in kern_prot.c. */ int prison_canseemount(struct ucred *cred, struct mount *mp) { struct prison *pr; struct statfs *sp; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return (0); if (pr->pr_root->v_mount == mp) return (0); if (pr->pr_enforce_statfs == 2) return (ENOENT); /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. * This is ugly check, but this is the only situation when jail's * directory ends with '/'. */ if (strcmp(pr->pr_path, "/") == 0) return (0); len = strlen(pr->pr_path); sp = &mp->mnt_stat; if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) return (ENOENT); /* * Be sure that we don't have situation where jail's root directory * is "/some/path" and mount point is "/some/pathpath". */ if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') return (ENOENT); return (0); } void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) { char jpath[MAXPATHLEN]; struct prison *pr; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return; if (prison_canseemount(cred, mp) != 0) { bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, "[restricted]", sizeof(sp->f_mntonname)); return; } if (pr->pr_root->v_mount == mp) { /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); *sp->f_mntonname = '/'; return; } /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. */ if (strcmp(pr->pr_path, "/") == 0) return; len = strlen(pr->pr_path); strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); if (*jpath == '\0') { /* Should never happen. */ *sp->f_mntonname = '/'; } else { strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); } } /* * Check with permission for a specific privilege is granted within jail. We * have a specific list of accepted privileges; the rest are denied. */ int prison_priv_check(struct ucred *cred, int priv) { struct prison *pr; int error; /* * Some policies have custom handlers. This routine should not be * called for them. See priv_check_cred(). */ switch (priv) { case PRIV_VFS_LOOKUP: case PRIV_VFS_GENERATION: KASSERT(0, ("prison_priv_check instead of a custom handler " "called for %d\n", priv)); } if (!jailed(cred)) return (0); #ifdef VIMAGE /* * Privileges specific to prisons with a virtual network stack. * There might be a duplicate entry here in case the privilege * is only granted conditionally in the legacy jail case. */ switch (priv) { #ifdef notyet /* * NFS-specific privileges. */ case PRIV_NFS_DAEMON: case PRIV_NFS_LOCKD: #endif /* * Network stack privileges. */ case PRIV_NET_BRIDGE: case PRIV_NET_GRE: case PRIV_NET_BPF: case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ case PRIV_NET_ROUTE: case PRIV_NET_TAP: case PRIV_NET_SETIFMTU: case PRIV_NET_SETIFFLAGS: case PRIV_NET_SETIFCAP: case PRIV_NET_SETIFDESCR: case PRIV_NET_SETIFNAME : case PRIV_NET_SETIFMETRIC: case PRIV_NET_SETIFPHYS: case PRIV_NET_SETIFMAC: case PRIV_NET_SETLANPCP: case PRIV_NET_ADDMULTI: case PRIV_NET_DELMULTI: case PRIV_NET_HWIOCTL: case PRIV_NET_SETLLADDR: case PRIV_NET_ADDIFGROUP: case PRIV_NET_DELIFGROUP: case PRIV_NET_IFCREATE: case PRIV_NET_IFDESTROY: case PRIV_NET_ADDIFADDR: case PRIV_NET_DELIFADDR: case PRIV_NET_LAGG: case PRIV_NET_GIF: case PRIV_NET_SETIFVNET: case PRIV_NET_SETIFFIB: /* * 802.11-related privileges. */ case PRIV_NET80211_VAP_GETKEY: case PRIV_NET80211_VAP_MANAGE: #ifdef notyet /* * ATM privileges. */ case PRIV_NETATM_CFG: case PRIV_NETATM_ADD: case PRIV_NETATM_DEL: case PRIV_NETATM_SET: /* * Bluetooth privileges. */ case PRIV_NETBLUETOOTH_RAW: #endif /* * Netgraph and netgraph module privileges. */ case PRIV_NETGRAPH_CONTROL: #ifdef notyet case PRIV_NETGRAPH_TTY: #endif /* * IPv4 and IPv6 privileges. */ case PRIV_NETINET_IPFW: case PRIV_NETINET_DIVERT: case PRIV_NETINET_PF: case PRIV_NETINET_DUMMYNET: case PRIV_NETINET_CARP: case PRIV_NETINET_MROUTE: case PRIV_NETINET_RAW: case PRIV_NETINET_ADDRCTRL6: case PRIV_NETINET_ND6: case PRIV_NETINET_SCOPE6: case PRIV_NETINET_ALIFETIME6: case PRIV_NETINET_IPSEC: case PRIV_NETINET_BINDANY: #ifdef notyet /* * NCP privileges. */ case PRIV_NETNCP: /* * SMB privileges. */ case PRIV_NETSMB: #endif /* * No default: or deny here. * In case of no permit fall through to next switch(). */ if (cred->cr_prison->pr_flags & PR_VNET) return (0); } #endif /* VIMAGE */ switch (priv) { /* * Allow ktrace privileges for root in jail. */ case PRIV_KTRACE: #if 0 /* * Allow jailed processes to configure audit identity and * submit audit records (login, etc). In the future we may * want to further refine the relationship between audit and * jail. */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: case PRIV_AUDIT_SUBMIT: #endif /* * Allow jailed processes to manipulate process UNIX * credentials in any way they see fit. */ case PRIV_CRED_SETUID: case PRIV_CRED_SETEUID: case PRIV_CRED_SETGID: case PRIV_CRED_SETEGID: case PRIV_CRED_SETGROUPS: case PRIV_CRED_SETREUID: case PRIV_CRED_SETREGID: case PRIV_CRED_SETRESUID: case PRIV_CRED_SETRESGID: /* * Jail implements visibility constraints already, so allow * jailed root to override uid/gid-based constraints. */ case PRIV_SEEOTHERGIDS: case PRIV_SEEOTHERUIDS: /* * Jail implements inter-process debugging limits already, so * allow jailed root various debugging privileges. */ case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: /* * Allow jail to set various resource limits and login * properties, and for now, exceed process resource limits. */ case PRIV_PROC_LIMIT: case PRIV_PROC_SETLOGIN: case PRIV_PROC_SETRLIMIT: /* * System V and POSIX IPC privileges are granted in jail. */ case PRIV_IPC_READ: case PRIV_IPC_WRITE: case PRIV_IPC_ADMIN: case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: /* * Jail operations within a jail work on child jails. */ case PRIV_JAIL_ATTACH: case PRIV_JAIL_SET: case PRIV_JAIL_REMOVE: /* * Jail implements its own inter-process limits, so allow * root processes in jail to change scheduling on other * processes in the same jail. Likewise for signalling. */ case PRIV_SCHED_DIFFCRED: case PRIV_SCHED_CPUSET: case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: /* * Allow jailed processes to write to sysctls marked as jail * writable. */ case PRIV_SYSCTL_WRITEJAIL: /* * Allow root in jail to manage a variety of quota * properties. These should likely be conditional on a * configuration option. */ case PRIV_VFS_GETQUOTA: case PRIV_VFS_SETQUOTA: /* * Since Jail relies on chroot() to implement file system * protections, grant many VFS privileges to root in jail. * Be careful to exclude mount-related and NFS-related * privileges. */ case PRIV_VFS_READ: case PRIV_VFS_WRITE: case PRIV_VFS_ADMIN: case PRIV_VFS_EXEC: case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ case PRIV_VFS_CHFLAGS_DEV: case PRIV_VFS_CHOWN: case PRIV_VFS_CHROOT: case PRIV_VFS_RETAINSUGID: case PRIV_VFS_FCHROOT: case PRIV_VFS_LINK: case PRIV_VFS_SETGID: case PRIV_VFS_STAT: case PRIV_VFS_STICKYFILE: /* * As in the non-jail case, non-root users are expected to be * able to read kernel/physical memory (provided /dev/[k]mem * exists in the jail and they have permission to access it). */ case PRIV_KMEM_READ: return (0); /* * Depending on the global setting, allow privilege of * setting system flags. */ case PRIV_VFS_SYSFLAGS: if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) return (0); else return (EPERM); /* * Depending on the global setting, allow privilege of * mounting/unmounting file systems. */ case PRIV_VFS_MOUNT: case PRIV_VFS_UNMOUNT: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_MOUNT_OWNER: pr = cred->cr_prison; prison_lock(pr); if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2) error = 0; else error = EPERM; prison_unlock(pr); return (error); /* * Jails should hold no disposition on the PRIV_VFS_READ_DIR * policy. priv_check_cred will not specifically allow it, and * we may want a MAC policy to allow it. */ case PRIV_VFS_READ_DIR: return (0); /* * Conditionnaly allow locking (unlocking) physical pages * in memory. */ case PRIV_VM_MLOCK: case PRIV_VM_MUNLOCK: if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK) return (0); else return (EPERM); /* * Conditionally allow jailed root to bind reserved ports. */ case PRIV_NETINET_RESERVEDPORT: if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS) return (0); else return (EPERM); /* * Allow jailed root to reuse in-use ports. */ case PRIV_NETINET_REUSEPORT: return (0); /* * Allow jailed root to set certain IPv4/6 (option) headers. */ case PRIV_NETINET_SETHDROPTS: return (0); /* * Conditionally allow creating raw sockets in jail. */ case PRIV_NETINET_RAW: if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) return (0); else return (EPERM); /* * Since jail implements its own visibility limits on netstat * sysctls, allow getcred. This allows identd to work in * jail. */ case PRIV_NETINET_GETCRED: return (0); /* * Allow jailed root to set loginclass. */ case PRIV_PROC_SETLOGINCLASS: return (0); /* * Do not allow a process inside a jail to read the kernel * message buffer unless explicitly permitted. */ case PRIV_MSGBUF: if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF) return (0); return (EPERM); default: /* * In all remaining cases, deny the privilege request. This * includes almost all network privileges, many system * configuration privileges. */ return (EPERM); } } /* * Return the part of pr2's name that is relative to pr1, or the whole name * if it does not directly follow. */ char * prison_name(struct prison *pr1, struct prison *pr2) { char *name; /* Jails see themselves as "0" (if they see themselves at all). */ if (pr1 == pr2) return "0"; name = pr2->pr_name; if (prison_ischild(pr1, pr2)) { /* * pr1 isn't locked (and allprison_lock may not be either) * so its length can't be counted on. But the number of dots * can be counted on - and counted. */ for (; pr1 != &prison0; pr1 = pr1->pr_parent) name = strchr(name, '.') + 1; } return (name); } /* * Return the part of pr2's path that is relative to pr1, or the whole path * if it does not directly follow. */ static char * prison_path(struct prison *pr1, struct prison *pr2) { char *path1, *path2; int len1; path1 = pr1->pr_path; path2 = pr2->pr_path; if (!strcmp(path1, "/")) return (path2); len1 = strlen(path1); if (strncmp(path1, path2, len1)) return (path2); if (path2[len1] == '\0') return "/"; if (path2[len1] == '/') return (path2 + len1); return (path2); } /* * Jail-related sysctls. */ static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Jails"); +#if defined(INET) || defined(INET6) +/* + * Copy address array to memory that would be then SYSCTL_OUT-ed. + * sysctl_jail_list() helper. + */ +static void +prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len) +{ + const size_t size = pr_families[af].size; + + again: + mtx_assert(&pr->pr_mtx, MA_OWNED); + if (pr->pr_addrs[af] != NULL) { + if (*len < pr->pr_addrs[af]->ips) { + *len = pr->pr_addrs[af]->ips; + mtx_unlock(&pr->pr_mtx); + *out = realloc(*out, *len * size, M_TEMP, M_WAITOK); + mtx_lock(&pr->pr_mtx); + goto again; + } + bcopy(pr->pr_addrs[af] + 1, *out, pr->pr_addrs[af]->ips * size); + } +} +#endif + static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { struct xprison *xp; struct prison *pr, *cpr; #ifdef INET struct in_addr *ip4 = NULL; int ip4s = 0; #endif #ifdef INET6 struct in6_addr *ip6 = NULL; int ip6s = 0; #endif int descend, error; xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); pr = req->td->td_ucred->cr_prison; error = 0; sx_slock(&allprison_lock); FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { -#if defined(INET) || defined(INET6) - again: -#endif mtx_lock(&cpr->pr_mtx); #ifdef INET - if (cpr->pr_ip4s > 0) { - if (ip4s < cpr->pr_ip4s) { - ip4s = cpr->pr_ip4s; - mtx_unlock(&cpr->pr_mtx); - ip4 = realloc(ip4, ip4s * - sizeof(struct in_addr), M_TEMP, M_WAITOK); - goto again; - } - bcopy(cpr->pr_ip4, ip4, - cpr->pr_ip4s * sizeof(struct in_addr)); - } + prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s); #endif #ifdef INET6 - if (cpr->pr_ip6s > 0) { - if (ip6s < cpr->pr_ip6s) { - ip6s = cpr->pr_ip6s; - mtx_unlock(&cpr->pr_mtx); - ip6 = realloc(ip6, ip6s * - sizeof(struct in6_addr), M_TEMP, M_WAITOK); - goto again; - } - bcopy(cpr->pr_ip6, ip6, - cpr->pr_ip6s * sizeof(struct in6_addr)); - } + prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s); #endif bzero(xp, sizeof(*xp)); xp->pr_version = XPRISON_VERSION; xp->pr_id = cpr->pr_id; xp->pr_state = cpr->pr_state; strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); #ifdef INET - xp->pr_ip4s = cpr->pr_ip4s; + xp->pr_ip4s = ip4s; #endif #ifdef INET6 - xp->pr_ip6s = cpr->pr_ip6s; + xp->pr_ip6s = ip6s; #endif mtx_unlock(&cpr->pr_mtx); error = SYSCTL_OUT(req, xp, sizeof(*xp)); if (error) break; #ifdef INET if (xp->pr_ip4s > 0) { error = SYSCTL_OUT(req, ip4, xp->pr_ip4s * sizeof(struct in_addr)); if (error) break; } #endif #ifdef INET6 if (xp->pr_ip6s > 0) { error = SYSCTL_OUT(req, ip6, xp->pr_ip6s * sizeof(struct in6_addr)); if (error) break; } #endif } sx_sunlock(&allprison_lock); free(xp, M_TEMP); #ifdef INET free(ip4, M_TEMP); #endif #ifdef INET6 free(ip6, M_TEMP); #endif return (error); } SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_list, "S", "List of active jails"); static int sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) { int error, injail; injail = jailed(req->td->td_ucred); error = SYSCTL_OUT(req, &injail, sizeof(injail)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_jailed, "I", "Process in jail?"); static int sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) { int error, havevnet; #ifdef VIMAGE struct ucred *cred = req->td->td_ucred; havevnet = jailed(cred) && prison_owns_vnet(cred); #else havevnet = 0; #endif error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, vnet, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns vnet?"); #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, "Number of IP addresses a jail may have at most per address family (deprecated)"); #endif /* * Default parameters for jail(2) compatibility. For historical reasons, * the sysctl names have varying similarity to the parameter names. Prisons * just see their own parameters, and can't change them. */ static int sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) { int error, i; /* Get the current flag value, and convert it to a boolean. */ if (req->td->td_ucred->cr_prison == &prison0) { mtx_lock(&prison0.pr_mtx); i = (jail_default_allow & arg2) != 0; mtx_unlock(&prison0.pr_mtx); } else i = prison_allow(req->td->td_ucred, arg2); if (arg1 != NULL) i = !i; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); i = i ? arg2 : 0; if (arg1 != NULL) i ^= arg2; /* * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 * for writing. */ mtx_lock(&prison0.pr_mtx); jail_default_allow = (jail_default_allow & ~arg2) | i; mtx_unlock(&prison0.pr_mtx); return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", "Processes in jail can set their hostnames (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", "Processes in jail can use System V IPC primitives (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", "Prison root can create raw sockets (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", "Processes in jail can alter system file flags (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", "Processes in jail can mount/unmount jail-friendly file systems (deprecated)"); static int sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) { struct prison *pr; int level, error; pr = req->td->td_ucred->cr_prison; level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); *(int *)arg1 = level; return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), sysctl_jail_default_level, "I", "Processes in jail cannot see all mounted file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), sysctl_jail_default_level, "I", "Ruleset for the devfs filesystem in jail (deprecated)"); /* * Nodes to describe jail parameters. Maximum length of string parameters * is returned in the string itself, and the other parameters exist merely * to make themselves and their types known. */ SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Jail parameters"); int sysctl_jail_param(SYSCTL_HANDLER_ARGS) { int i; long l; size_t s; char numbuf[12]; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_LONG: case CTLTYPE_ULONG: l = 0; #ifdef SCTL_MASK32 if (!(req->flags & SCTL_MASK32)) #endif return (SYSCTL_OUT(req, &l, sizeof(l))); case CTLTYPE_INT: case CTLTYPE_UINT: i = 0; return (SYSCTL_OUT(req, &i, sizeof(i))); case CTLTYPE_STRING: snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); return (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); case CTLTYPE_STRUCT: s = (size_t)arg2; return (SYSCTL_OUT(req, &s, sizeof(s))); } return (0); } /* * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at * jail creation time but cannot be changed in an existing jail. */ SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail secure level"); SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail value for kern.osreldate and uname -K"); SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, "Jail value for kern.osrelease and uname -r"); SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail cannot see all mounted file systems"); SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, "I", "Ruleset for in-jail devfs mounts"); SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail persistence"); #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, "I", "Current number of child jails"); SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, "I", "Maximum number of child jails"); SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail hostname"); SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail NIS domainname"); SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, "Jail host UUID"); SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, "LU", "Jail host ID"); SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); #ifdef INET SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, "Jail IPv4 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), "S,in_addr,a", "Jail IPv4 addresses"); SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv4 source address selection rather than the " "primary jail IPv4 address."); #endif #ifdef INET6 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, "Jail IPv6 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), "S,in6_addr,a", "Jail IPv6 addresses"); SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv6 source address selection rather than the " "primary jail IPv6 address."); #endif SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set hostname"); SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may use SYSV IPC"); SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create raw sockets"); SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may alter system file flags"); SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set file quotas"); SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may lock (unlock) physical pages in memory"); SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may bind sockets to reserved ports"); SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may read the kernel message buffer"); SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW, "B", "Unprivileged processes may use process debugging facilities"); SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW, "B", "Processes in jail with uid 0 have privilege"); SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount/unmount jail-friendly file systems in general"); /* * Add a dynamic parameter allow., or allow... Return * its associated bit in the pr_allow bitmask, or zero if the parameter was * not created. */ unsigned prison_add_allow(const char *prefix, const char *name, const char *prefix_descr, const char *descr) { struct bool_flags *bf; struct sysctl_oid *parent; char *allow_name, *allow_noname, *allowed; #ifndef NO_SYSCTL_DESCR char *descr_deprecated; #endif u_int allow_flag; if (prefix ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name) < 0 || asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name) < 0 : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 || asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) { free(allow_name, M_PRISON); return 0; } /* * See if this parameter has already beed added, i.e. a module was * previously loaded/unloaded. */ mtx_lock(&prison0.pr_mtx); for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { if (strcmp(bf->name, allow_name) == 0) { allow_flag = bf->flag; goto no_add; } } /* * Find a free bit in pr_allow_all, failing if there are none * (which shouldn't happen as long as we keep track of how many * potential dynamic flags exist). */ for (allow_flag = 1;; allow_flag <<= 1) { if (allow_flag == 0) goto no_add; if ((pr_allow_all & allow_flag) == 0) break; } /* Note the parameter in the next open slot in pr_flag_allow. */ for (bf = pr_flag_allow; ; bf++) { if (bf == pr_flag_allow + nitems(pr_flag_allow)) { /* This should never happen, but is not fatal. */ allow_flag = 0; goto no_add; } if (atomic_load_int(&bf->flag) == 0) break; } bf->name = allow_name; bf->noname = allow_noname; pr_allow_all |= allow_flag; /* * prison0 always has permission for the new parameter. * Other jails must have it granted to them. */ prison0.pr_allow |= allow_flag; /* The flag indicates a valid entry, so make sure it is set last. */ atomic_store_rel_int(&bf->flag, allow_flag); mtx_unlock(&prison0.pr_mtx); /* * Create sysctls for the paramter, and the back-compat global * permission. */ parent = prefix ? SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(&sysctl___security_jail_param_allow), OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr) : &sysctl___security_jail_param_allow; (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_param, "B", descr); if ((prefix ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name) : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) { #ifndef NO_SYSCTL_DESCR (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)", descr); #endif (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag, sysctl_jail_default_allow, "I", descr_deprecated); #ifndef NO_SYSCTL_DESCR free(descr_deprecated, M_TEMP); #endif free(allowed, M_TEMP); } return allow_flag; no_add: mtx_unlock(&prison0.pr_mtx); free(allow_name, M_PRISON); free(allow_noname, M_PRISON); return allow_flag; } /* * The VFS system will register jail-aware filesystems here. They each get * a parameter allow.mount.xxxfs and a flag to check when a jailed user * attempts to mount. */ void prison_add_vfs(struct vfsconf *vfsp) { #ifdef NO_SYSCTL_DESCR vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, NULL, NULL); #else char *descr; (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system", vfsp->vfc_name); vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, NULL, descr); free(descr, M_TEMP); #endif } #ifdef RACCT void prison_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_slock(&allprison_lock); if (pre != NULL) (pre)(); LIST_FOREACH(prr, &allprison_racct, prr_next) (callback)(prr->prr_racct, arg2, arg3); if (post != NULL) (post)(); sx_sunlock(&allprison_lock); } static struct prison_racct * prison_racct_find_locked(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) return (NULL); LIST_FOREACH(prr, &allprison_racct, prr_next) { if (strcmp(name, prr->prr_name) != 0) continue; /* Found prison_racct with a matching name? */ prison_racct_hold(prr); return (prr); } /* Add new prison_racct. */ prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); racct_create(&prr->prr_racct); strcpy(prr->prr_name, name); refcount_init(&prr->prr_refcount, 1); LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); return (prr); } struct prison_racct * prison_racct_find(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_xlock(&allprison_lock); prr = prison_racct_find_locked(name); sx_xunlock(&allprison_lock); return (prr); } void prison_racct_hold(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); refcount_acquire(&prr->prr_refcount); } static void prison_racct_free_locked(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (refcount_release(&prr->prr_refcount)) { racct_destroy(&prr->prr_racct); LIST_REMOVE(prr, prr_next); free(prr, M_PRISON_RACCT); } } void prison_racct_free(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (refcount_release_if_not_last(&prr->prr_refcount)) return; sx_xlock(&allprison_lock); prison_racct_free_locked(prr); sx_xunlock(&allprison_lock); } static void prison_racct_attach(struct prison *pr) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); prr = prison_racct_find_locked(pr->pr_name); KASSERT(prr != NULL, ("cannot find prison_racct")); pr->pr_prison_racct = prr; } /* * Handle jail renaming. From the racct point of view, renaming means * moving from one prison_racct to another. */ static void prison_racct_modify(struct prison *pr) { #ifdef RCTL struct proc *p; struct ucred *cred; #endif struct prison_racct *oldprr; ASSERT_RACCT_ENABLED(); sx_slock(&allproc_lock); sx_xlock(&allprison_lock); if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { sx_xunlock(&allprison_lock); sx_sunlock(&allproc_lock); return; } oldprr = pr->pr_prison_racct; pr->pr_prison_racct = NULL; prison_racct_attach(pr); /* * Move resource utilisation records. */ racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); #ifdef RCTL /* * Force rctl to reattach rules to processes. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); cred = crhold(p->p_ucred); PROC_UNLOCK(p); rctl_proc_ucred_changed(p, cred); crfree(cred); } #endif sx_sunlock(&allproc_lock); prison_racct_free_locked(oldprr); sx_xunlock(&allprison_lock); } static void prison_racct_detach(struct prison *pr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (pr->pr_prison_racct == NULL) return; prison_racct_free(pr->pr_prison_racct); pr->pr_prison_racct = NULL; } #endif /* RACCT */ #ifdef DDB static void db_show_prison(struct prison *pr) { struct bool_flags *bf; struct jailsys_flags *jsf; #if defined(INET) || defined(INET6) int ii; #endif unsigned f; #ifdef INET char ip4buf[INET_ADDRSTRLEN]; #endif #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif db_printf("prison %p:\n", pr); db_printf(" jid = %d\n", pr->pr_id); db_printf(" name = %s\n", pr->pr_name); db_printf(" parent = %p\n", pr->pr_parent); db_printf(" ref = %d\n", pr->pr_ref); db_printf(" uref = %d\n", pr->pr_uref); db_printf(" state = %s\n", pr->pr_state == PRISON_STATE_ALIVE ? "alive" : pr->pr_state == PRISON_STATE_DYING ? "dying" : "invalid"); db_printf(" path = %s\n", pr->pr_path); db_printf(" cpuset = %d\n", pr->pr_cpuset ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); db_printf(" children.max = %d\n", pr->pr_childmax); db_printf(" children.cur = %d\n", pr->pr_childcount); db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); db_printf(" flags = 0x%x", pr->pr_flags); for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) if (pr->pr_flags & bf->flag) db_printf(" %s", bf->name); for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { f = pr->pr_flags & (jsf->disable | jsf->new); db_printf(" %-16s= %s\n", jsf->name, (f != 0 && f == jsf->disable) ? "disable" : (f == jsf->new) ? "new" : "inherit"); } db_printf(" allow = 0x%x", pr->pr_allow); for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) if (pr->pr_allow & bf->flag) db_printf(" %s", bf->name); db_printf("\n"); db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); db_printf(" host.hostname = %s\n", pr->pr_hostname); db_printf(" host.domainname = %s\n", pr->pr_domainname); db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); db_printf(" host.hostid = %lu\n", pr->pr_hostid); #ifdef INET - db_printf(" ip4s = %d\n", pr->pr_ip4s); - for (ii = 0; ii < pr->pr_ip4s; ii++) - db_printf(" %s %s\n", - ii == 0 ? "ip4.addr =" : " ", - inet_ntoa_r(pr->pr_ip4[ii], ip4buf)); + if (pr->pr_addrs[PR_INET] != NULL) { + pr_family_t af = PR_INET; + + db_printf(" ip4s = %d\n", pr->pr_addrs[af]->ips); + for (ii = 0; ii < pr->pr_addrs[af]->ips; ii++) + db_printf(" %s %s\n", + ii == 0 ? "ip4.addr =" : " ", + inet_ntoa_r( + *(const struct in_addr *)PR_IP(pr, ii), + ip4buf)); + } #endif #ifdef INET6 - db_printf(" ip6s = %d\n", pr->pr_ip6s); - for (ii = 0; ii < pr->pr_ip6s; ii++) - db_printf(" %s %s\n", - ii == 0 ? "ip6.addr =" : " ", - ip6_sprintf(ip6buf, &pr->pr_ip6[ii])); + if (pr->pr_addrs[PR_INET6] != NULL) { + pr_family_t af = PR_INET6; + + db_printf(" ip6s = %d\n", pr->pr_addrs[af]->ips); + for (ii = 0; ii < pr->pr_addrs[af]->ips; ii++) + db_printf(" %s %s\n", + ii == 0 ? "ip6.addr =" : " ", + ip6_sprintf(ip6buf, + (const struct in6_addr *)PR_IP(pr, ii))); + } #endif } DB_SHOW_COMMAND(prison, db_show_prison_command) { struct prison *pr; if (!have_addr) { /* * Show all prisons in the list, and prison0 which is not * listed. */ db_show_prison(&prison0); if (!db_pager_quit) { TAILQ_FOREACH(pr, &allprison, pr_list) { db_show_prison(pr); if (db_pager_quit) break; } } return; } if (addr == 0) pr = &prison0; else { /* Look for a prison with the ID and with references. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr && pr->pr_ref > 0) break; if (pr == NULL) /* Look again, without requiring a reference. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr) break; if (pr == NULL) /* Assume address points to a valid prison. */ pr = (struct prison *)addr; } db_show_prison(pr); } #endif /* DDB */ diff --git a/sys/netinet/in.c b/sys/netinet/in.c index 70fbe32c05ac..914ca57dee38 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -1,1726 +1,1728 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); +#include "opt_inet.h" + #define IN_HISTORICAL_NETS /* include class masks */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_gifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); static bool ia_need_loopback_route(const struct in_ifaddr *); VNET_DEFINE_STATIC(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DEFINE_STATIC(bool, broadcast_lowest); #define V_broadcast_lowest VNET(broadcast_lowest) SYSCTL_BOOL(_net_inet_ip, OID_AUTO, broadcast_lowest, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(broadcast_lowest), 0, "Treat lowest address on a subnet (host 0) as broadcast"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { u_long i = ntohl(in.s_addr); struct in_ifaddr *ia; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) return (1); } return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ bool in_localip(struct in_addr in) { struct in_ifaddr *ia; NET_EPOCH_ASSERT(); CK_LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) return (true); return (false); } /* * Like in_localip(), but FIB-aware. */ bool in_localip_fib(struct in_addr in, uint16_t fib) { struct in_ifaddr *ia; NET_EPOCH_ASSERT(); CK_LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr && ia->ia_ifa.ifa_ifp->if_fib == fib) return (true); return (false); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; struct in_ifaddr *ia; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) return (1); } return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *original_ia) { struct epoch_tracker et; in_addr_t original_addr = IA_SIN(original_ia)->sin_addr.s_addr; uint32_t original_fib = original_ia->ia_ifa.ifa_ifp->if_fib; struct in_ifaddr *ia; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(ia, INADDR_HASH(original_addr), ia_hash) { in_addr_t addr = IA_SIN(ia)->sin_addr.s_addr; uint32_t fib = ia->ia_ifa.ifa_ifp->if_fib; if (!V_rt_add_addr_allfibs && (original_fib != fib)) continue; if ((original_ia != ia) && (original_addr == addr)) { ifa_ref(&ia->ia_ifa); NET_EPOCH_EXIT(et); return (ia); } } NET_EPOCH_EXIT(et); return (NULL); } /* * Tries to find first IPv4 address in the provided fib. * Prefers non-loopback addresses and return loopback IFF * @loopback_ok is set. * * Returns ifa or NULL. */ struct in_ifaddr * in_findlocal(uint32_t fibnum, bool loopback_ok) { struct in_ifaddr *ia = NULL, *ia_lo = NULL; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { uint32_t ia_fib = ia->ia_ifa.ifa_ifp->if_fib; if (!V_rt_add_addr_allfibs && (fibnum != ia_fib)) continue; if (!IN_LOOPBACK(ntohl(IA_SIN(ia)->sin_addr.s_addr))) break; if (loopback_ok) ia_lo = ia; } if (ia == NULL) ia = ia_lo; return (ia); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { u_long i = ntohl(in.s_addr); if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i) || IN_ZERONET(i) || IN_LOOPBACK(i)) return (0); return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { char *cplim = (char *) &ap->sin_addr; char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCGIFALIAS: sx_xlock(&in_control_sx); error = in_gifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } NET_EPOCH_EXIT(et); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid != 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; else iaIsFirst = false; } NET_EPOCH_EXIT(et); if (ia != NULL) (void )in_difaddr_ioctl(cmd, data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; callout_init_rw(&ia->ia_garp_timer, &ifp->if_addr_lock, CALLOUT_RETURNUNLOCKED); ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * If netmask isn't supplied, use historical default. * This is deprecated for interfaces other than loopback * or point-to-point; warn in other cases. In the future * we should return an error rather than warning. */ if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) == 0) printf("%s: set address: WARNING: network mask " "should be specified; using historical default\n", ifp->if_xname); if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ sx_assert(&in_control_sx, SA_XLOCKED); CK_STAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); CK_LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { error = in_addprefix(ia); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && ia_need_loopback_route(ia)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } /* * Note: we don't need extra reference for ifa, since we called * with sx lock held, and ifaddr can not be deleted in concurrent * thread. */ EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, ifa, IFADDR_EVENT_ADD); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, false); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ sx_assert(&in_control_sx, SA_XLOCKED); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); CK_LIST_REMOVE(ia, ia_hash); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ sx_assert(&in_control_sx, SA_XLOCKED); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); CK_LIST_REMOVE(ia, ia_hash); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, cmd == SIOCAIFADDR); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); if (ii->ii_allhosts) { (void)in_leavegroup(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } } IF_ADDR_WLOCK(ifp); if (callout_stop(&ia->ia_garp_timer) == 1) { ifa_free(&ia->ia_ifa); } IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_DEL); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } static int in_gifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; /* * ifra_addr must be present and be of INET family. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); /* * See whether address exist. */ ia = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) { ia = it; break; } } if (ia == NULL) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } ifra->ifra_mask = ia->ia_sockmask; if ((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_family == AF_INET) ifra->ifra_dstaddr = ia->ia_dstaddr; else if ((ifp->if_flags & IFF_BROADCAST) && ia->ia_broadaddr.sin_family == AF_INET) ifra->ifra_broadaddr = ia->ia_broadaddr; else memset(&ifra->ifra_broadaddr, 0, sizeof(ifra->ifra_broadaddr)); NET_EPOCH_EXIT(et); return (0); } static int in_match_ifaddr(const struct rtentry *rt, const struct nhop_object *nh, void *arg) { if (nh->nh_ifa == (struct ifaddr *)arg) return (1); return (0); } static int in_handle_prefix_route(uint32_t fibnum, int cmd, struct sockaddr_in *dst, struct sockaddr_in *netmask, struct ifaddr *ifa, struct ifnet *ifp) { NET_EPOCH_ASSERT(); /* Prepare gateway */ struct sockaddr_dl_short sdl = { .sdl_family = AF_LINK, .sdl_len = sizeof(struct sockaddr_dl_short), .sdl_type = ifa->ifa_ifp->if_type, .sdl_index = ifa->ifa_ifp->if_index, }; struct rt_addrinfo info = { .rti_ifa = ifa, .rti_ifp = ifp, .rti_flags = RTF_PINNED | ((netmask != NULL) ? 0 : RTF_HOST), .rti_info = { [RTAX_DST] = (struct sockaddr *)dst, [RTAX_NETMASK] = (struct sockaddr *)netmask, [RTAX_GATEWAY] = (struct sockaddr *)&sdl, }, /* Ensure we delete the prefix IFF prefix ifa matches */ .rti_filter = in_match_ifaddr, .rti_filterdata = ifa, }; return (rib_handle_ifaddr_info(fibnum, cmd, &info)); } /* * Routing table interaction with interface addresses. * * In general, two types of routes needs to be installed: * a) "interface" or "prefix" route, telling user that the addresses * behind the ifa prefix are reached directly. * b) "loopback" route installed for the ifa address, telling user that * the address belongs to local system. * * Handling for (a) and (b) differs in multi-fib aspects, hence they * are implemented in different functions below. * * The cases above may intersect - /32 interface aliases results in * the same prefix produced by (a) and (b). This blurs the definition * of the "loopback" route and complicate interactions. The interaction * table is defined below. The case numbers are used in the multiple * functions below to refer to the particular test case. * * There can be multiple options: * 1) Adding address with prefix on non-p2p/non-loopback interface. * Example: 192.0.2.1/24. Action: * * add "prefix" route towards 192.0.2.0/24 via @ia interface, * using @ia as an address source. * * add "loopback" route towards 192.0.2.1 via V_loif, saving * @ia ifp in the gateway and using @ia as an address source. * * 2) Adding address with /32 mask to non-p2p/non-loopback interface. * Example: 192.0.2.2/32. Action: * * add "prefix" host route via V_loif, using @ia as an address source. * * 3) Adding address with or without prefix to p2p interface. * Example: 10.0.0.1/24->10.0.0.2. Action: * * add "prefix" host route towards 10.0.0.2 via this interface, using @ia * as an address source. Note: no sense in installing full /24 as the interface * is point-to-point. * * add "loopback" route towards 10.0.9.1 via V_loif, saving * @ia ifp in the gateway and using @ia as an address source. * * 4) Adding address with or without prefix to loopback interface. * Example: 192.0.2.1/24. Action: * * add "prefix" host route via @ia interface, using @ia as an address source. * Note: Skip installing /24 prefix as it would introduce TTL loop * for the traffic destined to these addresses. */ /* * Checks if @ia needs to install loopback route to @ia address via * ifa_maintain_loopback_route(). * * Return true on success. */ static bool ia_need_loopback_route(const struct in_ifaddr *ia) { struct ifnet *ifp = ia->ia_ifp; /* Case 4: Skip loopback interfaces */ if ((ifp->if_flags & IFF_LOOPBACK) || (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)) return (false); /* Clash avoidance: Skip p2p interfaces with both addresses are equal */ if ((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) return (false); /* Case 2: skip /32 prefixes */ if (!(ifp->if_flags & IFF_POINTOPOINT) && (ia->ia_sockmask.sin_addr.s_addr == INADDR_BROADCAST)) return (false); return (true); } /* * Calculate "prefix" route corresponding to @ia. */ static void ia_getrtprefix(const struct in_ifaddr *ia, struct in_addr *prefix, struct in_addr *mask) { if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) { /* Case 3: return host route for dstaddr */ *prefix = ia->ia_dstaddr.sin_addr; mask->s_addr = INADDR_BROADCAST; } else if (ia->ia_ifp->if_flags & IFF_LOOPBACK) { /* Case 4: return host route for ifaddr */ *prefix = ia->ia_addr.sin_addr; mask->s_addr = INADDR_BROADCAST; } else { /* Cases 1,2: return actual ia prefix */ *prefix = ia->ia_addr.sin_addr; *mask = ia->ia_sockmask.sin_addr; prefix->s_addr &= mask->s_addr; } } /* * Adds or delete interface "prefix" route corresponding to @ifa. * Returns 0 on success or errno. */ int in_handle_ifaddr_route(int cmd, struct in_ifaddr *ia) { struct ifaddr *ifa = &ia->ia_ifa; struct in_addr daddr, maddr; struct sockaddr_in *pmask; struct epoch_tracker et; int error; ia_getrtprefix(ia, &daddr, &maddr); struct sockaddr_in mask = { .sin_family = AF_INET, .sin_len = sizeof(struct sockaddr_in), .sin_addr = maddr, }; pmask = (maddr.s_addr != INADDR_BROADCAST) ? &mask : NULL; struct sockaddr_in dst = { .sin_family = AF_INET, .sin_len = sizeof(struct sockaddr_in), .sin_addr.s_addr = daddr.s_addr & maddr.s_addr, }; struct ifnet *ifp = ia->ia_ifp; if ((maddr.s_addr == INADDR_BROADCAST) && (!(ia->ia_ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)))) { /* Case 2: host route on broadcast interface */ ifp = V_loif; } uint32_t fibnum = ifa->ifa_ifp->if_fib; NET_EPOCH_ENTER(et); error = in_handle_prefix_route(fibnum, cmd, &dst, pmask, ifa, ifp); NET_EPOCH_EXIT(et); return (error); } /* * Check if we have a route for the given prefix already. */ static bool in_hasrtprefix(struct in_ifaddr *target) { struct epoch_tracker et; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; bool result = false; ia_getrtprefix(target, &prefix, &mask); /* Look for an existing address with the same prefix, mask, and fib */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { ia_getrtprefix(ia, &p, &m); if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { result = true; break; } } NET_EPOCH_EXIT(et); return (result); } int in_addprefix(struct in_ifaddr *target) { int error; if (in_hasrtprefix(target)) { if (V_nosameprefix) return (EEXIST); else { rt_addrmsg(RTM_ADD, &target->ia_ifa, target->ia_ifp->if_fib); return (0); } } /* * No-one seem to have this prefix route, so we try to insert it. */ rt_addrmsg(RTM_ADD, &target->ia_ifa, target->ia_ifp->if_fib); error = in_handle_ifaddr_route(RTM_ADD, target); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ static void in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) { struct sockaddr_in addr, mask; struct sockaddr *saddr, *smask; struct ifnet *ifp; saddr = (struct sockaddr *)&addr; bzero(&addr, sizeof(addr)); addr.sin_len = sizeof(addr); addr.sin_family = AF_INET; smask = (struct sockaddr *)&mask; bzero(&mask, sizeof(mask)); mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr.s_addr = ia->ia_subnetmask; ifp = ia->ia_ifp; if (all) { /* * Remove all L2 entries matching given prefix. * Convert address to host representation to avoid * doing this on every callback. ia_subnetmask is already * stored in host representation. */ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); lltable_prefix_free(AF_INET, saddr, smask, flags); } else { /* Remove interface address only */ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); } } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct epoch_tracker et; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; /* * Remove the loopback route to the interface address. */ if (ia_need_loopback_route(target) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; eia = in_localip_more(target); if (eia != NULL) { error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } } ia_getrtprefix(target, &prefix, &mask); if ((target->ia_flags & IFA_ROUTE) == 0) { rt_addrmsg(RTM_DELETE, &target->ia_ifa, target->ia_ifp->if_fib); /* * Removing address from !IFF_UP interface or * prefix which exists on other interface (along with route). * No entries should exist here except target addr. * Given that, delete this entry only. */ in_scrubprefixlle(target, 0, flags); return (0); } NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { ia_getrtprefix(ia, &p, &m); if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); NET_EPOCH_EXIT(et); error = in_handle_ifaddr_route(RTM_DELETE, target); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); /* Scrub all entries IFF interface is different */ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, flags); error = in_handle_ifaddr_route(RTM_ADD, ia); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } NET_EPOCH_EXIT(et); /* * remove all L2 entries on the given prefix */ in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. */ rt_addrmsg(RTM_DELETE, &target->ia_ifa, target->ia_ifp->if_fib); error = in_handle_ifaddr_route(RTM_DELETE, target); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } void in_ifscrub_all(void) { struct ifnet *ifp; struct ifaddr *ifa, *nifa; struct ifaliasreq ifr; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* NET_EPOCH_ENTER(et); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; /* * This is ugly but the only way for legacy IP to * cleanly remove addresses and everything attached. */ bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } /* NET_EPOCH_EXIT(et); */ in_purgemaddrs(ifp); igmp_domifdetach(ifp); } IFNET_RUNLOCK(); } int in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) { return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Optionally check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (V_broadcast_lowest && ia->ia_subnetmask != IN_RFC3021_MASK && ntohl(in.s_addr) == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff); } /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; int found; NET_EPOCH_ASSERT(); if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); found = 0; /* * Look through the list of addresses for a match * with a broadcast address. */ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { found = 1; break; } return (found); } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { IN_MULTI_LOCK(); in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); IN_MULTI_UNLOCK(); /* * Make sure all multicast deletions invoking if_ioctl() are * completed before returning. Else we risk accessing a freed * ifnet structure pointer. */ inm_release_wait(NULL); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { struct in_multi_head purgeinms; struct in_multi *inm; struct ifmultiaddr *ifma, *next; SLIST_INIT(&purgeinms); IN_MULTI_LIST_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_WLOCK(ifp); restart: CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; inm_rele_locked(&purgeinms, inm); if (__predict_false(ifma_restart)) { ifma_restart = true; goto restart; } } IF_ADDR_WUNLOCK(ifp); inm_release_list_deferred(&purgeinms); igmp_ifdetach(ifp); IN_MULTI_LIST_UNLOCK(); } struct in_llentry { struct llentry base; }; #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in_lltable_destroy_lle_unlocked(epoch_context_t ctx) { struct llentry *lle; lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); NET_EPOCH_CALL(in_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { struct in_addr addr, mask, lle_addr; addr = ((const struct sockaddr_in *)saddr)->sin_addr; mask = ((const struct sockaddr_in *)smask)->sin_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. * Note also we should handle 'ifdown' cases without removing * ifaddr macs. */ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); lltable_unlink_entry(llt, lle); } /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct nhop_object *nh; struct in_addr addr; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); addr = ((const struct sockaddr_in *)l3addr)->sin_addr; nh = fib4_lookup(ifp->if_fib, addr, 0, NHR_NONE, 0); if (nh == NULL) return (EINVAL); /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (nh->nh_flags & NHF_GATEWAY) { if (!(nh->nh_flags & NHF_HOST) || nh->nh_ifp->if_type != IFT_ETHER || (nh->nh_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(nh->gw_sa.sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { return (EINVAL); } } /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if ((nh->nh_ifp != ifp) && (nh->nh_flags & NHF_HOST) == 0) { struct in_ifaddr *ia = (struct in_ifaddr *)ifaof_ifpforaddr(l3addr, ifp); struct in_addr dst_addr, mask_addr; if (ia == NULL) return (EINVAL); /* * ifaof_ifpforaddr() returns _best matching_ IFA. * It is possible that ifa prefix does not cover our address. * Explicitly verify and fail if that's the case. */ dst_addr = IA_SIN(ia)->sin_addr; mask_addr.s_addr = htonl(ia->ia_subnetmask); if (!IN_ARE_MASKED_ADDR_EQUAL(dst_addr, addr, mask_addr)) return (EINVAL); } return (0); } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static void in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if (flags & LLE_STATIC) lle->r_flags |= RLLE_VALID; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { NET_EPOCH_CALL(in_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; lle->r_flags |= (RLLE_VALID | RLLE_IFADDR); } return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != (LLE_UNLOCKED | LLE_EXCLUSIVE), ("wrong lle request flags: %#x", flags)); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); /* * If the afdata lock is not held, the LLE may have been unlinked while * we were blocked on the LLE lock. Check for this case. */ if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(lle); else LLE_RUNLOCK(lle); return (NULL); } return (lle); } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; struct sockaddr_dl *sdl; int error; bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } static struct lltable * in_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_alloc_entry = in_lltable_alloc; llt->llt_delete_entry = in_lltable_delete_entry; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; llt->llt_mark_used = llentry_mark_used; lltable_link(llt); return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); return (ii); } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } diff --git a/sys/netinet/in_jail.c b/sys/netinet/in_jail.c index 11891b56ecbe..d9f864f369cf 100644 --- a/sys/netinet/in_jail.c +++ b/sys/netinet/in_jail.c @@ -1,431 +1,346 @@ /*- * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +static in_addr_t +prison_primary_ip4(const struct prison *pr) +{ + + return (((const struct in_addr *)prison_ip_get0(pr, PR_INET))->s_addr); +} + int prison_qcmp_v4(const void *ip1, const void *ip2) { in_addr_t iaa, iab; /* * We need to compare in HBO here to get the list sorted as expected * by the result of the code. Sorting NBO addresses gives you * interesting results. If you do not understand, do not try. */ iaa = ntohl(((const struct in_addr *)ip1)->s_addr); iab = ntohl(((const struct in_addr *)ip2)->s_addr); /* * Do not simply return the difference of the two numbers, the int is * not wide enough. */ if (iaa > iab) return (1); else if (iaa < iab) return (-1); else return (0); } -/* - * Restrict a prison's IP address list with its parent's, possibly replacing - * it. Return true if the replacement buffer was used (or would have been). - */ -int -prison_restrict_ip4(struct prison *pr, struct in_addr *newip4) +bool +prison_valid_v4(const void *ip) { - int ii, ij, used; - struct prison *ppr; - - ppr = pr->pr_parent; - if (!(pr->pr_flags & PR_IP4_USER)) { - /* This has no user settings, so just copy the parent's list. */ - if (pr->pr_ip4s < ppr->pr_ip4s) { - /* - * There's no room for the parent's list. Use the - * new list buffer, which is assumed to be big enough - * (if it was passed). If there's no buffer, try to - * allocate one. - */ - used = 1; - if (newip4 == NULL) { - newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4), - M_PRISON, M_NOWAIT); - if (newip4 != NULL) - used = 0; - } - if (newip4 != NULL) { - bcopy(ppr->pr_ip4, newip4, - ppr->pr_ip4s * sizeof(*newip4)); - free(pr->pr_ip4, M_PRISON); - pr->pr_ip4 = newip4; - pr->pr_ip4s = ppr->pr_ip4s; - } - return (used); - } - pr->pr_ip4s = ppr->pr_ip4s; - if (pr->pr_ip4s > 0) - bcopy(ppr->pr_ip4, pr->pr_ip4, - pr->pr_ip4s * sizeof(*newip4)); - else if (pr->pr_ip4 != NULL) { - free(pr->pr_ip4, M_PRISON); - pr->pr_ip4 = NULL; - } - } else if (pr->pr_ip4s > 0) { - /* Remove addresses that aren't in the parent. */ - for (ij = 0; ij < ppr->pr_ip4s; ij++) - if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) - break; - if (ij < ppr->pr_ip4s) - ii = 1; - else { - bcopy(pr->pr_ip4 + 1, pr->pr_ip4, - --pr->pr_ip4s * sizeof(*pr->pr_ip4)); - ii = 0; - } - for (ij = 1; ii < pr->pr_ip4s; ) { - if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) { - ii++; - continue; - } - switch (ij >= ppr->pr_ip4s ? -1 : - prison_qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) { - case -1: - bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii, - (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4)); - break; - case 0: - ii++; - ij++; - break; - case 1: - ij++; - break; - } - } - if (pr->pr_ip4s == 0) { - free(pr->pr_ip4, M_PRISON); - pr->pr_ip4 = NULL; - } - } - return (0); + in_addr_t ia = ((const struct in_addr *)ip)->s_addr; + + /* + * We do not have to care about byte order for these + * checks so we will do them in NBO. + */ + return (ia != INADDR_ANY && ia != INADDR_BROADCAST); } /* * Pass back primary IPv4 address of this jail. * * If not restricted return success but do not alter the address. Caller has * to make sure to initialize it correctly (e.g. INADDR_ANY). * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4. * Address returned in NBO. */ int prison_get_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } - if (pr->pr_ip4 == NULL) { + if (pr->pr_addrs[PR_INET] == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } - ia->s_addr = pr->pr_ip4[0].s_addr; + ia->s_addr = prison_primary_ip4(pr); mtx_unlock(&pr->pr_mtx); return (0); } /* * Return 1 if we should do proper source address selection or are not jailed. * We will return 0 if we should bypass source address selection in favour * of the primary jail IPv4 address. Only in this case *ia will be updated and * returned in NBO. * Return EAFNOSUPPORT, in case this jail does not allow IPv4. */ int prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; struct in_addr lia; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); if (!jailed(cred)) return (1); pr = cred->cr_prison; if (pr->pr_flags & PR_IP4_SADDRSEL) return (1); lia.s_addr = INADDR_ANY; error = prison_get_ip4(cred, &lia); if (error) return (error); if (lia.s_addr == INADDR_ANY) return (1); ia->s_addr = lia.s_addr; return (0); } /* * Return true if pr1 and pr2 have the same IPv4 address restrictions. */ int prison_equal_ip4(struct prison *pr1, struct prison *pr2) { if (pr1 == pr2) return (1); /* * No need to lock since the PR_IP4_USER flag can't be altered for * existing prisons. */ while (pr1 != &prison0 && #ifdef VIMAGE !(pr1->pr_flags & PR_VNET) && #endif !(pr1->pr_flags & PR_IP4_USER)) pr1 = pr1->pr_parent; while (pr2 != &prison0 && #ifdef VIMAGE !(pr2->pr_flags & PR_VNET) && #endif !(pr2->pr_flags & PR_IP4_USER)) pr2 = pr2->pr_parent; return (pr1 == pr2); } /* * Make sure our (source) address is set to something meaningful to this * jail. * * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv4. Address passed in in NBO and returned in NBO. */ int prison_local_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; struct in_addr ia0; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } - if (pr->pr_ip4 == NULL) { + if (pr->pr_addrs[PR_INET] == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } ia0.s_addr = ntohl(ia->s_addr); if (ia0.s_addr == INADDR_ANY) { /* * In case there is only 1 IPv4 address, bind directly. */ - if (pr->pr_ip4s == 1) - ia->s_addr = pr->pr_ip4[0].s_addr; + if (prison_ip_cnt(pr, PR_INET) == 1) + ia->s_addr = prison_primary_ip4(pr); mtx_unlock(&pr->pr_mtx); return (0); } error = prison_check_ip4_locked(pr, ia); if (error == EADDRNOTAVAIL && ia0.s_addr == INADDR_LOOPBACK) { - ia->s_addr = pr->pr_ip4[0].s_addr; + ia->s_addr = prison_primary_ip4(pr); error = 0; } mtx_unlock(&pr->pr_mtx); return (error); } /* * Rewrite destination address in case we will connect to loopback address. * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4. * Address passed in in NBO and returned in NBO. */ int prison_remote_ip4(struct ucred *cred, struct in_addr *ia) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } - if (pr->pr_ip4 == NULL) { + if (pr->pr_addrs[PR_INET] == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (ntohl(ia->s_addr) == INADDR_LOOPBACK && prison_check_ip4_locked(pr, ia) == EADDRNOTAVAIL) { - ia->s_addr = pr->pr_ip4[0].s_addr; + ia->s_addr = prison_primary_ip4(pr); mtx_unlock(&pr->pr_mtx); return (0); } /* * Return success because nothing had to be changed. */ mtx_unlock(&pr->pr_mtx); return (0); } /* * Check if given address belongs to the jail referenced by cred/prison. * * Returns 0 if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong to the jail. */ int prison_check_ip4_locked(const struct prison *pr, const struct in_addr *ia) { - int i, a, z, d; - /* - * Check the primary IP. - */ - if (pr->pr_ip4[0].s_addr == ia->s_addr) + if (!(pr->pr_flags & PR_IP4)) return (0); - /* - * All the other IPs are sorted so we can do a binary search. - */ - a = 0; - z = pr->pr_ip4s - 2; - while (a <= z) { - i = (a + z) / 2; - d = prison_qcmp_v4(&pr->pr_ip4[i+1], ia); - if (d > 0) - z = i - 1; - else if (d < 0) - a = i + 1; - else - return (0); - } - - return (EADDRNOTAVAIL); + return (prison_ip_check(pr, PR_INET, ia)); } int prison_check_ip4(const struct ucred *cred, const struct in_addr *ia) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP4)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP4)) { mtx_unlock(&pr->pr_mtx); return (0); } - if (pr->pr_ip4 == NULL) { + if (pr->pr_addrs[PR_INET] == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } error = prison_check_ip4_locked(pr, ia); mtx_unlock(&pr->pr_mtx); return (error); } diff --git a/sys/netinet6/in6_jail.c b/sys/netinet6/in6_jail.c index 65b397f69bf2..05b046fd2aa7 100644 --- a/sys/netinet6/in6_jail.c +++ b/sys/netinet6/in6_jail.c @@ -1,417 +1,330 @@ /*- * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +static void +prison_bcopy_primary_ip6(const struct prison *pr, struct in6_addr *ia6) +{ + + bcopy(prison_ip_get0(pr, PR_INET6), ia6, sizeof(struct in6_addr)); +} + int prison_qcmp_v6(const void *ip1, const void *ip2) { const struct in6_addr *ia6a, *ia6b; int i, rc; ia6a = (const struct in6_addr *)ip1; ia6b = (const struct in6_addr *)ip2; rc = 0; for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) { if (ia6a->s6_addr[i] > ia6b->s6_addr[i]) rc = 1; else if (ia6a->s6_addr[i] < ia6b->s6_addr[i]) rc = -1; } return (rc); } -int -prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6) +bool +prison_valid_v6(const void *ip) { - int ii, ij, used; - struct prison *ppr; - - ppr = pr->pr_parent; - if (!(pr->pr_flags & PR_IP6_USER)) { - /* This has no user settings, so just copy the parent's list. */ - if (pr->pr_ip6s < ppr->pr_ip6s) { - /* - * There's no room for the parent's list. Use the - * new list buffer, which is assumed to be big enough - * (if it was passed). If there's no buffer, try to - * allocate one. - */ - used = 1; - if (newip6 == NULL) { - newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6), - M_PRISON, M_NOWAIT); - if (newip6 != NULL) - used = 0; - } - if (newip6 != NULL) { - bcopy(ppr->pr_ip6, newip6, - ppr->pr_ip6s * sizeof(*newip6)); - free(pr->pr_ip6, M_PRISON); - pr->pr_ip6 = newip6; - pr->pr_ip6s = ppr->pr_ip6s; - } - return (used); - } - pr->pr_ip6s = ppr->pr_ip6s; - if (pr->pr_ip6s > 0) - bcopy(ppr->pr_ip6, pr->pr_ip6, - pr->pr_ip6s * sizeof(*newip6)); - else if (pr->pr_ip6 != NULL) { - free(pr->pr_ip6, M_PRISON); - pr->pr_ip6 = NULL; - } - } else if (pr->pr_ip6s > 0) { - /* Remove addresses that aren't in the parent. */ - for (ij = 0; ij < ppr->pr_ip6s; ij++) - if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], - &ppr->pr_ip6[ij])) - break; - if (ij < ppr->pr_ip6s) - ii = 1; - else { - bcopy(pr->pr_ip6 + 1, pr->pr_ip6, - --pr->pr_ip6s * sizeof(*pr->pr_ip6)); - ii = 0; - } - for (ij = 1; ii < pr->pr_ip6s; ) { - if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii], - &ppr->pr_ip6[0])) { - ii++; - continue; - } - switch (ij >= ppr->pr_ip6s ? -1 : - prison_qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) { - case -1: - bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii, - (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6)); - break; - case 0: - ii++; - ij++; - break; - case 1: - ij++; - break; - } - } - if (pr->pr_ip6s == 0) { - free(pr->pr_ip6, M_PRISON); - pr->pr_ip6 = NULL; - } - } - return 0; + const struct in6_addr *ia = ip; + + return (!IN6_IS_ADDR_UNSPECIFIED(ia)); } /* * Pass back primary IPv6 address for this jail. * * If not restricted return success but do not alter the address. Caller has * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT). * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6. */ int prison_get_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } - if (pr->pr_ip6 == NULL) { + if (pr->pr_addrs[PR_INET6] == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } - bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); + prison_bcopy_primary_ip6(pr, ia6); mtx_unlock(&pr->pr_mtx); return (0); } /* * Return 1 if we should do proper source address selection or are not jailed. * We will return 0 if we should bypass source address selection in favour * of the primary jail IPv6 address. Only in this case *ia will be updated and * returned in NBO. * Return EAFNOSUPPORT, in case this jail does not allow IPv6. */ int prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; struct in6_addr lia6; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); if (!jailed(cred)) return (1); pr = cred->cr_prison; if (pr->pr_flags & PR_IP6_SADDRSEL) return (1); lia6 = in6addr_any; error = prison_get_ip6(cred, &lia6); if (error) return (error); if (IN6_IS_ADDR_UNSPECIFIED(&lia6)) return (1); bcopy(&lia6, ia6, sizeof(struct in6_addr)); return (0); } /* * Return true if pr1 and pr2 have the same IPv6 address restrictions. */ int prison_equal_ip6(struct prison *pr1, struct prison *pr2) { if (pr1 == pr2) return (1); while (pr1 != &prison0 && #ifdef VIMAGE !(pr1->pr_flags & PR_VNET) && #endif !(pr1->pr_flags & PR_IP6_USER)) pr1 = pr1->pr_parent; while (pr2 != &prison0 && #ifdef VIMAGE !(pr2->pr_flags & PR_VNET) && #endif !(pr2->pr_flags & PR_IP6_USER)) pr2 = pr2->pr_parent; return (pr1 == pr2); } /* * Make sure our (source) address is set to something meaningful to this jail. * * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0) * when needed while binding. * * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail * doesn't allow IPv6. */ int prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } - if (pr->pr_ip6 == NULL) { + if (pr->pr_addrs[PR_INET6] == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (IN6_IS_ADDR_UNSPECIFIED(ia6)) { /* * In case there is only 1 IPv6 address, and v6only is true, * then bind directly. */ - if (v6only != 0 && pr->pr_ip6s == 1) - bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); + if (v6only != 0 && prison_ip_cnt(pr, PR_INET6) == 1) + prison_bcopy_primary_ip6(pr, ia6); mtx_unlock(&pr->pr_mtx); return (0); } error = prison_check_ip6_locked(pr, ia6); if (error == EADDRNOTAVAIL && IN6_IS_ADDR_LOOPBACK(ia6)) { - bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); + prison_bcopy_primary_ip6(pr, ia6); error = 0; } mtx_unlock(&pr->pr_mtx); return (error); } /* * Rewrite destination address in case we will connect to loopback address. * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6. */ int prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6) { struct prison *pr; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } - if (pr->pr_ip6 == NULL) { + if (pr->pr_addrs[PR_INET6] == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } if (IN6_IS_ADDR_LOOPBACK(ia6) && prison_check_ip6_locked(pr, ia6) == EADDRNOTAVAIL) { - bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr)); + prison_bcopy_primary_ip6(pr, ia6); mtx_unlock(&pr->pr_mtx); return (0); } /* * Return success because nothing had to be changed. */ mtx_unlock(&pr->pr_mtx); return (0); } /* * Check if given address belongs to the jail referenced by cred/prison. * * Returns 0 if address belongs to jail, * EADDRNOTAVAIL if the address doesn't belong to the jail. */ int prison_check_ip6_locked(const struct prison *pr, const struct in6_addr *ia6) { - int i, a, z, d; - /* - * Check the primary IP. - */ - if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6)) + if (!(pr->pr_flags & PR_IP6)) return (0); - /* - * All the other IPs are sorted so we can do a binary search. - */ - a = 0; - z = pr->pr_ip6s - 2; - while (a <= z) { - i = (a + z) / 2; - d = prison_qcmp_v6(&pr->pr_ip6[i+1], ia6); - if (d > 0) - z = i - 1; - else if (d < 0) - a = i + 1; - else - return (0); - } - - return (EADDRNOTAVAIL); + return (prison_ip_check(pr, PR_INET6, ia6)); } int prison_check_ip6(const struct ucred *cred, const struct in6_addr *ia6) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); pr = cred->cr_prison; if (!(pr->pr_flags & PR_IP6)) return (0); mtx_lock(&pr->pr_mtx); if (!(pr->pr_flags & PR_IP6)) { mtx_unlock(&pr->pr_mtx); return (0); } - if (pr->pr_ip6 == NULL) { + if (pr->pr_addrs[PR_INET6] == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); } error = prison_check_ip6_locked(pr, ia6); mtx_unlock(&pr->pr_mtx); return (error); } diff --git a/sys/sys/jail.h b/sys/sys/jail.h index b0183d404352..b7ecfc198b4c 100644 --- a/sys/sys/jail.h +++ b/sys/sys/jail.h @@ -1,471 +1,482 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_JAIL_H_ #define _SYS_JAIL_H_ #ifdef _KERNEL struct jail_v0 { u_int32_t version; char *path; char *hostname; u_int32_t ip_number; }; #endif struct jail { uint32_t version; char *path; char *hostname; char *jailname; uint32_t ip4s; uint32_t ip6s; struct in_addr *ip4; struct in6_addr *ip6; }; #define JAIL_API_VERSION 2 /* * For all xprison structs, always keep the pr_version an int and * the first variable so userspace can easily distinguish them. */ #ifndef _KERNEL struct xprison_v1 { int pr_version; int pr_id; char pr_path[MAXPATHLEN]; char pr_host[MAXHOSTNAMELEN]; u_int32_t pr_ip; }; #endif struct xprison { int pr_version; int pr_id; int pr_state; cpusetid_t pr_cpusetid; char pr_path[MAXPATHLEN]; char pr_host[MAXHOSTNAMELEN]; char pr_name[MAXHOSTNAMELEN]; uint32_t pr_ip4s; uint32_t pr_ip6s; #if 0 /* * sizeof(xprison) will be malloced + size needed for all * IPv4 and IPv6 addesses. Offsets are based numbers of addresses. */ struct in_addr pr_ip4[]; struct in6_addr pr_ip6[]; #endif }; #define XPRISON_VERSION 3 enum prison_state { PRISON_STATE_INVALID = 0, /* New prison, not ready to be seen */ PRISON_STATE_ALIVE, /* Current prison, visible to all */ PRISON_STATE_DYING /* Removed but holding resources, */ }; /* optionally visible. */ /* * Flags for jail_set and jail_get. */ #define JAIL_CREATE 0x01 /* Create jail if it doesn't exist */ #define JAIL_UPDATE 0x02 /* Update parameters of existing jail */ #define JAIL_ATTACH 0x04 /* Attach to jail upon creation */ #define JAIL_DYING 0x08 /* Allow getting a dying jail */ #define JAIL_SET_MASK 0x0f #define JAIL_GET_MASK 0x08 #define JAIL_SYS_DISABLE 0 #define JAIL_SYS_NEW 1 #define JAIL_SYS_INHERIT 2 #ifndef _KERNEL struct iovec; __BEGIN_DECLS int jail(struct jail *); int jail_set(struct iovec *, unsigned int, int); int jail_get(struct iovec *, unsigned int, int); int jail_attach(int); int jail_remove(int); __END_DECLS #else /* _KERNEL */ #include #include #include #include #include #define JAIL_MAX 999999 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PRISON); #endif #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_PRISON) #include #define HOSTUUIDLEN 64 #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" #define OSRELEASELEN 32 struct racct; struct prison_racct; +typedef enum { + PR_INET = 0, + PR_INET6 = 1, + PR_FAMILY_MAX = 2, +} pr_family_t; + /* * This structure describes a prison. It is pointed to by all struct * ucreds's of the inmates. pr_ref keeps track of them and is used to * delete the structure when the last inmate is dead. * * Lock key: * (a) allprison_lock * (c) set only during creation before the structure is shared, no mutex * required to read * (m) locked by pr_mtx * (p) locked by pr_mtx, and also at least shared allprison_lock required * to update * (q) locked by both pr_mtx and allprison_lock * (r) atomic via refcount(9), pr_mtx and allprison_lock required to * decrement to zero + * (n) read access granted with the network epoch */ struct prison { TAILQ_ENTRY(prison) pr_list; /* (a) all prisons */ int pr_id; /* (c) prison id */ volatile u_int pr_ref; /* (r) refcount */ volatile u_int pr_uref; /* (r) user (alive) refcount */ unsigned pr_flags; /* (p) PR_* flags */ LIST_HEAD(, prison) pr_children; /* (a) list of child jails */ LIST_ENTRY(prison) pr_sibling; /* (a) next in parent's list */ struct prison *pr_parent; /* (c) containing jail */ struct mtx pr_mtx; struct task pr_task; /* (c) destroy task */ struct osd pr_osd; /* (p) additional data */ struct cpuset *pr_cpuset; /* (p) cpuset */ struct vnet *pr_vnet; /* (c) network stack */ struct vnode *pr_root; /* (c) vnode to rdir */ - int pr_ip4s; /* (p) number of v4 IPs */ - int pr_ip6s; /* (p) number of v6 IPs */ - struct in_addr *pr_ip4; /* (p) v4 IPs of jail */ - struct in6_addr *pr_ip6; /* (p) v6 IPs of jail */ + struct prison_ip *pr_addrs[PR_FAMILY_MAX]; /* (p,n) IPs of jail */ struct prison_racct *pr_prison_racct; /* (c) racct jail proxy */ void *pr_sparep[3]; int pr_childcount; /* (a) number of child jails */ int pr_childmax; /* (p) maximum child jails */ unsigned pr_allow; /* (p) PR_ALLOW_* flags */ int pr_securelevel; /* (p) securelevel */ int pr_enforce_statfs; /* (p) statfs permission */ int pr_devfs_rsnum; /* (p) devfs ruleset */ enum prison_state pr_state; /* (q) state in life cycle */ int pr_spare[2]; int pr_osreldate; /* (c) kern.osreldate value */ unsigned long pr_hostid; /* (p) jail hostid */ char pr_name[MAXHOSTNAMELEN]; /* (p) admin jail name */ char pr_path[MAXPATHLEN]; /* (c) chroot path */ char pr_hostname[MAXHOSTNAMELEN]; /* (p) jail hostname */ char pr_domainname[MAXHOSTNAMELEN]; /* (p) jail domainname */ char pr_hostuuid[HOSTUUIDLEN]; /* (p) jail hostuuid */ char pr_osrelease[OSRELEASELEN]; /* (c) kern.osrelease value */ }; struct prison_racct { LIST_ENTRY(prison_racct) prr_next; char prr_name[MAXHOSTNAMELEN]; u_int prr_refcount; struct racct *prr_racct; }; #endif /* _KERNEL || _WANT_PRISON */ #ifdef _KERNEL /* Flag bits set via options */ #define PR_PERSIST 0x00000001 /* Can exist without processes */ #define PR_HOST 0x00000002 /* Virtualize hostname et al */ #define PR_IP4_USER 0x00000004 /* Restrict IPv4 addresses */ #define PR_IP6_USER 0x00000008 /* Restrict IPv6 addresses */ #define PR_VNET 0x00000010 /* Virtual network stack */ #define PR_IP4_SADDRSEL 0x00000080 /* Do IPv4 src addr sel. or use the */ /* primary jail address. */ #define PR_IP6_SADDRSEL 0x00000100 /* Do IPv6 src addr sel. or use the */ /* primary jail address. */ /* Internal flag bits */ #define PR_REMOVE 0x01000000 /* In process of being removed */ #define PR_IP4 0x02000000 /* IPv4 restricted or disabled */ /* by this jail or an ancestor */ #define PR_IP6 0x04000000 /* IPv6 restricted or disabled */ /* by this jail or an ancestor */ #define PR_COMPLETE_PROC 0x08000000 /* prison_complete called from */ /* prison_proc_free, releases uref */ /* * Flags for pr_allow * Bits not noted here may be used for dynamic allow.mount.xxxfs. */ #define PR_ALLOW_SET_HOSTNAME 0x00000001 #define PR_ALLOW_SYSVIPC 0x00000002 #define PR_ALLOW_RAW_SOCKETS 0x00000004 #define PR_ALLOW_CHFLAGS 0x00000008 #define PR_ALLOW_MOUNT 0x00000010 #define PR_ALLOW_QUOTAS 0x00000020 #define PR_ALLOW_SOCKET_AF 0x00000040 #define PR_ALLOW_MLOCK 0x00000080 #define PR_ALLOW_READ_MSGBUF 0x00000100 #define PR_ALLOW_UNPRIV_DEBUG 0x00000200 #define PR_ALLOW_SUSER 0x00000400 #define PR_ALLOW_RESERVED_PORTS 0x00008000 #define PR_ALLOW_KMEM_ACCESS 0x00010000 /* reserved, not used yet */ #define PR_ALLOW_ALL_STATIC 0x000187ff /* * PR_ALLOW_DIFFERENCES determines which flags are able to be * different between the parent and child jail upon creation. */ #define PR_ALLOW_DIFFERENCES (PR_ALLOW_UNPRIV_DEBUG) /* * OSD methods */ #define PR_METHOD_CREATE 0 #define PR_METHOD_GET 1 #define PR_METHOD_SET 2 #define PR_METHOD_CHECK 3 #define PR_METHOD_ATTACH 4 #define PR_METHOD_REMOVE 5 #define PR_MAXMETHOD 6 /* * Lock/unlock a prison. * XXX These exist not so much for general convenience, but to be useable in * the FOREACH_PRISON_DESCENDANT_LOCKED macro which can't handle them in * non-function form as currently defined. */ static __inline void prison_lock(struct prison *pr) { mtx_lock(&pr->pr_mtx); } static __inline void prison_unlock(struct prison *pr) { mtx_unlock(&pr->pr_mtx); } /* Traverse a prison's immediate children. */ #define FOREACH_PRISON_CHILD(ppr, cpr) \ LIST_FOREACH(cpr, &(ppr)->pr_children, pr_sibling) /* * Preorder traversal of all of a prison's descendants. * This ugly loop allows the macro to be followed by a single block * as expected in a looping primitive. */ #define FOREACH_PRISON_DESCENDANT(ppr, cpr, descend) \ for ((cpr) = (ppr), (descend) = 1; \ ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children)) \ ? LIST_FIRST(&(cpr)->pr_children) \ : ((cpr) == (ppr) \ ? NULL \ : (((descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ ? LIST_NEXT(cpr, pr_sibling) \ : (cpr)->pr_parent))));) \ if (!(descend)) \ ; \ else /* * As above, but lock descendants on the way down and unlock on the way up. */ #define FOREACH_PRISON_DESCENDANT_LOCKED(ppr, cpr, descend) \ for ((cpr) = (ppr), (descend) = 1; \ ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children)) \ ? LIST_FIRST(&(cpr)->pr_children) \ : ((cpr) == (ppr) \ ? NULL \ : ((prison_unlock(cpr), \ (descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ ? LIST_NEXT(cpr, pr_sibling) \ : (cpr)->pr_parent))));) \ if ((descend) ? (prison_lock(cpr), 0) : 1) \ ; \ else /* * As above, but also keep track of the level descended to. */ #define FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(ppr, cpr, descend, level)\ for ((cpr) = (ppr), (descend) = 1, (level) = 0; \ ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children)) \ ? (level++, LIST_FIRST(&(cpr)->pr_children)) \ : ((cpr) == (ppr) \ ? NULL \ : ((prison_unlock(cpr), \ (descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ ? LIST_NEXT(cpr, pr_sibling) \ : (level--, (cpr)->pr_parent)))));) \ if ((descend) ? (prison_lock(cpr), 0) : 1) \ ; \ else /* * Traverse a prison's descendants, visiting both preorder and postorder. */ #define FOREACH_PRISON_DESCENDANT_PRE_POST(ppr, cpr, descend) \ for ((cpr) = (ppr), (descend) = 1; \ ((cpr) = (descend) \ ? ((descend) = !LIST_EMPTY(&(cpr)->pr_children)) \ ? LIST_FIRST(&(cpr)->pr_children) \ : (cpr) \ : ((descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ ? LIST_NEXT(cpr, pr_sibling) \ : cpr->pr_parent) != (ppr);) /* * Attributes of the physical system, and the root of the jail tree. */ extern struct prison prison0; TAILQ_HEAD(prisonlist, prison); extern struct prisonlist allprison; extern struct sx allprison_lock; /* * Sysctls to describe jail parameters. */ SYSCTL_DECL(_security_jail_param); #define SYSCTL_JAIL_PARAM(module, param, type, fmt, descr) \ SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param, \ (type) | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_param, fmt, descr) #define SYSCTL_JAIL_PARAM_STRING(module, param, access, len, descr) \ SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param, \ CTLTYPE_STRING | CTLFLAG_MPSAFE | (access), NULL, len, \ sysctl_jail_param, "A", descr) #define SYSCTL_JAIL_PARAM_STRUCT(module, param, access, len, fmt, descr)\ SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param, \ CTLTYPE_STRUCT | CTLFLAG_MPSAFE | (access), NULL, len, \ sysctl_jail_param, fmt, descr) #define SYSCTL_JAIL_PARAM_NODE(module, descr) \ SYSCTL_NODE(_security_jail_param, OID_AUTO, module, CTLFLAG_MPSAFE, \ 0, descr) #define SYSCTL_JAIL_PARAM_SUBNODE(parent, module, descr) \ SYSCTL_NODE(_security_jail_param_##parent, OID_AUTO, module, \ CTLFLAG_MPSAFE, 0, descr) #define SYSCTL_JAIL_PARAM_SYS_NODE(module, access, descr) \ SYSCTL_JAIL_PARAM_NODE(module, descr); \ SYSCTL_JAIL_PARAM(_##module, , CTLTYPE_INT | (access), "E,jailsys", \ descr) /* * Kernel support functions for jail(). */ struct ucred; struct mount; struct sockaddr; struct statfs; struct vfsconf; /* * Return 1 if the passed credential is in a jail, otherwise 0. */ #define jailed(cred) (cred->cr_prison != &prison0) int jailed_without_vnet(struct ucred *); void getcredhostname(struct ucred *, char *, size_t); void getcreddomainname(struct ucred *, char *, size_t); void getcredhostuuid(struct ucred *, char *, size_t); void getcredhostid(struct ucred *, unsigned long *); void getjailname(struct ucred *cred, char *name, size_t len); void prison0_init(void); int prison_allow(struct ucred *, unsigned); int prison_check(struct ucred *cred1, struct ucred *cred2); int prison_owns_vnet(struct ucred *); int prison_canseemount(struct ucred *cred, struct mount *mp); void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp); struct prison *prison_find(int prid); struct prison *prison_find_child(struct prison *, int); struct prison *prison_find_name(struct prison *, const char *); int prison_flag(struct ucred *, unsigned); void prison_free(struct prison *pr); void prison_free_locked(struct prison *pr); void prison_hold(struct prison *pr); void prison_hold_locked(struct prison *pr); void prison_proc_hold(struct prison *); void prison_proc_free(struct prison *); void prison_set_allow(struct ucred *cred, unsigned flag, int enable); int prison_ischild(struct prison *, struct prison *); -bool prison_isalive(struct prison *); +bool prison_isalive(const struct prison *); bool prison_isvalid(struct prison *); +#if defined(INET) || defined(INET6) +int prison_ip_check(const struct prison *, const pr_family_t, const void *); +const void *prison_ip_get0(const struct prison *, const pr_family_t); +u_int prison_ip_cnt(const struct prison *, const pr_family_t); +#endif +#ifdef INET int prison_equal_ip4(struct prison *, struct prison *); int prison_get_ip4(struct ucred *cred, struct in_addr *ia); int prison_local_ip4(struct ucred *cred, struct in_addr *ia); int prison_remote_ip4(struct ucred *cred, struct in_addr *ia); int prison_check_ip4(const struct ucred *, const struct in_addr *); int prison_check_ip4_locked(const struct prison *, const struct in_addr *); int prison_saddrsel_ip4(struct ucred *, struct in_addr *); -int prison_restrict_ip4(struct prison *, struct in_addr *); int prison_qcmp_v4(const void *, const void *); +bool prison_valid_v4(const void *); +#endif #ifdef INET6 int prison_equal_ip6(struct prison *, struct prison *); int prison_get_ip6(struct ucred *, struct in6_addr *); int prison_local_ip6(struct ucred *, struct in6_addr *, int); int prison_remote_ip6(struct ucred *, struct in6_addr *); int prison_check_ip6(const struct ucred *, const struct in6_addr *); int prison_check_ip6_locked(const struct prison *, const struct in6_addr *); int prison_saddrsel_ip6(struct ucred *, struct in6_addr *); -int prison_restrict_ip6(struct prison *, struct in6_addr *); int prison_qcmp_v6(const void *, const void *); +bool prison_valid_v6(const void *); #endif int prison_check_af(struct ucred *cred, int af); int prison_if(struct ucred *cred, const struct sockaddr *sa); char *prison_name(struct prison *, struct prison *); int prison_priv_check(struct ucred *cred, int priv); int sysctl_jail_param(SYSCTL_HANDLER_ARGS); unsigned prison_add_allow(const char *prefix, const char *name, const char *prefix_descr, const char *descr); void prison_add_vfs(struct vfsconf *vfsp); void prison_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3); struct prison_racct *prison_racct_find(const char *name); void prison_racct_hold(struct prison_racct *prr); void prison_racct_free(struct prison_racct *prr); #endif /* _KERNEL */ #endif /* !_SYS_JAIL_H_ */