diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index b0b0fa50e648..d4529e096929 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -1,5149 +1,5162 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_nfs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ #include #define PRISON0_HOSTUUID_MODULE "hostuuid" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ #ifdef INET #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL #endif #else /* !INET */ #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL 0 #endif #endif /* prison0 describes what is "real" about the system. */ struct prison prison0 = { .pr_id = 0, .pr_name = "0", .pr_ref = 1, .pr_uref = 1, .pr_path = "/", .pr_securelevel = -1, .pr_devfs_rsnum = 0, .pr_state = PRISON_STATE_ALIVE, .pr_childmax = JAIL_MAX, .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif .pr_allow = PR_ALLOW_ALL_STATIC, }; MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); struct bool_flags { const char *name; const char *noname; volatile u_int flag; }; struct jailsys_flags { const char *name; unsigned disable; unsigned new; }; /* * Handle jail teardown in a dedicated thread to avoid deadlocks from * vnet_destroy(). */ TASKQUEUE_DEFINE_THREAD(jail_remove); /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ struct sx allprison_lock; SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); LIST_HEAD(, prison_racct) allprison_racct; int lastprid = 0; int lastdeadid = 0; static int get_next_prid(struct prison **insprp); static int get_next_deadid(struct prison **insprp); static int do_jail_attach(struct thread *td, struct prison *pr, int drflags); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison); static int prison_lock_xlock(struct prison *pr, int flags); static void prison_cleanup(struct prison *pr); static void prison_free_not_last(struct prison *pr); static void prison_proc_free_not_last(struct prison *pr); static void prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p); static void prison_set_allow_locked(struct prison *pr, unsigned flag, int enable); static char *prison_path(struct prison *pr1, struct prison *pr2); #ifdef RACCT static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif /* Flags for prison_deref */ #define PD_DEREF 0x01 /* Decrement pr_ref */ #define PD_DEUREF 0x02 /* Decrement pr_uref */ #define PD_KILL 0x04 /* Remove jail, kill processes, etc */ #define PD_LOCKED 0x10 /* pr_mtx is held */ #define PD_LIST_SLOCKED 0x20 /* allprison_lock is held shared */ #define PD_LIST_XLOCKED 0x40 /* allprison_lock is held exclusive */ #define PD_OP_FLAGS 0x07 /* Operation flags */ #define PD_LOCK_FLAGS 0x70 /* Lock status flags */ /* * Parameter names corresponding to PR_* flag values. Size values are for kvm * as we cannot figure out the size of a sparse array, or an array without a * terminating entry. */ static struct bool_flags pr_flag_bool[] = { {"persist", "nopersist", PR_PERSIST}, #ifdef INET {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL}, #endif #ifdef INET6 {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL}, #endif }; const size_t pr_flag_bool_size = sizeof(pr_flag_bool); static struct jailsys_flags pr_flag_jailsys[] = { {"host", 0, PR_HOST}, #ifdef VIMAGE {"vnet", 0, PR_VNET}, #endif #ifdef INET {"ip4", PR_IP4_USER, PR_IP4_USER}, #endif #ifdef INET6 {"ip6", PR_IP6_USER, PR_IP6_USER}, #endif }; const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); /* * Make this array full-size so dynamic parameters can be added. * It is protected by prison0.mtx, but lockless reading is allowed * with an atomic check of the flag values. */ static struct bool_flags pr_flag_allow[NBBY * NBPW] = { {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME}, {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC}, {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS}, {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS}, {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT}, {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS}, {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF}, {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK}, {"allow.reserved_ports", "allow.noreserved_ports", PR_ALLOW_RESERVED_PORTS}, {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF}, {"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug", PR_ALLOW_UNPRIV_DEBUG}, {"allow.suser", "allow.nosuser", PR_ALLOW_SUSER}, #ifdef VIMAGE {"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD}, #endif {"allow.extattr", "allow.noextattr", PR_ALLOW_EXTATTR}, {"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME}, {"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME}, + {"allow.routing", "allow.norouting", PR_ALLOW_ROUTING}, }; static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC; const size_t pr_flag_allow_size = sizeof(pr_flag_allow); #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | \ PR_ALLOW_RESERVED_PORTS | \ PR_ALLOW_UNPRIV_DEBUG | \ PR_ALLOW_SUSER) #define JAIL_DEFAULT_ENFORCE_STATFS 2 #define JAIL_DEFAULT_DEVFS_RSNUM 0 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; #if defined(INET) || defined(INET6) static unsigned jail_max_af_ips = 255; #endif /* * Initialize the parts of prison0 that can't be static-initialized with * constants. This is called from proc0_init() after creating thread0 cpuset. */ void prison0_init(void) { uint8_t *file, *data; size_t size; char buf[sizeof(prison0.pr_hostuuid)]; bool valid; prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); prison0.pr_osreldate = osreldate; strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); /* If we have a preloaded hostuuid, use it. */ file = preload_search_by_type(PRISON0_HOSTUUID_MODULE); if (file != NULL) { data = preload_fetch_addr(file); size = preload_fetch_size(file); if (data != NULL) { /* * The preloaded data may include trailing whitespace, almost * certainly a newline; skip over any whitespace or * non-printable characters to be safe. */ while (size > 0 && data[size - 1] <= 0x20) { size--; } valid = false; /* * Not NUL-terminated when passed from loader, but * validate_uuid requires that due to using sscanf (as * does the subsequent strlcpy, since it still reads * past the given size to return the true length); * bounce to a temporary buffer to fix. */ if (size >= sizeof(buf)) goto done; memcpy(buf, data, size); buf[size] = '\0'; if (validate_uuid(buf, size, NULL, 0) != 0) goto done; valid = true; (void)strlcpy(prison0.pr_hostuuid, buf, sizeof(prison0.pr_hostuuid)); done: if (bootverbose && !valid) { printf("hostuuid: preload data malformed: '%.*s'\n", (int)size, data); } } } if (bootverbose) printf("hostuuid: using %s\n", prison0.pr_hostuuid); } /* * struct jail_args { * struct jail *jail; * }; */ int sys_jail(struct thread *td, struct jail_args *uap) { uint32_t version; int error; struct jail j; error = copyin(uap->jail, &version, sizeof(uint32_t)); if (error) return (error); switch (version) { case 0: { struct jail_v0 j0; /* FreeBSD single IPv4 jails. */ bzero(&j, sizeof(struct jail)); error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); if (error) return (error); j.version = j0.version; j.path = j0.path; j.hostname = j0.hostname; j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ break; } case 1: /* * Version 1 was used by multi-IPv4 jail implementations * that never made it into the official kernel. */ return (EINVAL); case 2: /* JAIL_API_VERSION */ /* FreeBSD multi-IPv4/IPv6,noIP jails. */ error = copyin(uap->jail, &j, sizeof(struct jail)); if (error) return (error); break; default: /* Sci-Fi jails are not supported, sorry. */ return (EINVAL); } return (kern_jail(td, &j)); } int kern_jail(struct thread *td, struct jail *j) { struct iovec optiov[2 * (4 + nitems(pr_flag_allow) #ifdef INET + 1 #endif #ifdef INET6 + 1 #endif )]; struct uio opt; char *u_path, *u_hostname, *u_name; struct bool_flags *bf; #ifdef INET uint32_t ip4s; struct in_addr *u_ip4; #endif #ifdef INET6 struct in6_addr *u_ip6; #endif size_t tmplen; int error, enforce_statfs; bzero(&optiov, sizeof(optiov)); opt.uio_iov = optiov; opt.uio_iovcnt = 0; opt.uio_offset = -1; opt.uio_resid = -1; opt.uio_segflg = UIO_SYSSPACE; opt.uio_rw = UIO_READ; opt.uio_td = td; /* Set permissions for top-level jails from sysctls. */ if (!jailed(td->td_ucred)) { for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { optiov[opt.uio_iovcnt].iov_base = __DECONST(char *, (jail_default_allow & bf->flag) ? bf->name : bf->noname); optiov[opt.uio_iovcnt].iov_len = strlen(optiov[opt.uio_iovcnt].iov_base) + 1; opt.uio_iovcnt += 2; } optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); opt.uio_iovcnt++; enforce_statfs = jail_default_enforce_statfs; optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); opt.uio_iovcnt++; } tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; #ifdef INET ip4s = (j->version == 0) ? 1 : j->ip4s; if (ip4s > jail_max_af_ips) return (EINVAL); tmplen += ip4s * sizeof(struct in_addr); #else if (j->ip4s > 0) return (EINVAL); #endif #ifdef INET6 if (j->ip6s > jail_max_af_ips) return (EINVAL); tmplen += j->ip6s * sizeof(struct in6_addr); #else if (j->ip6s > 0) return (EINVAL); #endif u_path = malloc(tmplen, M_TEMP, M_WAITOK); u_hostname = u_path + MAXPATHLEN; u_name = u_hostname + MAXHOSTNAMELEN; #ifdef INET u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); #endif #ifdef INET6 #ifdef INET u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); #else u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); #endif #endif optiov[opt.uio_iovcnt].iov_base = "path"; optiov[opt.uio_iovcnt].iov_len = sizeof("path"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_path; error = copyinstr(j->path, u_path, MAXPATHLEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = "host.hostname"; optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_hostname; error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; if (j->jailname != NULL) { optiov[opt.uio_iovcnt].iov_base = "name"; optiov[opt.uio_iovcnt].iov_len = sizeof("name"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_name; error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; } #ifdef INET optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip4; optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); if (j->version == 0) u_ip4->s_addr = j->ip4s; else { error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } } opt.uio_iovcnt++; #endif #ifdef INET6 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip6; optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; #endif KASSERT(opt.uio_iovcnt <= nitems(optiov), ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); free(u_path, M_TEMP); return (error); } /* * struct jail_set_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_set(struct thread *td, struct jail_set_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_set(td, auio, uap->flags); freeuio(auio); return (error); } #if defined(INET) || defined(INET6) typedef int prison_addr_cmp_t(const void *, const void *); typedef bool prison_addr_valid_t(const void *); static const struct pr_family { size_t size; prison_addr_cmp_t *cmp; prison_addr_valid_t *valid; int ip_flag; } pr_families[PR_FAMILY_MAX] = { #ifdef INET [PR_INET] = { .size = sizeof(struct in_addr), .cmp = prison_qcmp_v4, .valid = prison_valid_v4, .ip_flag = PR_IP4_USER, }, #endif #ifdef INET6 [PR_INET6] = { .size = sizeof(struct in6_addr), .cmp = prison_qcmp_v6, .valid = prison_valid_v6, .ip_flag = PR_IP6_USER, }, #endif }; /* * Network address lists (pr_addrs) allocation for jails. The addresses * are accessed locklessly by the network stack, thus need to be protected by * the network epoch. */ struct prison_ip { struct epoch_context ctx; uint32_t ips; #ifdef FUTURE_C /* * XXX Variable-length automatic arrays in union may be * supported in future C. */ union { char pr_ip[]; struct in_addr pr_ip4[]; struct in6_addr pr_ip6[]; }; #else /* No future C :( */ char pr_ip[]; #endif }; static char * PR_IP(struct prison_ip *pip, const pr_family_t af, int idx) { MPASS(pip); MPASS(af < PR_FAMILY_MAX); MPASS(idx >= 0 && idx < pip->ips); return (pip->pr_ip + pr_families[af].size * idx); } static struct prison_ip * prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags) { struct prison_ip *pip; pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size, M_PRISON, flags); if (pip != NULL) pip->ips = cnt; return (pip); } /* * Allocate and copyin user supplied address list, sorting and validating. * kern_jail_set() helper. */ static struct prison_ip * prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt) { prison_addr_cmp_t *const cmp = pr_families[af].cmp; const size_t size = pr_families[af].size; struct prison_ip *pip; pip = prison_ip_alloc(af, cnt, M_WAITOK); bcopy(op, pip->pr_ip, cnt * size); /* * IP addresses are all sorted but ip[0] to preserve * the primary IP address as given from userland. * This special IP is used for unbound outgoing * connections as well for "loopback" traffic in case * source address selection cannot find any more fitting * address to connect from. */ if (cnt > 1) qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp); /* * Check for duplicate addresses and do some simple * zero and broadcast checks. If users give other bogus * addresses it is their problem. */ for (int i = 0; i < cnt; i++) { if (!pr_families[af].valid(PR_IP(pip, af, i))) { free(pip, M_PRISON); return (NULL); } if (i + 1 < cnt && (cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 || cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) { free(pip, M_PRISON); return (NULL); } } return (pip); } /* * Allocate and dup parent prison address list. * kern_jail_set() helper. */ static void prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af) { const struct prison_ip *ppip = ppr->pr_addrs[af]; struct prison_ip *pip; if (ppip != NULL) { pip = prison_ip_alloc(af, ppip->ips, M_WAITOK); bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size); pr->pr_addrs[af] = pip; } } /* * Make sure the new set of IP addresses is a subset of the parent's list. * Don't worry about the parent being unlocked, as any setting is done with * allprison_lock held. * kern_jail_set() helper. */ static bool prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip, const pr_family_t af) { prison_addr_cmp_t *const cmp = pr_families[af].cmp; int i, j; if (ppip == NULL) return (false); for (i = 0; i < ppip->ips; i++) if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0) break; if (i == ppip->ips) /* Main address not present in parent. */ return (false); if (pip->ips > 1) { for (i = j = 1; i < pip->ips; i++) { if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) /* Equals to parent primary address. */ continue; for (; j < ppip->ips; j++) if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, j)) == 0) break; if (j == ppip->ips) break; } if (j == ppip->ips) /* Address not present in parent. */ return (false); } return (true); } /* * Check for conflicting IP addresses. We permit them if there is no more * than one IP on each jail. If there is a duplicate on a jail with more * than one IP stop checking and return error. * kern_jail_set() helper. */ static bool prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr, struct prison_ip *pip, pr_family_t af) { const struct prison *tppr, *tpr; int descend; #ifdef VIMAGE for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #else tppr = &prison0; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif !prison_isalive(tpr)) { descend = 0; continue; } if (!(tpr->pr_flags & pr_families[af].ip_flag)) continue; descend = 0; if (tpr->pr_addrs[af] == NULL || (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1)) continue; for (int i = 0; i < pip->ips; i++) if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0) return (false); } return (true); } _Static_assert(offsetof(struct prison_ip, ctx) == 0, "prison must start with epoch context"); static void prison_ip_free_deferred(epoch_context_t ctx) { free(ctx, M_PRISON); } static void prison_ip_free(struct prison_ip *pip) { if (pip != NULL) NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx); } static void prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new) { struct prison_ip **mem, *old; mtx_assert(&pr->pr_mtx, MA_OWNED); mem = &pr->pr_addrs[af]; old = *mem; atomic_store_ptr(mem, new); prison_ip_free(old); } /* * Restrict a prison's IP address list with its parent's, possibly replacing * it. Return true if succeed, otherwise should redo. * kern_jail_set() helper. */ static bool prison_ip_restrict(struct prison *pr, const pr_family_t af, struct prison_ip **newp) { struct prison_ip *ppip = pr->pr_parent->pr_addrs[af]; struct prison_ip *pip = pr->pr_addrs[af]; int (*const cmp)(const void *, const void *) = pr_families[af].cmp; const size_t size = pr_families[af].size; struct prison_ip *new = newp != NULL ? *newp : NULL; uint32_t ips; mtx_assert(&pr->pr_mtx, MA_OWNED); /* * Due to epoch-synchronized access to the IP address lists we always * allocate a new list even if the old one has enough space. We could * atomically update an IPv4 address inside a list, but that would * screw up sorting, and in case of IPv6 we can't even atomically write * one. */ if (ppip == NULL) { if (pip != NULL) prison_ip_set(pr, af, NULL); return (true); } if (!(pr->pr_flags & pr_families[af].ip_flag)) { if (new == NULL) { new = prison_ip_alloc(af, ppip->ips, M_NOWAIT); if (new == NULL) return (false); /* Redo */ } /* This has no user settings, so just copy the parent's list. */ MPASS(new->ips == ppip->ips); bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size); prison_ip_set(pr, af, new); if (newp != NULL) *newp = NULL; /* Used */ } else if (pip != NULL) { /* Remove addresses that aren't in the parent. */ int i; i = 0; /* index in pip */ ips = 0; /* index in new */ if (new == NULL) { new = prison_ip_alloc(af, pip->ips, M_NOWAIT); if (new == NULL) return (false); /* Redo */ } for (int pi = 0; pi < ppip->ips; pi++) if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) { /* Found our primary address in parent. */ bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), size); i++; ips++; break; } for (int pi = 1; i < pip->ips; ) { /* Check against primary, which is unsorted. */ if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) { /* Matches parent's primary address. */ bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), size); i++; ips++; continue; } /* The rest are sorted. */ switch (pi >= ppip->ips ? -1 : cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) { case -1: i++; break; case 0: bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips), size); i++; pi++; ips++; break; case 1: pi++; break; } } if (ips == 0) { if (newp == NULL || *newp == NULL) prison_ip_free(new); new = NULL; } else { /* Shrink to real size */ KASSERT((new->ips >= ips), ("Out-of-bounds write to prison_ip %p", new)); new->ips = ips; } prison_ip_set(pr, af, new); if (newp != NULL) *newp = NULL; /* Used */ } return (true); } /* * Fast-path check if an address belongs to a prison. */ int prison_ip_check(const struct prison *pr, const pr_family_t af, const void *addr) { int (*const cmp)(const void *, const void *) = pr_families[af].cmp; struct prison_ip *pip; int i, a, z, d; MPASS(mtx_owned(&pr->pr_mtx) || in_epoch(net_epoch_preempt) || sx_xlocked(&allprison_lock)); pip = atomic_load_ptr(&pr->pr_addrs[af]); if (__predict_false(pip == NULL)) return (EAFNOSUPPORT); /* Check the primary IP. */ if (cmp(PR_IP(pip, af, 0), addr) == 0) return (0); /* * All the other IPs are sorted so we can do a binary search. */ a = 0; z = pip->ips - 2; while (a <= z) { i = (a + z) / 2; d = cmp(PR_IP(pip, af, i + 1), addr); if (d > 0) z = i - 1; else if (d < 0) a = i + 1; else return (0); } return (EADDRNOTAVAIL); } /* * Grab primary IP. Historically required mutex, but nothing prevents * us to support epoch-protected access. Is it used in fast path? * in{6}_jail.c helper */ const void * prison_ip_get0(const struct prison *pr, const pr_family_t af) { const struct prison_ip *pip = pr->pr_addrs[af]; mtx_assert(&pr->pr_mtx, MA_OWNED); MPASS(pip); return (pip->pr_ip); } u_int prison_ip_cnt(const struct prison *pr, const pr_family_t af) { return (pr->pr_addrs[af]->ips); } #endif /* defined(INET) || defined(INET6) */ int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { struct nameidata nd; #ifdef INET struct prison_ip *ip4; #endif #ifdef INET6 struct prison_ip *ip6; #endif struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; struct bool_flags *bf; struct jailsys_flags *jsf; #if defined(INET) || defined(INET6) void *op; #endif unsigned long hid; size_t namelen, onamelen, pnamelen; int created, cuflags, descend, drflags, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; int deadid, jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; #ifdef INET int ip4s; bool redo_ip4; #endif #ifdef INET6 int ip6s; bool redo_ip6; #endif uint64_t pr_allow, ch_allow, pr_flags, ch_flags; uint64_t pr_allow_diff; unsigned tallow; char numbuf[12]; error = priv_check(td, PRIV_JAIL_SET); if (!error && (flags & JAIL_ATTACH)) error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); mypr = td->td_ucred->cr_prison; if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this * takes care of some expensive stuff (path lookup) before getting * the allprison lock. * * XXX Jails are not filesystems, and jail parameters are not mount * options. But it makes more sense to re-use the vfsopt code * than duplicate it under a different name. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); #ifdef INET ip4 = NULL; #endif #ifdef INET6 ip6 = NULL; #endif g_path = NULL; cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { error = EINVAL; vfs_opterror(opts, "no valid operation (create or update)"); goto done_errmsg; } error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; else if (error != 0) goto done_free; error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); if (error == ENOENT) gotslevel = 0; else if (error != 0) goto done_free; else gotslevel = 1; error = vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); if (error == ENOENT) gotchildmax = 0; else if (error != 0) goto done_free; else gotchildmax = 1; error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); if (error == ENOENT) gotenforce = 0; else if (error != 0) goto done_free; else if (enforce < 0 || enforce > 2) { error = EINVAL; goto done_free; } else gotenforce = 1; error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); if (error == ENOENT) gotrsnum = 0; else if (error != 0) goto done_free; else gotrsnum = 1; pr_flags = ch_flags = 0; for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) { vfs_flagopt(opts, bf->name, &pr_flags, bf->flag); vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag); } ch_flags |= pr_flags; for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys)); if (error == ENOENT) continue; if (error != 0) goto done_free; switch (jsys) { case JAIL_SYS_DISABLE: if (!jsf->disable) { error = EINVAL; goto done_free; } pr_flags |= jsf->disable; break; case JAIL_SYS_NEW: pr_flags |= jsf->new; break; case JAIL_SYS_INHERIT: break; default: error = EINVAL; goto done_free; } ch_flags |= jsf->new | jsf->disable; } if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE && !(pr_flags & PR_PERSIST)) { error = EINVAL; vfs_opterror(opts, "new jail must persist or attach"); goto done_errmsg; } #ifdef VIMAGE if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { error = EINVAL; vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET6 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_errmsg; } #endif pr_allow = ch_allow = 0; for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { vfs_flagopt(opts, bf->name, &pr_allow, bf->flag); vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag); } ch_allow |= pr_allow; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == ENOENT) name = NULL; else if (error != 0) goto done_free; else { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); if (error == ENOENT) host = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || host[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); if (error == ENOENT) domain = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || domain[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); if (error == ENOENT) uuid = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || uuid[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > HOSTUUIDLEN) { error = ENAMETOOLONG; goto done_free; } } #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32; error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); hid = hid32; } else #endif error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); if (error == ENOENT) gothid = 0; else if (error != 0) goto done_free; else { gothid = 1; ch_flags |= PR_HOST; pr_flags |= PR_HOST; } #ifdef INET error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); if (error == ENOENT) ip4s = 0; else if (error != 0) goto done_free; else if (ip4s & (sizeof(struct in_addr) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP4_USER; pr_flags |= PR_IP4_USER; if (ip4s > 0) { ip4s /= sizeof(struct in_addr); if (ip4s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv4 addresses"); goto done_errmsg; } ip4 = prison_ip_copyin(PR_INET, op, ip4s); if (ip4 == NULL) { error = EINVAL; goto done_free; } } } #endif #ifdef INET6 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); if (error == ENOENT) ip6s = 0; else if (error != 0) goto done_free; else if (ip6s & (sizeof(struct in6_addr) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP6_USER; pr_flags |= PR_IP6_USER; if (ip6s > 0) { ip6s /= sizeof(struct in6_addr); if (ip6s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv6 addresses"); goto done_errmsg; } ip6 = prison_ip_copyin(PR_INET6, op, ip6s); if (ip6 == NULL) { error = EINVAL; goto done_free; } } } #endif #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_errmsg; } #endif error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); if (error == ENOENT) osrelstr = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osrelease cannot be changed after creation"); goto done_errmsg; } if (len == 0 || osrelstr[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len >= OSRELEASELEN) { error = ENAMETOOLONG; vfs_opterror(opts, "osrelease string must be 1-%d bytes long", OSRELEASELEN - 1); goto done_errmsg; } } error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); if (error == ENOENT) osreldt = 0; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be changed after creation"); goto done_errmsg; } if (osreldt == 0) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be 0"); goto done_errmsg; } } root = NULL; error = vfs_getopt(opts, "path", (void **)&path, &len); if (error == ENOENT) path = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "path cannot be changed after creation"); goto done_errmsg; } if (len == 0 || path[len - 1] != '\0') { error = EINVAL; goto done_free; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path); error = namei(&nd); if (error) goto done_free; root = nd.ni_vp; NDFREE_PNBUF(&nd); g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); strlcpy(g_path, path, MAXPATHLEN); error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); if (error == 0) { path = g_path; } else { /* exit on other errors */ goto done_free; } if (root->v_type != VDIR) { error = ENOTDIR; vput(root); goto done_free; } VOP_UNLOCK(root); } /* * Find the specified jail, or at least its parent. * This abuses the file error codes ENOENT and EEXIST. */ pr = NULL; inspr = NULL; deadpr = NULL; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); if (*p != '\0') jid = 0; } sx_xlock(&allprison_lock); drflags = PD_LIST_XLOCKED; ppr = mypr; if (!prison_isalive(ppr)) { /* This jail is dying. This process will surely follow. */ error = EAGAIN; goto done_deref; } if (jid != 0) { if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); goto done_deref; } /* * See if a requested jid already exists. Keep track of * where it can be inserted later. */ TAILQ_FOREACH(inspr, &allprison, pr_list) { if (inspr->pr_id < jid) continue; if (inspr->pr_id > jid) break; if (prison_isalive(inspr)) { pr = inspr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; } else { /* Note a dying jail to handle later. */ deadpr = inspr; } inspr = NULL; break; } if (cuflags == JAIL_CREATE && pr != NULL) { /* * Even creators that cannot see the jail will * get EEXIST. */ error = EEXIST; vfs_opterror(opts, "jail %d already exists", jid); goto done_deref; } if ((pr == NULL) ? cuflags == JAIL_UPDATE : !prison_ischild(mypr, pr)) { /* * Updaters get ENOENT for nonexistent jails, * or for jails they cannot see. The latter * case is true even for CREATE | UPDATE, * which normally cannot give this error. */ error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_deref; } } /* * If the caller provided a name, look for a jail by that name. * This has different semantics for creates and updates keyed by jid * (where the name must not already exist in a different jail), * and updates keyed by the name itself (where the name must exist * because that is the jail being updated). */ namelc = NULL; if (name != NULL) { namelc = strrchr(name, '.'); if (namelc == NULL) namelc = name; else { /* * This is a hierarchical name. Split it into the * parent and child names, and make sure the parent * exists or matches an already found jail. */ if (pr != NULL) { if (strncmp(name, ppr->pr_name, namelc - name) || ppr->pr_name[namelc - name] != '\0') { error = EINVAL; vfs_opterror(opts, "cannot change jail's parent"); goto done_deref; } } else { *namelc = '\0'; ppr = prison_find_name(mypr, name); if (ppr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_deref; } mtx_unlock(&ppr->pr_mtx); if (!prison_isalive(ppr)) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_deref; } *namelc = '.'; } namelc++; } if (namelc[0] != '\0') { pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr == pr || !prison_isalive(tpr) || strcmp(tpr->pr_name + pnamelen, namelc)) continue; if (cuflags == JAIL_CREATE || pr != NULL) { /* * Create, or update(jid): name must * not exist in an active sibling jail. */ error = EEXIST; vfs_opterror(opts, "jail \"%s\" already exists", name); goto done_deref; } /* Use this jail for updates. */ pr = tpr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; break; } /* * Update: name must exist if no jid is specified. * As with the jid case, the jail must be currently * visible, or else even CREATE | UPDATE will get * an error. */ if ((pr == NULL) ? cuflags == JAIL_UPDATE : !prison_isalive(pr)) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_deref; } } } /* Update: must provide a jid or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); goto done_deref; } /* If there's no prison to update, create a new one and link it in. */ created = pr == NULL; if (created) { for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) if (tpr->pr_childcount >= tpr->pr_childmax) { error = EPERM; vfs_opterror(opts, "prison limit exceeded"); goto done_deref; } if (deadpr != NULL) { /* * The prison being created has the same ID as a dying * one. Handle this by giving the dying jail a new ID. * This may cause some confusion to user space, but * only to those listing dying jails. */ deadid = get_next_deadid(&dinspr); if (deadid == 0) { error = EAGAIN; vfs_opterror(opts, "no available jail IDs"); goto done_deref; } mtx_lock(&deadpr->pr_mtx); deadpr->pr_id = deadid; mtx_unlock(&deadpr->pr_mtx); if (dinspr == deadpr) inspr = deadpr; else { inspr = TAILQ_NEXT(deadpr, pr_list); TAILQ_REMOVE(&allprison, deadpr, pr_list); if (dinspr != NULL) TAILQ_INSERT_AFTER(&allprison, dinspr, deadpr, pr_list); else TAILQ_INSERT_HEAD(&allprison, deadpr, pr_list); } } if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) { error = EAGAIN; vfs_opterror(opts, "no available jail IDs"); goto done_deref; } pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); pr->pr_state = PRISON_STATE_INVALID; refcount_init(&pr->pr_ref, 1); refcount_init(&pr->pr_uref, 0); drflags |= PD_DEREF; LIST_INIT(&pr->pr_children); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); pr->pr_id = jid; if (inspr != NULL) TAILQ_INSERT_BEFORE(inspr, pr, pr_list); else TAILQ_INSERT_TAIL(&allprison, pr, pr_list); pr->pr_parent = ppr; prison_hold(ppr); prison_proc_hold(ppr); LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; /* Set some default values, and inherit some from the parent. */ if (namelc == NULL) namelc = ""; if (path == NULL) { path = "/"; root = mypr->pr_root; vref(root); } strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); pr->pr_flags |= PR_HOST; #if defined(INET) || defined(INET6) #ifdef VIMAGE if (!(pr_flags & PR_VNET)) #endif { #ifdef INET if (!(ch_flags & PR_IP4_USER)) pr->pr_flags |= PR_IP4 | PR_IP4_USER; else if (!(pr_flags & PR_IP4_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP4; prison_ip_dup(ppr, pr, PR_INET); } #endif #ifdef INET6 if (!(ch_flags & PR_IP6_USER)) pr->pr_flags |= PR_IP6 | PR_IP6_USER; else if (!(pr_flags & PR_IP6_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP6; prison_ip_dup(ppr, pr, PR_INET6); } #endif } #endif /* Source address selection is always on by default. */ pr->pr_flags |= _PR_IP_SADDRSEL; pr->pr_securelevel = ppr->pr_securelevel; pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; pr->pr_enforce_statfs = jail_default_enforce_statfs; pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; if (osrelstr == NULL) strlcpy(pr->pr_osrelease, ppr->pr_osrelease, sizeof(pr->pr_osrelease)); else strlcpy(pr->pr_osrelease, osrelstr, sizeof(pr->pr_osrelease)); #ifdef VIMAGE /* * Allocate a new vnet if specified. * * Set PR_VNET now if so, so that the vnet is disposed of * properly when the jail is destroyed. */ if (pr_flags & PR_VNET) { pr->pr_flags |= PR_VNET; pr->pr_vnet = vnet_alloc(); } else { pr->pr_vnet = ppr->pr_vnet; } #endif /* * Allocate a dedicated cpuset for each jail. * Unlike other initial settings, this may return an error. */ error = cpuset_create_root(ppr, &pr->pr_cpuset); if (error) goto done_deref; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; } else { /* * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ prison_hold(pr); drflags |= PD_DEREF; #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_deref; } #endif #ifdef INET if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_deref; } #endif #ifdef INET6 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_deref; } #endif } /* Do final error checking before setting anything. */ if (gotslevel) { if (slevel < ppr->pr_securelevel) { error = EPERM; goto done_deref; } } if (gotchildmax) { if (childmax >= ppr->pr_childmax) { error = EPERM; goto done_deref; } } if (gotenforce) { if (enforce < ppr->pr_enforce_statfs) { error = EPERM; goto done_deref; } } if (gotrsnum) { /* * devfs_rsnum is a uint16_t */ if (rsnum < 0 || rsnum > 65535) { error = EINVAL; goto done_deref; } /* * Nested jails always inherit parent's devfs ruleset */ if (jailed(td->td_ucred)) { if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { error = EPERM; goto done_deref; } else rsnum = ppr->pr_devfs_rsnum; } } #ifdef INET if (ip4s > 0) { if ((ppr->pr_flags & PR_IP4) && !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4, PR_INET)) { error = EPERM; goto done_deref; } if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) { error = EADDRINUSE; vfs_opterror(opts, "IPv4 addresses clash"); goto done_deref; } } #endif #ifdef INET6 if (ip6s > 0) { if ((ppr->pr_flags & PR_IP6) && !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6, PR_INET6)) { error = EPERM; goto done_deref; } if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) { error = EADDRINUSE; vfs_opterror(opts, "IPv6 addresses clash"); goto done_deref; } } #endif onamelen = namelen = 0; if (namelc != NULL) { /* Give a default name of the jid. Also allow the name to be * explicitly the jid - but not any other number, and only in * normal form (no leading zero/etc). */ if (namelc[0] == '\0') snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); else if ((strtoul(namelc, &p, 10) != jid || namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { error = EINVAL; vfs_opterror(opts, "name cannot be numeric (unless it is the jid)"); goto done_deref; } /* * Make sure the name isn't too long for the prison or its * children. */ pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; onamelen = strlen(pr->pr_name + pnamelen); namelen = strlen(namelc); if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref; } FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { if (strlen(tpr->pr_name) + (namelen - onamelen) >= sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref; } } } pr_allow_diff = pr_allow & ~ppr->pr_allow; if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) { error = EPERM; goto done_deref; } /* * Let modules check their parameters. This requires unlocking and * then re-locking the prison, but this is still a valid state as long * as allprison_lock remains xlocked. */ mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; error = osd_jail_call(pr, PR_METHOD_CHECK, opts); if (error != 0) goto done_deref; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; /* At this point, all valid parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_deref; } } /* Set the parameters of the prison. */ #ifdef INET redo_ip4 = false; if (pr_flags & PR_IP4_USER) { pr->pr_flags |= PR_IP4; prison_ip_set(pr, PR_INET, ip4); ip4 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (!prison_ip_restrict(tpr, PR_INET, NULL)) { redo_ip4 = true; descend = 0; } } } #endif #ifdef INET6 redo_ip6 = false; if (pr_flags & PR_IP6_USER) { pr->pr_flags |= PR_IP6; prison_ip_set(pr, PR_INET6, ip6); ip6 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (!prison_ip_restrict(tpr, PR_INET6, NULL)) { redo_ip6 = true; descend = 0; } } } #endif if (gotslevel) { pr->pr_securelevel = slevel; /* Set all child jails to be at least this level. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_securelevel < slevel) tpr->pr_securelevel = slevel; } if (gotchildmax) { pr->pr_childmax = childmax; /* Set all child jails to under this limit. */ FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) if (tpr->pr_childmax > childmax - level) tpr->pr_childmax = childmax > level ? childmax - level : 0; } if (gotenforce) { pr->pr_enforce_statfs = enforce; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_enforce_statfs < enforce) tpr->pr_enforce_statfs = enforce; } if (gotrsnum) { pr->pr_devfs_rsnum = rsnum; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_devfs_rsnum = rsnum; } if (namelc != NULL) { if (ppr == &prison0) strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); else snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", ppr->pr_name, namelc); /* Change this component of child names. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, strlen(tpr->pr_name + onamelen) + 1); bcopy(pr->pr_name, tpr->pr_name, namelen); } } if (path != NULL) { /* Try to keep a real-rooted full pathname. */ strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); pr->pr_root = root; root = NULL; } if (PR_HOST & ch_flags & ~pr_flags) { if (pr->pr_flags & PR_HOST) { /* * Copy the parent's host info. As with pr_ip4 above, * the lack of a lock on the parent is not a problem; * it is always set with allprison_lock at least * shared, and is held exclusively here. */ strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, sizeof(pr->pr_hostname)); strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, sizeof(pr->pr_domainname)); strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, sizeof(pr->pr_hostuuid)); pr->pr_hostid = pr->pr_parent->pr_hostid; } } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { /* Set this prison, and any descendants without PR_HOST. */ if (host != NULL) strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); if (domain != NULL) strlcpy(pr->pr_domainname, domain, sizeof(pr->pr_domainname)); if (uuid != NULL) strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); if (gothid) pr->pr_hostid = hid; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { if (tpr->pr_flags & PR_HOST) descend = 0; else { if (host != NULL) strlcpy(tpr->pr_hostname, pr->pr_hostname, sizeof(tpr->pr_hostname)); if (domain != NULL) strlcpy(tpr->pr_domainname, pr->pr_domainname, sizeof(tpr->pr_domainname)); if (uuid != NULL) strlcpy(tpr->pr_hostuuid, pr->pr_hostuuid, sizeof(tpr->pr_hostuuid)); if (gothid) tpr->pr_hostid = hid; } } } pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; if ((tallow = ch_allow & ~pr_allow)) prison_set_allow_locked(pr, tallow, 0); /* * Persistent prisons get an extra reference, and prisons losing their * persist flag lose that reference. */ if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) { if (pr_flags & PR_PERSIST) { prison_hold(pr); /* * This may be a new prison's first user reference, * but wait to call it alive until after OSD calls * have had a chance to run (and perhaps to fail). */ refcount_acquire(&pr->pr_uref); } else { drflags |= PD_DEUREF; prison_free_not_last(pr); } } pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; /* * Any errors past this point will need to de-persist newly created * prisons, as well as call remove methods. */ if (created) drflags |= PD_KILL; #ifdef RACCT if (racct_enable && created) prison_racct_attach(pr); #endif /* Locks may have prevented a complete restriction of child IP * addresses. If so, allocate some more memory and try again. */ #ifdef INET while (redo_ip4) { ip4s = pr->pr_addrs[PR_INET]->ips; MPASS(ip4 == NULL); ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip4 = false; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (!prison_ip_restrict(tpr, PR_INET, &ip4)) redo_ip4 = true; } mtx_unlock(&pr->pr_mtx); } #endif #ifdef INET6 while (redo_ip6) { ip6s = pr->pr_addrs[PR_INET6]->ips; MPASS(ip6 == NULL); ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip6 = false; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (!prison_ip_restrict(tpr, PR_INET6, &ip6)) redo_ip6 = true; } mtx_unlock(&pr->pr_mtx); } #endif /* Let the modules do their work. */ if (created) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) goto done_deref; } error = osd_jail_call(pr, PR_METHOD_SET, opts); if (error) goto done_deref; /* * A new prison is now ready to be seen; either it has gained a user * reference via persistence, or is about to gain one via attachment. */ if (created) { drflags = prison_lock_xlock(pr, drflags); pr->pr_state = PRISON_STATE_ALIVE; } /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { error = do_jail_attach(td, pr, prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS)); drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED); if (error) { vfs_opterror(opts, "attach failed"); goto done_deref; } } #ifdef RACCT if (racct_enable && !created) { if (drflags & PD_LOCKED) { mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; } if (drflags & PD_LIST_XLOCKED) { sx_xunlock(&allprison_lock); drflags &= ~PD_LIST_XLOCKED; } prison_racct_modify(pr); } #endif if (created && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 && (pr->pr_root->v_vflag & VV_ROOT) == 0) printf("Warning jail jid=%d: mountd/nfsd requires a separate" " file system\n", pr->pr_id); drflags &= ~PD_KILL; td->td_retval[0] = pr->pr_id; done_deref: /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (drflags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); if (root != NULL) vrele(root); done_errmsg: if (error) { /* Write the error message back to userspace. */ if (vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len) == 0 && errmsg_len > 0) { errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else (void)copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } done_free: #ifdef INET prison_ip_free(ip4); #endif #ifdef INET6 prison_ip_free(ip6); #endif if (g_path != NULL) free(g_path, M_TEMP); vfs_freeopts(opts); return (error); } /* * Find the next available prison ID. Return the ID on success, or zero * on failure. Also set a pointer to the allprison list entry the prison * should be inserted before. */ static int get_next_prid(struct prison **insprp) { struct prison *inspr; int jid, maxid; jid = lastprid % JAIL_MAX + 1; if (TAILQ_EMPTY(&allprison) || TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) { /* * A common case is for all jails to be implicitly numbered, * which means they'll go on the end of the list, at least * for the first JAIL_MAX times. */ inspr = NULL; } else { /* * Take two passes through the allprison list: first starting * with the proposed jid, then ending with it. */ for (maxid = JAIL_MAX; maxid != 0; ) { TAILQ_FOREACH(inspr, &allprison, pr_list) { if (inspr->pr_id < jid) continue; if (inspr->pr_id > jid) { /* Found an opening. */ maxid = 0; break; } if (++jid > maxid) { if (lastprid == maxid || lastprid == 0) { /* * The entire legal range * has been traversed */ return 0; } /* Try again from the start. */ jid = 1; maxid = lastprid; break; } } if (inspr == NULL) { /* Found room at the end of the list. */ break; } } } *insprp = inspr; lastprid = jid; return (jid); } /* * Find the next available ID for a renumbered dead prison. This is the same * as get_next_prid, but counting backward from the end of the range. */ static int get_next_deadid(struct prison **dinsprp) { struct prison *dinspr; int deadid, minid; deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX; /* * Take two reverse passes through the allprison list: first * starting with the proposed deadid, then ending with it. */ for (minid = 1; minid != 0; ) { TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) { if (dinspr->pr_id > deadid) continue; if (dinspr->pr_id < deadid) { /* Found an opening. */ minid = 0; break; } if (--deadid < minid) { if (lastdeadid == minid || lastdeadid == 0) { /* * The entire legal range * has been traversed */ return 0; } /* Try again from the end. */ deadid = JAIL_MAX; minid = lastdeadid; break; } } if (dinspr == NULL) { /* Found room at the beginning of the list. */ break; } } *dinsprp = dinspr; lastdeadid = deadid; return (deadid); } /* * struct jail_get_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_get(struct thread *td, struct jail_get_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_get(td, auio, uap->flags); if (error == 0) error = copyout(auio->uio_iov, uap->iovp, uap->iovcnt * sizeof(struct iovec)); freeuio(auio); return (error); } int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct bool_flags *bf; struct jailsys_flags *jsf; struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos; unsigned f; if (flags & ~JAIL_GET_MASK) return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; pr = NULL; /* * Find the prison specified by one of: lastjid, jid, name. */ sx_slock(&allprison_lock); drflags = PD_LIST_SLOCKED; error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id > jid && ((flags & JAIL_DYING) || prison_isalive(pr)) && prison_ischild(mypr, pr)) { mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; goto found_prison; } } error = ENOENT; vfs_opterror(opts, "no jail after %d", jid); goto done; } else if (error != ENOENT) goto done; error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == 0) { if (jid != 0) { pr = prison_find_child(mypr, jid); if (pr != NULL) { drflags |= PD_LOCKED; if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done; } } else if (error != ENOENT) goto done; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == 0) { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done; } pr = prison_find_name(mypr, name); if (pr != NULL) { drflags |= PD_LOCKED; if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done; } else if (error != ENOENT) goto done; vfs_opterror(opts, "no jail specified"); error = ENOENT; goto done; found_prison: /* Get the parameters of the prison. */ prison_hold(pr); drflags |= PD_DEREF; td->td_retval[0] = pr->pr_id; error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done; i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; error = vfs_setopt(opts, "parent", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "name", prison_name(mypr, pr)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, sizeof(pr->pr_cpuset->cs_id)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "path", prison_path(mypr, pr)); if (error != 0 && error != ENOENT) goto done; #ifdef INET error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip, pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips * pr_families[PR_INET].size : 0 ); if (error != 0 && error != ENOENT) goto done; #endif #ifdef INET6 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip, pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips * pr_families[PR_INET6].size : 0 ); if (error != 0 && error != ENOENT) goto done; #endif error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, sizeof(pr->pr_securelevel)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, sizeof(pr->pr_childcount)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "children.max", &pr->pr_childmax, sizeof(pr->pr_childmax)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); if (error != 0 && error != ENOENT) goto done; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32 = pr->pr_hostid; error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); } else #endif error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, sizeof(pr->pr_hostid)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, sizeof(pr->pr_enforce_statfs)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, sizeof(pr->pr_devfs_rsnum)); if (error != 0 && error != ENOENT) goto done; for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) { i = (pr->pr_flags & bf->flag) ? 1 : 0; error = vfs_setopt(opts, bf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { f = pr->pr_flags & (jsf->disable | jsf->new); i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE : (f == jsf->new) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, jsf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { i = (pr->pr_allow & bf->flag) ? 1 : 0; error = vfs_setopt(opts, bf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } i = !prison_isalive(pr); error = vfs_setopt(opts, "dying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, "nodying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, sizeof(pr->pr_osreldate)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); if (error != 0 && error != ENOENT) goto done; /* Get the module parameters. */ mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; error = osd_jail_call(pr, PR_METHOD_GET, opts); if (error) goto done; prison_deref(pr, drflags); pr = NULL; drflags = 0; /* By now, all parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && (strstr(opt->name, JAIL_META_PRIVATE ".") == opt->name || strstr(opt->name, JAIL_META_SHARED ".") == opt->name)) { /* Communicate back a missing key. */ free(opt->value, M_MOUNT); opt->value = NULL; opt->len = 0; continue; } if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done; } } /* Write the fetched parameters back to userspace. */ error = 0; TAILQ_FOREACH(opt, opts, link) { if (opt->pos >= 0 && opt->pos != errmsg_pos) { pos = 2 * opt->pos + 1; optuio->uio_iov[pos].iov_len = opt->len; if (opt->value != NULL) { if (optuio->uio_segflg == UIO_SYSSPACE) { bcopy(opt->value, optuio->uio_iov[pos].iov_base, opt->len); } else { error = copyout(opt->value, optuio->uio_iov[pos].iov_base, opt->len); if (error) break; } } } } done: /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); if (error && errmsg_pos >= 0) { /* Write the error message back to userspace. */ vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); errmsg_pos = 2 * errmsg_pos + 1; if (errmsg_len > 0) { if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else (void)copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } vfs_freeopts(opts); return (error); } /* * struct jail_remove_args { * int jid; * }; */ int sys_jail_remove(struct thread *td, struct jail_remove_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_REMOVE); if (error) return (error); sx_xlock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_xunlock(&allprison_lock); return (EINVAL); } if (!prison_isalive(pr)) { /* Silently ignore already-dying prisons. */ mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); return (0); } prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED); return (0); } /* * struct jail_attach_args { * int jid; * }; */ int sys_jail_attach(struct thread *td, struct jail_attach_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); return (EINVAL); } /* Do not allow a process to attach to a prison that is not alive. */ if (!prison_isalive(pr)) { mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); return (EINVAL); } return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED)); } static int do_jail_attach(struct thread *td, struct prison *pr, int drflags) { struct proc *p; struct ucred *newcred, *oldcred; int error; mtx_assert(&pr->pr_mtx, MA_OWNED); sx_assert(&allprison_lock, SX_LOCKED); drflags &= PD_LOCK_FLAGS; /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two * different jails at the same time. It is important for * user processes not to do this, or they might end up with * a process root from one prison, but attached to the jail * of another. */ prison_hold(pr); refcount_acquire(&pr->pr_uref); drflags |= PD_DEREF | PD_DEUREF; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; /* Let modules do whatever they need to prepare for attaching. */ error = osd_jail_call(pr, PR_METHOD_ATTACH, td); if (error) { prison_deref(pr, drflags); return (error); } sx_unlock(&allprison_lock); drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED); /* * Reparent the newly attached process to this jail. */ p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) goto e_revert_osd; vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); if ((error = change_dir(pr->pr_root, td)) != 0) goto e_unlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) goto e_unlock; #endif VOP_UNLOCK(pr->pr_root); if ((error = pwd_chroot_chdir(td, pr->pr_root))) goto e_revert_osd; newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_prison = pr; proc_set_cred(p, newcred); setsugid(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif prison_proc_relink(oldcred->cr_prison, pr, p); prison_deref(oldcred->cr_prison, drflags); crfree(oldcred); /* * If the prison was killed while changing credentials, die along * with it. */ if (!prison_isalive(pr)) { PROC_LOCK(p); kern_psignal(p, SIGKILL); PROC_UNLOCK(p); } return (0); e_unlock: VOP_UNLOCK(pr->pr_root); e_revert_osd: /* Tell modules this thread is still in its old jail after all. */ sx_slock(&allprison_lock); drflags |= PD_LIST_SLOCKED; (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); prison_deref(pr, drflags); return (error); } /* * Returns a locked prison instance, or NULL on failure. */ struct prison * prison_find(int prid) { struct prison *pr; sx_assert(&allprison_lock, SX_LOCKED); TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id < prid) continue; if (pr->pr_id > prid) break; KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); mtx_lock(&pr->pr_mtx); return (pr); } return (NULL); } /* * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. */ struct prison * prison_find_child(struct prison *mypr, int prid) { struct prison *pr; int descend; sx_assert(&allprison_lock, SX_LOCKED); FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (pr->pr_id == prid) { KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); mtx_lock(&pr->pr_mtx); return (pr); } } return (NULL); } /* * Look for the name relative to mypr. Returns a locked prison or NULL. */ struct prison * prison_find_name(struct prison *mypr, const char *name) { struct prison *pr, *deadpr; size_t mylen; int descend; sx_assert(&allprison_lock, SX_LOCKED); mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; deadpr = NULL; FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (!strcmp(pr->pr_name + mylen, name)) { KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); if (prison_isalive(pr)) { mtx_lock(&pr->pr_mtx); return (pr); } deadpr = pr; } } /* There was no valid prison - perhaps there was a dying one. */ if (deadpr != NULL) mtx_lock(&deadpr->pr_mtx); return (deadpr); } /* * See if a prison has the specific flag set. The prison should be locked, * unless checking for flags that are only set at jail creation (such as * PR_IP4 and PR_IP6), or only the single bit is examined, without regard * to any other prison data. */ bool prison_flag(struct ucred *cred, unsigned flag) { return ((cred->cr_prison->pr_flags & flag) != 0); } /* * See if a prison has the specific allow flag set. * The prison *should* be locked, or only a single bit is examined, without * regard to any other prison data. */ bool prison_allow(struct ucred *cred, unsigned flag) { return ((cred->cr_prison->pr_allow & flag) != 0); } /* * Hold a prison reference, by incrementing pr_ref. It is generally * an error to hold a prison that does not already have a reference. * A prison record will remain valid as long as it has at least one * reference, and will not be removed as long as either the prison * mutex or the allprison lock is held (allprison_lock may be shared). */ void prison_hold_locked(struct prison *pr) { /* Locking is no longer required. */ prison_hold(pr); } void prison_hold(struct prison *pr) { #ifdef INVARIANTS int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref); KASSERT(was_valid, ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_acquire(&pr->pr_ref); #endif } /* * Remove a prison reference. If that was the last reference, the * prison will be removed (at a later time). */ void prison_free_locked(struct prison *pr) { mtx_assert(&pr->pr_mtx, MA_OWNED); /* * Locking is no longer required, but unlock because the caller * expects it. */ mtx_unlock(&pr->pr_mtx); prison_free(pr); } void prison_free(struct prison *pr) { KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_ref)) { /* * Don't remove the last reference in this context, * in case there are locks held. */ taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task); } } static void prison_free_not_last(struct prison *pr) { #ifdef INVARIANTS int lastref; KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); lastref = refcount_release(&pr->pr_ref); KASSERT(!lastref, ("prison_free_not_last freed last ref on prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_release(&pr->pr_ref); #endif } /* * Hold a prison for user visibility, by incrementing pr_uref. * It is generally an error to hold a prison that isn't already * user-visible, except through the jail system calls. It is also * an error to hold an invalid prison. A prison record will remain * alive as long as it has at least one user reference, and will not * be set to the dying state until the prison mutex and allprison_lock * are both freed. */ void prison_proc_hold(struct prison *pr) { #ifdef INVARIANTS int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref); KASSERT(was_alive, ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); #else refcount_acquire(&pr->pr_uref); #endif } /* * Remove a prison user reference. If it was the last reference, the * prison will be considered "dying", and may be removed once all of * its references are dropped. */ void prison_proc_free(struct prison *pr) { /* * Locking is only required when releasing the last reference. * This allows assurance that a locked prison will remain alive * until it is unlocked. */ KASSERT(refcount_load(&pr->pr_uref) > 0, ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_uref)) { /* * Don't remove the last user reference in this context, * which is expected to be a process that is not only locked, * but also half dead. Add a reference so any calls to * prison_free() won't re-submit the task. */ prison_hold(pr); mtx_lock(&pr->pr_mtx); KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC), ("Redundant last reference in prison_proc_free (jid=%d)", pr->pr_id)); pr->pr_flags |= PR_COMPLETE_PROC; mtx_unlock(&pr->pr_mtx); taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task); } } static void prison_proc_free_not_last(struct prison *pr) { #ifdef INVARIANTS int lastref; KASSERT(refcount_load(&pr->pr_uref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); lastref = refcount_release(&pr->pr_uref); KASSERT(!lastref, ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_release(&pr->pr_uref); #endif } void prison_proc_link(struct prison *pr, struct proc *p) { sx_assert(&allproc_lock, SA_XLOCKED); LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist); } void prison_proc_unlink(struct prison *pr, struct proc *p) { sx_assert(&allproc_lock, SA_XLOCKED); LIST_REMOVE(p, p_jaillist); } static void prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p) { sx_xlock(&allproc_lock); prison_proc_unlink(opr, p); prison_proc_link(npr, p); sx_xunlock(&allproc_lock); } /* * Complete a call to either prison_free or prison_proc_free. */ static void prison_complete(void *context, int pending) { struct prison *pr = context; int drflags; /* * This could be called to release the last reference, or the last * user reference (plus the reference held in prison_proc_free). */ drflags = prison_lock_xlock(pr, PD_DEREF); if (pr->pr_flags & PR_COMPLETE_PROC) { pr->pr_flags &= ~PR_COMPLETE_PROC; drflags |= PD_DEUREF; } prison_deref(pr, drflags); } static void prison_kill_processes_cb(struct proc *p, void *arg __unused) { kern_psignal(p, SIGKILL); } /* * Note the iteration does not guarantee acting on all processes. * Most notably there may be fork or jail_attach in progress. */ void prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *), void *cbarg) { struct prison *ppr; struct proc *p; if (atomic_load_int(&pr->pr_childcount) == 0) { sx_slock(&allproc_lock); LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) { if (p->p_state == PRS_NEW) continue; PROC_LOCK(p); cb(p, cbarg); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); if (atomic_load_int(&pr->pr_childcount) == 0) return; /* * Some jails popped up during the iteration, fall through to a * system-wide search. */ } sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NEW && p->p_ucred != NULL) { for (ppr = p->p_ucred->cr_prison; ppr != NULL; ppr = ppr->pr_parent) { if (ppr == pr) { cb(p, cbarg); break; } } } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } /* * Remove a prison reference and/or user reference (usually). * This assumes context that allows sleeping (for allprison_lock), * with no non-sleeping locks held, except perhaps the prison itself. * If there are no more references, release and delist the prison. * On completion, the prison lock and the allprison lock are both * unlocked. */ static void prison_deref(struct prison *pr, int flags) { struct prisonlist freeprison; struct prison *killpr, *rpr, *ppr, *tpr; killpr = NULL; TAILQ_INIT(&freeprison); /* * Release this prison as requested, which may cause its parent * to be released, and then maybe its grandparent, etc. */ for (;;) { if (flags & PD_KILL) { /* Kill the prison and its descendents. */ KASSERT(pr != &prison0, ("prison_deref trying to kill prison0")); if (!(flags & PD_DEREF)) { prison_hold(pr); flags |= PD_DEREF; } flags = prison_lock_xlock(pr, flags); prison_deref_kill(pr, &freeprison); } if (flags & PD_DEUREF) { /* Drop a user reference. */ KASSERT(refcount_load(&pr->pr_uref) > 0, ("prison_deref PD_DEUREF on a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_uref)) { if (!(flags & PD_DEREF)) { prison_hold(pr); flags |= PD_DEREF; } flags = prison_lock_xlock(pr, flags); if (refcount_release(&pr->pr_uref) && pr->pr_state == PRISON_STATE_ALIVE) { /* * When the last user references goes, * this becomes a dying prison. */ KASSERT( refcount_load(&prison0.pr_uref) > 0, ("prison0 pr_uref=0")); pr->pr_state = PRISON_STATE_DYING; mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; prison_cleanup(pr); } } } if (flags & PD_KILL) { /* * Any remaining user references are probably processes * that need to be killed, either in this prison or its * descendants. */ if (refcount_load(&pr->pr_uref) > 0) killpr = pr; /* Make sure the parent prison doesn't get killed. */ flags &= ~PD_KILL; } if (flags & PD_DEREF) { /* Drop a reference. */ KASSERT(refcount_load(&pr->pr_ref) > 0, ("prison_deref PD_DEREF on a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_ref)) { flags = prison_lock_xlock(pr, flags); if (refcount_release(&pr->pr_ref)) { /* * When the last reference goes, * unlink the prison and set it aside. */ KASSERT( refcount_load(&pr->pr_uref) == 0, ("prison_deref: last ref, " "but still has %d urefs (jid=%d)", pr->pr_uref, pr->pr_id)); KASSERT( refcount_load(&prison0.pr_ref) != 0, ("prison0 pr_ref=0")); pr->pr_state = PRISON_STATE_INVALID; TAILQ_REMOVE(&allprison, pr, pr_list); LIST_REMOVE(pr, pr_sibling); TAILQ_INSERT_TAIL(&freeprison, pr, pr_list); for (ppr = pr->pr_parent; ppr != NULL; ppr = ppr->pr_parent) ppr->pr_childcount--; /* * Removing a prison frees references * from its parent. */ ppr = pr->pr_parent; pr->pr_parent = NULL; mtx_unlock(&pr->pr_mtx); pr = ppr; flags &= ~PD_LOCKED; flags |= PD_DEREF | PD_DEUREF; continue; } } } break; } /* Release all the prison locks. */ if (flags & PD_LOCKED) mtx_unlock(&pr->pr_mtx); if (flags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (flags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); /* Kill any processes attached to a killed prison. */ if (killpr != NULL) prison_proc_iterate(killpr, prison_kill_processes_cb, NULL); /* * Finish removing any unreferenced prisons, which couldn't happen * while allprison_lock was held (to avoid a LOR on vrele). */ TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) { #ifdef VIMAGE if (rpr->pr_flags & PR_VNET) vnet_destroy(rpr->pr_vnet); #endif if (rpr->pr_root != NULL) vrele(rpr->pr_root); mtx_destroy(&rpr->pr_mtx); #ifdef INET prison_ip_free(rpr->pr_addrs[PR_INET]); #endif #ifdef INET6 prison_ip_free(rpr->pr_addrs[PR_INET6]); #endif if (rpr->pr_cpuset != NULL) cpuset_rel(rpr->pr_cpuset); osd_jail_exit(rpr); #ifdef RACCT if (racct_enable) prison_racct_detach(rpr); #endif TAILQ_REMOVE(&freeprison, rpr, pr_list); free(rpr, M_PRISON); } } /* * Kill the prison and its descendants. Mark them as dying, clear the * persist flag, and call module remove methods. */ static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) { struct prison *cpr, *ppr, *rpr; bool descend; /* * Unlike the descendants, the target prison can be killed * even if it is currently dying. This is useful for failed * creation in jail_set(2). */ KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to kill dead prison %p (jid=%d).", pr, pr->pr_id)); refcount_acquire(&pr->pr_uref); pr->pr_state = PRISON_STATE_DYING; mtx_unlock(&pr->pr_mtx); rpr = NULL; FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) { if (descend) { if (!prison_isalive(cpr)) { descend = false; continue; } prison_hold(cpr); prison_proc_hold(cpr); mtx_lock(&cpr->pr_mtx); cpr->pr_state = PRISON_STATE_DYING; cpr->pr_flags |= PR_REMOVE; mtx_unlock(&cpr->pr_mtx); continue; } if (!(cpr->pr_flags & PR_REMOVE)) continue; prison_cleanup(cpr); mtx_lock(&cpr->pr_mtx); cpr->pr_flags &= ~PR_REMOVE; if (cpr->pr_flags & PR_PERSIST) { cpr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(cpr); prison_free_not_last(cpr); } (void)refcount_release(&cpr->pr_uref); if (refcount_release(&cpr->pr_ref)) { /* * When the last reference goes, unlink the prison * and set it aside for prison_deref() to handle. * Delay unlinking the sibling list to keep the loop * safe. */ if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); rpr = cpr; rpr->pr_state = PRISON_STATE_INVALID; TAILQ_REMOVE(&allprison, rpr, pr_list); TAILQ_INSERT_TAIL(freeprison, rpr, pr_list); /* * Removing a prison frees references from its parent. */ ppr = rpr->pr_parent; prison_proc_free_not_last(ppr); prison_free_not_last(ppr); for (; ppr != NULL; ppr = ppr->pr_parent) ppr->pr_childcount--; } mtx_unlock(&cpr->pr_mtx); } if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); prison_cleanup(pr); mtx_lock(&pr->pr_mtx); if (pr->pr_flags & PR_PERSIST) { pr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(pr); prison_free_not_last(pr); } (void)refcount_release(&pr->pr_uref); } /* * Given the current locking state in the flags, make sure allprison_lock * is held exclusive, and the prison is locked. Return flags indicating * the new state. */ static int prison_lock_xlock(struct prison *pr, int flags) { if (!(flags & PD_LIST_XLOCKED)) { /* * Get allprison_lock, which may be an upgrade, * and may require unlocking the prison. */ if (flags & PD_LOCKED) { mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; } if (flags & PD_LIST_SLOCKED) { if (!sx_try_upgrade(&allprison_lock)) { sx_sunlock(&allprison_lock); sx_xlock(&allprison_lock); } flags &= ~PD_LIST_SLOCKED; } else sx_xlock(&allprison_lock); flags |= PD_LIST_XLOCKED; } if (!(flags & PD_LOCKED)) { /* Lock the prison mutex. */ mtx_lock(&pr->pr_mtx); flags |= PD_LOCKED; } return flags; } /* * Release a prison's resources when it starts dying (when the last user * reference is dropped, or when it is killed). */ static void prison_cleanup(struct prison *pr) { sx_assert(&allprison_lock, SA_XLOCKED); mtx_assert(&pr->pr_mtx, MA_NOTOWNED); vfs_exjail_delete(pr); shm_remove_prison(pr); (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); } /* * Set or clear a permission bit in the pr_allow field, passing restrictions * (cleared permission) down to child jails. */ void prison_set_allow(struct ucred *cred, unsigned flag, int enable) { struct prison *pr; pr = cred->cr_prison; sx_slock(&allprison_lock); mtx_lock(&pr->pr_mtx); prison_set_allow_locked(pr, flag, enable); mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); } static void prison_set_allow_locked(struct prison *pr, unsigned flag, int enable) { struct prison *cpr; int descend; if (enable != 0) pr->pr_allow |= flag; else { pr->pr_allow &= ~flag; FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) cpr->pr_allow &= ~flag; } } /* * Check if a jail supports the given address family. * * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT * if not. */ int prison_check_af(struct ucred *cred, int af) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); pr = cred->cr_prison; #ifdef VIMAGE /* Prisons with their own network stack are not limited. */ if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (af) { #ifdef INET case AF_INET: if (pr->pr_flags & PR_IP4) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP4) && pr->pr_addrs[PR_INET] == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif #ifdef INET6 case AF_INET6: if (pr->pr_flags & PR_IP6) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP6) && pr->pr_addrs[PR_INET6] == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif case AF_LOCAL: case AF_ROUTE: case AF_NETLINK: break; default: if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Check if given address belongs to the jail referenced by cred (wrapper to * prison_check_ip[46]). * * Returns 0 if jail doesn't restrict the address family or if address belongs * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if * the jail doesn't allow the address family. IPv4 Address passed in in NBO. */ int prison_if(struct ucred *cred, const struct sockaddr *sa) { #ifdef INET const struct sockaddr_in *sai; #endif #ifdef INET6 const struct sockaddr_in6 *sai6; #endif int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (sa->sa_family) { #ifdef INET case AF_INET: sai = (const struct sockaddr_in *)sa; error = prison_check_ip4(cred, &sai->sin_addr); break; #endif #ifdef INET6 case AF_INET6: sai6 = (const struct sockaddr_in6 *)sa; error = prison_check_ip6(cred, &sai6->sin6_addr); break; #endif default: if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ int prison_check(struct ucred *cred1, struct ucred *cred2) { return ((cred1->cr_prison == cred2->cr_prison || prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); } /* * For mountd/nfsd to run within a prison, it must be: * - A vnet prison. * - PR_ALLOW_NFSD must be set on it. * - The root directory (pr_root) of the prison must be * a file system mount point, so the mountd can hang * export information on it. * - The prison's enforce_statfs cannot be 0, so that * mountd(8) can do exports. */ bool prison_check_nfsd(struct ucred *cred) { if (jailed_without_vnet(cred)) return (false); if (!prison_allow(cred, PR_ALLOW_NFSD)) return (false); if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0) return (false); if (cred->cr_prison->pr_enforce_statfs == 0) return (false); return (true); } /* * Return true if p2 is a child of p1, otherwise false. */ bool prison_ischild(struct prison *pr1, struct prison *pr2) { for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) if (pr1 == pr2) return (true); return (false); } /* * Return true if the prison is currently alive. A prison is alive if it * holds user references and it isn't being removed. */ bool prison_isalive(const struct prison *pr) { if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE)) return (false); return (true); } /* * Return true if the prison is currently valid. A prison is valid if it has * been fully created, and is not being destroyed. Note that dying prisons * are still considered valid. Invalid prisons won't be found under normal * circumstances, as they're only put in that state by functions that have * an exclusive hold on allprison_lock. */ bool prison_isvalid(struct prison *pr) { if (__predict_false(pr->pr_state == PRISON_STATE_INVALID)) return (false); if (__predict_false(refcount_load(&pr->pr_ref) == 0)) return (false); return (true); } /* * Return true if the passed credential is in a jail and that jail does not * have its own virtual network stack, otherwise false. */ bool jailed_without_vnet(struct ucred *cred) { if (!jailed(cred)) return (false); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (false); #endif return (true); } /* * Return the correct hostname (domainname, et al) for the passed credential. */ void getcredhostname(struct ucred *cred, char *buf, size_t size) { struct prison *pr; /* * A NULL credential can be used to shortcut to the physical * system's hostname. */ pr = (cred != NULL) ? cred->cr_prison : &prison0; mtx_lock(&pr->pr_mtx); strlcpy(buf, pr->pr_hostname, size); mtx_unlock(&pr->pr_mtx); } void getcreddomainname(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_domainname, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostuuid(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_hostuuid, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostid(struct ucred *cred, unsigned long *hostid) { mtx_lock(&cred->cr_prison->pr_mtx); *hostid = cred->cr_prison->pr_hostid; mtx_unlock(&cred->cr_prison->pr_mtx); } void getjailname(struct ucred *cred, char *name, size_t len) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(name, cred->cr_prison->pr_name, len); mtx_unlock(&cred->cr_prison->pr_mtx); } #ifdef VIMAGE /* * Determine whether the prison represented by cred owns * its vnet rather than having it inherited. * * Returns true in case the prison owns the vnet, false otherwise. */ bool prison_owns_vnet(struct ucred *cred) { /* * vnets cannot be added/removed after jail creation, * so no need to lock here. */ return ((cred->cr_prison->pr_flags & PR_VNET) != 0); } #endif /* * Determine whether the subject represented by cred can "see" * status of a mount point. * Returns: 0 for permitted, ENOENT otherwise. * XXX: This function should be called cr_canseemount() and should be * placed in kern_prot.c. */ int prison_canseemount(struct ucred *cred, struct mount *mp) { struct prison *pr; struct statfs *sp; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return (0); if (pr->pr_root->v_mount == mp) return (0); if (pr->pr_enforce_statfs == 2) return (ENOENT); /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. * This is ugly check, but this is the only situation when jail's * directory ends with '/'. */ if (strcmp(pr->pr_path, "/") == 0) return (0); len = strlen(pr->pr_path); sp = &mp->mnt_stat; if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) return (ENOENT); /* * Be sure that we don't have situation where jail's root directory * is "/some/path" and mount point is "/some/pathpath". */ if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') return (ENOENT); return (0); } void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) { char jpath[MAXPATHLEN]; struct prison *pr; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return; if (prison_canseemount(cred, mp) != 0) { bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, "[restricted]", sizeof(sp->f_mntonname)); return; } if (pr->pr_root->v_mount == mp) { /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); *sp->f_mntonname = '/'; return; } /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. */ if (strcmp(pr->pr_path, "/") == 0) return; len = strlen(pr->pr_path); strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); if (*jpath == '\0') { /* Should never happen. */ *sp->f_mntonname = '/'; } else { strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); } } /* * Check with permission for a specific privilege is granted within jail. We * have a specific list of accepted privileges; the rest are denied. */ int prison_priv_check(struct ucred *cred, int priv) { struct prison *pr; int error; /* * Some policies have custom handlers. This routine should not be * called for them. See priv_check_cred(). */ switch (priv) { case PRIV_VFS_LOOKUP: case PRIV_VFS_GENERATION: KASSERT(0, ("prison_priv_check instead of a custom handler " "called for %d\n", priv)); } if (!jailed(cred)) return (0); #ifdef VIMAGE /* * Privileges specific to prisons with a virtual network stack. * There might be a duplicate entry here in case the privilege * is only granted conditionally in the legacy jail case. */ switch (priv) { /* * NFS-specific privileges. */ case PRIV_NFS_DAEMON: case PRIV_VFS_GETFH: case PRIV_VFS_MOUNT_EXPORTED: if (!prison_check_nfsd(cred)) return (EPERM); #ifdef notyet case PRIV_NFS_LOCKD: #endif /* * Network stack privileges. */ case PRIV_NET_BRIDGE: case PRIV_NET_GRE: case PRIV_NET_BPF: case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ case PRIV_NET_ROUTE: case PRIV_NET_TAP: case PRIV_NET_SETIFMTU: case PRIV_NET_SETIFFLAGS: case PRIV_NET_SETIFCAP: case PRIV_NET_SETIFDESCR: case PRIV_NET_SETIFNAME : case PRIV_NET_SETIFMETRIC: case PRIV_NET_SETIFPHYS: case PRIV_NET_SETIFMAC: case PRIV_NET_SETLANPCP: case PRIV_NET_ADDMULTI: case PRIV_NET_DELMULTI: case PRIV_NET_HWIOCTL: case PRIV_NET_SETLLADDR: case PRIV_NET_ADDIFGROUP: case PRIV_NET_DELIFGROUP: case PRIV_NET_IFCREATE: case PRIV_NET_IFDESTROY: case PRIV_NET_ADDIFADDR: case PRIV_NET_DELIFADDR: case PRIV_NET_LAGG: case PRIV_NET_GIF: case PRIV_NET_SETIFVNET: case PRIV_NET_SETIFFIB: case PRIV_NET_OVPN: case PRIV_NET_ME: case PRIV_NET_WG: /* * 802.11-related privileges. */ case PRIV_NET80211_VAP_GETKEY: case PRIV_NET80211_VAP_MANAGE: #ifdef notyet /* * ATM privileges. */ case PRIV_NETATM_CFG: case PRIV_NETATM_ADD: case PRIV_NETATM_DEL: case PRIV_NETATM_SET: /* * Bluetooth privileges. */ case PRIV_NETBLUETOOTH_RAW: #endif /* * Netgraph and netgraph module privileges. */ case PRIV_NETGRAPH_CONTROL: #ifdef notyet case PRIV_NETGRAPH_TTY: #endif /* * IPv4 and IPv6 privileges. */ case PRIV_NETINET_IPFW: case PRIV_NETINET_DIVERT: case PRIV_NETINET_PF: case PRIV_NETINET_DUMMYNET: case PRIV_NETINET_CARP: case PRIV_NETINET_MROUTE: case PRIV_NETINET_RAW: case PRIV_NETINET_ADDRCTRL6: case PRIV_NETINET_ND6: case PRIV_NETINET_SCOPE6: case PRIV_NETINET_ALIFETIME6: case PRIV_NETINET_IPSEC: case PRIV_NETINET_BINDANY: #ifdef notyet /* * NCP privileges. */ case PRIV_NETNCP: /* * SMB privileges. */ case PRIV_NETSMB: #endif /* * No default: or deny here. * In case of no permit fall through to next switch(). */ if (cred->cr_prison->pr_flags & PR_VNET) return (0); } #endif /* VIMAGE */ switch (priv) { /* * Allow ktrace privileges for root in jail. */ case PRIV_KTRACE: #if 0 /* * Allow jailed processes to configure audit identity and * submit audit records (login, etc). In the future we may * want to further refine the relationship between audit and * jail. */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: case PRIV_AUDIT_SUBMIT: #endif /* * Allow jailed processes to manipulate process UNIX * credentials in any way they see fit. */ case PRIV_CRED_SETCRED: case PRIV_CRED_SETUID: case PRIV_CRED_SETEUID: case PRIV_CRED_SETGID: case PRIV_CRED_SETEGID: case PRIV_CRED_SETGROUPS: case PRIV_CRED_SETREUID: case PRIV_CRED_SETREGID: case PRIV_CRED_SETRESUID: case PRIV_CRED_SETRESGID: /* * Jail implements visibility constraints already, so allow * jailed root to override uid/gid-based constraints. */ case PRIV_SEEOTHERGIDS: case PRIV_SEEOTHERUIDS: case PRIV_SEEJAILPROC: /* * Jail implements inter-process debugging limits already, so * allow jailed root various debugging privileges. */ case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: /* * Allow jail to set various resource limits and login * properties, and for now, exceed process resource limits. */ case PRIV_PROC_LIMIT: case PRIV_PROC_SETLOGIN: case PRIV_PROC_SETRLIMIT: /* * Debuggers should work in jails. */ case PRIV_PROC_MEM_WRITE: /* * System V and POSIX IPC privileges are granted in jail. */ case PRIV_IPC_READ: case PRIV_IPC_WRITE: case PRIV_IPC_ADMIN: case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: /* * Jail operations within a jail work on child jails. */ case PRIV_JAIL_ATTACH: case PRIV_JAIL_SET: case PRIV_JAIL_REMOVE: /* * Jail implements its own inter-process limits, so allow * root processes in jail to change scheduling on other * processes in the same jail. Likewise for signalling. */ case PRIV_SCHED_DIFFCRED: case PRIV_SCHED_CPUSET: case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: /* * Allow jailed processes to write to sysctls marked as jail * writable. */ case PRIV_SYSCTL_WRITEJAIL: /* * Allow root in jail to manage a variety of quota * properties. These should likely be conditional on a * configuration option. */ case PRIV_VFS_GETQUOTA: case PRIV_VFS_SETQUOTA: /* * Since Jail relies on chroot() to implement file system * protections, grant many VFS privileges to root in jail. * Be careful to exclude mount-related and NFS-related * privileges. */ case PRIV_VFS_READ: case PRIV_VFS_WRITE: case PRIV_VFS_ADMIN: case PRIV_VFS_EXEC: case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ case PRIV_VFS_CHFLAGS_DEV: case PRIV_VFS_CHOWN: case PRIV_VFS_CHROOT: case PRIV_VFS_RETAINSUGID: case PRIV_VFS_FCHROOT: case PRIV_VFS_LINK: case PRIV_VFS_SETGID: case PRIV_VFS_STAT: case PRIV_VFS_STICKYFILE: /* * As in the non-jail case, non-root users are expected to be * able to read kernel/physical memory (provided /dev/[k]mem * exists in the jail and they have permission to access it). */ case PRIV_KMEM_READ: return (0); /* * Depending on the global setting, allow privilege of * setting system flags. */ case PRIV_VFS_SYSFLAGS: if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) return (0); else return (EPERM); /* * Depending on the global setting, allow privilege of * mounting/unmounting file systems. */ case PRIV_VFS_MOUNT: case PRIV_VFS_UNMOUNT: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_MOUNT_OWNER: pr = cred->cr_prison; prison_lock(pr); if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2) error = 0; else error = EPERM; prison_unlock(pr); return (error); /* * Jails should hold no disposition on the PRIV_VFS_READ_DIR * policy. priv_check_cred will not specifically allow it, and * we may want a MAC policy to allow it. */ case PRIV_VFS_READ_DIR: return (0); /* * Conditionally allow privileged process in the jail to * manipulate filesystem extended attributes in the system * namespace. */ case PRIV_VFS_EXTATTR_SYSTEM: if ((cred->cr_prison->pr_allow & PR_ALLOW_EXTATTR) != 0) return (0); else return (EPERM); /* * Conditionnaly allow locking (unlocking) physical pages * in memory. */ case PRIV_VM_MLOCK: case PRIV_VM_MUNLOCK: if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK) return (0); else return (EPERM); /* * Conditionally allow jailed root to bind reserved ports. */ case PRIV_NETINET_RESERVEDPORT: if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS) return (0); else return (EPERM); /* * Allow jailed root to reuse in-use ports. */ case PRIV_NETINET_REUSEPORT: return (0); /* * Allow jailed root to set certain IPv4/6 (option) headers. */ case PRIV_NETINET_SETHDROPTS: return (0); /* * Conditionally allow creating raw sockets in jail. */ case PRIV_NETINET_RAW: if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) return (0); else return (EPERM); /* * Since jail implements its own visibility limits on netstat * sysctls, allow getcred. This allows identd to work in * jail. */ case PRIV_NETINET_GETCRED: return (0); /* * Allow jailed root to set loginclass. */ case PRIV_PROC_SETLOGINCLASS: return (0); /* * Do not allow a process inside a jail to read the kernel * message buffer unless explicitly permitted. */ case PRIV_MSGBUF: if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF) return (0); return (EPERM); /* * Conditionally allow privileged process in the jail adjust * machine time. */ case PRIV_ADJTIME: case PRIV_NTP_ADJTIME: if (cred->cr_prison->pr_allow & (PR_ALLOW_ADJTIME | PR_ALLOW_SETTIME)) { return (0); } return (EPERM); /* * Conditionally allow privileged process in the jail set * machine time. */ case PRIV_SETTIMEOFDAY: case PRIV_CLOCK_SETTIME: if (cred->cr_prison->pr_allow & PR_ALLOW_SETTIME) return (0); else return (EPERM); + /* + * Conditionally allow privileged process in the jail to modify + * the routing table. + */ + case PRIV_NET_ROUTE: + if (cred->cr_prison->pr_allow & PR_ALLOW_ROUTING) + return (0); + else + return (EPERM); + default: /* * In all remaining cases, deny the privilege request. This * includes almost all network privileges, many system * configuration privileges. */ return (EPERM); } } /* * Return the part of pr2's name that is relative to pr1, or the whole name * if it does not directly follow. */ char * prison_name(struct prison *pr1, struct prison *pr2) { char *name; /* Jails see themselves as "0" (if they see themselves at all). */ if (pr1 == pr2) return "0"; name = pr2->pr_name; if (prison_ischild(pr1, pr2)) { /* * pr1 isn't locked (and allprison_lock may not be either) * so its length can't be counted on. But the number of dots * can be counted on - and counted. */ for (; pr1 != &prison0; pr1 = pr1->pr_parent) name = strchr(name, '.') + 1; } return (name); } /* * Return the part of pr2's path that is relative to pr1, or the whole path * if it does not directly follow. */ static char * prison_path(struct prison *pr1, struct prison *pr2) { char *path1, *path2; int len1; path1 = pr1->pr_path; path2 = pr2->pr_path; if (!strcmp(path1, "/")) return (path2); len1 = strlen(path1); if (strncmp(path1, path2, len1)) return (path2); if (path2[len1] == '\0') return "/"; if (path2[len1] == '/') return (path2 + len1); return (path2); } /* * Jail-related sysctls. */ SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Jails"); #if defined(INET) || defined(INET6) /* * Copy address array to memory that would be then SYSCTL_OUT-ed. * sysctl_jail_list() helper. */ static void prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len) { const struct prison_ip *pip; const size_t size = pr_families[af].size; again: mtx_assert(&pr->pr_mtx, MA_OWNED); if ((pip = pr->pr_addrs[af]) != NULL) { if (*len < pip->ips) { *len = pip->ips; mtx_unlock(&pr->pr_mtx); *out = realloc(*out, *len * size, M_TEMP, M_WAITOK); mtx_lock(&pr->pr_mtx); goto again; } bcopy(pip->pr_ip, *out, pip->ips * size); } } #endif static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { struct xprison *xp; struct prison *pr, *cpr; #ifdef INET struct in_addr *ip4 = NULL; int ip4s = 0; #endif #ifdef INET6 struct in6_addr *ip6 = NULL; int ip6s = 0; #endif int descend, error; xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); pr = req->td->td_ucred->cr_prison; error = 0; sx_slock(&allprison_lock); FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { mtx_lock(&cpr->pr_mtx); #ifdef INET prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s); #endif #ifdef INET6 prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s); #endif bzero(xp, sizeof(*xp)); xp->pr_version = XPRISON_VERSION; xp->pr_id = cpr->pr_id; xp->pr_state = cpr->pr_state; strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); #ifdef INET xp->pr_ip4s = ip4s; #endif #ifdef INET6 xp->pr_ip6s = ip6s; #endif mtx_unlock(&cpr->pr_mtx); error = SYSCTL_OUT(req, xp, sizeof(*xp)); if (error) break; #ifdef INET if (xp->pr_ip4s > 0) { error = SYSCTL_OUT(req, ip4, xp->pr_ip4s * sizeof(struct in_addr)); if (error) break; } #endif #ifdef INET6 if (xp->pr_ip6s > 0) { error = SYSCTL_OUT(req, ip6, xp->pr_ip6s * sizeof(struct in6_addr)); if (error) break; } #endif } sx_sunlock(&allprison_lock); free(xp, M_TEMP); #ifdef INET free(ip4, M_TEMP); #endif #ifdef INET6 free(ip6, M_TEMP); #endif return (error); } SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_list, "S", "List of active jails"); static int sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) { int error, injail; injail = jailed(req->td->td_ucred); error = SYSCTL_OUT(req, &injail, sizeof(injail)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_jailed, "I", "Process in jail?"); static int sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) { int error, havevnet; #ifdef VIMAGE struct ucred *cred = req->td->td_ucred; havevnet = jailed(cred) && prison_owns_vnet(cred); #else havevnet = 0; #endif error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, vnet, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns vnet?"); #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, "Number of IP addresses a jail may have at most per address family (deprecated)"); #endif /* * Default parameters for jail(2) compatibility. For historical reasons, * the sysctl names have varying similarity to the parameter names. Prisons * just see their own parameters, and can't change them. */ static int sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) { int error, i; /* Get the current flag value, and convert it to a boolean. */ if (req->td->td_ucred->cr_prison == &prison0) { mtx_lock(&prison0.pr_mtx); i = (jail_default_allow & arg2) != 0; mtx_unlock(&prison0.pr_mtx); } else i = prison_allow(req->td->td_ucred, arg2); if (arg1 != NULL) i = !i; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); i = i ? arg2 : 0; if (arg1 != NULL) i ^= arg2; /* * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 * for writing. */ mtx_lock(&prison0.pr_mtx); jail_default_allow = (jail_default_allow & ~arg2) | i; mtx_unlock(&prison0.pr_mtx); return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", "Processes in jail can set their hostnames (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", "Processes in jail can use System V IPC primitives (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", "Prison root can create raw sockets (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", "Processes in jail can alter system file flags (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", "Processes in jail can mount/unmount jail-friendly file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I", "Processes in jail can lock/unlock physical pages in memory"); static int sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) { struct prison *pr; int level, error; pr = req->td->td_ucred->cr_prison; level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); *(int *)arg1 = level; return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), sysctl_jail_default_level, "I", "Processes in jail cannot see all mounted file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), sysctl_jail_default_level, "I", "Ruleset for the devfs filesystem in jail (deprecated)"); SYSCTL_NODE(_security_jail, OID_AUTO, children, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Limits and stats of child jails"); static int sysctl_jail_children(SYSCTL_HANDLER_ARGS) { struct prison *pr; int i; pr = req->td->td_ucred->cr_prison; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_INT: i = *(int *)((char *)pr + arg2); return (SYSCTL_OUT(req, &i, sizeof(i))); } return (0); } SYSCTL_PROC(_security_jail_children, OID_AUTO, max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, offsetof(struct prison, pr_childmax), sysctl_jail_children, "I", "Maximum number of child jails"); SYSCTL_PROC(_security_jail_children, OID_AUTO, cur, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, offsetof(struct prison, pr_childcount), sysctl_jail_children, "I", "Current number of child jails"); /* * Nodes to describe jail parameters. Maximum length of string parameters * is returned in the string itself, and the other parameters exist merely * to make themselves and their types known. */ SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Jail parameters"); int sysctl_jail_param(SYSCTL_HANDLER_ARGS) { int i; long l; size_t s; char numbuf[12]; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_LONG: case CTLTYPE_ULONG: l = 0; #ifdef SCTL_MASK32 if (!(req->flags & SCTL_MASK32)) #endif return (SYSCTL_OUT(req, &l, sizeof(l))); case CTLTYPE_INT: case CTLTYPE_UINT: i = 0; return (SYSCTL_OUT(req, &i, sizeof(i))); case CTLTYPE_STRING: snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); return (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); case CTLTYPE_STRUCT: s = (size_t)arg2; return (SYSCTL_OUT(req, &s, sizeof(s))); } return (0); } /* * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at * jail creation time but cannot be changed in an existing jail. */ SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail secure level"); SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail value for kern.osreldate and uname -K"); SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, "Jail value for kern.osrelease and uname -r"); SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail cannot see all mounted file systems"); SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, "I", "Ruleset for in-jail devfs mounts"); SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail persistence"); #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, "I", "Current number of child jails"); SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, "I", "Maximum number of child jails"); SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail hostname"); SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail NIS domainname"); SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, "Jail host UUID"); SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, "LU", "Jail host ID"); SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); #ifdef INET SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, "Jail IPv4 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), "S,in_addr,a", "Jail IPv4 addresses"); SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv4 source address selection rather than the " "primary jail IPv4 address."); #endif #ifdef INET6 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, "Jail IPv6 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), "S,in6_addr,a", "Jail IPv6 addresses"); SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv6 source address selection rather than the " "primary jail IPv6 address."); #endif SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set hostname"); SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may use SYSV IPC"); SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create raw sockets"); SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may alter system file flags"); SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set file quotas"); SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may lock (unlock) physical pages in memory"); SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may bind sockets to reserved ports"); SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may read the kernel message buffer"); SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW, "B", "Unprivileged processes may use process debugging facilities"); SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW, "B", "Processes in jail with uid 0 have privilege"); #ifdef VIMAGE SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW, "B", "Mountd/nfsd may run in the jail"); #endif SYSCTL_JAIL_PARAM(_allow, extattr, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set system-level filesystem extended attributes"); SYSCTL_JAIL_PARAM(_allow, adjtime, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may adjust system time"); SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set system time"); +SYSCTL_JAIL_PARAM(_allow, routing, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may modify routing table"); SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount/unmount jail-friendly file systems in general"); /* * Add a dynamic parameter allow., or allow... Return * its associated bit in the pr_allow bitmask, or zero if the parameter was * not created. */ unsigned prison_add_allow(const char *prefix, const char *name, const char *prefix_descr, const char *descr) { struct bool_flags *bf; struct sysctl_oid *parent; char *allow_name, *allow_noname, *allowed; #ifndef NO_SYSCTL_DESCR char *descr_deprecated; #endif u_int allow_flag; if (prefix ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name) < 0 || asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name) < 0 : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 || asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) { free(allow_name, M_PRISON); return 0; } /* * See if this parameter has already beed added, i.e. a module was * previously loaded/unloaded. */ mtx_lock(&prison0.pr_mtx); for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { if (strcmp(bf->name, allow_name) == 0) { allow_flag = bf->flag; goto no_add; } } /* * Find a free bit in pr_allow_all, failing if there are none * (which shouldn't happen as long as we keep track of how many * potential dynamic flags exist). */ for (allow_flag = 1;; allow_flag <<= 1) { if (allow_flag == 0) goto no_add; if ((pr_allow_all & allow_flag) == 0) break; } /* Note the parameter in the next open slot in pr_flag_allow. */ for (bf = pr_flag_allow; ; bf++) { if (bf == pr_flag_allow + nitems(pr_flag_allow)) { /* This should never happen, but is not fatal. */ allow_flag = 0; goto no_add; } if (atomic_load_int(&bf->flag) == 0) break; } bf->name = allow_name; bf->noname = allow_noname; pr_allow_all |= allow_flag; /* * prison0 always has permission for the new parameter. * Other jails must have it granted to them. */ prison0.pr_allow |= allow_flag; /* The flag indicates a valid entry, so make sure it is set last. */ atomic_store_rel_int(&bf->flag, allow_flag); mtx_unlock(&prison0.pr_mtx); /* * Create sysctls for the parameter, and the back-compat global * permission. */ parent = prefix ? SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(&sysctl___security_jail_param_allow), OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr) : &sysctl___security_jail_param_allow; (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_param, "B", descr); if ((prefix ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name) : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) { #ifndef NO_SYSCTL_DESCR (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)", descr); #endif (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag, sysctl_jail_default_allow, "I", descr_deprecated); #ifndef NO_SYSCTL_DESCR free(descr_deprecated, M_TEMP); #endif free(allowed, M_TEMP); } return allow_flag; no_add: mtx_unlock(&prison0.pr_mtx); free(allow_name, M_PRISON); free(allow_noname, M_PRISON); return allow_flag; } /* * The VFS system will register jail-aware filesystems here. They each get * a parameter allow.mount.xxxfs and a flag to check when a jailed user * attempts to mount. */ void prison_add_vfs(struct vfsconf *vfsp) { #ifdef NO_SYSCTL_DESCR vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, NULL, NULL); #else char *descr; (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system", vfsp->vfc_name); vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, NULL, descr); free(descr, M_TEMP); #endif } #ifdef RACCT void prison_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_slock(&allprison_lock); if (pre != NULL) (pre)(); LIST_FOREACH(prr, &allprison_racct, prr_next) (callback)(prr->prr_racct, arg2, arg3); if (post != NULL) (post)(); sx_sunlock(&allprison_lock); } static struct prison_racct * prison_racct_find_locked(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) return (NULL); LIST_FOREACH(prr, &allprison_racct, prr_next) { if (strcmp(name, prr->prr_name) != 0) continue; /* Found prison_racct with a matching name? */ prison_racct_hold(prr); return (prr); } /* Add new prison_racct. */ prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); racct_create(&prr->prr_racct); strcpy(prr->prr_name, name); refcount_init(&prr->prr_refcount, 1); LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); return (prr); } struct prison_racct * prison_racct_find(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_xlock(&allprison_lock); prr = prison_racct_find_locked(name); sx_xunlock(&allprison_lock); return (prr); } void prison_racct_hold(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); refcount_acquire(&prr->prr_refcount); } static void prison_racct_free_locked(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (refcount_release(&prr->prr_refcount)) { racct_destroy(&prr->prr_racct); LIST_REMOVE(prr, prr_next); free(prr, M_PRISON_RACCT); } } void prison_racct_free(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (refcount_release_if_not_last(&prr->prr_refcount)) return; sx_xlock(&allprison_lock); prison_racct_free_locked(prr); sx_xunlock(&allprison_lock); } static void prison_racct_attach(struct prison *pr) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); prr = prison_racct_find_locked(pr->pr_name); KASSERT(prr != NULL, ("cannot find prison_racct")); pr->pr_prison_racct = prr; } /* * Handle jail renaming. From the racct point of view, renaming means * moving from one prison_racct to another. */ static void prison_racct_modify(struct prison *pr) { #ifdef RCTL struct proc *p; struct ucred *cred; #endif struct prison_racct *oldprr; ASSERT_RACCT_ENABLED(); sx_slock(&allproc_lock); sx_xlock(&allprison_lock); if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { sx_xunlock(&allprison_lock); sx_sunlock(&allproc_lock); return; } oldprr = pr->pr_prison_racct; pr->pr_prison_racct = NULL; prison_racct_attach(pr); /* * Move resource utilisation records. */ racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); #ifdef RCTL /* * Force rctl to reattach rules to processes. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); cred = crhold(p->p_ucred); PROC_UNLOCK(p); rctl_proc_ucred_changed(p, cred); crfree(cred); } #endif sx_sunlock(&allproc_lock); prison_racct_free_locked(oldprr); sx_xunlock(&allprison_lock); } static void prison_racct_detach(struct prison *pr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (pr->pr_prison_racct == NULL) return; prison_racct_free(pr->pr_prison_racct); pr->pr_prison_racct = NULL; } #endif /* RACCT */ #ifdef DDB static void db_show_prison(struct prison *pr) { struct bool_flags *bf; struct jailsys_flags *jsf; #if defined(INET) || defined(INET6) int ii; struct prison_ip *pip; #endif unsigned f; #ifdef INET char ip4buf[INET_ADDRSTRLEN]; #endif #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif db_printf("prison %p:\n", pr); db_printf(" jid = %d\n", pr->pr_id); db_printf(" name = %s\n", pr->pr_name); db_printf(" parent = %p\n", pr->pr_parent); db_printf(" ref = %d\n", pr->pr_ref); db_printf(" uref = %d\n", pr->pr_uref); db_printf(" state = %s\n", pr->pr_state == PRISON_STATE_ALIVE ? "alive" : pr->pr_state == PRISON_STATE_DYING ? "dying" : "invalid"); db_printf(" path = %s\n", pr->pr_path); db_printf(" cpuset = %d\n", pr->pr_cpuset ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); db_printf(" children.max = %d\n", pr->pr_childmax); db_printf(" children.cur = %d\n", pr->pr_childcount); db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); db_printf(" flags = 0x%x", pr->pr_flags); for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) if (pr->pr_flags & bf->flag) db_printf(" %s", bf->name); for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { f = pr->pr_flags & (jsf->disable | jsf->new); db_printf(" %-16s= %s\n", jsf->name, (f != 0 && f == jsf->disable) ? "disable" : (f == jsf->new) ? "new" : "inherit"); } db_printf(" allow = 0x%x", pr->pr_allow); for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) if (pr->pr_allow & bf->flag) db_printf(" %s", bf->name); db_printf("\n"); db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); db_printf(" host.hostname = %s\n", pr->pr_hostname); db_printf(" host.domainname = %s\n", pr->pr_domainname); db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); db_printf(" host.hostid = %lu\n", pr->pr_hostid); #ifdef INET if ((pip = pr->pr_addrs[PR_INET]) != NULL) { db_printf(" ip4s = %d\n", pip->ips); for (ii = 0; ii < pip->ips; ii++) db_printf(" %s %s\n", ii == 0 ? "ip4.addr =" : " ", inet_ntoa_r( *(const struct in_addr *)PR_IP(pip, PR_INET, ii), ip4buf)); } #endif #ifdef INET6 if ((pip = pr->pr_addrs[PR_INET6]) != NULL) { db_printf(" ip6s = %d\n", pip->ips); for (ii = 0; ii < pip->ips; ii++) db_printf(" %s %s\n", ii == 0 ? "ip6.addr =" : " ", ip6_sprintf(ip6buf, (const struct in6_addr *)PR_IP(pip, PR_INET6, ii))); } #endif } DB_SHOW_COMMAND(prison, db_show_prison_command) { struct prison *pr; if (!have_addr) { /* * Show all prisons in the list, and prison0 which is not * listed. */ db_show_prison(&prison0); if (!db_pager_quit) { TAILQ_FOREACH(pr, &allprison, pr_list) { db_show_prison(pr); if (db_pager_quit) break; } } return; } if (addr == 0) pr = &prison0; else { /* Look for a prison with the ID and with references. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr && pr->pr_ref > 0) break; if (pr == NULL) /* Look again, without requiring a reference. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr) break; if (pr == NULL) /* Assume address points to a valid prison. */ pr = (struct prison *)addr; } db_show_prison(pr); } #endif /* DDB */ diff --git a/sys/netlink/route/rt.c b/sys/netlink/route/rt.c index 30dab2b0d8cf..dcd19b43105c 100644 --- a/sys/netlink/route/rt.c +++ b/sys/netlink/route/rt.c @@ -1,1137 +1,1139 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_route #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); static unsigned char get_rtm_type(const struct nhop_object *nh) { int nh_flags = nh->nh_flags; /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */ if (nh_flags & NHF_BLACKHOLE) return (RTN_BLACKHOLE); else if (nh_flags & NHF_REJECT) return (RTN_PROHIBIT); return (RTN_UNICAST); } static uint8_t nl_get_rtm_protocol(const struct nhop_object *nh) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh; uint8_t origin = nhgrp_get_origin(nhg); if (origin != RTPROT_UNSPEC) return (origin); nh = nhg->nhops[0]; } #endif uint8_t origin = nhop_get_origin(nh); if (origin != RTPROT_UNSPEC) return (origin); /* TODO: remove guesswork once all kernel users fill in origin */ int rt_flags = nhop_get_rtflags(nh); if (rt_flags & RTF_PROTO1) return (RTPROT_ZEBRA); if (rt_flags & RTF_STATIC) return (RTPROT_STATIC); return (RTPROT_KERNEL); } static int get_rtmsg_type_from_rtsock(int cmd) { switch (cmd) { case RTM_ADD: case RTM_CHANGE: case RTM_GET: return NL_RTM_NEWROUTE; case RTM_DELETE: return NL_RTM_DELROUTE; } return (0); } /* * fibnum heuristics * * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS * msg rtm_table RTA_TABLE result * RTM_GETROUTE/dump 0 - RT_ALL_FIBS * RTM_GETROUTE/dump 1 - 1 * RTM_GETROUTE/get 0 - 0 * */ static struct nhop_object * rc_get_nhop(const struct rib_cmd_info *rc) { return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new); } static void dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh) { #ifdef INET6 int upper_family; #endif switch (nhop_get_neigh_family(nh)) { case AF_LINK: /* onlink prefix, skip */ break; case AF_INET: nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr); break; #ifdef INET6 case AF_INET6: upper_family = nhop_get_upper_family(nh); if (upper_family == AF_INET6) { struct in6_addr gw6 = nh->gw6_sa.sin6_addr; in6_clearscope(&gw6); nlattr_add(nw, NL_RTA_GATEWAY, 16, &gw6); } else if (upper_family == AF_INET) { /* IPv4 over IPv6 */ struct in6_addr gw6 = nh->gw6_sa.sin6_addr; in6_clearscope(&gw6); char buf[20]; struct rtvia *via = (struct rtvia *)&buf[0]; via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &gw6, 16); nlattr_add(nw, NL_RTA_VIA, 17, via); } break; #endif } } static void dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh) { int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) return; nla->nla_type = NL_RTA_METRICS; nla->nla_len = nla_len; nla++; nla->nla_type = NL_RTAX_MTU; nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t); *((uint32_t *)(nla + 1)) = nh->nh_mtu; } #ifdef ROUTE_MPATH static void dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm) { uint32_t uidx = nhgrp_get_uidx(nhg); uint32_t num_nhops; const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops); uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh); if (uidx != 0) nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg)); nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags); int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH); if (off == 0) return; for (int i = 0; i < num_nhops; i++) { int nh_off = nlattr_save_offset(nw); struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop); if (rtnh == NULL) return; rtnh->rtnh_flags = 0; rtnh->rtnh_ifindex = if_getindex(wn[i].nh->nh_ifp); rtnh->rtnh_hops = wn[i].weight; dump_rc_nhop_gw(nw, wn[i].nh); uint32_t rtflags = nhop_get_rtflags(wn[i].nh); if (rtflags != base_rtflags) nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); if (rtflags & RTF_FIXEDMTU) dump_rc_nhop_mtu(nw, wn[i].nh); rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop); /* * nlattr_add() allocates 4-byte aligned storage, no need to aligh * length here * */ rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off; } nlattr_set_len(nw, off); } #endif static void dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rnd->rnd_nhop)) { dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm); return; } #endif const struct nhop_object *nh = rnd->rnd_nhop; uint32_t rtflags = nhop_get_rtflags(nh); /* * IPv4 over IPv6 * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2), * IPv4 w/ gw * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)], * Direct route: * ('RTA_OIF', 2) */ if (nh->nh_flags & NHF_GATEWAY) dump_rc_nhop_gw(nw, nh); uint32_t uidx = nhop_get_uidx(nh); if (uidx != 0) nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh)); nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); if (rtflags & RTF_FIXEDMTU) dump_rc_nhop_mtu(nw, nh); uint32_t nh_expire = nhop_get_expire(nh); if (nh_expire > 0) nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime); /* In any case, fill outgoing interface */ nlattr_add_u32(nw, NL_RTA_OIF, if_getindex(nh->nh_ifp)); if (rnd->rnd_weight != RT_DEFAULT_WEIGHT) nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight); } /* * Dumps output from a rib command into an rtmsg */ static int dump_px(uint32_t fibnum, const struct nlmsghdr *hdr, const struct rtentry *rt, struct route_nhop_data *rnd, struct nl_writer *nw) { struct rtmsg *rtm; int error = 0; NET_EPOCH_ASSERT(); if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg))) goto enomem; int family = rt_get_family(rt); int rtm_off = nlattr_save_offset(nw); rtm = nlmsg_reserve_object(nw, struct rtmsg); rtm->rtm_family = family; rtm->rtm_dst_len = 0; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; if (fibnum < 255) rtm->rtm_table = (unsigned char)fibnum; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop); rtm->rtm_type = get_rtm_type(rnd->rnd_nhop); nlattr_add_u32(nw, NL_RTA_TABLE, fibnum); int plen = 0; #if defined(INET) || defined(INET6) uint32_t scopeid; #endif switch (family) { #ifdef INET case AF_INET: { struct in_addr addr; rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid); nlattr_add(nw, NL_RTA_DST, 4, &addr); break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr addr; rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid); nlattr_add(nw, NL_RTA_DST, 16, &addr); break; } #endif default: FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family); error = EAFNOSUPPORT; goto flush; } rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg); if (plen > 0) rtm->rtm_dst_len = plen; dump_rc_nhop(nw, rnd, rtm); if (nlmsg_end(nw)) return (0); enomem: error = ENOMEM; flush: nlmsg_abort(nw); return (error); } static int family_to_group(int family) { switch (family) { case AF_INET: return (RTNLGRP_IPV4_ROUTE); case AF_INET6: return (RTNLGRP_IPV6_ROUTE); } return (0); } static void report_operation(uint32_t fibnum, struct rib_cmd_info *rc, struct nlpcb *nlp, struct nlmsghdr *hdr) { struct nl_writer nw; uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt)); if (nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id, 0, false)) { struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, }; hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE); hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND); switch (rc->rc_cmd) { case RTM_ADD: hdr->nlmsg_type = NL_RTM_NEWROUTE; hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; break; case RTM_CHANGE: hdr->nlmsg_type = NL_RTM_NEWROUTE; hdr->nlmsg_flags |= NLM_F_REPLACE; break; case RTM_DELETE: hdr->nlmsg_type = NL_RTM_DELROUTE; break; } dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw); nlmsg_flush(&nw); } rtsock_callback_p->route_f(fibnum, rc); } static void set_scope6(struct sockaddr *sa, struct ifnet *ifp) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); } #endif } struct rta_mpath_nh { struct sockaddr *gw; struct ifnet *ifp; uint8_t rtnh_flags; uint8_t rtnh_weight; }; #define _IN(_field) offsetof(struct rtnexthop, _field) #define _OUT(_field) offsetof(struct rta_mpath_nh, _field) const static struct nlattr_parser nla_p_rtnh[] = { { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip }, { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia }, }; const static struct nlfield_parser nlf_p_rtnh[] = { { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 }, { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 }, { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz }, }; #undef _IN #undef _OUT static bool post_p_rtnh(void *_attrs, struct nl_pstate *npt __unused) { struct rta_mpath_nh *attrs = (struct rta_mpath_nh *)_attrs; set_scope6(attrs->gw, attrs->ifp); return (true); } NL_DECLARE_PARSER_EXT(mpath_parser, struct rtnexthop, NULL, nlf_p_rtnh, nla_p_rtnh, post_p_rtnh); struct rta_mpath { u_int num_nhops; struct rta_mpath_nh nhops[0]; }; static int nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { struct rta_mpath *mp; struct rtnexthop *rtnh; uint16_t data_len, len; u_int max_nhops; int error; data_len = nla->nla_len - sizeof(struct nlattr); max_nhops = data_len / sizeof(struct rtnexthop); mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh)); mp->num_nhops = 0; for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) { struct rta_mpath_nh *mpnh; if (__predict_false(rtnh->rtnh_len <= sizeof(*rtnh) || rtnh->rtnh_len > data_len)) { NLMSG_REPORT_ERR_MSG(npt, "%s: bad length %u", __func__, rtnh->rtnh_len); return (EINVAL); } mpnh = &mp->nhops[mp->num_nhops++]; error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser, npt, mpnh); if (error != 0) { NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexthop %u: parse failed", mp->num_nhops - 1); return (error); } len = NL_ITEM_ALIGN(rtnh->rtnh_len); data_len -= len; rtnh = (struct rtnexthop *)((char *)rtnh + len); } if (data_len != 0 || mp->num_nhops == 0) { NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr"); return (EINVAL); } *((struct rta_mpath **)target) = mp; return (0); } struct nl_parsed_route { struct sockaddr *rta_dst; struct sockaddr *rta_gw; struct ifnet *rta_oif; struct rta_mpath *rta_multipath; uint32_t rta_table; uint32_t rta_rtflags; uint32_t rta_nh_id; uint32_t rta_weight; uint32_t rtax_mtu; uint8_t rtm_table; uint8_t rtm_family; uint8_t rtm_dst_len; uint8_t rtm_protocol; uint8_t rtm_type; uint32_t rtm_flags; }; #define _IN(_field) offsetof(struct rtmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_route, _field) static struct nlattr_parser nla_p_rtmetrics[] = { { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics); static const struct nlattr_parser nla_p_rtmsg[] = { { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip }, { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp }, { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip }, { .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested }, { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath }, { .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 }, { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 }, { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 }, { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia }, { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 }, }; static const struct nlfield_parser nlf_p_rtmsg[] = { { .off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_table), .off_out = _OUT(rtm_table), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_flags), .off_out = _OUT(rtm_flags), .cb = nlf_get_u32 }, }; #undef _IN #undef _OUT static bool post_p_rtmsg(void *_attrs, struct nl_pstate *npt __unused) { struct nl_parsed_route *attrs = (struct nl_parsed_route *)_attrs; set_scope6(attrs->rta_dst, attrs->rta_oif); set_scope6(attrs->rta_gw, attrs->rta_oif); return (true); } NL_DECLARE_PARSER_EXT(rtm_parser, struct rtmsg, NULL, nlf_p_rtmsg, nla_p_rtmsg, post_p_rtmsg); struct netlink_walkargs { struct nl_writer *nw; struct route_nhop_data rnd; struct nlmsghdr hdr; struct nlpcb *nlp; uint32_t fibnum; int family; int error; int count; int dumped; int dumped_tables; }; static int dump_rtentry(struct rtentry *rt, void *_arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; int error; wa->count++; if (wa->error != 0) return (0); if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp))) return (0); wa->dumped++; rt_get_rnd(rt, &wa->rnd); error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw); IF_DEBUG_LEVEL(LOG_DEBUG3) { char rtbuf[INET6_ADDRSTRLEN + 5]; FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family, "Dump %s, error %d", rt_print_buf(rt, rtbuf, sizeof(rtbuf)), error); } wa->error = error; return (0); } static void dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family) { FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump"); wa->count = 0; wa->dumped = 0; rib_walk(fibnum, family, false, dump_rtentry, wa); wa->dumped_tables++; FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d", wa->count, wa->dumped); } static int dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family) { wa->fibnum = fibnum; if (family == AF_UNSPEC) { for (int i = 0; i < AF_MAX; i++) { if (rt_tables_get_rnh(fibnum, i) != 0) { wa->family = i; dump_rtable_one(wa, fibnum, i); if (wa->error != 0) break; } } } else { if (rt_tables_get_rnh(fibnum, family) != 0) { wa->family = family; dump_rtable_one(wa, fibnum, family); } } return (wa->error); } static int handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs, struct nlmsghdr *hdr, struct nl_pstate *npt) { RIB_RLOCK_TRACKER; struct rib_head *rnh; const struct rtentry *rt; struct route_nhop_data rnd; uint32_t fibnum = attrs->rta_table; sa_family_t family = attrs->rtm_family; if (attrs->rta_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied"); return (EINVAL); } rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return (EAFNOSUPPORT); RIB_RLOCK(rnh); struct sockaddr *dst = attrs->rta_dst; if (attrs->rtm_flags & RTM_F_PREFIX) rt = rib_lookup_prefix_plen(rnh, dst, attrs->rtm_dst_len, &rnd); else rt = (const struct rtentry *)rnh->rnh_matchaddr(dst, &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } rt_get_rnd(rt, &rnd); rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0); RIB_RUNLOCK(rnh); if (!rt_is_exportable(rt, nlp_get_cred(nlp))) return (ESRCH); IF_DEBUG_LEVEL(LOG_DEBUG2) { char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s", nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)), rt_print_buf(rt, rtbuf, sizeof(rtbuf))); } hdr->nlmsg_type = NL_RTM_NEWROUTE; dump_px(fibnum, hdr, rt, &rnd, npt->nw); return (0); } static int handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family, struct nlmsghdr *hdr, struct nl_writer *nw) { struct netlink_walkargs wa = { .nlp = nlp, .nw = nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_type = NL_RTM_NEWROUTE, .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, }; if (fibnum == RT_TABLE_UNSPEC) { for (int i = 0; i < V_rt_numfibs; i++) { dump_rtable_fib(&wa, fibnum, family); if (wa.error != 0) break; } } else dump_rtable_fib(&wa, fibnum, family); if (wa.error == 0 && wa.dumped_tables == 0) { FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family"); wa.error = ESRCH; // How do we propagate it? } if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (wa.error); } static struct nhop_object * finalize_nhop(struct nhop_object *nh, const struct sockaddr *dst, int *perror) { /* * The following MUST be filled: * nh_ifp, nh_ifa, nh_gw */ if (nh->gw_sa.sa_family == 0) { /* * Empty gateway. Can be direct route with RTA_OIF set. */ if (nh->nh_ifp != NULL) nhop_set_direct_gw(nh, nh->nh_ifp); else { NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping"); *perror = EINVAL; return (NULL); } /* Both nh_ifp and gateway are set */ } else { /* Gateway is set up, we can derive ifp if not set */ if (nh->nh_ifp == NULL) { uint32_t fibnum = nhop_get_fibnum(nh); uint32_t flags = 0; if (nh->nh_flags & NHF_GATEWAY) flags = RTF_GATEWAY; else if (nh->nh_flags & NHF_HOST) flags = RTF_HOST; struct ifaddr *ifa = ifa_ifwithroute(flags, dst, &nh->gw_sa, fibnum); if (ifa == NULL) { NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping"); *perror = EINVAL; return (NULL); } nhop_set_transmit_ifp(nh, ifa->ifa_ifp); } } /* Both nh_ifp and gateway are set */ if (nh->nh_ifa == NULL) { const struct sockaddr *gw_sa = &nh->gw_sa; if (gw_sa->sa_family != dst->sa_family) { /* * Use dst as the target for determining the default * preferred ifa IF * 1) the gateway is link-level (e.g. direct route) * 2) the gateway family is different (e.g. IPv4 over IPv6). */ gw_sa = dst; } struct ifaddr *ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp); if (ifa == NULL) { /* Try link-level ifa. */ gw_sa = &nh->gw_sa; ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp); if (ifa == NULL) { NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping"); *perror = EINVAL; return (NULL); } } nhop_set_src(nh, ifa); } return (nhop_get_nhop(nh, perror)); } static int get_pxflag(const struct nl_parsed_route *attrs) { int pxflag = 0; switch (attrs->rtm_family) { case AF_INET: if (attrs->rtm_dst_len == 32) pxflag = NHF_HOST; else if (attrs->rtm_dst_len == 0) pxflag = NHF_DEFAULT; break; case AF_INET6: if (attrs->rtm_dst_len == 128) pxflag = NHF_HOST; else if (attrs->rtm_dst_len == 0) pxflag = NHF_DEFAULT; break; } return (pxflag); } static int get_op_flags(int nlm_flags) { int op_flags = 0; op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0; op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0; op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0; op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0; return (op_flags); } #ifdef ROUTE_MPATH static int create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh, struct nl_pstate *npt, struct nhop_object **pnh) { int error; if (mpnh->gw == NULL) return (EINVAL); struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); if (nh == NULL) return (ENOMEM); error = nl_set_nexthop_gw(nh, mpnh->gw, mpnh->ifp, npt); if (error != 0) { nhop_free(nh); return (error); } if (mpnh->ifp != NULL) nhop_set_transmit_ifp(nh, mpnh->ifp); nhop_set_pxtype_flag(nh, get_pxflag(attrs)); nhop_set_rtflags(nh, attrs->rta_rtflags); if (attrs->rtm_protocol > RTPROT_STATIC) nhop_set_origin(nh, attrs->rtm_protocol); *pnh = finalize_nhop(nh, attrs->rta_dst, &error); return (error); } #endif static struct nhop_object * create_nexthop_from_attrs(struct nl_parsed_route *attrs, struct nl_pstate *npt, int *perror) { struct nhop_object *nh = NULL; int error = 0; if (attrs->rta_multipath != NULL) { #ifdef ROUTE_MPATH /* Multipath w/o explicit nexthops */ int num_nhops = attrs->rta_multipath->num_nhops; struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops); for (int i = 0; i < num_nhops; i++) { struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i]; error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh); if (error != 0) { for (int j = 0; j < i; j++) nhop_free(wn[j].nh); break; } wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1; } if (error == 0) { struct rib_head *rh = nhop_get_rh(wn[0].nh); struct nhgrp_object *nhg; nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family, wn, num_nhops, perror); if (nhg != NULL) { if (attrs->rtm_protocol > RTPROT_STATIC) nhgrp_set_origin(nhg, attrs->rtm_protocol); nhg = nhgrp_get_nhgrp(nhg, perror); } for (int i = 0; i < num_nhops; i++) nhop_free(wn[i].nh); if (nhg != NULL) return ((struct nhop_object *)nhg); error = *perror; } #else error = ENOTSUP; #endif *perror = error; } else { nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); if (nh == NULL) { *perror = ENOMEM; return (NULL); } if (attrs->rta_gw != NULL) { *perror = nl_set_nexthop_gw(nh, attrs->rta_gw, attrs->rta_oif, npt); if (*perror != 0) { nhop_free(nh); return (NULL); } } if (attrs->rta_oif != NULL) nhop_set_transmit_ifp(nh, attrs->rta_oif); if (attrs->rtax_mtu != 0) nhop_set_mtu(nh, attrs->rtax_mtu, true); if (attrs->rta_rtflags & RTF_BROADCAST) nhop_set_broadcast(nh, true); if (attrs->rtm_protocol > RTPROT_STATIC) nhop_set_origin(nh, attrs->rtm_protocol); nhop_set_pxtype_flag(nh, get_pxflag(attrs)); nhop_set_rtflags(nh, attrs->rta_rtflags); switch (attrs->rtm_type) { case RTN_UNICAST: break; case RTN_BLACKHOLE: nhop_set_blackhole(nh, RTF_BLACKHOLE); break; case RTN_PROHIBIT: case RTN_UNREACHABLE: nhop_set_blackhole(nh, RTF_REJECT); break; /* TODO: return ENOTSUP for other types if strict option is set */ } nh = finalize_nhop(nh, attrs->rta_dst, perror); } return (nh); } static int rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct rib_cmd_info rc = {}; struct nhop_object *nh = NULL; int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); /* Check if we have enough data */ if (attrs.rta_dst == NULL) { NL_LOG(LOG_DEBUG, "missing RTA_DST"); return (EINVAL); } /* pre-2.6.19 Linux API compatibility */ if (attrs.rtm_table > 0 && attrs.rta_table == 0) attrs.rta_table = attrs.rtm_table; if (attrs.rta_table >= V_rt_numfibs || attrs.rtm_family > AF_MAX) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } if (attrs.rta_nh_id != 0) { /* Referenced uindex */ int pxflag = get_pxflag(&attrs); nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id, pxflag, &error); if (error != 0) return (error); } else { nh = create_nexthop_from_attrs(&attrs, npt, &error); if (error != 0) { NL_LOG(LOG_DEBUG, "Error creating nexthop"); return (error); } } if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0) attrs.rta_weight = RT_DEFAULT_WEIGHT; struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight }; int op_flags = get_op_flags(hdr->nlmsg_flags); error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, &rnd, op_flags, &rc); if (error == 0) report_operation(attrs.rta_table, &rc, nlp, hdr); return (error); } static int path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data) { struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data; if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw)) return (0); if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp)) return (0); return (1); } static int rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct rib_cmd_info rc; int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); if (attrs.rta_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set"); return (ESRCH); } if (attrs.rta_table >= V_rt_numfibs || attrs.rtm_family > AF_MAX) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } error = rib_del_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, path_match_func, &attrs, (attrs.rta_rtflags & RTF_PINNED) ? RTM_F_FORCE : 0, &rc); if (error == 0) report_operation(attrs.rta_table, &rc, nlp, hdr); return (error); } static int rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); if (attrs.rta_table >= V_rt_numfibs || attrs.rtm_family > AF_MAX) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } if (hdr->nlmsg_flags & NLM_F_DUMP) error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw); else error = handle_rtm_getroute(nlp, &attrs, hdr, npt); return (error); } void rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) { struct nl_writer nw; int family, nlm_flags = 0; family = rt_get_family(rc->rc_rt); /* XXX: check if there are active listeners first */ /* TODO: consider passing PID/type/seq */ switch (rc->rc_cmd) { case RTM_ADD: nlm_flags = NLM_F_EXCL | NLM_F_CREATE; break; case RTM_CHANGE: nlm_flags = NLM_F_REPLACE; break; case RTM_DELETE: nlm_flags = 0; break; } IF_DEBUG_LEVEL(LOG_DEBUG2) { char rtbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_LOG(LOG_DEBUG2, fibnum, family, "received event %s for %s / nlm_flags=%X", rib_print_cmd(rc->rc_cmd), rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)), nlm_flags); } struct nlmsghdr hdr = { .nlmsg_flags = nlm_flags, .nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd), }; struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, }; uint32_t group_id = family_to_group(family); if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id, 0, false)) { NL_LOG(LOG_DEBUG, "error allocating event buffer"); return; } dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw); nlmsg_flush(&nw); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_GETROUTE, .name = "RTM_GETROUTE", .cb = &rtnl_handle_getroute, .flags = RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_DELROUTE, .name = "RTM_DELROUTE", .cb = &rtnl_handle_delroute, .priv = PRIV_NET_ROUTE, + .flags = RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_NEWROUTE, .name = "RTM_NEWROUTE", .cb = &rtnl_handle_newroute, .priv = PRIV_NET_ROUTE, + .flags = RTNL_F_ALLOW_NONVNET_JAIL, } }; static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser}; void rtnl_routes_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, nitems(cmd_handlers)); } diff --git a/sys/sys/jail.h b/sys/sys/jail.h index 90fcf8cd5a47..08caa9f49270 100644 --- a/sys/sys/jail.h +++ b/sys/sys/jail.h @@ -1,501 +1,502 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_JAIL_H_ #define _SYS_JAIL_H_ #ifdef _KERNEL struct jail_v0 { u_int32_t version; char *path; char *hostname; u_int32_t ip_number; }; #endif struct jail { uint32_t version; char *path; char *hostname; char *jailname; uint32_t ip4s; uint32_t ip6s; struct in_addr *ip4; struct in6_addr *ip6; }; #define JAIL_API_VERSION 2 /* * For all xprison structs, always keep the pr_version an int and * the first variable so userspace can easily distinguish them. */ #ifndef _KERNEL struct xprison_v1 { int pr_version; int pr_id; char pr_path[MAXPATHLEN]; char pr_host[MAXHOSTNAMELEN]; u_int32_t pr_ip; }; #endif struct xprison { int pr_version; int pr_id; int pr_state; cpusetid_t pr_cpusetid; char pr_path[MAXPATHLEN]; char pr_host[MAXHOSTNAMELEN]; char pr_name[MAXHOSTNAMELEN]; uint32_t pr_ip4s; uint32_t pr_ip6s; #if 0 /* * sizeof(xprison) will be malloced + size needed for all * IPv4 and IPv6 addesses. Offsets are based numbers of addresses. */ struct in_addr pr_ip4[]; struct in6_addr pr_ip6[]; #endif }; #define XPRISON_VERSION 3 enum prison_state { PRISON_STATE_INVALID = 0, /* New prison, not ready to be seen */ PRISON_STATE_ALIVE, /* Current prison, visible to all */ PRISON_STATE_DYING /* Removed but holding resources, */ }; /* optionally visible. */ /* * Flags for jail_set and jail_get. */ #define JAIL_CREATE 0x01 /* Create jail if it doesn't exist */ #define JAIL_UPDATE 0x02 /* Update parameters of existing jail */ #define JAIL_ATTACH 0x04 /* Attach to jail upon creation */ #define JAIL_DYING 0x08 /* Allow getting a dying jail */ #define JAIL_SET_MASK 0x0f /* JAIL_DYING is deprecated/ignored here */ #define JAIL_GET_MASK 0x08 #define JAIL_SYS_DISABLE 0 #define JAIL_SYS_NEW 1 #define JAIL_SYS_INHERIT 2 #ifndef _KERNEL struct iovec; __BEGIN_DECLS int jail(struct jail *); int jail_set(struct iovec *, unsigned int, int); int jail_get(struct iovec *, unsigned int, int); int jail_attach(int); int jail_remove(int); __END_DECLS #else /* _KERNEL */ #include #include #include #include #include #define JAIL_MAX 999999 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PRISON); #endif #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_PRISON) #include #define HOSTUUIDLEN 64 #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" #define OSRELEASELEN 32 #define JAIL_META_PRIVATE "meta" #define JAIL_META_SHARED "env" struct racct; struct prison_racct; typedef enum { PR_INET = 0, PR_INET6 = 1, PR_FAMILY_MAX = 2, } pr_family_t; /* * This structure describes a prison. It is pointed to by all struct * ucreds's of the inmates. pr_ref keeps track of them and is used to * delete the structure when the last inmate is dead. * * Lock key: * (a) allprison_lock * (A) allproc_lock * (c) set only during creation before the structure is shared, no mutex * required to read * (m) locked by pr_mtx * (p) locked by pr_mtx, and also at least shared allprison_lock required * to update * (q) locked by both pr_mtx and allprison_lock * (r) atomic via refcount(9), pr_mtx and allprison_lock required to * decrement to zero * (n) read access granted with the network epoch */ struct prison { TAILQ_ENTRY(prison) pr_list; /* (a) all prisons */ int pr_id; /* (c) prison id */ volatile u_int pr_ref; /* (r) refcount */ volatile u_int pr_uref; /* (r) user (alive) refcount */ unsigned pr_flags; /* (p) PR_* flags */ LIST_HEAD(, prison) pr_children; /* (a) list of child jails */ LIST_HEAD(, proc) pr_proclist; /* (A) list of jailed processes */ LIST_ENTRY(prison) pr_sibling; /* (a) next in parent's list */ struct prison *pr_parent; /* (c) containing jail */ struct mtx pr_mtx; struct task pr_task; /* (c) destroy task */ struct osd pr_osd; /* (p) additional data */ struct cpuset *pr_cpuset; /* (p) cpuset */ struct vnet *pr_vnet; /* (c) network stack */ struct vnode *pr_root; /* (c) vnode to rdir */ struct prison_ip *pr_addrs[PR_FAMILY_MAX]; /* (p,n) IPs of jail */ struct prison_racct *pr_prison_racct; /* (c) racct jail proxy */ void *pr_sparep[3]; int pr_childcount; /* (a) number of child jails */ int pr_childmax; /* (p) maximum child jails */ unsigned pr_allow; /* (p) PR_ALLOW_* flags */ int pr_securelevel; /* (p) securelevel */ int pr_enforce_statfs; /* (p) statfs permission */ int pr_devfs_rsnum; /* (p) devfs ruleset */ enum prison_state pr_state; /* (q) state in life cycle */ volatile int pr_exportcnt; /* (r) count of mount exports */ int pr_spare; int pr_osreldate; /* (c) kern.osreldate value */ unsigned long pr_hostid; /* (p) jail hostid */ char pr_name[MAXHOSTNAMELEN]; /* (p) admin jail name */ char pr_path[MAXPATHLEN]; /* (c) chroot path */ char pr_hostname[MAXHOSTNAMELEN]; /* (p) jail hostname */ char pr_domainname[MAXHOSTNAMELEN]; /* (p) jail domainname */ char pr_hostuuid[HOSTUUIDLEN]; /* (p) jail hostuuid */ char pr_osrelease[OSRELEASELEN]; /* (c) kern.osrelease value */ }; struct prison_racct { LIST_ENTRY(prison_racct) prr_next; char prr_name[MAXHOSTNAMELEN]; u_int prr_refcount; struct racct *prr_racct; }; #endif /* _KERNEL || _WANT_PRISON */ #ifdef _KERNEL /* Flag bits set via options */ #define PR_PERSIST 0x00000001 /* Can exist without processes */ #define PR_HOST 0x00000002 /* Virtualize hostname et al */ #define PR_IP4_USER 0x00000004 /* Restrict IPv4 addresses */ #define PR_IP6_USER 0x00000008 /* Restrict IPv6 addresses */ #define PR_VNET 0x00000010 /* Virtual network stack */ #define PR_IP4_SADDRSEL 0x00000080 /* Do IPv4 src addr sel. or use the */ /* primary jail address. */ #define PR_IP6_SADDRSEL 0x00000100 /* Do IPv6 src addr sel. or use the */ /* primary jail address. */ /* Internal flag bits */ #define PR_REMOVE 0x01000000 /* In process of being removed */ #define PR_IP4 0x02000000 /* IPv4 restricted or disabled */ /* by this jail or an ancestor */ #define PR_IP6 0x04000000 /* IPv6 restricted or disabled */ /* by this jail or an ancestor */ #define PR_COMPLETE_PROC 0x08000000 /* prison_complete called from */ /* prison_proc_free, releases uref */ /* * Flags for pr_allow * Bits not noted here may be used for dynamic allow.mount.xxxfs. */ #define PR_ALLOW_SET_HOSTNAME 0x00000001 #define PR_ALLOW_SYSVIPC 0x00000002 #define PR_ALLOW_RAW_SOCKETS 0x00000004 #define PR_ALLOW_CHFLAGS 0x00000008 #define PR_ALLOW_MOUNT 0x00000010 #define PR_ALLOW_QUOTAS 0x00000020 #define PR_ALLOW_SOCKET_AF 0x00000040 #define PR_ALLOW_MLOCK 0x00000080 #define PR_ALLOW_READ_MSGBUF 0x00000100 #define PR_ALLOW_UNPRIV_DEBUG 0x00000200 #define PR_ALLOW_SUSER 0x00000400 #define PR_ALLOW_RESERVED_PORTS 0x00008000 #define PR_ALLOW_KMEM_ACCESS 0x00010000 /* reserved, not used yet */ #define PR_ALLOW_NFSD 0x00020000 #define PR_ALLOW_EXTATTR 0x00040000 #define PR_ALLOW_ADJTIME 0x00080000 #define PR_ALLOW_SETTIME 0x00100000 -#define PR_ALLOW_ALL_STATIC 0x001f87ff +#define PR_ALLOW_ROUTING 0x00200000 +#define PR_ALLOW_ALL_STATIC 0x003f87ff /* * PR_ALLOW_DIFFERENCES determines which flags are able to be * different between the parent and child jail upon creation. */ #define PR_ALLOW_DIFFERENCES (PR_ALLOW_UNPRIV_DEBUG) /* * OSD methods */ #define PR_METHOD_CREATE 0 #define PR_METHOD_GET 1 #define PR_METHOD_SET 2 #define PR_METHOD_CHECK 3 #define PR_METHOD_ATTACH 4 #define PR_METHOD_REMOVE 5 #define PR_MAXMETHOD 6 /* * Lock/unlock a prison. * XXX These exist not so much for general convenience, but to be useable in * the FOREACH_PRISON_DESCENDANT_LOCKED macro which can't handle them in * non-function form as currently defined. */ static __inline void prison_lock(struct prison *pr) { mtx_lock(&pr->pr_mtx); } static __inline void prison_unlock(struct prison *pr) { mtx_unlock(&pr->pr_mtx); } /* Traverse a prison's immediate children. */ #define FOREACH_PRISON_CHILD(ppr, cpr) \ LIST_FOREACH(cpr, &(ppr)->pr_children, pr_sibling) /* * Preorder traversal of all of a prison's descendants. * This ugly loop allows the macro to be followed by a single block * as expected in a looping primitive. */ #define FOREACH_PRISON_DESCENDANT(ppr, cpr, descend) \ for ((cpr) = (ppr), (descend) = 1; \ ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children)) \ ? LIST_FIRST(&(cpr)->pr_children) \ : ((cpr) == (ppr) \ ? NULL \ : (((descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ ? LIST_NEXT(cpr, pr_sibling) \ : (cpr)->pr_parent))));) \ if (!(descend)) \ ; \ else /* * As above, but lock descendants on the way down and unlock on the way up. */ #define FOREACH_PRISON_DESCENDANT_LOCKED(ppr, cpr, descend) \ for ((cpr) = (ppr), (descend) = 1; \ ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children)) \ ? LIST_FIRST(&(cpr)->pr_children) \ : ((cpr) == (ppr) \ ? NULL \ : ((prison_unlock(cpr), \ (descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ ? LIST_NEXT(cpr, pr_sibling) \ : (cpr)->pr_parent))));) \ if ((descend) ? (prison_lock(cpr), 0) : 1) \ ; \ else /* * As above, but also keep track of the level descended to. */ #define FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(ppr, cpr, descend, level)\ for ((cpr) = (ppr), (descend) = 1, (level) = 0; \ ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children)) \ ? (level++, LIST_FIRST(&(cpr)->pr_children)) \ : ((cpr) == (ppr) \ ? NULL \ : ((prison_unlock(cpr), \ (descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ ? LIST_NEXT(cpr, pr_sibling) \ : (level--, (cpr)->pr_parent)))));) \ if ((descend) ? (prison_lock(cpr), 0) : 1) \ ; \ else /* * Traverse a prison's descendants, visiting both preorder and postorder. */ #define FOREACH_PRISON_DESCENDANT_PRE_POST(ppr, cpr, descend) \ for ((cpr) = (ppr), (descend) = 1; \ ((cpr) = (descend) \ ? ((descend) = !LIST_EMPTY(&(cpr)->pr_children)) \ ? LIST_FIRST(&(cpr)->pr_children) \ : (cpr) \ : ((descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ ? LIST_NEXT(cpr, pr_sibling) \ : cpr->pr_parent) != (ppr);) /* * Attributes of the physical system, and the root of the jail tree. */ extern struct prison prison0; TAILQ_HEAD(prisonlist, prison); extern struct prisonlist allprison; extern struct sx allprison_lock; /* * Sysctls to describe jail parameters. */ SYSCTL_DECL(_security_jail); SYSCTL_DECL(_security_jail_param); #define SYSCTL_JAIL_PARAM_DECL(name) \ SYSCTL_DECL(_security_jail_param_##name) #define SYSCTL_JAIL_PARAM(module, param, type, fmt, descr) \ SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param, \ (type) | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_param, fmt, descr) #define SYSCTL_JAIL_PARAM_STRING(module, param, access, len, descr) \ SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param, \ CTLTYPE_STRING | CTLFLAG_MPSAFE | (access), NULL, len, \ sysctl_jail_param, "A", descr) #define SYSCTL_JAIL_PARAM_STRUCT(module, param, access, len, fmt, descr) \ SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param, \ CTLTYPE_STRUCT | CTLFLAG_MPSAFE | (access), NULL, len, \ sysctl_jail_param, fmt, descr) #define SYSCTL_JAIL_PARAM_NODE(module, descr) \ SYSCTL_NODE(_security_jail_param, OID_AUTO, module, CTLFLAG_MPSAFE, \ 0, descr) #define SYSCTL_JAIL_PARAM_SUBNODE(parent, module, descr) \ SYSCTL_NODE(_security_jail_param_##parent, OID_AUTO, module, \ CTLFLAG_MPSAFE, 0, descr) #define SYSCTL_JAIL_PARAM_SYS_NODE(module, access, descr) \ SYSCTL_JAIL_PARAM_NODE(module, descr); \ SYSCTL_JAIL_PARAM(_##module, , CTLTYPE_INT | (access), "E,jailsys", \ descr) #define SYSCTL_JAIL_PARAM_SYS_SUBNODE(parent, module, access, descr) \ SYSCTL_JAIL_PARAM_SUBNODE(parent, module, descr); \ SYSCTL_JAIL_PARAM(_##parent##_##module, , CTLTYPE_INT | (access), \ "E,jailsys", descr) /* * Kernel support functions for jail(). */ struct ucred; struct mount; struct sockaddr; struct statfs; struct vfsconf; /* * Return 1 if the passed credential is in a jail, otherwise 0. */ #define jailed(cred) (cred->cr_prison != &prison0) bool jailed_without_vnet(struct ucred *); void getcredhostname(struct ucred *, char *, size_t); void getcreddomainname(struct ucred *, char *, size_t); void getcredhostuuid(struct ucred *, char *, size_t); void getcredhostid(struct ucred *, unsigned long *); void getjailname(struct ucred *cred, char *name, size_t len); void prison0_init(void); bool prison_allow(struct ucred *, unsigned); int prison_check(struct ucred *cred1, struct ucred *cred2); bool prison_check_nfsd(struct ucred *cred); bool prison_owns_vnet(struct ucred *); int prison_canseemount(struct ucred *cred, struct mount *mp); void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp); struct prison *prison_find(int prid); struct prison *prison_find_child(struct prison *, int); struct prison *prison_find_name(struct prison *, const char *); bool prison_flag(struct ucred *, unsigned); void prison_free(struct prison *pr); void prison_free_locked(struct prison *pr); void prison_hold(struct prison *pr); void prison_hold_locked(struct prison *pr); void prison_proc_hold(struct prison *); void prison_proc_free(struct prison *); void prison_proc_link(struct prison *, struct proc *); void prison_proc_unlink(struct prison *, struct proc *); void prison_proc_iterate(struct prison *, void (*)(struct proc *, void *), void *); void prison_set_allow(struct ucred *cred, unsigned flag, int enable); bool prison_ischild(struct prison *, struct prison *); bool prison_isalive(const struct prison *); bool prison_isvalid(struct prison *); #if defined(INET) || defined(INET6) int prison_ip_check(const struct prison *, const pr_family_t, const void *); const void *prison_ip_get0(const struct prison *, const pr_family_t); u_int prison_ip_cnt(const struct prison *, const pr_family_t); #endif #ifdef INET bool prison_equal_ip4(struct prison *, struct prison *); int prison_get_ip4(struct ucred *cred, struct in_addr *ia); int prison_local_ip4(struct ucred *cred, struct in_addr *ia); int prison_remote_ip4(struct ucred *cred, struct in_addr *ia); int prison_check_ip4(const struct ucred *, const struct in_addr *); int prison_check_ip4_locked(const struct prison *, const struct in_addr *); bool prison_saddrsel_ip4(struct ucred *, struct in_addr *); int prison_qcmp_v4(const void *, const void *); bool prison_valid_v4(const void *); #endif #ifdef INET6 bool prison_equal_ip6(struct prison *, struct prison *); int prison_get_ip6(struct ucred *, struct in6_addr *); int prison_local_ip6(struct ucred *, struct in6_addr *, int); int prison_remote_ip6(struct ucred *, struct in6_addr *); int prison_check_ip6(const struct ucred *, const struct in6_addr *); int prison_check_ip6_locked(const struct prison *, const struct in6_addr *); bool prison_saddrsel_ip6(struct ucred *, struct in6_addr *); int prison_qcmp_v6(const void *, const void *); bool prison_valid_v6(const void *); #endif int prison_check_af(struct ucred *cred, int af); int prison_if(struct ucred *cred, const struct sockaddr *sa); char *prison_name(struct prison *, struct prison *); int prison_priv_check(struct ucred *cred, int priv); int sysctl_jail_param(SYSCTL_HANDLER_ARGS); unsigned prison_add_allow(const char *prefix, const char *name, const char *prefix_descr, const char *descr); void prison_add_vfs(struct vfsconf *vfsp); void prison_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3); struct prison_racct *prison_racct_find(const char *name); void prison_racct_hold(struct prison_racct *prr); void prison_racct_free(struct prison_racct *prr); #endif /* _KERNEL */ #endif /* !_SYS_JAIL_H_ */ diff --git a/usr.sbin/jail/jail.8 b/usr.sbin/jail/jail.8 index 3426f4f0d600..8d7bc25a8694 100644 --- a/usr.sbin/jail/jail.8 +++ b/usr.sbin/jail/jail.8 @@ -1,1575 +1,1578 @@ .\" Copyright (c) 2000, 2003 Robert N. M. Watson .\" Copyright (c) 2008-2012 James Gritton .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd September 19, 2024 +.Dd May 11, 2025 .Dt JAIL 8 .Os .Sh NAME .Nm jail .Nd "manage system jails" .Sh SYNOPSIS .Ss From Configuration File .Nm .Op Fl cm .Op Fl Cdqv .Op Fl f Ar conf_file .Op Fl p Ar limit .Op Ar jail .Nm .Op Fl r .Op Fl Cqv .Op Fl f Ar conf_file .Op Fl p Ar limit .Op Cm * | Ar jail ... .Ss Without Configuration File .Nm .Op Fl cm .Op Fl dhilqv .Op Fl J Ar jid_file .Op Fl u Ar username .Op Fl U Ar username .Ar param Ns = Ns Ar value ... .Op Cm command Ns = Ns Ar command ... .Nm .Op Fl rR .Op Fl qv .Op Cm * | Ar jail ... .Ss Show Parameters .Nm .Op Fl f Ar conf_file .Fl e .Ar separator .Ss Backward Compatibility .Nm .Op Fl dhilqv .Op Fl J Ar jid_file .Op Fl u Ar username .Op Fl U Ar username .Op Fl n Ar jailname .Op Fl s Ar securelevel .Ar path hostname ip Ns Op Cm \&, Ns Ar ... .Ar command ... .Sh DESCRIPTION The .Nm utility creates new jails, or modifies or removes existing jails. It can also print a list of configured jails and their parameters. A jail .Pq or Dq prison is specified via parameters on the command line, or in the .Xr jail.conf 5 file. .Pp At least one of the options .Fl c , .Fl e , .Fl m or .Fl r must be specified. These options are used alone or in combination to describe the operation to perform: .Bl -tag -width indent .It Fl c Create a new jail. The jail .Va jid and .Va name parameters (if specified on the command line) must not refer to an existing jail. .It Fl e Ar separator Exhibit a list of all configured non-wildcard jails and their parameters. No jail creation, modification or removal performed if this option is used. The .Ar separator string is used to separate parameters. Use .Xr jls 8 utility to list running jails. .It Fl m Modify an existing jail. One of the .Va jid or .Va name parameters must exist and refer to an existing jail. Some parameters may not be changed on a running jail. .It Fl r Remove the .Ar jail specified by jid or name. All jailed processes are killed, and all jails that are children of this jail are also removed. .It Fl rc Restart an existing jail. The jail is first removed and then re-created, as if .Dq Nm Fl r and .Dq Nm Fl c were run in succession. .It Fl cm Create a jail if it does not exist, or modify the jail if it does exist. .It Fl mr Modify an existing jail. The jail may be restarted if necessary to modify parameters than could not otherwise be changed. .It Fl cmr Create a jail if it doesn't exist, or modify (and possibly restart) the jail if it does exist. .El .Pp Other available options are: .Bl -tag -width indent .It Fl C Clean up after an already-removed jail, running commands and operations that are typically run following jail removal. .It Fl f Ar conf_file Use configuration file .Ar conf_file instead of the default .Pa /etc/jail.conf . .It Fl h Resolve the .Va host.hostname parameter (or .Va hostname ) and add all IP addresses returned by the resolver to the list of addresses for this jail. This is equivalent to the .Va ip_hostname parameter. .It Fl i Output (only) the jail identifier of the newly created jail(s). This implies the .Fl q option. .It Fl J Ar jid_file Write a .Ar jid_file file, containing the parameters used to start the jail. .It Fl l Run commands in a clean environment. This is deprecated and is equivalent to the exec.clean parameter. .It Fl n Ar jailname Set the jail's name. This is deprecated and is equivalent to the .Va name parameter. .It Fl p Ar limit Limit the number of commands from .Va exec.* that can run simultaneously. .It Fl q Suppress the message printed whenever a jail is created, modified or removed. Only error messages will be printed. .It Fl R A variation of the .Fl r option that removes an existing jail without using the configuration file. No removal-related parameters for this jail will be used \(em the jail will simply be removed. .It Fl s Ar securelevel Set the .Va kern.securelevel MIB entry to the specified value inside the newly created jail. This is deprecated and is equivalent to the .Va securelevel parameter. .It Fl u Ar username The user name from host environment as whom jailed commands should run. This is deprecated and is equivalent to the .Va exec.jail_user and .Va exec.system_jail_user parameters. .It Fl U Ar username The user name from the jailed environment as whom jailed commands should run. This is deprecated and is equivalent to the .Va exec.jail_user parameter. .It Fl v Print a message on every operation, such as running commands and mounting filesystems. .It Fl d This is deprecated and is equivalent to the .Va allow.dying parameter, which is also deprecated. It used to allow making changes to a .Va dying jail. Now such jails are always replaced when a new jail is created with the same .Va jid or .Va name . .El .Pp If no arguments are given after the options, the operation (except remove) will be performed on all jails specified in the .Xr jail.conf 5 file. A single argument of a jail name will operate only on the specified jail. The .Fl r and .Fl R options can also remove running jails that aren't in the .Xr jail.conf 5 file, specified by name or jid. .Pp An argument of .Dq * is a wildcard that will operate on all jails, regardless of whether they appear in .Xr jail.conf 5 ; this is the surest way for .Fl r to remove all jails. If hierarchical jails exist, a partial-matching wildcard definition may be specified. For example, an argument of .Dq foo.* would apply to jails with names like .Dq foo.bar and .Dq foo.bar.baz . .Pp A jail may also be specified via parameters directly on the command line in .Dq name=value form, ignoring the contents of .Xr jail.conf 5 . For backward compatibility, the command line may also have four fixed parameters, without names: .Ar path , .Ar hostname , .Ar ip , and .Ar command . .Ss Jail Parameters Parameters in the .Xr jail.conf 5 file, or on the command line, are generally of the form .Dq name=value . Some parameters are boolean, and do not have a value but are set by the name alone with or without a .Dq no prefix, e.g. .Va persist or .Va nopersist . They can also be given the values .Dq true and .Dq false . Other parameters may have more than one value, specified as a comma-separated list, or with .Dq += in the configuration file (see .Xr jail.conf 5 for details). List-based parameters may also be specified multiple times on the command line, i.e., .Dq name=value1,value2 and .Dq name=value1 name=value2 are equivalent for such parameters. .Pp The .Nm utility recognizes two classes of parameters. There are the true jail parameters that are passed to the kernel when the jail is created, which can be seen with .Xr jls 8 , and can (usually) be changed with .Dq Nm Fl m . Then there are pseudo-parameters that are only used by .Nm itself. .Pp Jails have a set of core parameters, and kernel modules can add their own jail parameters. The current set of available parameters can be retrieved via .Dq Nm sysctl Fl d Va security.jail.param . Any parameters not set will be given default values, often based on the current environment. The core parameters are: .Bl -tag -width indent .It Va jid The jail identifier. This will be assigned automatically to a new jail (or can be explicitly set), and can be used to identify the jail for later modification, or for such commands as .Xr jls 8 or .Xr jexec 8 . .It Va name The jail name. This is an arbitrary string that identifies a jail (except it may not contain a .Sq \&. ) . Like the .Va jid , it can be passed to later .Nm commands, or to .Xr jls 8 or .Xr jexec 8 . If no .Va name is supplied, a default is assumed that is the same as the .Va jid . The .Va name parameter is implied by the .Xr jail.conf 5 file format, and need not be explicitly set when using the configuration file. .It Va path The directory which is to be the root of the jail. Any commands run inside the jail, either by .Nm or from .Xr jexec 8 , are run from this directory. .It Va ip4.addr A list of IPv4 addresses assigned to the jail. If this is set, the jail is restricted to using only these addresses. Any attempts to use other addresses fail, and attempts to use wildcard addresses silently use the jailed address instead. For IPv4 the first address given will be used as the source address when source address selection on unbound sockets cannot find a better match. It is only possible to start multiple jails with the same IP address if none of the jails has more than this single overlapping IP address assigned to itself. .It Va ip4.saddrsel A boolean option to change the formerly mentioned behaviour and disable IPv4 source address selection for the jail in favour of the primary IPv4 address of the jail. Source address selection is enabled by default for all jails and the .Va ip4.nosaddrsel setting of a parent jail is not inherited for any child jails. .It Va ip4 Control the availability of IPv4 addresses. Possible values are .Dq inherit to allow unrestricted access to all system addresses, .Dq new to restrict addresses via .Va ip4.addr , and .Dq disable to stop the jail from using IPv4 entirely. Setting the .Va ip4.addr parameter implies a value of .Dq new . .It Va ip6.addr , Va ip6.saddrsel , Va ip6 A set of IPv6 options for the jail, the counterparts to .Va ip4.addr , .Va ip4.saddrsel and .Va ip4 above. .It Va vnet Create the jail with its own virtual network stack, with its own network interfaces, addresses, routing table, etc. The kernel must have been compiled with the .Sy VIMAGE option for this to be available. Possible values are .Dq inherit to use the system network stack, possibly with restricted IP addresses, and .Dq new to create a new network stack. .It Va host.hostname The hostname of the jail. Other similar parameters are .Va host.domainname , .Va host.hostuuid and .Va host.hostid . .It Va host Set the origin of hostname and related information. Possible values are .Dq inherit to use the system information and .Dq new for the jail to use the information from the above fields. Setting any of the above fields implies a value of .Dq new . .It Va securelevel The value of the jail's .Va kern.securelevel sysctl. A jail never has a lower securelevel than its parent system, but by setting this parameter it may have a higher one. If the system securelevel is changed, any jail securelevels will be at least as secure. .It Va devfs_ruleset The number of the devfs ruleset that is enforced for mounting devfs in this jail. A value of zero (default) means no ruleset is enforced. Descendant jails inherit the parent jail's devfs ruleset enforcement. Mounting devfs inside a jail is possible only if the .Va allow.mount and .Va allow.mount.devfs permissions are effective and .Va enforce_statfs is set to a value lower than 2. Devfs rules and rulesets cannot be viewed or modified from inside a jail. .Pp NOTE: It is important that only appropriate device nodes in devfs be exposed to a jail; access to disk devices in the jail may permit processes in the jail to bypass the jail sandboxing by modifying files outside of the jail. See .Xr devfs 8 for information on how to use devfs rules to limit access to entries in the per-jail devfs. A simple devfs ruleset for jails is available as ruleset #4 in .Pa /etc/defaults/devfs.rules . .It Va children.max The number of child jails allowed to be created by this jail (or by other jails under this jail). This limit is zero by default, indicating the jail is not allowed to create child jails. See the .Sx "Hierarchical Jails" section for more information. .It Va children.cur The number of descendants of this jail, including its own child jails and any jails created under them. .It Va enforce_statfs This determines what information processes in a jail are able to get about mount points. It affects the behaviour of the following syscalls: .Xr statfs 2 , .Xr fstatfs 2 , .Xr getfsstat 2 , and .Xr fhstatfs 2 (as well as similar compatibility syscalls). When set to 0, all mount points are available without any restrictions. When set to 1, only mount points below the jail's chroot directory are visible. In addition to that, the path to the jail's chroot directory is removed from the front of their pathnames. When set to 2 (default), above syscalls can operate only on a mount-point where the jail's chroot directory is located. .It Va persist Setting this boolean parameter allows a jail to exist without any processes. Normally, a command is run as part of jail creation, and then the jail is destroyed as its last process exits. A new jail must have either the .Va persist parameter or .Va exec.start or .Va command pseudo-parameter set. .It Va cpuset.id The ID of the cpuset associated with this jail (read-only). .It Va dying This is true if the jail is in the process of shutting down (read-only). .It Va parent The .Va jid of the parent of this jail, or zero if this is a top-level jail (read-only). .It Va osrelease The string for the jail's .Va kern.osrelease sysctl and uname -r. .It Va osreldate The number for the jail's .Va kern.osreldate and uname -K. .It Va meta , Va env An arbitrary string associated with the jail. Its maximum buffer size is controlled by the global .Va security.jail.meta_maxbufsize sysctl, which can only be adjusted by the non-jailed root user. While the .Va meta is hidden from the jail, the .Va env is readable through the .Va security.jail.env sysctl. .Pp Each buffer can be treated as a set of key=value\\n strings. In order to add or replace a specific key the .Va meta.keyname=value or .Va env.keyname=value parameter notations must be used. While .Va meta.keyname= or .Va env.keyname= reset the value to an empty string, the .Va meta.keyname or .Va env.keyname notations, without the equal sign, remove the given key. Respectively, the same .Va meta.keyname or .Va env.keyname notations are used to query a specific key while reading jail parameters using such commands as .Xr jls 8 . Multiple keys can be queried or modified with a single command. .It Va allow.* Some restrictions of the jail environment may be set on a per-jail basis. With the exception of .Va allow.set_hostname and .Va allow.reserved_ports , these boolean parameters are off by default. .Bl -tag -width indent .It Va allow.set_hostname The jail's hostname may be changed via .Xr hostname 1 or .Xr sethostname 3 . .It Va allow.sysvipc A process within the jail has access to System V IPC primitives. This is deprecated in favor of the per-module parameters (see below). When this parameter is set, it is equivalent to setting .Va sysvmsg , .Va sysvsem , and .Va sysvshm all to .Dq inherit . .It Va allow.raw_sockets The jail root is allowed to create raw sockets. Setting this parameter allows utilities like .Xr ping 8 and .Xr traceroute 8 to operate inside the jail. If this is set, the source IP addresses are enforced to comply with the IP address bound to the jail, regardless of whether or not the .Dv IP_HDRINCL flag has been set on the socket. Since raw sockets can be used to configure and interact with various network subsystems, extra caution should be used where privileged access to jails is given out to untrusted parties. .It Va allow.chflags Normally, privileged users inside a jail are treated as unprivileged by .Xr chflags 2 . When this parameter is set, such users are treated as privileged, and may manipulate system file flags subject to the usual constraints on .Va kern.securelevel . .It Va allow.mount privileged users inside the jail will be able to mount and unmount file system types marked as jail-friendly. The .Xr lsvfs 1 command can be used to find file system types available for mount from within a jail. This permission is effective only if .Va enforce_statfs is set to a value lower than 2. .It Va allow.mount.devfs privileged users inside the jail will be able to mount and unmount the devfs file system. This permission is effective only together with .Va allow.mount and only when .Va enforce_statfs is set to a value lower than 2. The devfs ruleset should be restricted from the default by using the .Va devfs_ruleset option. .It Va allow.quotas The jail root may administer quotas on the jail's filesystem(s). This includes filesystems that the jail may share with other jails or with non-jailed parts of the system. .It Va allow.read_msgbuf Jailed users may read the kernel message buffer. If the .Va security.bsd.unprivileged_read_msgbuf MIB entry is zero, this will be restricted to the root user. .It Va allow.socket_af Sockets within a jail are normally restricted to IPv4, IPv6, local (UNIX), and route. This allows access to other protocol stacks that have not had jail functionality added to them. .It Va allow.mlock Locking or unlocking physical pages in memory are normally not available within a jail. When this parameter is set, users may .Xr mlock 2 or .Xr munlock 2 memory subject to .Va security.bsd.unprivileged_mlock and resource limits. .It Va allow.nfsd The .Xr mountd 8 , .Xr nfsd 8 , .Xr nfsuserd 8 , .Xr gssd 8 and .Xr rpc.tlsservd 8 daemons are permitted to run inside a properly configured vnet-enabled jail. The jail's root must be a file system mount point and .Va enforce_statfs must not be set to 0, so that .Xr mountd 8 can export file systems visible within the jail. .Va enforce_statfs must be set to 1 if file systems mounted under the jail's file system need to be exported by .Xr mount 8 . For exporting only the jail's file system, a setting of 2 is sufficient. If the kernel configuration does not include the .Sy NFSD option, .Pa nfsd.ko must be loaded outside of the jails. This is normally done by adding .Dq nfsd to .Va kld_list in the .Xr rc.conf 5 file outside of the jails. Similarily, if the .Xr gssd 8 is to be run in a jail, either the kernel .Sy KGSSAPI option needs to be specified or .Dq kgssapi and .Dq kgssapi_krb5 need to be in .Va kld_list in the .Xr rc.conf 5 file outside of the jails. .It Va allow.reserved_ports The jail root may bind to ports lower than 1024. .It Va allow.unprivileged_proc_debug Unprivileged processes in the jail may use debugging facilities. .It Va allow.suser The value of the jail's .Va security.bsd.suser_enabled sysctl. The super-user will be disabled automatically if its parent system has it disabled. The super-user is enabled by default. .It Va allow.extattr Allow privileged process in the jail to manipulate filesystem extended attributes in the system namespace. .It Va allow.adjtime Allow privileged process in the jail to slowly adjusting global operating system time. For example through utilities like .Xr ntpd 8 . .It Va allow.settime Allow privileged process in the jail to set global operating system data and time. For example through utilities like .Xr date 1 . This permission includes also .Va allow.adjtime . +.It Va allow.routing +Allow privileged process in the non-VNET jail to modify the system routing +table. .El .El .Pp Kernel modules may add their own parameters, which only exist when the module is loaded. These are typically headed under a parameter named after the module, with values of .Dq inherit to give the jail full use of the module, .Dq new to encapsulate the jail in some module-specific way, and .Dq disable to make the module unavailable to the jail. There also may be other parameters to define jail behavior within the module. Module-specific parameters include: .Bl -tag -width indent .It Va allow.mount.fdescfs privileged users inside the jail will be able to mount and unmount the fdescfs file system. This permission is effective only together with .Va allow.mount and only when .Va enforce_statfs is set to a value lower than 2. .It Va allow.mount.fusefs privileged users inside the jail will be able to mount and unmount fuse-based file systems. This permission is effective only together with .Va allow.mount and only when .Va enforce_statfs is set to a value lower than 2. .It Va allow.mount.nullfs privileged users inside the jail will be able to mount and unmount the nullfs file system. This permission is effective only together with .Va allow.mount and only when .Va enforce_statfs is set to a value lower than 2. .It Va allow.mount.procfs privileged users inside the jail will be able to mount and unmount the procfs file system. This permission is effective only together with .Va allow.mount and only when .Va enforce_statfs is set to a value lower than 2. .It Va allow.mount.linprocfs privileged users inside the jail will be able to mount and unmount the linprocfs file system. This permission is effective only together with .Va allow.mount and only when .Va enforce_statfs is set to a value lower than 2. .It Va allow.mount.linsysfs privileged users inside the jail will be able to mount and unmount the linsysfs file system. This permission is effective only together with .Va allow.mount and only when .Va enforce_statfs is set to a value lower than 2. .It Va allow.mount.tmpfs privileged users inside the jail will be able to mount and unmount the tmpfs file system. This permission is effective only together with .Va allow.mount and only when .Va enforce_statfs is set to a value lower than 2. .It Va allow.mount.zfs privileged users inside the jail will be able to mount and unmount the ZFS file system. This permission is effective only together with .Va allow.mount and only when .Va enforce_statfs is set to a value lower than 2. See .Xr zfs-jail 8 for information on how to configure the ZFS filesystem to operate from within a jail. .It Va allow.vmm The jail may access .Xr vmm 4 . This flag is only available when the .Xr vmm 4 kernel module is loaded. .It Va linux Determine how a jail's Linux emulation environment appears. A value of .Dq inherit will keep the same environment, and .Dq new will give the jail its own environment (still originally inherited when the jail is created). .It Va linux.osname , linux.osrelease , linux.oss_version The Linux OS name, OS release, and OSS version associated with this jail. .It Va sysvmsg Allow access to SYSV IPC message primitives. If set to .Dq inherit , all IPC objects on the system are visible to this jail, whether they were created by the jail itself, the base system, or other jails. If set to .Dq new , the jail will have its own key namespace, and can only see the objects that it has created; the system (or parent jail) has access to the jail's objects, but not to its keys. If set to .Dq disable , the jail cannot perform any sysvmsg-related system calls. .It Va sysvsem, sysvshm Allow access to SYSV IPC semaphore and shared memory primitives, in the same manner as .Va sysvmsg . .It Va zfs.mount_snapshot When set to 1, jailed users may access the contents of ZFS snapshots under the filesystem's .Pa .zfs directory. If .Va allow.mount.zfs is set, the snapshots may also be mounted. .El .Pp There are pseudo-parameters that are not passed to the kernel, but are used by .Nm to set up the jail environment, often by running specified commands when jails are created or removed. The .Va exec.* command parameters are .Xr sh 1 command lines that are run in either the system or jail environment. They may be given multiple values, which would run the specified commands in sequence. All commands must succeed (return a zero exit status), or the jail will not be created or removed, as appropriate. .Pp The pseudo-parameters are: .Bl -tag -width indent .It Va exec.prepare Command(s) to run in the system environment to prepare a jail for creation. These commands are executed before assigning IP addresses and mounting filesystems, so they may be used to create a new jail filesystem if it does not already exist. .It Va exec.prestart Command(s) to run in the system environment before a jail is created. .It Va exec.created Command(s) to run in the system environment right after a jail has been created, but before commands (or services) get executed in the jail. .It Va exec.start Command(s) to run in the jail environment when a jail is created. A typical command to run is .Dq sh /etc/rc . .It Va command A synonym for .Va exec.start for use when specifying a jail directly on the command line. Unlike other parameters whose value is a single string, .Va command uses the remainder of the .Nm command line as its own arguments. .It Va exec.poststart Command(s) to run in the system environment after a jail is created, and after any .Va exec.start commands have completed. .It Va exec.prestop Command(s) to run in the system environment before a jail is removed. .It Va exec.stop Command(s) to run in the jail environment before a jail is removed, and after any .Va exec.prestop commands have completed. A typical command to run is .Dq sh /etc/rc.shutdown jail . .It Va exec.poststop Command(s) to run in the system environment after a jail is removed. .It Va exec.release Command(s) to run in the system environment after all other actions are done. These commands are executed after unmounting filesystems and removing IP addresses, so they may be used to remove a jail filesystem if it is no longer needed. .It Va exec.clean Run commands in a clean environment. The environment is discarded except for .Ev HOME , SHELL , TERM and .Ev USER . .Ev HOME and .Ev SHELL are set to the target login's default values. .Ev USER is set to the target login. .Ev TERM is imported from the current environment. .Ev PATH is set to "/bin:/usr/bin". The environment variables from the login class capability database for the target login are also set. If a user is specified (as with .Va exec.jail_user ) , commands are run from that (possibly jailed) user's directory. .It Va exec.jail_user The user to run commands as, when running in the jail environment. The default is to run the commands as the current user. .It Va exec.system_jail_user This boolean option looks for the .Va exec.jail_user in the system .Xr passwd 5 file, instead of in the jail's file. .It Va exec.system_user The user to run commands as, when running in the system environment. The default is to run the commands as the current user. .It Va exec.timeout The maximum amount of time to wait for a command to complete, in seconds. If a command is still running after this timeout has passed, the jail will not be created or removed, as appropriate. .It Va exec.consolelog A file to direct command output (stdout and stderr) to. .It Va exec.fib The FIB (routing table) to set when running commands inside the jail. .It Va stop.timeout The maximum amount of time to wait for a jail's processes to exit after sending them a .Dv SIGTERM signal (which happens after the .Va exec.stop commands have completed). After this many seconds have passed, the jail will be removed, which will kill any remaining processes. If this is set to zero, no .Dv SIGTERM is sent and the jail is immediately removed. The default is 10 seconds. .It Va interface A network interface to add the jail's IP addresses .Va ( ip4.addr and .Va ip6.addr ) to. An alias for each address will be added to the interface before the jail is created, and will be removed from the interface after the jail is removed. .It Va ip4.addr In addition to the IP addresses that are passed to the kernel, an interface, netmask and additional parameters (as supported by .Xr ifconfig 8 Ns ) may also be specified, in the form .Dq Ar interface Ns | Ns Ar ip-address Ns / Ns Ar netmask param ... . If an interface is given before the IP address, an alias for the address will be added to that interface, as it is with the .Va interface parameter. If a netmask in either dotted-quad or CIDR form is given after an IP address, it will be used when adding the IP alias. If additional parameters are specified then they will also be used when adding the IP alias. .It Va ip6.addr In addition to the IP addresses that are passed to the kernel, an interface, prefix and additional parameters (as supported by .Xr ifconfig 8 Ns ) may also be specified, in the form .Dq Ar interface Ns | Ns Ar ip-address Ns / Ns Ar prefix param ... . .It Va vnet.interface A list of network interfaces to give to a vnet-enabled jail after is it created. The interfaces will automatically be released when the jail is removed. .It Va zfs.dataset A list of ZFS datasets to be attached to the jail. This requires .Va allow.mount.zfs to be set. See .Xr zfs-jail 8 for information on how to configure a ZFS dataset to be operated from within a jail. .It Va ip_hostname Resolve the .Va host.hostname parameter and add all IP addresses returned by the resolver to the list of addresses .Po Va ip4.addr or .Va ip6.addr Pc for this jail. This may affect default address selection for outgoing IPv4 connections from jails. The address first returned by the resolver for each address family will be used as the primary address. .It Va mount A filesystem to mount before creating the jail (and to unmount after removing it), given as a single .Xr fstab 5 line. .It Va mount.fstab An .Xr fstab 5 format file containing filesystems to mount before creating a jail. .It Va mount.devfs Mount a .Xr devfs 4 filesystem on the chrooted .Pa /dev directory, and apply the ruleset in the .Va devfs_ruleset parameter (or a default of ruleset 4: devfsrules_jail) to restrict the devices visible inside the jail. .It Va mount.fdescfs Mount a .Xr fdescfs 4 filesystem on the chrooted .Pa /dev/fd directory. .It Va mount.procfs Mount a .Xr procfs 4 filesystem on the chrooted .Pa /proc directory. .It Va allow.dying This is deprecated and has no effect. It used to allow making changes to a .Va dying jail. Now such jails are always replaced when a new jail is created with the same .Va jid or .Va name . .It Va depend Specify a jail (or jails) that this jail depends on. When this jail is to be created, any jail(s) it depends on must already exist. If not, they will be created automatically, up to the completion of the last .Va exec.poststart command, before any action will taken to create this jail. When jails are removed the opposite is true: this jail will be removed, up to the last .Va exec.poststop command, before any jail(s) it depends on are stopped. .El .Sh EXAMPLES Jails are typically set up using one of two philosophies: either to constrain a specific application (possibly running with privilege), or to create a .Dq "virtual system image" running a variety of daemons and services. In both cases, a fairly complete file system install of .Fx is required, so as to provide the necessary command line tools, daemons, libraries, application configuration files, etc. However, for a virtual server configuration, a fair amount of additional work is required so as to replace the .Dq boot process. This manual page documents the configuration steps necessary to support either of these steps, although the configuration steps may need to be refined based on local requirements. .Ss "Setting up a Jail Directory Tree" To set up a jail directory tree containing an entire .Fx distribution, the following .Xr sh 1 command script can be used: .Bd -literal -offset indent D=/here/is/the/jail cd /usr/src mkdir -p $D make world DESTDIR=$D make distribution DESTDIR=$D .Ed .Pp In many cases this example would put far more in the jail than needed. In the other extreme case a jail might contain only one file: the executable to be run in the jail. .Pp We recommend experimentation, and caution that it is a lot easier to start with a .Dq fat jail and remove things until it stops working, than it is to start with a .Dq thin jail and add things until it works. .Ss "Setting Up a Jail" Do what was described in .Sx "Setting Up a Jail Directory Tree" to build the jail directory tree. For the sake of this example, we will assume you built it in .Pa /data/jail/testjail , for a jail named .Dq testjail . Substitute below as needed with your own directory, IP address, and hostname. .Ss "Setting up the Host Environment" First, set up the real system's environment to be .Dq jail-friendly . For consistency, we will refer to the parent box as the .Dq "host environment" , and to the jailed virtual machine as the .Dq "jail environment" . Since jails are implemented using IP aliases, one of the first things to do is to disable IP services on the host system that listen on all local IP addresses for a service. If a network service is present in the host environment that binds all available IP addresses rather than specific IP addresses, it may service requests sent to jail IP addresses if the jail did not bind the port. This means changing .Xr inetd 8 to only listen on the appropriate IP address, and so forth. Add the following to .Pa /etc/rc.conf in the host environment: .Bd -literal -offset indent sendmail_enable="NO" inetd_flags="-wW -a 192.0.2.23" rpcbind_enable="NO" .Ed .Pp .Li 192.0.2.23 is the native IP address for the host system, in this example. Daemons that run out of .Xr inetd 8 can be easily configured to use only the specified host IP address. Other daemons will need to be manually configured \(em for some this is possible through .Xr rc.conf 5 flags entries; for others it is necessary to modify per-application configuration files, or to recompile the application. The following frequently deployed services must have their individual configuration files modified to limit the application to listening to a specific IP address: .Pp To configure .Xr sshd 8 , it is necessary to modify .Pa /etc/ssh/sshd_config . .Pp To configure .Xr sendmail 8 , it is necessary to modify .Pa /etc/mail/sendmail.cf . .Pp In addition, a number of services must be recompiled in order to run them in the host environment. This includes most applications providing services using .Xr rpc 3 , such as .Xr rpcbind 8 , .Xr nfsd 8 , and .Xr mountd 8 . In general, applications for which it is not possible to specify which IP address to bind should not be run in the host environment unless they should also service requests sent to jail IP addresses. Attempting to serve NFS from the host environment may also cause confusion, and cannot be easily reconfigured to use only specific IPs, as some NFS services are hosted directly from the kernel. Any third-party network software running in the host environment should also be checked and configured so that it does not bind all IP addresses, which would result in those services also appearing to be offered by the jail environments. .Pp Once these daemons have been disabled or fixed in the host environment, it is best to reboot so that all daemons are in a known state, to reduce the potential for confusion later (such as finding that when you send mail to a jail, and its sendmail is down, the mail is delivered to the host, etc.). .Ss "Configuring the Jail" Start any jail for the first time without configuring the network interface so that you can clean it up a little and set up accounts. As with any machine (virtual or not), you will need to set a root password, time zone, etc. Some of these steps apply only if you intend to run a full virtual server inside the jail; others apply both for constraining a particular application or for running a virtual server. .Pp Start a shell in the jail: .Bd -literal -offset indent jail -c path=/data/jail/testjail mount.devfs \\ host.hostname=testhostname ip4.addr=192.0.2.100 \\ command=/bin/sh .Ed .Pp Assuming no errors, you will end up with a shell prompt within the jail. You can now run .Xr bsdconfig 8 and do the post-install configuration to set various configuration options, or perform these actions manually by editing .Pa /etc/rc.conf , etc. .Pp .Bl -bullet -offset indent -compact .It Configure .Pa /etc/resolv.conf so that name resolution within the jail will work correctly. .It Run .Xr newaliases 1 to quell .Xr sendmail 8 warnings. .It Set a root password, probably different from the real host system. .It Set the timezone. .It Add accounts for users in the jail environment. .It Install any packages the environment requires. .El .Pp You may also want to perform any package-specific configuration (web servers, SSH servers, etc), patch up .Pa /etc/syslog.conf so it logs as you would like, etc. If you are not using a virtual server, you may wish to modify .Xr syslogd 8 in the host environment to listen on the syslog socket in the jail environment; in this example, the syslog socket would be stored in .Pa /data/jail/testjail/var/run/log . .Pp Exit from the shell, and the jail will be shut down. .Ss "Starting the Jail" You are now ready to restart the jail and bring up the environment with all of its daemons and other programs. Create an entry for the jail in .Pa /etc/jail.conf : .Bd -literal -offset indent testjail { path = /tmp/jail/testjail; mount.devfs; host.hostname = testhostname; ip4.addr = 192.0.2.100; interface = em0; exec.start = "/bin/sh /etc/rc"; exec.stop = "/bin/sh /etc/rc.shutdown jail"; } .Ed .Pp To start a virtual server environment, .Pa /etc/rc is run to launch various daemons and services, and .Pa /etc/rc.shutdown is run to shut them down when the jail is removed. If you are running a single application in the jail, substitute the command used to start the application for .Dq /bin/sh /etc/rc ; there may be some script available to cleanly shut down the application, or it may be sufficient to go without a stop command, and have .Nm send .Dv SIGTERM to the application. .Pp Start the jail by running: .Bd -literal -offset indent jail -c testjail .Ed .Pp A few warnings may be produced; however, it should all work properly. You should be able to see .Xr inetd 8 , .Xr syslogd 8 , and other processes running within the jail using .Xr ps 1 , with the .Ql J flag appearing beside jailed processes. To see an active list of jails, use .Xr jls 8 . If .Xr sshd 8 is enabled in the jail environment, you should be able to .Xr ssh 1 to the hostname or IP address of the jailed environment, and log in using the accounts you created previously. .Pp It is possible to have jails started at boot time. Please refer to the .Dq jail_* variables in .Xr rc.conf 5 for more information. .Ss "Managing the Jail" Normal machine shutdown commands, such as .Xr halt 8 , .Xr reboot 8 , and .Xr shutdown 8 , cannot be used successfully within the jail. To kill all processes from within a jail, you may use one of the following commands, depending on what you want to accomplish: .Bd -literal -offset indent kill -TERM -1 kill -KILL -1 .Ed .Pp This will send the .Dv SIGTERM or .Dv SIGKILL signals to all processes in the jail \(em be careful not to run this from the host environment! Once all of the jail's processes have died, unless the jail was created with the .Va persist parameter, the jail will be removed. Depending on the intended use of the jail, you may also want to run .Pa /etc/rc.shutdown from within the jail. .Pp To shut down the jail from the outside, simply remove it with: .Bd -literal -offset indent jail -r .Ed .Pp which will run any commands specified by .Va exec.stop , and then send .Dv SIGTERM and eventually .Dv SIGKILL to any remaining jailed processes. .Pp The .Pa /proc/ Ns Ar pid Ns Pa /status file contains, as its last field, the name of the jail in which the process runs, or .Dq Li - to indicate that the process is not running within a jail. The .Xr ps 1 command also shows a .Ql J flag for processes in a jail. .Pp You can also list/kill processes based on their jail ID. To show processes and their jail ID, use the following command: .Pp .Dl "ps ax -o pid,jid,args" .Pp To show and then kill processes in jail number 3 use the following commands: .Bd -literal -offset indent pgrep -lfj 3 pkill -j 3 .Ed or: .Pp .Dl "killall -j 3" .Ss "Jails and File Systems" It is not possible to .Xr mount 8 or .Xr umount 8 any file system inside a jail unless the file system is marked jail-friendly, the jail's .Va allow.mount parameter is set, and the jail's .Va enforce_statfs parameter is lower than 2. .Pp Multiple jails sharing the same file system can influence each other. For example, a user in one jail can fill the file system, leaving no space for processes in the other jail. Trying to use .Xr quota 1 to prevent this will not work either, as the file system quotas are not aware of jails but only look at the user and group IDs. This means the same user ID in two jails share a single file system quota. One would need to use one file system per jail to make this work. .Ss "Sysctl MIB Entries" The read-only entry .Va security.jail.jailed can be used to determine if a process is running inside a jail (value is one) or not (value is zero). .Pp The variable .Va security.jail.jail_max_af_ips determines how may address per address family a jail may have. The default is 255. .Pp Some MIB variables have per-jail settings. Changes to these variables by a jailed process do not affect the host environment, only the jail environment. These variables are .Va kern.securelevel , .Va security.bsd.suser_enabled , .Va kern.hostname , .Va kern.domainname , .Va kern.hostid , and .Va kern.hostuuid . .Ss "Hierarchical Jails" By setting a jail's .Va children.max parameter, processes within a jail may be able to create jails of their own. These child jails are kept in a hierarchy, with jails only able to see and/or modify the jails they created (or those jails' children). Each jail has a read-only .Va parent parameter, containing the .Va jid of the jail that created it; a .Va jid of 0 indicates the jail is a child of the current jail (or is a top-level jail if the current process isn't jailed). .Pp Jailed processes are not allowed to confer greater permissions than they themselves are given, e.g., if a jail is created with .Va allow.nomount , it is not able to create a jail with .Va allow.mount set. Similarly, such restrictions as .Va ip4.addr and .Va securelevel may not be bypassed in child jails. .Pp A child jail may in turn create its own child jails if its own .Va children.max parameter is set (remember it is zero by default). These jails are visible to and can be modified by their parent and all ancestors. .Pp Jail names reflect this hierarchy, with a full name being an MIB-type string separated by dots. For example, if a base system process creates a jail .Dq foo , and a process under that jail creates another jail .Dq bar , then the second jail will be seen as .Dq foo.bar in the base system (though it is only seen as .Dq bar to any processes inside jail .Dq foo ) . Jids on the other hand exist in a single space, and each jail must have a unique jid. .Pp Like the names, a child jail's .Va path appears relative to its creator's own .Va path . This is by virtue of the child jail being created in the chrooted environment of the first jail. .Sh SEE ALSO .Xr date 1 , .Xr killall 1 , .Xr lsvfs 1 , .Xr newaliases 1 , .Xr pgrep 1 , .Xr pkill 1 , .Xr ps 1 , .Xr quota 1 , .Xr adjtime 2 , .Xr clock_settime 2 , .Xr jail_set 2 , .Xr ntp_adjtime 2 , .Xr devfs 4 , .Xr fdescfs 4 , .Xr linprocfs 4 , .Xr linsysfs 4 , .Xr procfs 4 , .Xr vmm 4 , .Xr jail.conf 5 , .Xr rc.conf 5 , .Xr sysctl.conf 5 , .Xr bsdconfig 8 , .Xr chroot 8 , .Xr devfs 8 , .Xr halt 8 , .Xr ifconfig 8 , .Xr inetd 8 , .Xr jexec 8 , .Xr jls 8 , .Xr mount 8 , .Xr mountd 8 , .Xr nfsd 8 , .Xr ntpd 8 , .Xr reboot 8 , .Xr rpcbind 8 , .Xr sendmail 8 , .Xr shutdown 8 , .Xr sysctl 8 , .Xr syslogd 8 , .Xr umount 8 , .Xr zfs-jail 8 , .Xr extattr 9 .Sh HISTORY The .Nm utility appeared in .Fx 4.0 . Hierarchical/extensible jails were introduced in .Fx 8.0 . The configuration file was introduced in .Fx 9.1 . .Sh AUTHORS .An -nosplit The jail feature was written by .An Poul-Henning Kamp for R&D Associates who contributed it to .Fx . .Pp .An Robert Watson wrote the extended documentation, found a few bugs, added a few new features, and cleaned up the userland jail environment. .Pp .An Bjoern A. Zeeb added multi-IP jail support for IPv4 and IPv6 based on a patch originally done by .An Pawel Jakub Dawidek for IPv4. .Pp .An James Gritton added the extensible jail parameters, hierarchical jails, and the configuration file. .Sh BUGS It might be a good idea to add an address alias flag such that daemons listening on all IPs .Pq Dv INADDR_ANY will not bind on that address, which would facilitate building a safe host environment such that host daemons do not impose on services offered from within jails. Currently, the simplest answer is to minimize services offered on the host, possibly limiting it to services offered from .Xr inetd 8 which is easily configurable. .Sh NOTES Great care should be taken when managing directories visible within the jail. For example, if a jailed process has its current working directory set to a directory that is moved out of the jail's chroot, then the process may gain access to the file space outside of the jail. It is recommended that directories always be copied, rather than moved, out of a jail. .Pp In addition, there are several ways in which an unprivileged user outside the jail can cooperate with a privileged user inside the jail and thereby obtain elevated privileges in the host environment. Most of these attacks can be mitigated by ensuring that the jail root is not accessible to unprivileged users in the host environment. Regardless, as a general rule, untrusted users with privileged access to a jail should not be given access to the host environment.