Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -3849,6 +3849,7 @@ kern/kern_tslog.c optional tslog kern/kern_umtx.c standard kern/kern_uuid.c standard +kern/kern_vps.c optional vimage kern/kern_xxx.c standard kern/link_elf.c standard kern/linker_if.m standard Index: sys/kern/kern_jail.c =================================================================== --- sys/kern/kern_jail.c +++ sys/kern/kern_jail.c @@ -62,6 +62,10 @@ #include #include #include +#include +#ifdef VIMAGE +#include +#endif #include #include @@ -74,6 +78,10 @@ #include +#ifdef VIMAGE +FEATURE(vimage, "VIMAGE kernel virtualization"); +#endif + #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); @@ -107,7 +115,7 @@ .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE - .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, + .pr_flags = PR_HOST|PR_VNET|PR_VPS|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif @@ -171,6 +179,9 @@ {"host", 0, PR_HOST}, #ifdef VIMAGE {"vnet", 0, PR_VNET}, +#ifdef ENABLE_VPS + {"vps", 0, PR_VPS }, +#endif #endif #ifdef INET {"ip4", PR_IP4_USER, PR_IP4_USER}, @@ -627,6 +638,11 @@ vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } + if ((flags & JAIL_UPDATE) && (ch_flags & PR_VPS)) { + error = EINVAL; + vfs_opterror(opts, "vps cannot be changed after creation"); + goto done_errmsg; + } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { @@ -1801,6 +1817,43 @@ goto done_errmsg; } +#ifdef VIMAGE + /* Allocate a new vps if specified. */ +#ifdef ENABLE_VPS + if (pr_flags & PR_VPS) { +#else + if (0) { +#endif + vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); + if ((error = change_dir(pr->pr_root, td)) != 0) + goto c_unlock; +#ifdef MAC + if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) + goto c_unlock; +#endif +c_unlock: + VOP_UNLOCK(pr->pr_root, 0); + if (error || (error = pwd_chroot(td, pr->pr_root))) { + vfs_opterror(opts, "vps chroot failed"); + if (!created) + prison_deref(pr, PD_DEREF); + goto done_errmsg; + } + + /* We temporary need a ref as otheriwse a prhold will panic. */ + mtx_lock(&pr->pr_mtx); + pr->pr_ref++; + pr->pr_uref++; + mtx_unlock(&pr->pr_mtx); + pr->pr_vps = vps_alloc(pr); + mtx_lock(&pr->pr_mtx); + pr->pr_ref--; + pr->pr_uref--; + mtx_unlock(&pr->pr_mtx); + } else { + pr->pr_vps = ppr->pr_vps; + } +#endif /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { mtx_lock(&pr->pr_mtx); @@ -2285,6 +2338,28 @@ /* * Kill all processes unfortunate enough to be attached to this prison. */ +#ifdef VIMAGE +#ifdef ENABLE_VPS + if (pr->pr_vps) { + /* + * Send signal to init and let init do it's job. + * This should run rc.shutdown and processes should go away. + * All but init? We need to catch the tail-end of reboot(2) + * and handle appropriately for the non-default vpss. + * vps_destroy() will ensure init and swapper will also go + * away and might sleep. If they do not go something will + * hold refs on cred and prisons. + * XXX There are other places which might do that for a long + * time as well. + */ + CURVPS_SET(pr->pr_vps); + shutdown_nice(RB_HALT|RB_POWEROFF); + vps_destroy(pr->pr_vps); + CURVPS_RESTORE(); + } else +#endif +#endif + { sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { PROC_LOCK(p); @@ -2294,6 +2369,7 @@ PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); + } /* Remove the temporary reference added by jail_remove. */ prison_deref(pr, deuref | PD_DEREF); } @@ -2348,6 +2424,24 @@ struct ucred *newcred, *oldcred; int error; +#ifdef VIMAGE + /* + * Do not allow to migrate a process between virtual process spaces. + * Use the console to attach to it. Getting all process spaces things + * right, including a new pid, progress group, session, terminal, + * tracing is one thing (with a lot of work) and may break apps if the + * pid changes, the pgrp no longer has the same (p)id; getting things + * restored to oriinal state and properly re-parented is virtually + * impossile. So do what we do on a normal machine, present a terminal + * to login to. + */ + if (pr->pr_flags & PR_VPS) { + mtx_unlock(&pr->pr_mtx); + sx_sunlock(&allprison_lock); + return (EPERM); + } +#endif + /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two @@ -2628,6 +2722,8 @@ #ifdef VIMAGE if (pr->pr_vnet != ppr->pr_vnet) vnet_destroy(pr->pr_vnet); + KASSERT(pr->pr_vps == NULL, ("%s: pr %p pr_vps %p != NULL\n", + __func__, pr, pr->pr_vps)); #endif if (pr->pr_root != NULL) vrele(pr->pr_root); @@ -2912,9 +3008,9 @@ #ifdef VIMAGE /* * Determine whether the prison represented by cred owns - * its vnet rather than having it inherited. + * its vnet/vps rather than having it inherited. * - * Returns 1 in case the prison owns the vnet, 0 otherwise. + * Returns 1 in case the prison owns the vnet/vps, 0 otherwise. */ int prison_owns_vnet(struct ucred *cred) @@ -2926,6 +3022,17 @@ */ return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0); } + +int +prison_owns_vps(struct ucred *cred) +{ + + /* + * vps cannot be added/removed after jail creation, + * so no need to lock here. + */ + return (cred->cr_prison->pr_flags & PR_VPS ? 1 : 0); +} #endif /* @@ -3542,6 +3649,26 @@ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns vnet?"); +static int +sysctl_jail_vps(SYSCTL_HANDLER_ARGS) +{ + int error, havevps; +#ifdef VIMAGE + struct ucred *cred = req->td->td_ucred; + + havevps = jailed(cred) && prison_owns_vps(cred); +#else + havevps = 0; +#endif + error = SYSCTL_OUT(req, &havevps, sizeof(havevps)); + + return (error); +} + +SYSCTL_PROC(_security_jail, OID_AUTO, vps, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + sysctl_jail_vps, "I", "Jail owns vps?"); + #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, @@ -3697,6 +3824,10 @@ #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); +#ifdef ENABLE_VPS +SYSCTL_JAIL_PARAM(, vps, CTLTYPE_INT | CTLFLAG_RDTUN, + "E,jailsys", "Virtual process space"); +#endif #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); @@ -4074,6 +4205,7 @@ ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); + db_printf(" vps = %p\n", pr->pr_vps); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c +++ sys/kern/kern_thread.c @@ -83,7 +83,7 @@ "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x470, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x518, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x528, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag"); @@ -103,7 +103,7 @@ "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x2e8, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x334, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x33c, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x68, "struct proc KBI p_flag"); Index: sys/kern/kern_vps.c =================================================================== --- /dev/null +++ sys/kern/kern_vps.c @@ -0,0 +1,828 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2004-2009 University of Zagreb + * Copyright (c) 2006-2009 FreeBSD Foundation + * Copyright (c) 2018 iXsystems, Inc. + * All rights reserved. + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Portions of this software were developed by Bjoern Zeeb + * under sponsorship from iXsystems, Inc. + * + * Copyright (c) 2009 Jeffrey Roberson + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_ddb.h" +#include "opt_kdb.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef DDB +#include +#include +#endif + + +/*- + * This file implements core functions for virtual process spaces: + * + * - Virtual process space management functions. + * + * - Virtual process space memory allocator, which virtualizes global + * variables in the process space. + * + * - Virtualized SYSINIT's/SYSUNINIT's, which allow process spaces + * to register startup/shutdown events to be run for each virtual process + * space instance. + */ + +static MALLOC_DEFINE(M_VPS, "vps", "process space control block"); + +/* + * The virtual process space list has two read-write locks, one sleepable and + * the other not, so that the list can be stablized and walked in a variety + * of process space contexts. Both must be acquired exclusively to modify + * the list, but a read lock of either lock is sufficient to walk the list. + */ +struct rwlock vps_rwlock; +struct sx vps_sxlock; + +#define VPS_LIST_WLOCK() do { \ + sx_xlock(&vps_sxlock); \ + rw_wlock(&vps_rwlock); \ +} while (0) + +#define VPS_LIST_WUNLOCK() do { \ + rw_wunlock(&vps_rwlock); \ + sx_xunlock(&vps_sxlock); \ +} while (0) + +struct vps_list_head vps_head; +struct vps *vps0; + +/* + * The virtual process space allocator provides storage for virtualized + * global variables. These variables are defined/declared using the + * VPS_DEFINE()/VPS_DECLARE() macros, which place them in the 'set_vps' + * linker set. The details of the implementation are somewhat subtle, but + * allow the majority of most process subsystems to maintain + * virtualization-agnostic. + * + * The virtual process space allocator handles variables in the base kernel + * vs. modules in similar but different ways. In both cases, virtualized + * global variables are marked as such by being declared to be part of the + * vps linker set. These "primary" copies of global variables serve two + * functions: + * + * (1) They contain static initialization or "default" values for global + * variables which will be propagated to each virtual process space + * instance when created. As with normal global variables, they default + * to zero-filled. + * + * (2) They act as unique global names by which the variable can be referred + * to, regardless of process space instance. The single global symbol + * will be used to calculate the location of a per-virtual instance + * variable at run-time. + * + * Each virtual process space instance has a complete copy of each + * virtualized global variable, stored in a malloc'd block of memory + * referred to by vps->vps_data_mem. Critical to the design is that each + * per-instance memory block is laid out identically to the primary block so + * that the offset of each global variable is the same across all blocks. + * To optimize run-time access, a precalculated 'base' address, + * vps->vps_data_base, is stored in each vps, and is the amount that can + * be added to the address of a 'primary' instance of a variable to get to the + * per-vps instance. + * + * Virtualized global variables are handled in a similar manner, but as each + * module has its own 'set_vps' linker set, and we want to keep all + * virtualized globals togther, we reserve space in the kernel's linker set + * for potential module variables using a per-vps character array, + * 'modspace'. The virtual process space allocator maintains a free list to + * track what space in the array is free (all, initially) and as modules are + * linked, allocates portions of the space to specific globals. The kernel + * module linker queries the virtual process space allocator and will + * bind references of the global to the location during linking. It also + * calls into the virtual process space allocator, once the memory is + * initialized, in order to propagate the new static initializations to all + * existing virtual process space instances so that the soon-to-be executing + * module will find every process space instance with proper default values. + */ + +/* + * Number of bytes of data in the 'set_vps' linker set, and hence the total + * size of all kernel virtualized global variables, and the malloc(9) type + * that will be used to allocate it. + */ +#define VPS_BYTES (VPS_STOP - VPS_START) + +static MALLOC_DEFINE(M_VPS_DATA, "vps_data", "VPS data"); + +/* + * VPS_MODMIN is the minimum number of bytes we will reserve for the sum of + * global variables across all loaded modules. As this actually sizes an + * array declared as a virtualized global variable in the kernel itself, and + * we want the virtualized global variable space to be page-sized, we may + * have more space than that in practice. + */ +#define VPS_MODMIN 8192 +#define VPS_SIZE roundup2(VPS_BYTES, PAGE_SIZE) + +/* + * Space to store virtualized global variables from loadable kernel modules, + * and the free list to manage it. + */ +static VPS_DEFINE(char, modspace[VPS_MODMIN]); + +/* + * Global lists of subsystem constructor and destructors for vpss. They are + * registered via VPS_SYSINIT() and VPS_SYSUNINIT(). Both lists are + * protected by the vps_sysinit_sxlock global lock. + */ +static TAILQ_HEAD(vps_sysinit_head, vps_sysinit) vps_constructors = + TAILQ_HEAD_INITIALIZER(vps_constructors); +static TAILQ_HEAD(vps_sysuninit_head, vps_sysinit) vps_destructors = + TAILQ_HEAD_INITIALIZER(vps_destructors); + +struct sx vps_sysinit_sxlock; + +#define VPS_SYSINIT_WLOCK() sx_xlock(&vps_sysinit_sxlock); +#define VPS_SYSINIT_WUNLOCK() sx_xunlock(&vps_sysinit_sxlock); +#define VPS_SYSINIT_RLOCK() sx_slock(&vps_sysinit_sxlock); +#define VPS_SYSINIT_RUNLOCK() sx_sunlock(&vps_sysinit_sxlock); + +/* XXX-BZ should probably be vpd_* instead of vnd_* but in the hope to + * harmonize most of this later on keep the names the same for now. */ +struct vps_data_free { + uintptr_t vnd_start; + int vnd_len; + TAILQ_ENTRY(vps_data_free) vnd_link; +}; + +static MALLOC_DEFINE(M_VPS_DATA_FREE, "vps_data_free", + "VPS resource accounting"); +static TAILQ_HEAD(, vps_data_free) vps_data_free_head = + TAILQ_HEAD_INITIALIZER(vps_data_free_head); +static struct sx vps_data_free_lock; + +SDT_PROVIDER_DEFINE(vps); +SDT_PROBE_DEFINE1(vps, functions, vps_alloc, entry, "int"); +SDT_PROBE_DEFINE2(vps, functions, vps_alloc, alloc, "int", "struct vps *"); +SDT_PROBE_DEFINE2(vps, functions, vps_alloc, return, "int", "struct vps *"); +SDT_PROBE_DEFINE2(vps, functions, vps_destroy, entry, "int", "struct vps *"); +SDT_PROBE_DEFINE1(vps, functions, vps_destroy, return, "int"); + +#ifdef DDB +static void db_show_vps_print_vs(struct vps_sysinit *, int); +#endif + +/* + * Allocate a virtual process space. + */ +struct vps * +vps_alloc(struct prison *pr) +{ + struct vps *vps; + + SDT_PROBE1(vps, functions, vps_alloc, entry, __LINE__); + vps = malloc(sizeof(struct vps), M_VPS, M_WAITOK | M_ZERO); + vps->vps_magic_n = VPS_MAGIC_N; + vps->vps_state = 0; + vps->vps_pr = pr; + /* Cheat for vps_sysinit() to get creds right. */ + pr->pr_vps = vps; + SDT_PROBE2(vps, functions, vps_alloc, alloc, __LINE__, vps); + + /* + * Allocate storage for virtualized global variables and copy in + * initial values form our 'primary' copy. + */ + vps->vps_data_mem = malloc(VPS_SIZE, M_VPS_DATA, M_WAITOK); + memcpy(vps->vps_data_mem, (void *)VPS_START, VPS_BYTES); + + /* + * All use of vps-specific data will immediately subtract VPS_START + * from the base memory pointer, so pre-calculate that now to avoid + * it on each use. + */ + vps->vps_data_base = (uintptr_t)vps->vps_data_mem - VPS_START; + + /* Initialize / attach vps module instances. */ + CURVPS_SET_QUIET(vps); + vps_sysinit(); + CURVPS_RESTORE(); + + VPS_LIST_WLOCK(); + LIST_INSERT_HEAD(&vps_head, vps, vps_le); + VPS_LIST_WUNLOCK(); + + SDT_PROBE2(vps, functions, vps_alloc, return, __LINE__, vps); + return (vps); +} + +/* + * Destroy a virtual process space. + */ +void +vps_destroy(struct vps *vps) +{ + + SDT_PROBE2(vps, functions, vps_destroy, entry, __LINE__, vps); + + VPS_LIST_WLOCK(); + LIST_REMOVE(vps, vps_le); + VPS_LIST_WUNLOCK(); + + CURVPS_SET_QUIET(vps); + vps_sysuninit(); + CURVPS_RESTORE(); + + /* + * Release storage for the virtual process space instance. + */ + free(vps->vps_data_mem, M_VPS_DATA); + vps->vps_data_mem = NULL; + vps->vps_data_base = 0; + vps->vps_pr->pr_vps = NULL; + vps->vps_pr = NULL; + vps->vps_magic_n = 0xdeadbeef; + free(vps, M_VPS); + SDT_PROBE1(vps, functions, vps_destroy, return, __LINE__); +} + +/* + * Boot time initialization and allocation of virtual process space. + */ +static void +vps_init_prelink(void *arg __unused) +{ + + rw_init(&vps_rwlock, "vps_rwlock"); + sx_init(&vps_sxlock, "vps_sxlock"); + sx_init(&vps_sysinit_sxlock, "vps_sysinit_sxlock"); + LIST_INIT(&vps_head); +} +SYSINIT(vps_init_prelink, SI_SUB_VIMAGE_PRELINK, SI_ORDER_FIRST, + vps_init_prelink, NULL); + +static void +vps0_init(void *arg __unused) +{ + + if (bootverbose) + printf("VIMAGE (virtualized process space) enabled\n"); + + /* + * We MUST clear curvps in vi_init_done() before going SMP, + * otherwise CURVPS_SET() macros would scream about unnecessary + * curvps recursions. + */ + curvps = prison0.pr_vps = vps0 = vps_alloc(&prison0); +} +SYSINIT(vps0_init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, vps0_init, NULL); + +#if 0 +/* Compared to vnets, nuking the vps of the current thread does not go down well. */ +static void +vps_init_done(void *unused __unused) +{ + + curvps = NULL; +} +SYSINIT(vps_init_done, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, vps_init_done, NULL); +#endif + +/* + * Once on boot, initialize the modspace freelist to entirely cover modspace. + */ +static void +vps_data_startup(void *dummy __unused) +{ + struct vps_data_free *df; + + df = malloc(sizeof(*df), M_VPS_DATA_FREE, M_WAITOK | M_ZERO); + df->vnd_start = (uintptr_t)&VPS_NAME(modspace); + df->vnd_len = VPS_MODMIN; + TAILQ_INSERT_HEAD(&vps_data_free_head, df, vnd_link); + sx_init(&vps_data_free_lock, "vps_data alloc lock"); +} +SYSINIT(vps_data, SI_SUB_KLD, SI_ORDER_FIRST, vps_data_startup, 0); + +/* Dummy VPS_SYSINIT to make sure we always reach the final end state. */ +static void +vps_sysinit_done(void *unused __unused) +{ + + return; +} +VPS_SYSINIT(vps_sysinit_done, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, + vps_sysinit_done, NULL); + +/* + * When a module is loaded and requires storage for a virtualized global + * variable, allocate space from the modspace free list. This interface + * should be used only by the kernel linker. + */ +void * +vps_data_alloc(int size) +{ + struct vps_data_free *df; + void *s; + + s = NULL; + size = roundup2(size, sizeof(void *)); + sx_xlock(&vps_data_free_lock); + TAILQ_FOREACH(df, &vps_data_free_head, vnd_link) { + if (df->vnd_len < size) + continue; + if (df->vnd_len == size) { + s = (void *)df->vnd_start; + TAILQ_REMOVE(&vps_data_free_head, df, vnd_link); + free(df, M_VPS_DATA_FREE); + break; + } + s = (void *)df->vnd_start; + df->vnd_len -= size; + df->vnd_start = df->vnd_start + size; + break; + } + sx_xunlock(&vps_data_free_lock); + + return (s); +} + +/* + * Free space for a virtualized global variable on module unload. + */ +void +vps_data_free(void *start_arg, int size) +{ + struct vps_data_free *df; + struct vps_data_free *dn; + uintptr_t start; + uintptr_t end; + + size = roundup2(size, sizeof(void *)); + start = (uintptr_t)start_arg; + end = start + size; + /* + * Free a region of space and merge it with as many neighbors as + * possible. Keeping the list sorted simplifies this operation. + */ + sx_xlock(&vps_data_free_lock); + TAILQ_FOREACH(df, &vps_data_free_head, vnd_link) { + if (df->vnd_start > end) + break; + /* + * If we expand at the end of an entry we may have to merge + * it with the one following it as well. + */ + if (df->vnd_start + df->vnd_len == start) { + df->vnd_len += size; + dn = TAILQ_NEXT(df, vnd_link); + if (df->vnd_start + df->vnd_len == dn->vnd_start) { + df->vnd_len += dn->vnd_len; + TAILQ_REMOVE(&vps_data_free_head, dn, + vnd_link); + free(dn, M_VPS_DATA_FREE); + } + sx_xunlock(&vps_data_free_lock); + return; + } + if (df->vnd_start == end) { + df->vnd_start = start; + df->vnd_len += size; + sx_xunlock(&vps_data_free_lock); + return; + } + } + dn = malloc(sizeof(*df), M_VPS_DATA_FREE, M_WAITOK | M_ZERO); + dn->vnd_start = start; + dn->vnd_len = size; + if (df) + TAILQ_INSERT_BEFORE(df, dn, vnd_link); + else + TAILQ_INSERT_TAIL(&vps_data_free_head, dn, vnd_link); + sx_xunlock(&vps_data_free_lock); +} + +/* + * When a new virtualized global variable has been allocated, propagate its + * initial value to each already-allocated virtual process space instance. + */ +void +vps_data_copy(void *start, int size) +{ + struct vps *vps; + + VPS_LIST_RLOCK(); + LIST_FOREACH(vps, &vps_head, vps_le) + memcpy((void *)((uintptr_t)vps->vps_data_base + + (uintptr_t)start), start, size); + VPS_LIST_RUNLOCK(); +} + +/* + * Support for special SYSINIT handlers registered via VPS_SYSINIT() + * and VPS_SYSUNINIT(). + */ +void +vps_register_sysinit(void *arg) +{ + struct vps_sysinit *vs, *vs2; + struct vps *vps; + + vs = arg; + KASSERT(vs->subsystem >= SI_SUB_INTRINSIC, ("vps sysinit too early")); + + /* Add the constructor to the global list of vps constructors. */ + VPS_SYSINIT_WLOCK(); + TAILQ_FOREACH(vs2, &vps_constructors, link) { + if (vs2->subsystem > vs->subsystem) + break; + if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) + break; + } + if (vs2 != NULL) + TAILQ_INSERT_BEFORE(vs2, vs, link); + else + TAILQ_INSERT_TAIL(&vps_constructors, vs, link); + + /* + * Invoke the constructor on all the existing vpss when it is + * registered. + */ + VPS_FOREACH(vps) { + CURVPS_SET_QUIET(vps); + vs->func(vs->arg); + CURVPS_RESTORE(); + } + VPS_SYSINIT_WUNLOCK(); +} + +void +vps_deregister_sysinit(void *arg) +{ + struct vps_sysinit *vs; + + vs = arg; + + /* Remove the constructor from the global list of vps constructors. */ + VPS_SYSINIT_WLOCK(); + TAILQ_REMOVE(&vps_constructors, vs, link); + VPS_SYSINIT_WUNLOCK(); +} + +void +vps_register_sysuninit(void *arg) +{ + struct vps_sysinit *vs, *vs2; + + vs = arg; + + /* Add the destructor to the global list of vps destructors. */ + VPS_SYSINIT_WLOCK(); + TAILQ_FOREACH(vs2, &vps_destructors, link) { + if (vs2->subsystem > vs->subsystem) + break; + if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) + break; + } + if (vs2 != NULL) + TAILQ_INSERT_BEFORE(vs2, vs, link); + else + TAILQ_INSERT_TAIL(&vps_destructors, vs, link); + VPS_SYSINIT_WUNLOCK(); +} + +void +vps_deregister_sysuninit(void *arg) +{ + struct vps_sysinit *vs; + struct vps *vps; + + vs = arg; + + /* + * Invoke the destructor on all the existing vpss when it is + * deregistered. + */ + VPS_SYSINIT_WLOCK(); + VPS_FOREACH(vps) { + CURVPS_SET_QUIET(vps); + vs->func(vs->arg); + CURVPS_RESTORE(); + } + + /* Remove the destructor from the global list of vps destructors. */ + TAILQ_REMOVE(&vps_destructors, vs, link); + VPS_SYSINIT_WUNLOCK(); +} + +/* + * Invoke all registered vps constructors on the current vps. Used during + * vps construction. The caller is responsible for ensuring the new vps is + * the current vps and that the vps_sysinit_sxlock lock is locked. + */ +void +vps_sysinit(void) +{ + struct vps_sysinit *vs; + struct vps *vps; + + vps = curvps; + VPS_SYSINIT_RLOCK(); + TAILQ_FOREACH(vs, &vps_constructors, link) { + curvps->vps_state = vs->subsystem; + vs->func(vs->arg); + KASSERT((curvps == vps), + ("%s: vs %p subsystem %u order %u func %p returned " + "with curvps altered: curvps %p should be %p\n", + __func__, vs, vs->subsystem, vs->order, vs->func, + curvps, vps)); + } + VPS_SYSINIT_RUNLOCK(); +} + +/* + * Invoke all registered vps destructors on the current vps. Used during + * vps destruction. The caller is responsible for ensuring the dying vps + * the current vps and that the vps_sysinit_sxlock lock is locked. + */ +void +vps_sysuninit(void) +{ + struct vps_sysinit *vs; + + VPS_SYSINIT_RLOCK(); + TAILQ_FOREACH_REVERSE(vs, &vps_destructors, vps_sysuninit_head, + link) { + curvps->vps_state = vs->subsystem; + vs->func(vs->arg); + } + VPS_SYSINIT_RUNLOCK(); +} + +/* + * EVENTHANDLER(9) extensions. + */ +/* + * Invoke the eventhandler function originally registered with the possibly + * registered argument for all virtual process space instances. + * + * This iterator can only be used for eventhandlers that do not take any + * additional arguments, as we do ignore the variadic arguments from the + * EVENTHANDLER_INVOKE() call. + */ +void +vps_global_eventhandler_iterator_func(void *arg, ...) +{ + VPS_ITERATOR_DECL(vps_iter); + struct eventhandler_entry_vimage *v_ee; + + /* + * There is a bug here in that we should actually cast things to + * (struct eventhandler_entry_ ## name *) but that's not easily + * possible in here so just re-using the variadic version we + * defined for the generic vimage case. + */ + v_ee = arg; + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + ((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg); + CURVPS_RESTORE(); + } + VPS_LIST_RUNLOCK(); +} + +#ifdef VPS_DEBUG +struct vps_recursion { + SLIST_ENTRY(vps_recursion) vnr_le; + const char *prev_fn; + const char *where_fn; + int where_line; + struct vps *old_vps; + struct vps *new_vps; +}; + +static SLIST_HEAD(, vps_recursion) vps_recursions = + SLIST_HEAD_INITIALIZER(vps_recursions); + +static void +vps_print_recursion(struct vps_recursion *vnr, int brief) +{ + + if (!brief) + printf("CURVPS_SET() recursion in "); + printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line, + vnr->prev_fn); + if (brief) + printf(", "); + else + printf("\n "); + printf("%p -> %p\n", vnr->old_vps, vnr->new_vps); +} + +void +vps_log_recursion(struct vps *old_vps, const char *old_fn, int line) +{ + struct vps_recursion *vnr; + + /* Skip already logged recursion events. */ + SLIST_FOREACH(vnr, &vps_recursions, vnr_le) + if (vnr->prev_fn == old_fn && + vnr->where_fn == curthread->td_vps_lpush && + vnr->where_line == line && + (vnr->old_vps == vnr->new_vps) == (curvps == old_vps)) + return; + + vnr = malloc(sizeof(*vnr), M_VPS, M_NOWAIT | M_ZERO); + if (vnr == NULL) + panic("%s: malloc failed", __func__); + vnr->prev_fn = old_fn; + vnr->where_fn = curthread->td_vps_lpush; + vnr->where_line = line; + vnr->old_vps = old_vps; + vnr->new_vps = curvps; + + SLIST_INSERT_HEAD(&vps_recursions, vnr, vnr_le); + + vps_print_recursion(vnr, 0); +#ifdef KDB + kdb_backtrace(); +#endif +} +#endif /* VPS_DEBUG */ + +/* + * DDB(4). + */ +#ifdef DDB +static void +db_vps_print(struct vps *vps) +{ + + db_printf("vps = %p\n", vps); + db_printf(" vps_magic_n = %#08x (%s, orig %#08x)\n", + vps->vps_magic_n, + (vps->vps_magic_n == VPS_MAGIC_N) ? + "ok" : "mismatch", VPS_MAGIC_N); + db_printf(" vps_data_mem = %p\n", vps->vps_data_mem); + db_printf(" vps_data_base = %#jx\n", + (uintmax_t)vps->vps_data_base); + db_printf(" vps_state = %#08x\n", vps->vps_state); + db_printf("\n"); +} + +DB_SHOW_ALL_COMMAND(vpss, db_show_all_vpss) +{ + VPS_ITERATOR_DECL(vps_iter); + + VPS_FOREACH(vps_iter) { + db_vps_print(vps_iter); + if (db_pager_quit) + break; + } +} + +DB_SHOW_COMMAND(vps, db_show_vps) +{ + + if (!have_addr) { + db_printf("usage: show vps \n"); + return; + } + + db_vps_print((struct vps *)addr); +} + +static void +db_show_vps_print_vs(struct vps_sysinit *vs, int ddb) +{ + const char *vsname, *funcname; + c_db_sym_t sym; + db_expr_t offset; + +#define xprint(...) \ + if (ddb) \ + db_printf(__VA_ARGS__); \ + else \ + printf(__VA_ARGS__) + + if (vs == NULL) { + xprint("%s: no vps_sysinit * given\n", __func__); + return; + } + + sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset); + db_symbol_values(sym, &vsname, NULL); + sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset); + db_symbol_values(sym, &funcname, NULL); + xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs); + xprint(" %#08x %#08x\n", vs->subsystem, vs->order); + xprint(" %p(%s)(%p)\n", + vs->func, (funcname != NULL) ? funcname : "", vs->arg); +#undef xprint +} + +DB_SHOW_COMMAND(vps_sysinit, db_show_vps_sysinit) +{ + struct vps_sysinit *vs; + + db_printf("VPS_SYSINIT vs Name(Ptr)\n"); + db_printf(" Subsystem Order\n"); + db_printf(" Function(Name)(Arg)\n"); + TAILQ_FOREACH(vs, &vps_constructors, link) { + db_show_vps_print_vs(vs, 1); + if (db_pager_quit) + break; + } +} + +DB_SHOW_COMMAND(vps_sysuninit, db_show_vps_sysuninit) +{ + struct vps_sysinit *vs; + + db_printf("VPS_SYSUNINIT vs Name(Ptr)\n"); + db_printf(" Subsystem Order\n"); + db_printf(" Function(Name)(Arg)\n"); + TAILQ_FOREACH_REVERSE(vs, &vps_destructors, vps_sysuninit_head, + link) { + db_show_vps_print_vs(vs, 1); + if (db_pager_quit) + break; + } +} + +DB_COMMAND(setcurvps, db_setcurvps) +{ + struct vps *vps; + + if (!have_addr) { + db_printf("usage: setcurvps \n"); + return; + } + + vps = (struct vps *)addr; + db_printf("curvps %p -> %p\n", curvps, vps); + curvps = vps; + db_vps_print(vps); +} + +#ifdef VPS_DEBUG +DB_SHOW_COMMAND(vpsrcrs, db_show_vpsrcrs) +{ + struct vps_recursion *vnr; + + SLIST_FOREACH(vnr, &vps_recursions, vnr_le) + vps_print_recursion(vnr, 1); +} +#endif +#endif /* DDB */ Index: sys/net/vnet.c =================================================================== --- sys/net/vnet.c +++ sys/net/vnet.c @@ -80,8 +80,6 @@ * stack instance. */ -FEATURE(vimage, "VIMAGE kernel virtualization"); - static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); /* @@ -307,7 +305,7 @@ sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock"); LIST_INIT(&vnet_head); } -SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST, +SYSINIT(vnet_init_prelink, SI_SUB_VIMAGE_PRELINK, SI_ORDER_FIRST, vnet_init_prelink, NULL); static void Index: sys/sys/jail.h =================================================================== --- sys/sys/jail.h +++ sys/sys/jail.h @@ -166,6 +166,7 @@ struct osd pr_osd; /* (p) additional data */ struct cpuset *pr_cpuset; /* (p) cpuset */ struct vnet *pr_vnet; /* (c) network stack */ + struct vps *pr_vps; /* (c) process space */ struct vnode *pr_root; /* (c) vnode to rdir */ int pr_ip4s; /* (p) number of v4 IPs */ int pr_ip6s; /* (p) number of v6 IPs */ @@ -209,6 +210,7 @@ /* primary jail address. */ #define PR_IP6_SADDRSEL 0x00000100 /* Do IPv6 src addr sel. or use the */ /* primary jail address. */ +#define PR_VPS 0x00000200 /* Virtual process space */ /* Internal flag bits */ #define PR_IP4 0x02000000 /* IPv4 restricted or disabled */ @@ -370,6 +372,7 @@ int prison_allow(struct ucred *, unsigned); int prison_check(struct ucred *cred1, struct ucred *cred2); int prison_owns_vnet(struct ucred *); +int prison_owns_vps(struct ucred *); int prison_canseemount(struct ucred *cred, struct mount *mp); void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp); Index: sys/sys/kernel.h =================================================================== --- sys/sys/kernel.h +++ sys/sys/kernel.h @@ -102,7 +102,7 @@ SI_SUB_MTX_POOL_DYNAMIC = 0x1AC0000, /* dynamic mutex pool */ SI_SUB_LOCK = 0x1B00000, /* various locks */ SI_SUB_EVENTHANDLER = 0x1C00000, /* eventhandler init */ - SI_SUB_VNET_PRELINK = 0x1E00000, /* vnet init before modules */ + SI_SUB_VIMAGE_PRELINK = 0x1E00000, /* VIMAGE init before modules */ SI_SUB_KLD = 0x2000000, /* KLD and module setup */ SI_SUB_CPU = 0x2100000, /* CPU resource(s)*/ SI_SUB_RACCT = 0x2110000, /* resource accounting */ @@ -159,7 +159,7 @@ SI_SUB_ROOT_CONF = 0xb000000, /* Find root devices */ SI_SUB_INTRINSIC_POST = 0xd000000, /* proc 0 cleanup*/ SI_SUB_SYSCALLS = 0xd800000, /* register system calls */ - SI_SUB_VNET_DONE = 0xdc00000, /* vnet registration complete */ + SI_SUB_VNET_DONE = 0xdc00000, /* VNET registration complete */ SI_SUB_KTHREAD_INIT = 0xe000000, /* init process*/ SI_SUB_KTHREAD_PAGE = 0xe400000, /* pageout daemon*/ SI_SUB_KTHREAD_VM = 0xe800000, /* vm daemon*/ @@ -170,6 +170,7 @@ SI_SUB_SMP = 0xf000000, /* start the APs*/ #endif SI_SUB_RACCTD = 0xf100000, /* start racctd*/ + SI_SUB_VIMAGE_DONE = 0xf800000, /* VIMAGE initialization done */ SI_SUB_LAST = 0xfffffff /* final initialization */ }; Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -364,6 +364,8 @@ /* LP64 hole */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ + struct vps *td_vps; /* (k) Effective vps. */ + const char *td_vps_lpush; /* (k) Debugging vps push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ Index: sys/sys/sysctl.h =================================================================== --- sys/sys/sysctl.h +++ sys/sys/sysctl.h @@ -104,6 +104,7 @@ #define CTLFLAG_CAPWR 0x00004000 /* Can be written in capability mode */ #define CTLFLAG_STATS 0x00002000 /* Statistics, not a tuneable */ #define CTLFLAG_NOFETCH 0x00001000 /* Don't fetch tunable from getenv() */ +#define CTLFLAG_VPS 0x00000800 /* Prisons with vps can fiddle */ #define CTLFLAG_CAPRW (CTLFLAG_CAPRD|CTLFLAG_CAPWR) /* Index: sys/sys/vps.h =================================================================== --- /dev/null +++ sys/sys/vps.h @@ -0,0 +1,381 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2006-2009 University of Zagreb + * Copyright (c) 2006-2009 FreeBSD Foundation + * Copyright (c) 2018 iXsystems, Inc. + * All rights reserved. + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Portions of this software were developed by Bjoern Zeeb + * under sponsorship from iXsystems, Inc. + * + * Copyright (c) 2009 Jeffrey Roberson + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * This header file defines several sets of interfaces supporting virtualized + * process space: + * + * - Definition of 'struct vps' and functions and macros to allocate/free/ + * manipulate it. + * + * - A virtual process stack memory allocator, which provides support for + * virtualized global variables via a special linker set, set_vps. + * + * - Virtualized sysinits/sysuninits, which allow constructors and + * destructors to be run for each process space as virtual + * instances are created and destroyed. + * + * If VIMAGE isn't compiled into the kernel, virtualized global variables + * compile to normal global variables, and virtualized sysinits to regular + * sysinits. + */ + +#ifndef _SYS_VPS_H_ +#define _SYS_VPS_H_ + +/* + * struct vps describes a virtualized process space, and is primarily a + * pointer to storage for virtualized global variables. Expose to userspace + * as required for libkvm. + */ +#if defined(_KERNEL) || defined(_WANT_VPS) +#include + +struct vps { + LIST_ENTRY(vps) vps_le; /* all vps list */ + u_int vps_magic_n; + u_int vps_state; /* SI_SUB_* */ + void *vps_data_mem; + uintptr_t vps_data_base; + struct prison *vps_pr; /* Put init on this if set. */ +}; +#define VPS_MAGIC_N 0x0f0307e2 + +/* + * These two virtual process space allocator definitions are also required + * for libkvm so that it can evaluate virtualized global variables. + */ +#define VPS_SETNAME "set_vps" +#define VPS_SYMPREFIX "vps_entry_" +#endif + +#ifdef _KERNEL +#ifdef VIMAGE +#include +#include /* for struct thread */ +#include +#include + +/* + * Location of the kernel's 'set_vps' linker set. + */ +extern uintptr_t *__start_set_vps; +__GLOBL(__start_set_vps); +extern uintptr_t *__stop_set_vps; +__GLOBL(__stop_set_vps); + +#define VPS_START (uintptr_t)&__start_set_vps +#define VPS_STOP (uintptr_t)&__stop_set_vps + +/* + * Functions to allocate and destroy virtual process spaces. + */ +struct vps *vps_alloc(struct prison *); +void vps_destroy(struct vps *); + +/* + * The current virtual process space -- we may wish to move this to struct + * pcpu in the future. + */ +#define curvps curthread->td_vps + +/* + * Various macros -- get and set the current process space, but also + * assertions. + */ +#if defined(INVARIANTS) || defined(VPS_DEBUG) +#define VPS_ASSERT(exp, msg) do { \ + if (!(exp)) \ + panic msg; \ +} while (0) +#else +#define VPS_ASSERT(exp, msg) do { \ +} while (0) +#endif + +#ifdef VPS_DEBUG +void vps_log_recursion(struct vps *, const char *, int); + +#define CURVPS_SET_QUIET(arg) \ + VPS_ASSERT((arg) != NULL && (arg)->vps_magic_n == VPS_MAGIC_N, \ + ("CURVPS_SET at %s:%d %s() curvps=%p vps=%p", \ + __FILE__, __LINE__, __func__, curvps, (arg))); \ + struct vps *saved_vps = curvps; \ + const char *saved_vps_lpush = curthread->td_vps_lpush; \ + curvps = arg; \ + curthread->td_vps_lpush = __func__; + +#define CURVPS_SET_VERBOSE(arg) \ + CURVPS_SET_QUIET(arg) \ + if (saved_vps) \ + vps_log_recursion(saved_vps, saved_vps_lpush, __LINE__); + +#define CURVPS_SET(arg) CURVPS_SET_VERBOSE(arg) + +#define CURVPS_RESTORE() \ + VPS_ASSERT(curvps != NULL && (saved_vps == NULL || \ + saved_vps->vps_magic_n == VPS_MAGIC_N), \ + ("CURVPS_RESTORE at %s:%d %s() curvps=%p saved_vps=%p", \ + __FILE__, __LINE__, __func__, curvps, saved_vps)); \ + curvps = saved_vps; \ + curthread->td_vps_lpush = saved_vps_lpush; +#else /* !VPS_DEBUG */ + +#define CURVPS_SET_QUIET(arg) \ + VPS_ASSERT((arg) != NULL && (arg)->vps_magic_n == VPS_MAGIC_N, \ + ("CURVPS_SET at %s:%d %s() curvps=%p vps=%p", \ + __FILE__, __LINE__, __func__, curvps, (arg))); \ + struct vps *saved_vps = curvps; \ + curvps = arg; + +#define CURVPS_SET_VERBOSE(arg) \ + CURVPS_SET_QUIET(arg) + +#define CURVPS_SET(arg) CURVPS_SET_VERBOSE(arg) + +#define CURVPS_RESTORE() \ + VPS_ASSERT(curvps != NULL && (saved_vps == NULL || \ + saved_vps->vps_magic_n == VPS_MAGIC_N), \ + ("CURVPS_RESTORE at %s:%d %s() curvps=%p saved_vps=%p", \ + __FILE__, __LINE__, __func__, curvps, saved_vps)); \ + curvps = saved_vps; +#endif /* VPS_DEBUG */ + +extern struct vps *vps0; +#define IS_DEFAULT_VPS(arg) ((arg) == vps0) + +#define CRED_TO_VPS(cr) (cr)->cr_prison->pr_vps +#define TD_TO_VPS(td) CRED_TO_VPS((td)->td_ucred) +#define P_TO_VPS(p) CRED_TO_VPS((p)->p_ucred) + +/* + * Global linked list of all virtual process spaces, along with read locks to + * access it. If a caller may sleep while accessing the list, it must use + * the sleepable lock macros. + */ +LIST_HEAD(vps_list_head, vps); +extern struct vps_list_head vps_head; +extern struct rwlock vps_rwlock; +extern struct sx vps_sxlock; + +#define VPS_LIST_RLOCK() sx_slock(&vps_sxlock) +#define VPS_LIST_RLOCK_NOSLEEP() rw_rlock(&vps_rwlock) +#define VPS_LIST_RUNLOCK() sx_sunlock(&vps_sxlock) +#define VPS_LIST_RUNLOCK_NOSLEEP() rw_runlock(&vps_rwlock) + +/* + * Iteration macros to walk the global list of virtual process spaces. + */ +#define VPS_ITERATOR_DECL(arg) struct vps *arg +#define VPS_FOREACH(arg) LIST_FOREACH((arg), &vps_head, vps_le) + +/* + * Virtual process space memory allocator, which allows global variables to + * be automatically instantiated for each process space instance. + */ +#define VPS_NAME(n) vps_entry_##n +#define VPS_DECLARE(t, n) extern t VPS_NAME(n) +#define VPS_DEFINE(t, n) t VPS_NAME(n) __section(VPS_SETNAME) __used +#define _VPS_PTR(b, n) (__typeof(VPS_NAME(n))*) \ + ((b) + (uintptr_t)&VPS_NAME(n)) + +#define _VPS(b, n) (*_VPS_PTR(b, n)) + +/* + * Virtualized global variable accessor macros. + */ +#define VPS_VPS_PTR(vps, n) _VPS_PTR((vps)->vps_data_base, n) +#define VPS_VPS(vps, n) (*VPS_VPS_PTR((vps), n)) + +#define VPS_PTR(n) VPS_VPS_PTR(curvps, n) +#define VPS(n) VPS_VPS(curvps, n) + +/* + * Virtual process space allocator interfaces from the kernel linker. + */ +void *vps_data_alloc(int size); +void vps_data_copy(void *start, int size); +void vps_data_free(void *start_arg, int size); + +/* + * Virtual sysinit mechanism, allowing process space components to declare + * startup and shutdown methods to be run when virtual process space + * instances are created and destroyed. + */ +#include + +/* + * SYSINIT/SYSUNINIT variants that provide per-vps constructors and + * destructors. + */ +struct vps_sysinit { + enum sysinit_sub_id subsystem; + enum sysinit_elem_order order; + sysinit_cfunc_t func; + const void *arg; + TAILQ_ENTRY(vps_sysinit) link; +}; + +#define VPS_SYSINIT(ident, subsystem, order, func, arg) \ + static struct vps_sysinit ident ## _vps_init = { \ + subsystem, \ + order, \ + (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ + (arg) \ + }; \ + SYSINIT(vps_init_ ## ident, subsystem, order, \ + vps_register_sysinit, &ident ## _vps_init); \ + SYSUNINIT(vps_init_ ## ident, subsystem, order, \ + vps_deregister_sysinit, &ident ## _vps_init) + +#define VPS_SYSUNINIT(ident, subsystem, order, func, arg) \ + static struct vps_sysinit ident ## _vps_uninit = { \ + subsystem, \ + order, \ + (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ + (arg) \ + }; \ + SYSINIT(vps_uninit_ ## ident, subsystem, order, \ + vps_register_sysuninit, &ident ## _vps_uninit); \ + SYSUNINIT(vps_uninit_ ## ident, subsystem, order, \ + vps_deregister_sysuninit, &ident ## _vps_uninit) + +/* + * Run per-vps sysinits or sysuninits during vps creation/destruction. + */ +void vps_sysinit(void); +void vps_sysuninit(void); + +/* + * Interfaces for managing per-vps constructors and destructors. + */ +void vps_register_sysinit(void *arg); +void vps_register_sysuninit(void *arg); +void vps_deregister_sysinit(void *arg); +void vps_deregister_sysuninit(void *arg); + +/* + * EVENTHANDLER(9) extensions. + */ +#include + +void vps_global_eventhandler_iterator_func(void *, ...); +#define VPS_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ +do { \ + if (IS_DEFAULT_VPS(curvps)) { \ + (tag) = vimage_eventhandler_register(NULL, #name, func, \ + arg, priority, \ + vps_global_eventhandler_iterator_func); \ + } \ +} while(0) +#define VPS_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ +do { \ + if (IS_DEFAULT_VPS(curvps)) { \ + vimage_eventhandler_register(NULL, #name, func, \ + arg, priority, \ + vps_global_eventhandler_iterator_func); \ + } \ +} while(0) + +#else /* !VIMAGE */ + +/* + * Various virtual process space macros compile to no-ops without VIMAGE. + */ +#define curvps NULL + +#define VPS_ASSERT(exp, msg) +#define CURVPS_SET(arg) +#define CURVPS_SET_QUIET(arg) +#define CURVPS_RESTORE() + +#define VPS_LIST_RLOCK() +#define VPS_LIST_RLOCK_NOSLEEP() +#define VPS_LIST_RUNLOCK() +#define VPS_LIST_RUNLOCK_NOSLEEP() +#define VPS_ITERATOR_DECL(arg) +#define VPS_FOREACH(arg) + +#define IS_DEFAULT_VPS(arg) 1 +#define CRED_TO_VPS(cr) NULL +#define TD_TO_VPS(td) NULL +#define P_TO_VPS(p) NULL + +/* + * Versions of the vps macros that compile to normal global variables and + * standard sysctl definitions. + */ +#define VPS_NAME(n) n +#define VPS_DECLARE(t, n) extern t n +#define VPS_DEFINE(t, n) t n +#define _VPS_PTR(b, n) &VPS_NAME(n) + +/* + * Virtualized global variable accessor macros. + */ +#define VPS_VPS_PTR(vps, n) (&(n)) +#define VPS_VPS(vps, n) (n) + +#define VPS_PTR(n) (&(n)) +#define VPS(n) (n) + +/* + * When VIMAGE isn't compiled into the kernel, VPS_SYSINIT/VPS_SYSUNINIT + * map into normal sysinits, which have the same ordering properties. + */ +#define VPS_SYSINIT(ident, subsystem, order, func, arg) \ + SYSINIT(ident, subsystem, order, func, arg) +#define VPS_SYSUNINIT(ident, subsystem, order, func, arg) \ + SYSUNINIT(ident, subsystem, order, func, arg) + +/* + * Without VIMAGE revert to the default implementation. + */ +#define VPS_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ + (tag) = eventhandler_register(NULL, #name, func, arg, priority) +#define VPS_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ + eventhandler_register(NULL, #name, func, arg, priority) +#endif /* VIMAGE */ +#endif /* _KERNEL */ + +#endif /* !_SYS_VPS_H_ */