Index: sys/ddb/db_expr.c =================================================================== --- sys/ddb/db_expr.c +++ sys/ddb/db_expr.c @@ -58,7 +58,8 @@ if (t == tIDENT) { if (!db_value_of_name(db_tok_string, valuep) && !db_value_of_name_pcpu(db_tok_string, valuep) && - !db_value_of_name_vnet(db_tok_string, valuep)) { + !db_value_of_name_vnet(db_tok_string, valuep) && + !db_value_of_name_vps(db_tok_string, valuep)) { db_printf("Symbol '%s' not found\n", db_tok_string); db_error(NULL); /*NOTREACHED*/ Index: sys/ddb/db_ps.c =================================================================== --- sys/ddb/db_ps.c +++ sys/ddb/db_ps.c @@ -398,6 +398,9 @@ db_printf(" last involuntary switch: %d ms ago\n", 1000 * delta / hz); } +#ifdef VIMAGE + db_printf(" vnet: %p vps: %p\n", td->td_vnet, td->td_vps); +#endif } DB_SHOW_COMMAND(proc, db_show_proc) Index: sys/ddb/db_sym.c =================================================================== --- sys/ddb/db_sym.c +++ sys/ddb/db_sym.c @@ -37,8 +37,10 @@ #include #include +#include #include #include +#include #include @@ -69,6 +71,7 @@ #ifdef VIMAGE static void *db_vnet = NULL; +static void *db_vps = NULL; #endif /* @@ -168,6 +171,53 @@ return (0); } } + +/* + * Validate the virtual process space pointer used to interpret per-vps global + * variable expansion. Right now we don't do much here, really we should + * walk the global vps list to check it's an OK pointer. + */ +int +db_var_db_vps(struct db_variable *vp, db_expr_t *valuep, int op) +{ + + switch (op) { + case DB_VAR_GET: + *valuep = (db_expr_t)db_vps; + return (1); + + case DB_VAR_SET: + db_vps = *(void **)valuep; + return (1); + + default: + db_printf("db_var_db_vps: unknown operation\n"); + return (0); + } +} + +/* + * Read-only variable reporting the current vps, which is what we use when + * db_vps is set to NULL. + */ +int +db_var_curvps(struct db_variable *vp, db_expr_t *valuep, int op) +{ + + switch (op) { + case DB_VAR_GET: + *valuep = (db_expr_t)curvps; + return (1); + + case DB_VAR_SET: + db_printf("Read-only variable.\n"); + return (0); + + default: + db_printf("db_var_curvps: unknown operation\n"); + return (0); + } +} #endif /* @@ -278,6 +328,33 @@ #endif } +bool +db_value_of_name_vps(const char *name, db_expr_t *valuep) +{ +#ifdef VIMAGE + static char tmp[256]; + db_expr_t value; + c_db_sym_t sym; + struct vps *vps; + + if (db_vps != NULL) + vps = db_vps; + else + vps = curvps; + snprintf(tmp, sizeof(tmp), "vps_entry_%s", name); + sym = db_lookup(tmp); + if (sym == C_DB_SYM_NULL) + return (false); + db_symbol_values(sym, &name, &value); + if (value < VPS_START || value >= VPS_STOP) + return (false); + *valuep = (db_expr_t)((uintptr_t)value + vps->vps_data_base); + return (true); +#else + return (false); +#endif +} + /* * Lookup a symbol. * If the symbol has a qualifier (e.g., ux:vm_map), Index: sys/ddb/db_variables.h =================================================================== --- sys/ddb/db_variables.h +++ sys/ddb/db_variables.h @@ -56,8 +56,10 @@ extern db_varfcn_t db_var_curcpu; /* DPCPU default CPU */ extern db_varfcn_t db_var_curvnet; /* Default vnet */ +extern db_varfcn_t db_var_curvps; /* Default vps */ extern db_varfcn_t db_var_db_cpu; /* DPCPU active CPU */ extern db_varfcn_t db_var_db_vnet; /* Active vnet */ +extern db_varfcn_t db_var_db_vps; /* Active vps */ int db_read_variable(struct db_variable *, db_expr_t *); int db_write_variable(struct db_variable *, db_expr_t); Index: sys/ddb/db_variables.c =================================================================== --- sys/ddb/db_variables.c +++ sys/ddb/db_variables.c @@ -53,6 +53,8 @@ #ifdef VIMAGE { "curvnet", NULL, db_var_curvnet }, { "db_vnet", NULL, db_var_db_vnet }, + { "curvps", NULL, db_var_curvps }, + { "db_vps", NULL, db_var_db_vps }, #endif }; static struct db_variable *db_evars = db_vars + nitems(db_vars); Index: sys/ddb/ddb.h =================================================================== --- sys/ddb/ddb.h +++ sys/ddb/ddb.h @@ -229,6 +229,7 @@ bool db_value_of_name(const char *name, db_expr_t *valuep); bool db_value_of_name_pcpu(const char *name, db_expr_t *valuep); bool db_value_of_name_vnet(const char *name, db_expr_t *valuep); +bool db_value_of_name_vps(const char *name, db_expr_t *valuep); int db_write_bytes(vm_offset_t addr, size_t size, char *data); void db_command_register(struct command_table *, struct command *); void db_command_unregister(struct command_table *, struct command *); Index: sys/kern/init_main.c =================================================================== --- sys/kern/init_main.c +++ sys/kern/init_main.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,7 @@ #include #include #include +#include #include @@ -106,6 +108,15 @@ VPS_DEFINE(struct proc *, initproc); VPS_DEFINE(struct proc *, vproc0); +#ifdef VIMAGE +/* + * Initialize to -2; after kproc_create() our thread will still be + * forked from thread0 and in the wrong vps. Once that is fixed it will + * see the local copy and not the DEFAULT_VPS one. Make sure we have + * a value that we can spin on until this happens. + */ +VPS_DEFINE(int, vpsdying) = -2; +#endif #ifndef BOOTHOWTO #define BOOTHOWTO 0 @@ -501,6 +512,9 @@ td->td_cpuset = cpuset_thread0(); td->td_domain.dr_policy = td->td_cpuset->cs_domain; prison0_init(); +#ifdef VIMAGE + td->td_vps = vps0; +#endif p->p_peers = 0; p->p_leader = p; p->p_reaper = p; @@ -603,7 +617,183 @@ racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } -SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); +SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_init, NULL); + +#ifdef VIMAGE +static void +vps_swapper(void *dummy __unused) +{ + + /* + * Make sure the surgical changes to V_vproc0 are done before + * entering the long-lasting loop. Otherwise we may start + * acquiring locks and accessing variables based on the wrong + * credential leading to, e.g., panics when trying to unlock a + * lock from a different context which may not be locked. + * When entering the function our credentials might still point + * to the DEFAULT_VPS; see comment for V_vpsdying declaration above. + */ + while (V_vpsdying < 0) + pause("wswvps", hz/2); + + /* + * Now hand over this thread to swapper. + */ + swapper(); + + if (V_vpsdying < 1) + panic("%s: swapper curtd %p ended but V_vpsdying %d\n", + __func__, curthread, V_vpsdying); + + kproc_exit(0); +} + +static void +proc0_init_vps(void *dummy __unused) +{ + struct ucred *newcred, *savecred; + struct thread *td; + struct prison *pr; + struct uidinfo tmpuinfo; + struct loginclass tmplc = { + .lc_name = "", + }; + int error; + + /* vps0 is handled normally in p0init. */ + if (IS_DEFAULT_VPS(curvps)) + return; + + KASSERT((curvps->vps_pr != NULL && curvps != vps0), + ("%s: curvps %p has vps_pr %p or is vps0 %p\n", + __func__, curvps, curvps->vps_pr, vps0)); + KASSERT((curvps == curvps->vps_pr->pr_vps), + ("%s: curvps %p != curvps->vps_pr %p ->pr_vps %p\n", + __func__, curvps, curvps->vps_pr, curvps->vps_pr->pr_vps)); + + /* + * Initialized the non-default VPS version to < 0 so vps_swapper() + * will spin once the credential is changed before all other surgery + * has happened. + */ + V_vpsdying = -1; + + /* + * Default is nprocs = 1 for vps0; need to set it to 0 here as our + * "proc0" and with that initproc are forked and not manually constructed. + */ + V_nprocs = 0; + + /* + * Set lastpid to -1 so that our swapper gets 0. + */ + V_lastpid = -1; + + error = kproc_create(vps_swapper, NULL, &V_vproc0, 0, 0, "vps%u", + curvps->vps_pr->pr_id); + if (error) + panic("%s: cannot create vps %p swapper: %d\n", + __func__, curvps, error); + + /* Create credentials. Copied from proc0. Just using vps_pr. */ + newcred = crget(); + newcred->cr_ngroups = 1; /* group 0 */ + /* A hack to prevent uifind from tripping over NULL pointers. */ + savecred = curthread->td_ucred; + curthread->td_ucred = newcred; + tmpuinfo.ui_uid = 1; + newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo; + newcred->cr_uidinfo = uifind(0); + newcred->cr_ruidinfo = uifind(0); + newcred->cr_loginclass = &tmplc; + newcred->cr_loginclass = loginclass_find("default"); + /* End hack. creds get properly set later with thread_cow_get_proc */ + curthread->td_ucred = savecred; + PROC_LOCK(V_vproc0); + newcred->cr_prison = curvps->vps_pr; + prison_hold(newcred->cr_prison); + /* The kernel process was accounted to thread0's prison. */ + prison_proc_hold(newcred->cr_prison); + prison_proc_free(savecred->cr_prison); + V_vproc0->p_treeflag |= P_TREE_REAPER; + savecred = proc_set_cred(V_vproc0, newcred); + PROC_UNLOCK(V_vproc0); +#ifdef AUDIT + audit_cred_kproc0(newcred); +#endif +#ifdef MAC + mac_cred_create_swapper(newcred); +#endif + crfree(savecred); + + PROC_LOCK(V_vproc0); + td = FIRST_THREAD_IN_PROC(V_vproc0); + thread_cow_get_proc(td, V_vproc0); + PROC_UNLOCK(V_vproc0); + KASSERT(curvps->vps_pr == + FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred->cr_prison, + ("%s:%d: curvps %p vps_pr %p != FTIP(V_vproc0 %p)->td_ucred %p " + "cr_prison %p\n", __func__, __LINE__, + curvps, curvps->vps_pr, V_vproc0, + FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred, + FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred->cr_prison)); + KASSERT(curvps == TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)), + ("%s:%d: curvps %p != TD_TO_VPS(..(V_vproc0 %p)) %p\n", + __func__, __LINE__, + curvps, V_vproc0, TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)))); + + /* Chroot it. */ + td = FIRST_THREAD_IN_PROC(V_vproc0); + pr = curvps->vps_pr; + vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); + if ((error = change_dir(pr->pr_root, td)) != 0) { + printf("%s: td %p change_dir %p failed: %d\n", + __func__, td, pr->pr_root, error); + goto err; + } +#ifdef MAC + if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) { + printf("%s: td %p mac_vnode_check_chroot %p failed: %d\n", + __func__, td, pr->pr_root, error); + goto err; + } +#endif + VOP_UNLOCK(pr->pr_root, 0); + if ((error = pwd_chroot(td, pr->pr_root))) { + printf("%s: td %p pwd_chroot %p failed: %d\n", + __func__, td, pr->pr_root, error); + goto err; + } + + V_vpsdying = 0; + return; + +err: + /* XXX could panic or singal the jail to abort; cannot really stop. */ + return; +} +VPS_SYSINIT(p0init_vps, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_init_vps, NULL); + +static void +proc0_uninit_vps(void *dummy __unused) +{ + + if (IS_DEFAULT_VPS(curvps)) + return; + + /* + * XXX ideally we want to get that state from elsewhere; + * neither prison, not vps state, .. lends itself though. + */ + V_vpsdying = 1; + wakeup(V_vproc0); + + while (V_vproc0 != NULL || + !LIST_EMPTY(&V_zombproc) || !LIST_EMPTY(&V_allproc)) + pause("p0uvps", hz/2); +} +VPS_SYSUNINIT(p0uninit_vps, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_uninit_vps, NULL); +#endif /* ARGSUSED*/ static void @@ -712,7 +902,8 @@ td = curthread; p = td->td_proc; - vfs_mountroot(); + if (IS_DEFAULT_VPS(curvps)) + vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); @@ -736,8 +927,8 @@ while ((path = strsep(&tmp_init_path, ":")) != NULL) { pathlen = strlen(path) + 1; if (bootverbose) - printf("start_init: trying %s\n", path); - + printf("%s: trying %s\n", __func__, path); + /* * Move out the boot flag argument. */ @@ -822,39 +1013,60 @@ struct thread *td; int error; + KASSERT(curvps == FIRST_THREAD_IN_PROC(V_vproc0)->td_vps, + ("%s: curvps %p != V_vproc0 %p first td %p td_vps %p\n", + __func__, curvps, V_vproc0, FIRST_THREAD_IN_PROC(V_vproc0), + FIRST_THREAD_IN_PROC(V_vproc0)->td_vps)); + KASSERT(curvps == TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)), + ("%s: curvps %p != TD_TO_VPS(..(V_vproc0 %p)) %p\n", + __func__, curvps, V_vproc0, + TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)))); + bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &V_initproc; - error = fork1(&thread0, &fr); + td = FIRST_THREAD_IN_PROC(V_vproc0); + error = fork1(td, &fr); if (error) panic("cannot fork init: %d\n", error); - KASSERT(V_initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); + KASSERT(V_initproc->p_pid == 1, ("%s: initproc->p_pid(%d) != 1", + __func__, V_initproc->p_pid)); + KASSERT(curvps == FIRST_THREAD_IN_PROC(V_initproc)->td_vps, + ("%s: curvps %p != V_initproc %p first td %p td_vps %p\n", + __func__, curvps, V_initproc, FIRST_THREAD_IN_PROC(V_initproc), + FIRST_THREAD_IN_PROC(V_initproc)->td_vps)); + /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&V_proctree_lock); PROC_LOCK(V_initproc); V_initproc->p_flag |= P_SYSTEM | P_INMEM; V_initproc->p_treeflag |= P_TREE_REAPER; - LIST_INSERT_HEAD(&V_initproc->p_reaplist, V_vproc0, p_reapsibling); oldcred = V_initproc->p_ucred; crcopy(newcred, oldcred); +#ifdef VIMAGE + /* Swap to the correct prison. */ + /* XXX is this really needed or was this related to a V_vproc0 bug? */ + prison_free(newcred->cr_prison); + newcred->cr_prison = curvps->vps_pr; + prison_hold(newcred->cr_prison); +#endif #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif + /* This will also update cowgen. */ proc_set_cred(V_initproc, newcred); - td = FIRST_THREAD_IN_PROC(V_initproc); - crfree(td->td_ucred); - td->td_ucred = crhold(V_initproc->p_ucred); PROC_UNLOCK(V_initproc); sx_xunlock(&V_proctree_lock); crfree(oldcred); + cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(V_initproc), start_init, NULL); } -SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); +VPS_SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. @@ -870,4 +1082,42 @@ sched_add(td, SRQ_BORING); thread_unlock(td); } -SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); +VPS_SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); + +#ifdef VIMAGE +static void +reapinit(void *ident __unused) +{ + struct proc *p, *p2; + + while (V_nprocs > 2) { + sx_slock(&V_allproc_lock); + LIST_FOREACH(p, &V_allproc, p_list) { + if (p->p_pid <= 1) + continue; + PROC_LOCK(p); + kern_psignal(p, SIGKILL); + PROC_UNLOCK(p); + } + sx_sunlock(&V_allproc_lock); + pause("reapin1t", hz/2); + } + + sx_xlock(&V_proctree_lock); + LIST_FOREACH_SAFE(p, &V_zombproc, p_list, p2) { + PROC_LOCK(p); + proc_reap(FIRST_THREAD_IN_PROC(V_vproc0), p, NULL, 0); + sx_xlock(&V_proctree_lock); + } + sx_xunlock(&V_proctree_lock); + + while (V_nprocs > 1) + pause("reapinit", hz/2); + + /* Only our "swapper" left. */ + KASSERT(V_nprocs == 1, ("%s: vps %p V_nprocs %d != 1", + __func__, curvps, V_nprocs)); +} +/* Run very first. */ +VPS_SYSUNINIT(reapinit, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, reapinit, NULL); +#endif Index: sys/kern/kern_clock.c =================================================================== --- sys/kern/kern_clock.c +++ sys/kern/kern_clock.c @@ -194,6 +194,14 @@ tryl = 0; for (;;) { +#ifdef VIMAGE + VPS_ITERATOR_DECL(vps); + + VPS_LIST_RLOCK(); + VPS_FOREACH(vps) { +again: + CURVPS_SET_QUIET(vps); +#endif blkticks = blktime_threshold * hz; slpticks = slptime_threshold * hz; @@ -204,11 +212,17 @@ */ if (!sx_try_slock(&V_allproc_lock)) { if (tryl > 100) - panic("%s: possible deadlock detected on allproc_lock\n", + panic("%s: possible deadlock detected on allproc_lock\n", __func__); tryl++; + CURVPS_RESTORE(); + pause("allproc", sleepfreq * hz); +#ifdef VIMAGE + goto again; +#else continue; +#endif } tryl = 0; FOREACH_PROC_IN_SYSTEM(p) { @@ -297,6 +311,12 @@ } sx_sunlock(&V_allproc_lock); +#ifdef VIMAGE + CURVPS_RESTORE(); + } + VPS_LIST_RUNLOCK(); +#endif + /* Sleep for sleepfreq seconds. */ pause("-", sleepfreq * hz); } Index: sys/kern/kern_exit.c =================================================================== --- sys/kern/kern_exit.c +++ sys/kern/kern_exit.c @@ -96,6 +96,11 @@ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exit, "int"); +#ifdef VIMAGE +VPS_DECLARE(int, vrebooting); /* kern_reboot() has been called. */ +#define V_vrebooting VPS(vrebooting) +#endif + /* Hook for NFS teardown procedure. */ void (*nlminfo_release_p)(struct proc *p); @@ -133,9 +138,15 @@ struct proc *p1, *p2, *ptmp; sx_assert(&V_proctree_lock, SX_LOCKED); - KASSERT(p != V_initproc, ("reaper_abandon_children for initproc")); - if ((p->p_treeflag & P_TREE_REAPER) == 0) + /* init inside a vps may die on prison_remove. */ + KASSERT(!IS_DEFAULT_VPS(curvps) || p != V_initproc, + ("%s: for initproc %p", __func__, p)); + if ((p->p_treeflag & P_TREE_REAPER) == 0) { + KASSERT((p != V_initproc && p->p_pid != 1 && p->p_pid != 0), + ("%s:%d curvps %p p %p pid %d p_treeflag %#x", + __func__, __LINE__, curvps, p, p->p_pid, p->p_treeflag)); return; + } p1 = p->p_reaper; LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) { LIST_REMOVE(p2, p_reapsibling); @@ -148,7 +159,8 @@ PROC_UNLOCK(p2); } } - KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty")); + KASSERT(LIST_EMPTY(&p->p_reaplist), + ("%s: p %p p_reaplist not empty", __func__, p)); p->p_treeflag &= ~P_TREE_REAPER; } @@ -203,9 +215,19 @@ * work around an unsolved stack overflow seen very late during * shutdown on sparc64 when the gmirror worker process exists. */ - if (p == V_initproc && rebooting == 0) { + if (p == V_initproc && (rebooting == 0 +#ifdef VIMAGE + || V_vrebooting +#endif + )) { printf("init died (signal %d, exit %d)\n", signo, rval); - panic("Going nowhere without my init!"); +#ifdef VIMAGE + if (!IS_DEFAULT_VPS(TD_TO_VPS(td))) + /* XXX-BZ make this jail go away. */ ; + else +#endif + panic("%s: Going nowhere without my init! td %p", + __func__, td); } /* @@ -829,7 +851,7 @@ sx_assert(&V_proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); - KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE")); + KASSERT(p->p_state == PRS_ZOMBIE, ("%s: !PRS_ZOMBIE", __func__)); mtx_spin_wait_unlocked(&p->p_slock); @@ -953,7 +975,7 @@ #endif KASSERT(FIRST_THREAD_IN_PROC(p), - ("proc_reap: no residual thread!")); + ("%s: no residual thread!", __func__)); uma_zfree(proc_zone, p); atomic_add_int(&V_nprocs, -1); } Index: sys/kern/kern_fork.c =================================================================== --- sys/kern/kern_fork.c +++ sys/kern/kern_fork.c @@ -237,7 +237,7 @@ static VPS_DEFINE(int, pidchecked) = 0; #define V_pidchecked VPS(pidchecked) -static int +int fork_findpid(int flags) { struct proc *p; @@ -494,6 +494,7 @@ td2->td_lend_user_pri = PRI_MAX; #ifdef VIMAGE + td2->td_vps = TD_TO_VPS(td); td2->td_vnet = NULL; td2->td_vnet_lpush = NULL; #endif Index: sys/kern/kern_jail.c =================================================================== --- sys/kern/kern_jail.c +++ sys/kern/kern_jail.c @@ -78,10 +78,6 @@ #include -#ifdef VIMAGE -FEATURE(vimage, "VIMAGE kernel virtualization"); -#endif - #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); @@ -179,9 +175,7 @@ {"host", 0, PR_HOST}, #ifdef VIMAGE {"vnet", 0, PR_VNET}, -#ifdef ENABLE_VPS - {"vps", 0, PR_VPS }, -#endif + {"vps", 0, PR_VPS}, #endif #ifdef INET {"ip4", PR_IP4_USER, PR_IP4_USER}, @@ -1819,11 +1813,7 @@ #ifdef VIMAGE /* Allocate a new vps if specified. */ -#ifdef ENABLE_VPS if (pr_flags & PR_VPS) { -#else - if (0) { -#endif vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); if ((error = change_dir(pr->pr_root, td)) != 0) goto c_unlock; @@ -2339,7 +2329,6 @@ * Kill all processes unfortunate enough to be attached to this prison. */ #ifdef VIMAGE -#ifdef ENABLE_VPS if (pr->pr_vps) { /* * Send signal to init and let init do it's job. @@ -2357,7 +2346,6 @@ vps_destroy(pr->pr_vps); CURVPS_RESTORE(); } else -#endif #endif { sx_slock(&V_allproc_lock); @@ -2421,7 +2409,7 @@ * right, including a new pid, progress group, session, terminal, * tracing is one thing (with a lot of work) and may break apps if the * pid changes, the pgrp no longer has the same (p)id; getting things - * restored to oriinal state and properly re-parented is virtually + * restored to original state and properly re-parented is virtually * impossile. So do what we do on a normal machine, present a terminal * to login to. */ @@ -2722,7 +2710,8 @@ #ifdef VIMAGE if (pr->pr_vnet != ppr->pr_vnet) vnet_destroy(pr->pr_vnet); - KASSERT(pr->pr_vps == NULL, ("%s: pr %p pr_vps %p != NULL\n", + KASSERT((pr->pr_vps == ppr->pr_vps || pr->pr_vps == NULL), + ("%s: pr %p pr_vps %p != NULL\n", __func__, pr, pr->pr_vps)); #endif if (pr->pr_root != NULL) @@ -3824,11 +3813,9 @@ #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); -#ifdef ENABLE_VPS SYSCTL_JAIL_PARAM(, vps, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual process space"); #endif -#endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); Index: sys/kern/kern_kthread.c =================================================================== --- sys/kern/kern_kthread.c +++ sys/kern/kern_kthread.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,8 @@ #include #include #include +#include + #include #include @@ -89,7 +92,7 @@ struct thread *td; struct proc *p2; - if (!V_vproc0->p_stats) + if (!proc0.p_stats) panic("kproc_create called too soon"); bzero(&fr, sizeof(fr)); @@ -160,7 +163,26 @@ */ sx_xlock(&V_proctree_lock); PROC_LOCK(p); - proc_reparent(p, V_initproc); +#ifdef VIMAGE + /* + * In the VIMAGE case if the kproc is our virtual "swapper" + * do not reparent it to our init as otherwise it would create + * a circle and never go away. Let the parent vps reap it + * as it was setup. And it needs to be the init there and + * not the swapper(kernel). + */ + if (!IS_DEFAULT_VPS(TD_TO_VPS(FIRST_THREAD_IN_PROC(p))) && + p->p_pid == 0) { + struct proc *init0; + + CURVPS_SET_QUIET(vps0) + init0 = V_initproc; + CURVPS_RESTORE(); + + proc_reparent(p, init0); + } else +#endif + proc_reparent(p, V_initproc); PROC_UNLOCK(p); sx_xunlock(&V_proctree_lock); @@ -260,7 +282,7 @@ va_list ap; struct thread *newtd, *oldtd; - if (!V_vproc0->p_stats) + if (!proc0.p_stats) panic("kthread_add called too soon"); /* If no process supplied, put it on proc0 */ @@ -288,6 +310,9 @@ TSTHREAD(newtd, newtd->td_name); newtd->td_proc = p; /* needed for cpu_copy_thread */ +#ifdef VIMAGE + newtd->td_vps = TD_TO_VPS(oldtd); +#endif /* might be further optimized for kthread */ cpu_copy_thread(newtd, oldtd); /* put the designated function(arg) as the resume context */ Index: sys/kern/kern_proc.c =================================================================== --- sys/kern/kern_proc.c +++ sys/kern/kern_proc.c @@ -196,12 +196,36 @@ LIST_INIT(&V_zombproc); V_pidhashtbl = hashinit(maxproc / 4, M_PROC, &V_pidhash); V_pgrphashtbl = hashinit(maxproc / 4, M_PROC, &V_pgrphash); - proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), - proc_ctor, proc_dtor, proc_init, proc_fini, - UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + if (IS_DEFAULT_VPS(curvps)) { + proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), + proc_ctor, proc_dtor, proc_init, proc_fini, + UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + } } VPS_SYSINIT(procinit, SI_SUB_INTRINSIC, SI_ORDER_SECOND, procinit, NULL); +#ifdef VIMAGE +static void +procdestroy(void *ident __unused) +{ + + KASSERT((LIST_EMPTY(&V_allproc)), ("%s: list allproc %p not empty\n", + __func__, &V_allproc)); + KASSERT((LIST_EMPTY(&V_zombproc)), ("%s: list zombproc %p not empty\n", + __func__, &V_zombproc)); + + /* proc_zone */ + hashdestroy(V_pgrphashtbl, M_PROC, V_pgrphash); + hashdestroy(V_pidhashtbl, M_PROC, V_pidhash); + + mtx_destroy(&V_ppeers_lock); + sx_destroy(&V_proctree_lock); + sx_destroy(&V_allproc_lock); +} +VPS_SYSUNINIT(procdestroy, SI_SUB_INTRINSIC, SI_ORDER_SECOND, procdestroy, + NULL); +#endif + /* * Prepare a proc for use. */ Index: sys/kern/kern_prot.c =================================================================== --- sys/kern/kern_prot.c +++ sys/kern/kern_prot.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -1860,8 +1861,10 @@ crfree(struct ucred *cr) { - KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref)); - KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred")); + KASSERT(cr->cr_ref > 0, ("%s: bad ucred %p refcount: %d", + __func__, cr, cr->cr_ref)); + KASSERT(cr->cr_ref != 0xdeadc0de, + ("%s: dangling reference to ucred %p", __func__, cr)); if (refcount_release(&cr->cr_ref)) { /* * Some callers of crget(), such as nfs_statfs(), Index: sys/kern/kern_resource.c =================================================================== --- sys/kern/kern_resource.c +++ sys/kern/kern_resource.c @@ -1236,6 +1236,27 @@ VPS_SYSINIT(uihashinit_vps, SI_SUB_INTRINSIC, SI_ORDER_SECOND, uihashinit_vps, NULL); +#ifdef VIMAGE +static void +uihashdestroy_vps(void *ident __unused) +{ + struct uidinfo *uip; + struct uihashhead *uih; + int i; + + i = 0; + for (uih = &V_uihashtbl[V_uihash]; uih >= V_uihashtbl; uih--) + LIST_FOREACH(uip, uih, ui_hash) + i++; + if (i == 0) + hashdestroy(V_uihashtbl, M_UIDINFO, V_uihash); + else + printf("%s: leaking %d uihash entries\n", __func__, i); +} +VPS_SYSUNINIT(uihashdestroy_vps, SI_SUB_INTRINSIC, SI_ORDER_SECOND, + uihashdestroy_vps, NULL); +#endif + /* * Look up a uidinfo struct for the parameter uid. * uihashtbl_lock must be locked. @@ -1406,7 +1427,8 @@ return (0); } } else if (new < 0) - printf("negative %s for uid = %d\n", name, uip->ui_uid); + printf("%s: curthread %p uip %p negative %s for uid = %d\n", + __func__, curthread, uip, name, uip->ui_uid); return (1); } Index: sys/kern/kern_shutdown.c =================================================================== --- sys/kern/kern_shutdown.c +++ sys/kern/kern_shutdown.c @@ -204,6 +204,10 @@ int dumping; /* system is dumping */ int rebooting; /* system is rebooting */ +#ifdef VIMAGE +VPS_DEFINE(int, vrebooting); /* vps is rebooting */ +#define V_vrebooting VPS(vrebooting) +#endif static struct dumperinfo dumper; /* our selected dumper */ /* Context information for dump-debuggers. */ @@ -276,29 +280,42 @@ if (error == 0) { if (uap->opt & RB_REROOT) error = kern_reroot(); - else + else { +#ifdef VIMAGE + /* XXX Can argue that we should never make it here. */ + /* Init will want to _exit() in this case. */ + if (!IS_DEFAULT_VPS(TD_TO_VPS(curthread))) { + V_vrebooting = 1; + return (error); + } +#endif kern_reboot(uap->opt); + } } return (error); } +static VPS_DEFINE(int, vhowto); +#define V_vhowto VPS(vhowto) + static void shutdown_nice_task_fn(void *arg, int pending __unused) { - int howto; - howto = (uintptr_t)arg; + CURVPS_SET((struct vps *)arg); /* Send a signal to init(8) and have it shutdown the world. */ PROC_LOCK(V_initproc); - if (howto & RB_POWEROFF) + if (V_vhowto & RB_POWEROFF) kern_psignal(V_initproc, SIGUSR2); - else if (howto & RB_POWERCYCLE) + else if (V_vhowto & RB_POWERCYCLE) kern_psignal(V_initproc, SIGWINCH); - else if (howto & RB_HALT) + else if (V_vhowto & RB_HALT) kern_psignal(V_initproc, SIGUSR1); else kern_psignal(V_initproc, SIGINT); PROC_UNLOCK(V_initproc); + V_vhowto = 0; + CURVPS_RESTORE(); } static struct task shutdown_nice_task = TASK_INITIALIZER(0, @@ -312,9 +329,21 @@ { if (V_initproc != NULL && !SCHEDULER_STOPPED()) { - shutdown_nice_task.ta_context = (void *)(uintptr_t)howto; + + KASSERT(V_vhowto == 0, ("%s: vps %p howto not 0: %d\n", + __func__, curvps, V_vhowto)); + V_vhowto = howto; + shutdown_nice_task.ta_context = (void *)curvps; taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task); } else { +#ifdef VIMAGE + /* XXX Can argue that we should never make it here. */ + /* Init will want to _exit() in this case. */ + if (!IS_DEFAULT_VPS(TD_TO_VPS(curthread))) { + V_vrebooting = 1; + return; + } +#endif /* * No init(8) running, or scheduler would not allow it * to run, so simply reboot. Index: sys/kern/kern_sysctl.c =================================================================== --- sys/kern/kern_sysctl.c +++ sys/kern/kern_sysctl.c @@ -2130,6 +2130,7 @@ memlocked = 1; sx_xlock(&sysctlmemlock); } + CURVPS_SET(TD_TO_VPS(td)); CURVNET_SET(TD_TO_VNET(td)); for (;;) { @@ -2142,6 +2143,7 @@ } CURVNET_RESTORE(); + CURVPS_RESTORE(); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); Index: sys/kern/kern_thr.c =================================================================== --- sys/kern/kern_thr.c +++ sys/kern/kern_thr.c @@ -32,6 +32,7 @@ #include "opt_posix.h" #include "opt_hwpmc_hooks.h" #include +#include #include #include #include @@ -56,6 +57,7 @@ #include #include #include +#include #ifdef HWPMC_HOOKS #include #endif @@ -238,6 +240,9 @@ bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = td->td_proc; +#ifdef VIMAGE + newtd->td_vps = TD_TO_VPS(td); +#endif newtd->td_rb_list = newtd->td_rbp_list = newtd->td_rb_inact = 0; thread_cow_get(newtd, td); Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c +++ sys/kern/kern_thread.c @@ -58,6 +58,9 @@ #ifdef HWPMC_HOOKS #include #endif +#ifdef VIMAGE +#include +#endif #include @@ -451,6 +454,10 @@ PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_ucred = crhold(p->p_ucred); +#ifdef VIMAGE + /* Make sure the cached vps stays correct. */ + newtd->td_vps = p->p_ucred->cr_prison->pr_vps; +#endif newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } @@ -460,6 +467,10 @@ { newtd->td_ucred = crhold(td->td_ucred); +#ifdef VIMAGE + /* Make sure to inherit the cached vps as well. */ + newtd->td_vps = td->td_vps; +#endif newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } @@ -489,6 +500,11 @@ oldcred = td->td_ucred; td->td_ucred = crhold(p->p_ucred); } +#ifdef VIMAGE + /* Make sure the cached vps stays correct. */ + if (td->td_vps != p->p_ucred->cr_prison->pr_vps) + td->td_vps = p->p_ucred->cr_prison->pr_vps; +#endif if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); Index: sys/kern/kern_vps.c =================================================================== --- sys/kern/kern_vps.c +++ sys/kern/kern_vps.c @@ -276,7 +276,14 @@ SDT_PROBE2(vps, functions, vps_destroy, entry, __LINE__, vps); VPS_LIST_WLOCK(); + if (vps->vps_le.le_prev == NULL && vps->vps_le.le_next == NULL) { + VPS_LIST_WUNLOCK(); + DELAY(10000); + return; + } LIST_REMOVE(vps, vps_le); + vps->vps_le.le_prev = NULL; + vps->vps_le.le_next = NULL; VPS_LIST_WUNLOCK(); CURVPS_SET_QUIET(vps); Index: sys/kern/subr_pcpu.c =================================================================== --- sys/kern/subr_pcpu.c +++ sys/kern/subr_pcpu.c @@ -378,6 +378,7 @@ #ifdef VIMAGE db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet); + db_printf("curvps = %p\n", pc->pc_curthread->td_vps); #endif #ifdef WITNESS Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -1065,6 +1065,7 @@ int fork1(struct thread *, struct fork_req *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); +int fork_findpid(int); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kern_proc_vmmap_resident(struct vm_map *map, struct vm_map_entry *entry, Index: sys/vm/vm_swapout.c =================================================================== --- sys/vm/vm_swapout.c +++ sys/vm/vm_swapout.c @@ -656,6 +656,13 @@ int ppri, pri, slptime, swtime; loop: +#ifdef VIMAGE + if (!IS_DEFAULT_VPS(curvps) && V_vpsdying > 0) { + V_vproc0 = NULL; + return; + } +#endif + if (vm_page_count_min()) { vm_wait_min(); goto loop;