Index: sys/arm/arm/pmap-v6.c =================================================================== --- sys/arm/arm/pmap-v6.c +++ sys/arm/arm/pmap-v6.c @@ -6577,7 +6577,7 @@ int npte2 = 0; int i, j, index; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid || p->p_vmspace == NULL) continue; @@ -6605,7 +6605,7 @@ index = 0; printf("\n"); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (npte2); } pte2p = pmap_pte2(pmap, va); @@ -6632,7 +6632,7 @@ } } } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (npte2); } Index: sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c +++ sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c @@ -1022,11 +1022,11 @@ mutex_enter(pid_mtx); #else pp = p; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); while (pp->p_vmspace == pp->p_pptr->p_vmspace) pp = pp->p_pptr; pid = pp->p_pid; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); pp = NULL; rm_rlock(&fasttrap_tp_lock, &tracker); Index: sys/compat/linprocfs/linprocfs.c =================================================================== --- sys/compat/linprocfs/linprocfs.c +++ sys/compat/linprocfs/linprocfs.c @@ -689,8 +689,8 @@ (int)(averunnable.ldavg[2] / averunnable.fscale), (int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100), 1, /* number of running tasks */ - nprocs, /* number of tasks */ - lastpid /* the last pid */ + V_nprocs, /* number of tasks */ + V_lastpid /* the last pid */ ); return (0); } @@ -708,10 +708,10 @@ vm_offset_t startcode, startdata; getboottime(&boottime); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); fill_kinfo_proc(p, &kp); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); if (p->p_vmspace) { startcode = (vm_offset_t)p->p_vmspace->vm_taddr; startdata = (vm_offset_t)p->p_vmspace->vm_daddr; @@ -787,11 +787,11 @@ struct kinfo_proc kp; segsz_t lsize; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); fill_kinfo_proc(p, &kp); PROC_UNLOCK(p); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); /* * See comments in linprocfs_doprocstatus() regarding the @@ -825,7 +825,7 @@ l_sigset_t siglist, sigignore, sigcatch; int i; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */ @@ -864,7 +864,7 @@ } fill_kinfo_proc(p, &kp); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */ sbuf_printf(sb, "State:\t%s\n", state); Index: sys/compat/linux/linux_file.c =================================================================== --- sys/compat/linux/linux_file.c +++ sys/compat/linux/linux_file.c @@ -149,17 +149,17 @@ fdrop(fp, td); goto done; } - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); if (SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); /* XXXPJD: Verify if TIOCSCTTY is allowed. */ (void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td->td_ucred, td); } else { PROC_UNLOCK(p); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); } fdrop(fp, td); } Index: sys/compat/linux/linux_fork.c =================================================================== --- sys/compat/linux/linux_fork.c +++ sys/compat/linux/linux_fork.c @@ -233,11 +233,11 @@ * the same as that of the calling process. */ if (args->flags & LINUX_CLONE_PARENT) { - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); PROC_LOCK(p2); proc_reparent(p2, td->td_proc->p_pptr); PROC_UNLOCK(p2); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); } #ifdef DEBUG Index: sys/compat/linux/linux_misc.c =================================================================== --- sys/compat/linux/linux_misc.c +++ sys/compat/linux/linux_misc.c @@ -181,7 +181,7 @@ sysinfo.totalswap = i * PAGE_SIZE; sysinfo.freeswap = (i - j) * PAGE_SIZE; - sysinfo.procs = nprocs; + sysinfo.procs = V_nprocs; /* The following are only present in newer Linux kernels. */ sysinfo.totalbig = 0; Index: sys/compat/linuxkpi/common/src/linux_current.c =================================================================== --- sys/compat/linuxkpi/common/src/linux_current.c +++ sys/compat/linuxkpi/common/src/linux_current.c @@ -226,22 +226,29 @@ static void linux_current_uninit(void *arg __unused) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p; struct task_struct *ts; struct thread *td; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - FOREACH_THREAD_IN_PROC(p, td) { - if ((ts = td->td_lkpi_task) != NULL) { - td->td_lkpi_task = NULL; - put_task_struct(ts); + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + FOREACH_THREAD_IN_PROC(p, td) { + if ((ts = td->td_lkpi_task) != NULL) { + td->td_lkpi_task = NULL; + put_task_struct(ts); + } } + PROC_UNLOCK(p); } - PROC_UNLOCK(p); + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); EVENTHANDLER_DEREGISTER(thread_dtor, linuxkpi_thread_dtor_tag); } Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -3852,6 +3852,7 @@ kern/kern_tslog.c optional tslog kern/kern_umtx.c standard kern/kern_uuid.c standard +kern/kern_vps.c optional vimage kern/kern_xxx.c standard kern/link_elf.c standard kern/linker_if.m standard Index: sys/ddb/db_command.c =================================================================== --- sys/ddb/db_command.c +++ sys/ddb/db_command.c @@ -693,11 +693,12 @@ * Find the process in question. allproc_lock is not needed * since we're in DDB. */ - /* sx_slock(&allproc_lock); */ + /* Operate on current vps instance only. */ + /* sx_slock(&V_allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) if (p->p_pid == pid) break; - /* sx_sunlock(&allproc_lock); */ + /* sx_sunlock(&V_allproc_lock); */ if (p == NULL) DB_ERROR(("Can't find process with pid %ld\n", (long) pid)); @@ -875,12 +876,26 @@ } } +static void +_db_stack_trace_all_v(bool active_only) +{ + VPS_ITERATOR_DECL(vps_iter); + + /* VPS_LIST_RLOCK(); */ + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + _db_stack_trace_all(active_only); + CURVPS_RESTORE(); + } + /* VPS_LIST_RUNLOCK(); */ +} + static void db_stack_trace_active(db_expr_t dummy, bool dummy2, db_expr_t dummy3, char *dummy4) { - _db_stack_trace_all(true); + _db_stack_trace_all_v(true); } static void @@ -888,7 +903,7 @@ char *dummy4) { - _db_stack_trace_all(false); + _db_stack_trace_all_v(false); } /* Index: sys/ddb/db_expr.c =================================================================== --- sys/ddb/db_expr.c +++ sys/ddb/db_expr.c @@ -58,7 +58,8 @@ if (t == tIDENT) { if (!db_value_of_name(db_tok_string, valuep) && !db_value_of_name_pcpu(db_tok_string, valuep) && - !db_value_of_name_vnet(db_tok_string, valuep)) { + !db_value_of_name_vnet(db_tok_string, valuep) && + !db_value_of_name_vps(db_tok_string, valuep)) { db_printf("Symbol '%s' not found\n", db_tok_string); db_error(NULL); /*NOTREACHED*/ Index: sys/ddb/db_ps.c =================================================================== --- sys/ddb/db_ps.c +++ sys/ddb/db_ps.c @@ -90,10 +90,11 @@ char state[9]; int np, rflag, sflag, dflag, lflag, wflag; - np = nprocs; + np = V_nprocs; - if (!LIST_EMPTY(&allproc)) - p = LIST_FIRST(&allproc); + /* Operate on current vps instance only. */ + if (!LIST_EMPTY(&V_allproc)) + p = LIST_FIRST(&V_allproc); else p = &proc0; @@ -217,8 +218,9 @@ p = LIST_NEXT(p, p_list); if (p == NULL && np > 0) - p = LIST_FIRST(&zombproc); + p = LIST_FIRST(&V_zombproc); } + db_printf("nprocs = %d, np = %d\n", V_nprocs, np); } static void @@ -397,6 +399,9 @@ db_printf(" last involuntary switch: %d ms ago\n", 1000 * delta / hz); } +#ifdef VIMAGE + db_printf(" vnet: %p vps: %p\n", td->td_vnet, td->td_vps); +#endif } DB_SHOW_COMMAND(proc, db_show_proc) @@ -475,6 +480,7 @@ db_findstack_cmd(db_expr_t addr, bool have_addr, db_expr_t dummy3 __unused, char *dummy4 __unused) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p; struct thread *td; struct kstack_cache_entry *ks_ce; @@ -487,15 +493,22 @@ return; } - FOREACH_PROC_IN_SYSTEM(p) { - FOREACH_THREAD_IN_PROC(p, td) { - if (td->td_kstack <= saddr && saddr < td->td_kstack + - PAGE_SIZE * td->td_kstack_pages) { - db_printf("Thread %p\n", td); - return; + /* VPS_LIST_RLOCK(); */ + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + FOREACH_PROC_IN_SYSTEM(p) { + FOREACH_THREAD_IN_PROC(p, td) { + if (td->td_kstack <= saddr && + saddr < td->td_kstack + + PAGE_SIZE * td->td_kstack_pages) { + db_printf("Thread %p\n", td); + return; + } } } + CURVPS_RESTORE(); } + /* VPS_LIST_RUNLOCK(); */ for (ks_ce = kstack_cache; ks_ce != NULL; ks_ce = ks_ce->next_ks_entry) { Index: sys/ddb/db_sym.c =================================================================== --- sys/ddb/db_sym.c +++ sys/ddb/db_sym.c @@ -37,8 +37,10 @@ #include #include +#include #include #include +#include #include @@ -69,6 +71,7 @@ #ifdef VIMAGE static void *db_vnet = NULL; +static void *db_vps = NULL; #endif /* @@ -168,6 +171,53 @@ return (0); } } + +/* + * Validate the virtual process space pointer used to interpret per-vps global + * variable expansion. Right now we don't do much here, really we should + * walk the global vps list to check it's an OK pointer. + */ +int +db_var_db_vps(struct db_variable *vp, db_expr_t *valuep, int op) +{ + + switch (op) { + case DB_VAR_GET: + *valuep = (db_expr_t)db_vps; + return (1); + + case DB_VAR_SET: + db_vps = *(void **)valuep; + return (1); + + default: + db_printf("db_var_db_vps: unknown operation\n"); + return (0); + } +} + +/* + * Read-only variable reporting the current vps, which is what we use when + * db_vps is set to NULL. + */ +int +db_var_curvps(struct db_variable *vp, db_expr_t *valuep, int op) +{ + + switch (op) { + case DB_VAR_GET: + *valuep = (db_expr_t)curvps; + return (1); + + case DB_VAR_SET: + db_printf("Read-only variable.\n"); + return (0); + + default: + db_printf("db_var_curvps: unknown operation\n"); + return (0); + } +} #endif /* @@ -278,6 +328,33 @@ #endif } +bool +db_value_of_name_vps(const char *name, db_expr_t *valuep) +{ +#ifdef VIMAGE + static char tmp[256]; + db_expr_t value; + c_db_sym_t sym; + struct vps *vps; + + if (db_vps != NULL) + vps = db_vps; + else + vps = curvps; + snprintf(tmp, sizeof(tmp), "vps_entry_%s", name); + sym = db_lookup(tmp); + if (sym == C_DB_SYM_NULL) + return (false); + db_symbol_values(sym, &name, &value); + if (value < VPS_START || value >= VPS_STOP) + return (false); + *valuep = (db_expr_t)((uintptr_t)value + vps->vps_data_base); + return (true); +#else + return (false); +#endif +} + /* * Lookup a symbol. * If the symbol has a qualifier (e.g., ux:vm_map), Index: sys/ddb/db_thread.c =================================================================== --- sys/ddb/db_thread.c +++ sys/ddb/db_thread.c @@ -135,11 +135,12 @@ if (td != NULL) return (td); if (check_pid) { + /* Operate on current vps instance only. */ FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid == decaddr) return (FIRST_THREAD_IN_PROC(p)); } - LIST_FOREACH(p, &zombproc, p_list) { + LIST_FOREACH(p, &V_zombproc, p_list) { if (p->p_pid == decaddr) return (FIRST_THREAD_IN_PROC(p)); } @@ -161,11 +162,12 @@ decaddr = db_hex2dec(addr); if (decaddr != -1) { + /* Operate on current vps instance only. */ FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid == decaddr) return (p); } - LIST_FOREACH(p, &zombproc, p_list) { + LIST_FOREACH(p, &V_zombproc, p_list) { if (p->p_pid == decaddr) return (p); } Index: sys/ddb/db_variables.h =================================================================== --- sys/ddb/db_variables.h +++ sys/ddb/db_variables.h @@ -56,8 +56,10 @@ extern db_varfcn_t db_var_curcpu; /* DPCPU default CPU */ extern db_varfcn_t db_var_curvnet; /* Default vnet */ +extern db_varfcn_t db_var_curvps; /* Default vps */ extern db_varfcn_t db_var_db_cpu; /* DPCPU active CPU */ extern db_varfcn_t db_var_db_vnet; /* Active vnet */ +extern db_varfcn_t db_var_db_vps; /* Active vps */ int db_read_variable(struct db_variable *, db_expr_t *); int db_write_variable(struct db_variable *, db_expr_t); Index: sys/ddb/db_variables.c =================================================================== --- sys/ddb/db_variables.c +++ sys/ddb/db_variables.c @@ -53,6 +53,8 @@ #ifdef VIMAGE { "curvnet", NULL, db_var_curvnet }, { "db_vnet", NULL, db_var_db_vnet }, + { "curvps", NULL, db_var_curvps }, + { "db_vps", NULL, db_var_db_vps }, #endif }; static struct db_variable *db_evars = db_vars + nitems(db_vars); Index: sys/ddb/ddb.h =================================================================== --- sys/ddb/ddb.h +++ sys/ddb/ddb.h @@ -229,6 +229,7 @@ bool db_value_of_name(const char *name, db_expr_t *valuep); bool db_value_of_name_pcpu(const char *name, db_expr_t *valuep); bool db_value_of_name_vnet(const char *name, db_expr_t *valuep); +bool db_value_of_name_vps(const char *name, db_expr_t *valuep); int db_write_bytes(vm_offset_t addr, size_t size, char *data); void db_command_register(struct command_table *, struct command *); void db_command_unregister(struct command_table *, struct command *); Index: sys/dev/filemon/filemon.c =================================================================== --- sys/dev/filemon/filemon.c +++ sys/dev/filemon/filemon.c @@ -210,6 +210,7 @@ static void filemon_untrack_processes(struct filemon *filemon) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p; sx_assert(&filemon->lock, SA_XLOCKED); @@ -223,18 +224,24 @@ * filemon_event_process_exit() will lock on filemon->lock * which we hold. */ - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - /* - * No PROC_LOCK is needed to compare here since it is - * guaranteed to not change since we have its filemon - * locked. Everything that changes this p_filemon will - * be locked on it. - */ - if (p->p_filemon == filemon) - filemon_proc_drop(p); + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + /* + * No PROC_LOCK is needed to compare here since it is + * guaranteed to not change since we have its filemon + * locked. Everything that changes this p_filemon will + * be locked on it. + */ + if (p->p_filemon == filemon) + filemon_proc_drop(p); + } + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); /* * It's possible some references were acquired but will be Index: sys/dev/hwpmc/hwpmc_mod.c =================================================================== --- sys/dev/hwpmc/hwpmc_mod.c +++ sys/dev/hwpmc/hwpmc_mod.c @@ -1203,7 +1203,7 @@ * this PMC. */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); top = p; @@ -1227,7 +1227,7 @@ (void) pmc_detach_process(top, pm); done: - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return error; } @@ -1312,7 +1312,7 @@ * partially attached proc tree. */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); top = p; @@ -1333,7 +1333,7 @@ } done: - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); if (LIST_EMPTY(&pm->pm_targets)) pm->pm_flags &= ~PMC_F_ATTACH_DONE; @@ -2025,7 +2025,7 @@ PROC_UNLOCK(p); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); top = p; @@ -2044,7 +2044,7 @@ } } done: - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); } /* @@ -5364,6 +5364,7 @@ static void pmc_process_allproc(struct pmc *pm) { + VPS_ITERATOR_DECL(vps_iter); struct pmc_owner *po; struct thread *td; struct proc *p; @@ -5371,15 +5372,22 @@ po = pm->pm_owner; if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) return; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - pmclog_process_proccreate(po, p, 0 /* sync */); - PROC_LOCK(p); - FOREACH_THREAD_IN_PROC(p, td) - pmclog_process_threadcreate(po, td, 0 /* sync */); - PROC_UNLOCK(p); + + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + pmclog_process_proccreate(po, p, 0 /* sync */); + PROC_LOCK(p); + FOREACH_THREAD_IN_PROC(p, td) + pmclog_process_threadcreate(po, td, 0 /* sync */); + PROC_UNLOCK(p); + } + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); pmclog_flush(po, 0); } Index: sys/fs/devfs/devfs_vnops.c =================================================================== --- sys/fs/devfs/devfs_vnops.c +++ sys/fs/devfs/devfs_vnops.c @@ -596,7 +596,7 @@ if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); VI_LOCK(vp); @@ -609,7 +609,7 @@ VI_UNLOCK(vp); SESS_UNLOCK(p->p_session); } - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); if (oldvp != NULL) vrele(oldvp); } else @@ -813,9 +813,9 @@ if (error == 0 && com == TIOCSCTTY) { /* Do nothing if reassigning same control tty */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (td->td_proc->p_session->s_ttyvp == vp) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (0); } @@ -826,7 +826,7 @@ td->td_proc->p_session->s_ttydp = cdev2priv(dev); SESS_UNLOCK(td->td_proc->p_session); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); /* Get rid of reference to old control tty */ if (vpold) Index: sys/fs/nfs/nfsport.h =================================================================== --- sys/fs/nfs/nfsport.h +++ sys/fs/nfs/nfsport.h @@ -692,8 +692,8 @@ #define NFSUNLOCKMNT(m) mtx_unlock(&((m)->nm_mtx)) #define NFSLOCKREQUEST(r) mtx_lock(&((r)->r_mtx)) #define NFSUNLOCKREQUEST(r) mtx_unlock(&((r)->r_mtx)) -#define NFSPROCLISTLOCK() sx_slock(&allproc_lock) -#define NFSPROCLISTUNLOCK() sx_sunlock(&allproc_lock) +#define NFSPROCLISTLOCK() sx_slock(&V_allproc_lock) +#define NFSPROCLISTUNLOCK() sx_sunlock(&V_allproc_lock) #define NFSLOCKSOCKREQ(r) mtx_lock(&((r)->nr_mtx)) #define NFSUNLOCKSOCKREQ(r) mtx_unlock(&((r)->nr_mtx)) #define NFSLOCKDS(d) mtx_lock(&((d)->nfsclds_mtx)) Index: sys/fs/pseudofs/pseudofs_vnops.c =================================================================== --- sys/fs/pseudofs/pseudofs_vnops.c +++ sys/fs/pseudofs/pseudofs_vnops.c @@ -705,7 +705,7 @@ { int visible; - sx_assert(&allproc_lock, SX_SLOCKED); + sx_assert(&V_allproc_lock, SX_SLOCKED); pfs_assert_owned(pd); again: if (*pn == NULL) { @@ -716,9 +716,14 @@ *pn = (*pn)->pn_next; } if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) { + /* + * Operate on current vps instance only. + * We must not iterate over all vps as duplicate process space + * would not work at all and leak a lot of information. + */ /* next process */ if (*p == NULL) - *p = LIST_FIRST(&allproc); + *p = LIST_FIRST(&V_allproc); else *p = LIST_NEXT(*p, p_list); /* out of processes: next node */ @@ -791,12 +796,12 @@ if (resid == 0) PFS_RETURN (0); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); pfs_lock(pd); /* check if the directory is visible to the caller */ if (!pfs_visible(curthread, pd, pid, true, &proc)) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); pfs_unlock(pd); PFS_RETURN (ENOENT); } @@ -810,7 +815,7 @@ if (proc != NULL) PROC_UNLOCK(proc); pfs_unlock(pd); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); PFS_RETURN (0); } } @@ -860,7 +865,7 @@ if (proc != NULL) PROC_UNLOCK(proc); pfs_unlock(pd); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); i = 0; STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) { if (error == 0) Index: sys/i386/i386/pmap.c =================================================================== --- sys/i386/i386/pmap.c +++ sys/i386/i386/pmap.c @@ -5799,7 +5799,7 @@ int npte = 0; int index; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; @@ -5822,7 +5822,7 @@ index = 0; printf("\n"); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (npte); } pte = pmap_pte(pmap, va); @@ -5847,7 +5847,7 @@ } } } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (npte); } #endif Index: sys/i386/ibcs2/ibcs2_sysvec.c =================================================================== --- sys/i386/ibcs2/ibcs2_sysvec.c +++ sys/i386/ibcs2/ibcs2_sysvec.c @@ -109,6 +109,7 @@ static int ibcs2_modevent(module_t mod, int type, void *unused) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p = NULL; int rval = 0; @@ -117,14 +118,20 @@ break; case MOD_UNLOAD: /* if this was an ELF module we'd use elf_brand_inuse()... */ - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - if (p->p_sysent == &ibcs2_svr3_sysvec) { - rval = EBUSY; - break; + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_sysent == &ibcs2_svr3_sysvec) { + rval = EBUSY; + break; + } } + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); break; default: rval = EOPNOTSUPP; Index: sys/kern/imgact_elf.c =================================================================== --- sys/kern/imgact_elf.c +++ sys/kern/imgact_elf.c @@ -238,17 +238,24 @@ int __elfN(brand_inuse)(Elf_Brandinfo *entry) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p; int rval = FALSE; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - if (p->p_sysent == entry->sysvec) { - rval = TRUE; - break; + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_sysent == entry->sysvec) { + rval = TRUE; + break; + } } + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); return (rval); } @@ -2106,10 +2113,10 @@ KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(elf_kinfo_proc_t); sbuf_bcat(sb, &structsize, sizeof(structsize)); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); kern_proc_out(p, sb, ELF_KERN_PROC_MASK); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); } *sizep = size; } Index: sys/kern/init_main.c =================================================================== --- sys/kern/init_main.c +++ sys/kern/init_main.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,7 @@ #include #include #include +#include #include @@ -103,7 +105,18 @@ struct proc proc0; struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; -struct proc *initproc; +VPS_DEFINE(struct proc *, initproc); + +VPS_DEFINE(struct proc *, vproc0); +#ifdef VIMAGE +/* + * Initialize to -2; after kproc_create() our thread will still be + * forked from thread0 and in the wrong vps. Once that is fixed it will + * see the local copy and not the DEFAULT_VPS one. Make sure we have + * a value that we can spin on until this happens. + */ +VPS_DEFINE(int, vpsdying) = -2; +#endif #ifndef BOOTHOWTO #define BOOTHOWTO 0 @@ -461,9 +474,8 @@ p->p_osrel = osreldate; /* - * Initialize thread and process structures. + * Initialize thread structures. */ - procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* @@ -475,7 +487,8 @@ /* * Create process 0 (the swapper). */ - LIST_INSERT_HEAD(&allproc, p, p_list); + V_vproc0 = p; + LIST_INSERT_HEAD(&V_allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; @@ -511,6 +524,9 @@ td->td_cpuset = cpuset_thread0(); td->td_domain.dr_policy = td->td_cpuset->cs_domain; prison0_init(); +#ifdef VIMAGE + td->td_vps = vps0; +#endif p->p_peers = 0; p->p_leader = p; p->p_reaper = p; @@ -549,7 +565,7 @@ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ - siginit(&proc0); + siginit(V_vproc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL, false); @@ -614,7 +630,184 @@ racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } -SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); +SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_init, NULL); + +#ifdef VIMAGE +static void +vps_swapper(void *dummy __unused) +{ + + /* + * Make sure the surgical changes to V_vproc0 are done before + * entering the long-lasting loop. Otherwise we may start + * acquiring locks and accessing variables based on the wrong + * credential leading to, e.g., panics when trying to unlock a + * lock from a different context which may not be locked. + * When entering the function our credentials might still point + * to the DEFAULT_VPS; see comment for V_vpsdying declaration above. + */ + while (V_vpsdying < 0) + pause("wswvps", hz/2); + + /* + * Now hand over this thread to swapper. + */ + swapper(); + + if (V_vpsdying < 1) + panic("%s: swapper curtd %p ended but V_vpsdying %d\n", + __func__, curthread, V_vpsdying); + + kproc_exit(0); +} + +static void +proc0_init_vps(void *dummy __unused) +{ + struct ucred *newcred, *savecred; + struct thread *td; + struct prison *pr; + struct uidinfo tmpuinfo; + struct loginclass tmplc = { + .lc_name = "", + }; + int error; + + /* vps0 is handled normally in p0init. */ + if (IS_DEFAULT_VPS(curvps)) + return; + + KASSERT((curvps->vps_pr != NULL && curvps != vps0), + ("%s: curvps %p has vps_pr %p or is vps0 %p\n", + __func__, curvps, curvps->vps_pr, vps0)); + KASSERT((curvps == curvps->vps_pr->pr_vps), + ("%s: curvps %p != curvps->vps_pr %p ->pr_vps %p\n", + __func__, curvps, curvps->vps_pr, curvps->vps_pr->pr_vps)); + + /* + * Initialized the non-default VPS version to < 0 so vps_swapper() + * will spin once the credential is changed before all other surgery + * has happened. + */ + V_vpsdying = -1; + + /* + * Default is nprocs = 1 for vps0; need to set it to 0 here as our + * "proc0" and with that initproc are forked and not manually constructed. + */ + V_nprocs = 0; + + /* + * Set lastpid to -1 so that our swapper gets 0. + */ + V_lastpid = -1; + + error = kproc_create(vps_swapper, NULL, &V_vproc0, 0, 0, "vps%u", + curvps->vps_pr->pr_id); + if (error) + panic("%s: cannot create vps %p swapper: %d\n", + __func__, curvps, error); + + /* Create credentials. Copied from proc0. Just using vps_pr. */ + newcred = crget(); + newcred->cr_ngroups = 1; /* group 0 */ + /* A hack to prevent uifind from tripping over NULL pointers. */ + savecred = curthread->td_ucred; + curthread->td_ucred = newcred; + tmpuinfo.ui_uid = 1; + newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo; + newcred->cr_uidinfo = uifind(0); + newcred->cr_ruidinfo = uifind(0); + newcred->cr_loginclass = &tmplc; + newcred->cr_loginclass = loginclass_find("default"); + /* End hack. creds get properly set later with thread_cow_get_proc */ + curthread->td_ucred = savecred; + PROC_LOCK(V_vproc0); + newcred->cr_prison = curvps->vps_pr; + prison_hold(newcred->cr_prison); + /* The kernel process was accounted to thread0's prison. */ + prison_proc_hold(newcred->cr_prison); + prison_proc_free(savecred->cr_prison); + V_vproc0->p_treeflag |= P_TREE_REAPER; + savecred = proc_set_cred(V_vproc0, newcred); + PROC_UNLOCK(V_vproc0); +#ifdef AUDIT + audit_cred_kproc0(newcred); +#endif +#ifdef MAC + mac_cred_create_swapper(newcred); +#endif + crfree(savecred); + + PROC_LOCK(V_vproc0); + td = FIRST_THREAD_IN_PROC(V_vproc0); + thread_cow_get_proc(td, V_vproc0); + PROC_UNLOCK(V_vproc0); + KASSERT(curvps->vps_pr == + FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred->cr_prison, + ("%s:%d: curvps %p vps_pr %p != FTIP(V_vproc0 %p)->td_ucred %p " + "cr_prison %p\n", __func__, __LINE__, + curvps, curvps->vps_pr, V_vproc0, + FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred, + FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred->cr_prison)); + KASSERT(curvps == TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)), + ("%s:%d: curvps %p != TD_TO_VPS(..(V_vproc0 %p)) %p\n", + __func__, __LINE__, + curvps, V_vproc0, TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)))); + + /* Chroot it. */ + td = FIRST_THREAD_IN_PROC(V_vproc0); + pr = curvps->vps_pr; + vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); + if ((error = change_dir(pr->pr_root, td)) != 0) { + printf("%s: td %p change_dir %p failed: %d\n", + __func__, td, pr->pr_root, error); + goto err; + } +#ifdef MAC + if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) { + printf("%s: td %p mac_vnode_check_chroot %p failed: %d\n", + __func__, td, pr->pr_root, error); + goto err; + } +#endif + VOP_UNLOCK(pr->pr_root, 0); + if ((error = pwd_chroot(td, pr->pr_root))) { + printf("%s: td %p pwd_chroot %p failed: %d\n", + __func__, td, pr->pr_root, error); + goto err; + } + + V_vpsdying = 0; + return; + +err: + /* XXX could panic or singal the jail to abort; cannot really stop. */ + return; +} +VPS_SYSINIT(p0init_vps, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_init_vps, NULL); + +static void +proc0_uninit_vps(void *dummy __unused) +{ + + if (IS_DEFAULT_VPS(curvps)) + return; + + /* + * XXX ideally we want to get that state from elsewhere; + * neither prison, not vps state, .. lends itself though. + */ + V_vpsdying = 1; + wakeup(V_vproc0); + + /* Operate on current vps instance only. */ + while (V_vproc0 != NULL || + !LIST_EMPTY(&V_zombproc) || !LIST_EMPTY(&V_allproc)) + pause("p0uvps", hz/2); +} +VPS_SYSUNINIT(p0uninit_vps, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_uninit_vps, NULL); +#endif /* ARGSUSED*/ static void @@ -628,8 +821,9 @@ /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. + * Operate on vps0 instance only. */ - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { @@ -649,7 +843,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); @@ -729,7 +923,8 @@ td = curthread; p = td->td_proc; - vfs_mountroot(); + if (IS_DEFAULT_VPS(curvps)) + vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); @@ -753,8 +948,8 @@ while ((path = strsep(&tmp_init_path, ":")) != NULL) { pathlen = strlen(path) + 1; if (bootverbose) - printf("start_init: trying %s\n", path); - + printf("%s: trying %s\n", __func__, path); + /* * Move out the boot flag argument. */ @@ -839,38 +1034,60 @@ struct thread *td; int error; + KASSERT(curvps == FIRST_THREAD_IN_PROC(V_vproc0)->td_vps, + ("%s: curvps %p != V_vproc0 %p first td %p td_vps %p\n", + __func__, curvps, V_vproc0, FIRST_THREAD_IN_PROC(V_vproc0), + FIRST_THREAD_IN_PROC(V_vproc0)->td_vps)); + KASSERT(curvps == TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)), + ("%s: curvps %p != TD_TO_VPS(..(V_vproc0 %p)) %p\n", + __func__, curvps, V_vproc0, + TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)))); + bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; - fr.fr_procp = &initproc; - error = fork1(&thread0, &fr); + fr.fr_procp = &V_initproc; + td = FIRST_THREAD_IN_PROC(V_vproc0); + error = fork1(td, &fr); if (error) panic("cannot fork init: %d\n", error); - KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); + KASSERT(V_initproc->p_pid == 1, ("%s: initproc->p_pid(%d) != 1", + __func__, V_initproc->p_pid)); + KASSERT(curvps == FIRST_THREAD_IN_PROC(V_initproc)->td_vps, + ("%s: curvps %p != V_initproc %p first td %p td_vps %p\n", + __func__, curvps, V_initproc, FIRST_THREAD_IN_PROC(V_initproc), + FIRST_THREAD_IN_PROC(V_initproc)->td_vps)); + /* divorce init's credentials from the kernel's */ newcred = crget(); - sx_xlock(&proctree_lock); - PROC_LOCK(initproc); - initproc->p_flag |= P_SYSTEM | P_INMEM; - initproc->p_treeflag |= P_TREE_REAPER; - oldcred = initproc->p_ucred; + sx_xlock(&V_proctree_lock); + PROC_LOCK(V_initproc); + V_initproc->p_flag |= P_SYSTEM | P_INMEM; + V_initproc->p_treeflag |= P_TREE_REAPER; + oldcred = V_initproc->p_ucred; crcopy(newcred, oldcred); +#ifdef VIMAGE + /* Swap to the correct prison. */ + /* XXX is this really needed or was this related to a V_vproc0 bug? */ + prison_free(newcred->cr_prison); + newcred->cr_prison = curvps->vps_pr; + prison_hold(newcred->cr_prison); +#endif #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif - proc_set_cred(initproc, newcred); - td = FIRST_THREAD_IN_PROC(initproc); - crfree(td->td_ucred); - td->td_ucred = crhold(initproc->p_ucred); - PROC_UNLOCK(initproc); - sx_xunlock(&proctree_lock); + /* This will also update cowgen. */ + proc_set_cred(V_initproc, newcred); + PROC_UNLOCK(V_initproc); + sx_xunlock(&V_proctree_lock); crfree(oldcred); - cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), + + cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(V_initproc), start_init, NULL); } -SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); +VPS_SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. @@ -880,10 +1097,49 @@ { struct thread *td; - td = FIRST_THREAD_IN_PROC(initproc); + td = FIRST_THREAD_IN_PROC(V_initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); thread_unlock(td); } -SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); +VPS_SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); + +#ifdef VIMAGE +static void +reapinit(void *ident __unused) +{ + struct proc *p, *p2; + + while (V_nprocs > 2) { + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_pid <= 1) + continue; + PROC_LOCK(p); + kern_psignal(p, SIGKILL); + PROC_UNLOCK(p); + } + sx_sunlock(&V_allproc_lock); + pause("reapin1t", hz/2); + } + + /* Operate on current vps instance only. */ + sx_xlock(&V_proctree_lock); + LIST_FOREACH_SAFE(p, &V_zombproc, p_list, p2) { + PROC_LOCK(p); + proc_reap(FIRST_THREAD_IN_PROC(V_vproc0), p, NULL, 0); + sx_xlock(&V_proctree_lock); + } + sx_xunlock(&V_proctree_lock); + + while (V_nprocs > 1) + pause("reapinit", hz/2); + + /* Only our "swapper" left. */ + KASSERT(V_nprocs == 1, ("%s: vps %p V_nprocs %d != 1", + __func__, curvps, V_nprocs)); +} +/* Run very first. */ +VPS_SYSUNINIT(reapinit, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, reapinit, NULL); +#endif Index: sys/kern/kern_acct.c =================================================================== --- sys/kern/kern_acct.c +++ sys/kern/kern_acct.c @@ -378,7 +378,7 @@ * Get process accounting information. */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); /* (1) The terminal from which the process was started */ @@ -386,7 +386,7 @@ acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp); else acct.ac_tty = NODEV; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); /* (2) The name of the command that ran */ bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm); Index: sys/kern/kern_clock.c =================================================================== --- sys/kern/kern_clock.c +++ sys/kern/kern_clock.c @@ -184,12 +184,78 @@ static int blktime_threshold = 900; static int sleepfreq = 3; +static __inline void +_deadlres_td_on_lock(struct proc *p, struct thread *td, int blkticks) +{ + int tticks; + + /* + * The thread should be blocked on a turnstile, simply check + * if the turnstile channel is in good state. + */ + MPASS(td->td_blocked != NULL); + + tticks = ticks - td->td_blktick; + thread_unlock(td); + if (tticks > blkticks) { + + /* + * Accordingly with provided thresholds, this thread is stuck + * for too long on a turnstile. + */ + PROC_UNLOCK(p); + sx_sunlock(&V_allproc_lock); + panic("%s: possible deadlock detected for %p, " + "blocked for %d ticks\n", __func__, td, tticks); + } +} + +static __inline void +_deadlres_td_sleep_q(struct proc *p, struct thread *td, int slpticks) +{ + void *wchan; + int i, slptype, tryl, tticks; + + /* + * Check if the thread is sleeping on a lock, otherwise skip the check. + * Drop the thread lock in order to avoid a LOR with the sleepqueue + * spinlock. + */ + wchan = td->td_wchan; + tticks = ticks - td->td_slptick; + thread_unlock(td); + slptype = sleepq_type(wchan); + if ((slptype == SLEEPQ_SX || slptype == SLEEPQ_LK) && + tticks > slpticks) { + + /* + * Accordingly with provided thresholds, this thread is stuck + * for too long on a sleepqueue. + * However, being on a sleepqueue, we might still check for the + * blessed list. + */ + tryl = 0; + for (i = 0; blessed[i] != NULL; i++) { + if (!strcmp(blessed[i], td->td_wmesg)) { + tryl = 1; + break; + } + } + if (tryl != 0) + return; + PROC_UNLOCK(p); + sx_sunlock(&V_allproc_lock); + panic("%s: possible deadlock detected for %p, " + "blocked for %d ticks\n", __func__, td, tticks); + } +} + static void deadlres_td_on_lock(struct proc *p, struct thread *td, int blkticks) { int tticks; - sx_assert(&allproc_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* @@ -214,7 +280,7 @@ void *wchan; int i, slptype, tticks; - sx_assert(&allproc_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* @@ -246,6 +312,7 @@ static void deadlkres(void) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p; struct thread *td; int blkticks, slpticks, tryl; @@ -255,41 +322,49 @@ blkticks = blktime_threshold * hz; slpticks = slptime_threshold * hz; - /* - * Avoid to sleep on the sx_lock in order to avoid a - * possible priority inversion problem leading to - * starvation. - * If the lock can't be held after 100 tries, panic. - */ - if (!sx_try_slock(&allproc_lock)) { - if (tryl > 100) - panic("%s: possible deadlock detected " - "on allproc_lock\n", __func__); - tryl++; - pause("allproc", sleepfreq * hz); - continue; - } - tryl = 0; - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - if (p->p_state == PRS_NEW) { - PROC_UNLOCK(p); - continue; + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { +again: + CURVPS_SET_QUIET(vps_iter); + /* + * Avoid to sleep on the sx_lock in order to avoid a + * possible priority inversion problem leading to + * starvation. + * If the lock can't be held after 100 tries, panic. + */ + if (!sx_try_slock(&V_allproc_lock)) { + if (tryl > 100) + panic("%s: possible deadlock detected " + "on allproc_lock\n", __func__); + tryl++; + CURVPS_RESTORE(); + pause("allproc", sleepfreq * hz); + goto again; } - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); - if (TD_ON_LOCK(td)) - deadlres_td_on_lock(p, td, - blkticks); - else if (TD_IS_SLEEPING(td) && - TD_ON_SLEEPQ(td)) - deadlres_td_sleep_q(p, td, - slpticks); - thread_unlock(td); + tryl = 0; + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_state == PRS_NEW) { + PROC_UNLOCK(p); + continue; + } + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + if (TD_ON_LOCK(td)) + deadlres_td_on_lock(p, td, + blkticks); + else if (TD_IS_SLEEPING(td) && + TD_ON_SLEEPQ(td)) + deadlres_td_sleep_q(p, td, + slpticks); + thread_unlock(td); + } + PROC_UNLOCK(p); } - PROC_UNLOCK(p); + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); /* Sleep for sleepfreq seconds. */ pause("-", sleepfreq * hz); Index: sys/kern/kern_cpuset.c =================================================================== --- sys/kern/kern_cpuset.c +++ sys/kern/kern_cpuset.c @@ -510,24 +510,32 @@ static void domainset_notify(void) { + VPS_ITERATOR_DECL(vps_iter); struct thread *td; struct proc *p; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - if (p->p_state == PRS_NEW) { + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_state == PRS_NEW) { + PROC_UNLOCK(p); + continue; + } + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + td->td_domain.dr_policy = + td->td_cpuset->cs_domain; + thread_unlock(td); + } PROC_UNLOCK(p); - continue; } - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); - td->td_domain.dr_policy = td->td_cpuset->cs_domain; - thread_unlock(td); - } - PROC_UNLOCK(p); + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); kernel_object->domain.dr_policy = cpuset_kernel->cs_domain; } Index: sys/kern/kern_descrip.c =================================================================== --- sys/kern/kern_descrip.c +++ sys/kern/kern_descrip.c @@ -1063,7 +1063,7 @@ sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { @@ -1131,14 +1131,14 @@ sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); SIGIO_LOCK(); *sigiop = sigio; SIGIO_UNLOCK(); return (0); fail: - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); return (ret); @@ -3190,6 +3190,7 @@ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { + VPS_ITERATOR_DECL(vps_iter); struct filedesc *fdp; struct prison *pr; struct proc *p; @@ -3198,33 +3199,40 @@ if (vrefcnt(olddp) == 1) return; nrele = 0; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - fdp = fdhold(p); - PROC_UNLOCK(p); - if (fdp == NULL) - continue; - FILEDESC_XLOCK(fdp); - if (fdp->fd_cdir == olddp) { - vrefact(newdp); - fdp->fd_cdir = newdp; - nrele++; - } - if (fdp->fd_rdir == olddp) { - vrefact(newdp); - fdp->fd_rdir = newdp; - nrele++; - } - if (fdp->fd_jdir == olddp) { - vrefact(newdp); - fdp->fd_jdir = newdp; - nrele++; + + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + continue; + FILEDESC_XLOCK(fdp); + if (fdp->fd_cdir == olddp) { + vrefact(newdp); + fdp->fd_cdir = newdp; + nrele++; + } + if (fdp->fd_rdir == olddp) { + vrefact(newdp); + fdp->fd_rdir = newdp; + nrele++; + } + if (fdp->fd_jdir == olddp) { + vrefact(newdp); + fdp->fd_jdir = newdp; + nrele++; + } + FILEDESC_XUNLOCK(fdp); + fddrop(fdp); } - FILEDESC_XUNLOCK(fdp); - fddrop(fdp); + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); if (rootvnode == olddp) { vrefact(newdp); rootvnode = newdp; @@ -3307,6 +3315,7 @@ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { + VPS_ITERATOR_DECL(vps_iter); struct xfile xf; struct filedesc *fdp; struct file *fp; @@ -3318,68 +3327,82 @@ return (error); if (req->oldptr == NULL) { n = 0; - sx_slock(&allproc_lock); + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_state == PRS_NEW) { + PROC_UNLOCK(p); + continue; + } + fdp = fdhold(p); + PROC_UNLOCK(p); + if (fdp == NULL) + continue; + /* overestimates sparse tables. */ + if (fdp->fd_lastfile > 0) + n += fdp->fd_lastfile; + fddrop(fdp); + } + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); + } + VPS_LIST_RUNLOCK(); + return (SYSCTL_OUT(req, 0, n * sizeof(xf))); + } + error = 0; + bzero(&xf, sizeof(xf)); + xf.xf_size = sizeof(xf); + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } + if (p_cansee(req->td, p) != 0) { + PROC_UNLOCK(p); + continue; + } + xf.xf_pid = p->p_pid; + xf.xf_uid = p->p_ucred->cr_uid; fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; - /* overestimates sparse tables. */ - if (fdp->fd_lastfile > 0) - n += fdp->fd_lastfile; + FILEDESC_SLOCK(fdp); + for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) { + if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) + continue; + xf.xf_fd = n; + xf.xf_file = (kvaddr_t)(uintptr_t)fp; + xf.xf_data = (kvaddr_t)(uintptr_t)fp->f_data; + xf.xf_vnode = (kvaddr_t)(uintptr_t)fp->f_vnode; + xf.xf_type = (kvaddr_t)(uintptr_t)fp->f_type; + xf.xf_count = fp->f_count; + xf.xf_msgcount = 0; + xf.xf_offset = foffset_get(fp); + xf.xf_flag = fp->f_flag; + error = SYSCTL_OUT(req, &xf, sizeof(xf)); + if (error) + break; + } + FILEDESC_SUNLOCK(fdp); fddrop(fdp); - } - sx_sunlock(&allproc_lock); - return (SYSCTL_OUT(req, 0, n * sizeof(xf))); - } - error = 0; - bzero(&xf, sizeof(xf)); - xf.xf_size = sizeof(xf); - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - if (p->p_state == PRS_NEW) { - PROC_UNLOCK(p); - continue; - } - if (p_cansee(req->td, p) != 0) { - PROC_UNLOCK(p); - continue; - } - xf.xf_pid = p->p_pid; - xf.xf_uid = p->p_ucred->cr_uid; - fdp = fdhold(p); - PROC_UNLOCK(p); - if (fdp == NULL) - continue; - FILEDESC_SLOCK(fdp); - for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) { - if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) - continue; - xf.xf_fd = n; - xf.xf_file = (kvaddr_t)(uintptr_t)fp; - xf.xf_data = (kvaddr_t)(uintptr_t)fp->f_data; - xf.xf_vnode = (kvaddr_t)(uintptr_t)fp->f_vnode; - xf.xf_type = (kvaddr_t)(uintptr_t)fp->f_type; - xf.xf_count = fp->f_count; - xf.xf_msgcount = 0; - xf.xf_offset = foffset_get(fp); - xf.xf_flag = fp->f_flag; - error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } - FILEDESC_SUNLOCK(fdp); - fddrop(fdp); + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); if (error) break; } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); return (error); } @@ -3930,21 +3953,28 @@ static struct proc * file_to_first_proc(struct file *fp) { + VPS_ITERATOR_DECL(vps_iter); struct filedesc *fdp; struct proc *p; int n; - FOREACH_PROC_IN_SYSTEM(p) { - if (p->p_state == PRS_NEW) - continue; - fdp = p->p_fd; - if (fdp == NULL) - continue; - for (n = 0; n <= fdp->fd_lastfile; n++) { - if (fp == fdp->fd_ofiles[n].fde_file) - return (p); + /* VPS_LIST_RLOCK(); */ + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + fdp = p->p_fd; + if (fdp == NULL) + continue; + for (n = 0; n <= fdp->fd_lastfile; n++) { + if (fp == fdp->fd_ofiles[n].fde_file) + return (p); + } } + CURVPS_RESTORE(); } + /* VPS_LIST_RUNLOCK(); */ return (NULL); } @@ -3982,6 +4012,7 @@ DB_SHOW_COMMAND(files, db_show_files) { + VPS_ITERATOR_DECL(vps_iter); struct filedesc *fdp; struct file *fp; struct proc *p; @@ -3989,18 +4020,24 @@ int n; header = 1; - FOREACH_PROC_IN_SYSTEM(p) { - if (p->p_state == PRS_NEW) - continue; - if ((fdp = p->p_fd) == NULL) - continue; - for (n = 0; n <= fdp->fd_lastfile; ++n) { - if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) + /* VPS_LIST_RLOCK(); */ + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + if ((fdp = p->p_fd) == NULL) continue; - db_print_file(fp, header); - header = 0; + for (n = 0; n <= fdp->fd_lastfile; ++n) { + if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) + continue; + db_print_file(fp, header); + header = 0; + } } + CURVPS_RESTORE(); } + /* VPS_LIST_RUNLOCK(); */ } #endif Index: sys/kern/kern_exit.c =================================================================== --- sys/kern/kern_exit.c +++ sys/kern/kern_exit.c @@ -96,6 +96,11 @@ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exit, "int"); +#ifdef VIMAGE +VPS_DECLARE(int, vrebooting); /* kern_reboot() has been called. */ +#define V_vrebooting VPS(vrebooting) +#endif + /* Hook for NFS teardown procedure. */ void (*nlminfo_release_p)(struct proc *p); @@ -106,13 +111,13 @@ { struct proc *p, *parent; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); if ((child->p_treeflag & P_TREE_ORPHANED) == 0) { if (child->p_oppid == 0 || child->p_pptr->p_pid == child->p_oppid) parent = child->p_pptr; else - parent = initproc; + parent = V_initproc; return (parent); } for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) { @@ -132,10 +137,16 @@ { struct proc *p1, *p2, *ptmp; - sx_assert(&proctree_lock, SX_LOCKED); - KASSERT(p != initproc, ("reaper_abandon_children for initproc")); - if ((p->p_treeflag & P_TREE_REAPER) == 0) + sx_assert(&V_proctree_lock, SX_LOCKED); + /* init inside a vps may die on prison_remove. */ + KASSERT(!IS_DEFAULT_VPS(curvps) || p != V_initproc, + ("%s: for initproc %p", __func__, p)); + if ((p->p_treeflag & P_TREE_REAPER) == 0) { + KASSERT((p != V_initproc && p->p_pid != 1 && p->p_pid != 0), + ("%s:%d curvps %p p %p pid %d p_treeflag %#x", + __func__, __LINE__, curvps, p, p->p_pid, p->p_treeflag)); return; + } p1 = p->p_reaper; LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) { LIST_REMOVE(p2, p_reapsibling); @@ -148,7 +159,8 @@ PROC_UNLOCK(p2); } } - KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty")); + KASSERT(LIST_EMPTY(&p->p_reaplist), + ("%s: p %p p_reaplist not empty", __func__, p)); p->p_treeflag &= ~P_TREE_REAPER; } @@ -157,7 +169,7 @@ { struct proc *p1; - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); if ((p->p_treeflag & P_TREE_ORPHANED) == 0) return; if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) { @@ -203,9 +215,19 @@ * work around an unsolved stack overflow seen very late during * shutdown on sparc64 when the gmirror worker process exists. */ - if (p == initproc && rebooting == 0) { + if (p == V_initproc && (rebooting == 0 +#ifdef VIMAGE + || V_vrebooting +#endif + )) { printf("init died (signal %d, exit %d)\n", signo, rval); - panic("Going nowhere without my init!"); +#ifdef VIMAGE + if (!IS_DEFAULT_VPS(TD_TO_VPS(td))) + /* XXX-BZ make this jail go away. */ ; + else +#endif + panic("%s: Going nowhere without my init! td %p", + __func__, td); } /* @@ -313,7 +335,7 @@ /* Are we a task leader with peers? */ if (p->p_peers != NULL && p == p->p_leader) { - mtx_lock(&ppeers_lock); + mtx_lock(&V_ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); @@ -322,8 +344,8 @@ q = q->p_peers; } while (p->p_peers != NULL) - msleep(p, &ppeers_lock, PWAIT, "exit1", 0); - mtx_unlock(&ppeers_lock); + msleep(p, &V_ppeers_lock, PWAIT, "exit1", 0); + mtx_unlock(&V_ppeers_lock); } /* @@ -388,7 +410,7 @@ * Remove ourself from our leader's peer list and wake our leader. */ if (p->p_leader->p_peers != NULL) { - mtx_lock(&ppeers_lock); + mtx_lock(&V_ppeers_lock); if (p->p_leader->p_peers != NULL) { q = p->p_leader; while (q->p_peers != p) @@ -396,7 +418,7 @@ q->p_peers = p->p_peers; wakeup(p->p_leader); } - mtx_unlock(&ppeers_lock); + mtx_unlock(&V_ppeers_lock); } vmspace_exit(td); @@ -432,16 +454,17 @@ WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ - sx_xlock(&allproc_lock); + /* Operate on current vps instance only. */ + sx_xlock(&V_allproc_lock); LIST_REMOVE(p, p_list); - LIST_INSERT_HEAD(&zombproc, p, p_list); + LIST_INSERT_HEAD(&V_zombproc, p, p_list); LIST_REMOVE(p, p_hash); - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); /* * Reparent all children processes: @@ -602,7 +625,7 @@ } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); - if (p->p_pptr == p->p_reaper || p->p_pptr == initproc) { + if (p->p_pptr == p->p_reaper || p->p_pptr == V_initproc) { signal_parent = 1; } else if (p->p_sigparent != 0) { if (p->p_sigparent == SIGCHLD) { @@ -613,7 +636,7 @@ } } else PROC_LOCK(p->p_pptr); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); if (signal_parent == 1) { childproc_exited(p); @@ -827,9 +850,9 @@ { struct proc *q, *t; - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); - KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE")); + KASSERT(p->p_state == PRS_ZOMBIE, ("%s: !PRS_ZOMBIE", __func__)); mtx_spin_wait_unlocked(&p->p_slock); @@ -843,7 +866,7 @@ * release the proc struct just yet. */ PROC_UNLOCK(p); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return; } @@ -870,7 +893,7 @@ wakeup(t); cv_broadcast(&p->p_pwait); PROC_UNLOCK(t); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return; } p->p_oppid = 0; @@ -880,9 +903,9 @@ * Remove other references to this process to ensure we have an * exclusive reference. */ - sx_xlock(&allproc_lock); + sx_xlock(&V_allproc_lock); LIST_REMOVE(p, p_list); /* off zombproc */ - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); LIST_REMOVE(p, p_sibling); reaper_abandon_children(p, true); LIST_REMOVE(p, p_reapsibling); @@ -892,7 +915,7 @@ leavepgrp(p); if (p->p_procdesc != NULL) procdesc_reap(p); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); PROC_LOCK(p); knlist_detach(p->p_klist); @@ -953,9 +976,9 @@ #endif KASSERT(FIRST_THREAD_IN_PROC(p), - ("proc_reap: no residual thread!")); + ("%s: no residual thread!", __func__)); uma_zfree(proc_zone, p); - atomic_add_int(&nprocs, -1); + atomic_add_int(&V_nprocs, -1); } static int @@ -965,7 +988,7 @@ { struct rusage *rup; - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); PROC_LOCK(p); @@ -1156,7 +1179,7 @@ bool cont; PROC_LOCK_ASSERT(p, MA_OWNED); - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); MPASS(si_code == CLD_TRAPPED || si_code == CLD_STOPPED || si_code == CLD_CONTINUED); @@ -1170,7 +1193,7 @@ sigqueue_take(p->p_ksi); PROC_UNLOCK(td->td_proc); } - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); if (siginfo != NULL) { siginfo->si_code = si_code; siginfo->si_status = cont ? SIGCONT : p->p_xsig; @@ -1223,7 +1246,7 @@ q->p_flag &= ~P_STATCHILD; PROC_UNLOCK(q); } - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); loop_locked: nfound = 0; LIST_FOREACH(p, &q->p_children, p_sibling) { @@ -1307,11 +1330,11 @@ } } if (nfound == 0) { - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (ECHILD); } if (options & WNOHANG) { - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); td->td_retval[0] = 0; return (0); } @@ -1321,7 +1344,7 @@ PROC_UNLOCK(q); goto loop_locked; } - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); error = msleep(q, &q->p_mtx, PWAIT | PCATCH | PDROP, "wait", 0); if (error) return (error); @@ -1336,7 +1359,7 @@ proc_reparent(struct proc *child, struct proc *parent) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(child, MA_OWNED); if (child->p_pptr == parent) return; Index: sys/kern/kern_fork.c =================================================================== --- sys/kern/kern_fork.c +++ sys/kern/kern_fork.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -184,10 +185,10 @@ return (error); } -int nprocs = 1; /* process 0 */ -int lastpid = 0; -SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, - "Last used PID"); +VPS_DEFINE(int, nprocs) = 1; /* process 0 */ +VPS_DEFINE(int, lastpid) = 0; +SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD|CTLFLAG_VPS, + &VPS_NAME(lastpid), 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make @@ -197,7 +198,8 @@ * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ -static int randompid = 0; +static VPS_DEFINE(int, randompid) = 0; +#define V_randompid VPS(randompid) static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) @@ -207,44 +209,46 @@ error = sysctl_wire_old_buffer(req, sizeof(int)); if (error != 0) return(error); - sx_xlock(&allproc_lock); - pid = randompid; + sx_xlock(&V_allproc_lock); + pid = V_randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid == 0) - randompid = 0; + V_randompid = 0; else if (pid == 1) /* generate a random PID modulus between 100 and 1123 */ - randompid = 100 + arc4random() % 1024; + V_randompid = 100 + arc4random() % 1024; else if (pid < 0 || pid > pid_max - 100) /* out of range */ - randompid = pid_max - 100; + V_randompid = pid_max - 100; else if (pid < 100) /* Make it reasonable */ - randompid = 100; + V_randompid = 100; else - randompid = pid; + V_randompid = pid; } - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus. Special values: 0: disable, 1: choose random value"); -static int +static VPS_DEFINE(int, pidchecked) = 0; +#define V_pidchecked VPS(pidchecked) + +int fork_findpid(int flags) { struct proc *p; int trypid; - static int pidchecked = 0; /* * Requires allproc_lock in order to iterate over the list * of processes, and proctree_lock to access p_pgrp. */ - sx_assert(&allproc_lock, SX_LOCKED); - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); /* * Find an unused process ID. We remember a range of unused IDs @@ -253,13 +257,13 @@ * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ - trypid = lastpid + 1; + trypid = V_lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { - if (randompid) - trypid += arc4random() % randompid; + if (V_randompid) + trypid += arc4random() % V_randompid; } retry: /* @@ -271,12 +275,12 @@ trypid = trypid % pid_max; if (trypid < 100) trypid += 100; - pidchecked = 0; + V_pidchecked = 0; } - if (trypid >= pidchecked) { + if (trypid >= V_pidchecked) { int doingzomb = 0; - pidchecked = PID_MAX; + V_pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater @@ -291,7 +295,8 @@ * reserved pids is limited by process limit times * two. */ - p = LIST_FIRST(&allproc); + /* Operate on current vps instance only. */ + p = LIST_FIRST(&V_allproc); again: for (; p != NULL; p = LIST_NEXT(p, p_list)) { while (p->p_pid == trypid || @@ -301,24 +306,25 @@ (p->p_session != NULL && p->p_session->s_sid == trypid)))) { trypid++; - if (trypid >= pidchecked) + if (trypid >= V_pidchecked) goto retry; } - if (p->p_pid > trypid && pidchecked > p->p_pid) - pidchecked = p->p_pid; + if (p->p_pid > trypid && V_pidchecked > p->p_pid) + V_pidchecked = p->p_pid; if (p->p_pgrp != NULL) { if (p->p_pgrp->pg_id > trypid && - pidchecked > p->p_pgrp->pg_id) - pidchecked = p->p_pgrp->pg_id; + V_pidchecked > p->p_pgrp->pg_id) + V_pidchecked = p->p_pgrp->pg_id; if (p->p_session != NULL && p->p_session->s_sid > trypid && - pidchecked > p->p_session->s_sid) - pidchecked = p->p_session->s_sid; + V_pidchecked > p->p_session->s_sid) + V_pidchecked = p->p_session->s_sid; } } if (!doingzomb) { + /* Operate on current vps instance only. */ doingzomb = 1; - p = LIST_FIRST(&zombproc); + p = LIST_FIRST(&V_zombproc); goto again; } } @@ -327,9 +333,9 @@ * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) - pidchecked = 0; + V_pidchecked = 0; else - lastpid = trypid; + V_lastpid = trypid; return (trypid); } @@ -394,8 +400,8 @@ struct filedesc_to_leader *fdtol; struct sigacts *newsigacts; - sx_assert(&proctree_lock, SX_LOCKED); - sx_assert(&allproc_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_XLOCKED); p1 = td->td_proc; @@ -404,14 +410,14 @@ p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; AUDIT_ARG_PID(p2->p_pid); - LIST_INSERT_HEAD(&allproc, p2, p_list); + LIST_INSERT_HEAD(&V_allproc, p2, p_list); allproc_gen++; LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); PROC_LOCK(p2); PROC_LOCK(p1); - sx_xunlock(&allproc_lock); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_allproc_lock); + sx_xunlock(&V_proctree_lock); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); @@ -490,6 +496,7 @@ td2->td_lend_user_pri = PRI_MAX; #ifdef VIMAGE + td2->td_vps = TD_TO_VPS(td); td2->td_vnet = NULL; td2->td_vnet_lpush = NULL; #endif @@ -554,11 +561,11 @@ * Set up linkage for kernel based threading. */ if ((fr->fr_flags & RFTHREAD) != 0) { - mtx_lock(&ppeers_lock); + mtx_lock(&V_ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; - mtx_unlock(&ppeers_lock); + mtx_unlock(&V_ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); @@ -585,7 +592,7 @@ p2->p_leader = p2; } - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); @@ -648,7 +655,7 @@ LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling); if (p2->p_reaper == p1) p2->p_reapsubtree = p2->p_pid; - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; @@ -751,7 +758,7 @@ * if being set atm. */ if ((p1->p_ptevents & PTRACE_FORK) != 0) { - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); PROC_LOCK(p2); /* @@ -777,7 +784,7 @@ proc_reparent(p2, p1->p_pptr); } PROC_UNLOCK(p2); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); } if ((fr->fr_flags & RFSTOPPED) == 0) { @@ -801,6 +808,11 @@ PROC_UNLOCK(p2); } +static VPS_DEFINE(int, curfail); +#define V_curfail VPS(curfail) +static VPS_DEFINE(struct timeval, lastfail); +#define V_lastfail VPS(lastfail) + int fork1(struct thread *td, struct fork_req *fr) { @@ -810,8 +822,6 @@ struct file *fp_procdesc; vm_ooffset_t mem_charged; int error, nprocs_new, ok; - static int curfail; - static struct timeval lastfail; int flags, pages; flags = fr->fr_flags; @@ -881,17 +891,17 @@ * Don't allow a nonprivileged user to use the last ten * processes; don't let root exceed the limit. */ - nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1; + nprocs_new = atomic_fetchadd_int(&V_nprocs, 1) + 1; if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred, PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) { error = EAGAIN; - sx_xlock(&allproc_lock); - if (ppsratecheck(&lastfail, &curfail, 1)) { + sx_xlock(&V_allproc_lock); + if (ppsratecheck(&V_lastfail, &V_curfail, 1)) { printf("maxproc limit exceeded by uid %u (pid %d); " "see tuning(7) and login.conf(5)\n", td->td_ucred->cr_ruid, p1->p_pid); } - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); goto fail2; } @@ -973,8 +983,8 @@ STAILQ_INIT(&newproc->p_ktr); /* We have to lock the process tree while we look for a pid. */ - sx_xlock(&proctree_lock); - sx_xlock(&allproc_lock); + sx_xlock(&V_proctree_lock); + sx_xlock(&V_allproc_lock); /* * Increment the count of procs running with this uid. Don't allow @@ -995,8 +1005,8 @@ } error = EAGAIN; - sx_xunlock(&allproc_lock); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_allproc_lock); + sx_xunlock(&V_proctree_lock); #ifdef MAC mac_proc_destroy(newproc); #endif @@ -1012,7 +1022,7 @@ fdclose(td, fp_procdesc, *fr->fr_pd_fd); fdrop(fp_procdesc, td); } - atomic_add_int(&nprocs, -1); + atomic_add_int(&V_nprocs, -1); pause("fork", hz / 2); return (error); } Index: sys/kern/kern_jail.c =================================================================== --- sys/kern/kern_jail.c +++ sys/kern/kern_jail.c @@ -62,6 +62,10 @@ #include #include #include +#include +#ifdef VIMAGE +#include +#endif #include #include @@ -107,7 +111,7 @@ .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE - .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, + .pr_flags = PR_HOST|PR_VNET|PR_VPS|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif @@ -171,6 +175,7 @@ {"host", 0, PR_HOST}, #ifdef VIMAGE {"vnet", 0, PR_VNET}, + {"vps", 0, PR_VPS}, #endif #ifdef INET {"ip4", PR_IP4_USER, PR_IP4_USER}, @@ -627,6 +632,11 @@ vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } + if ((flags & JAIL_UPDATE) && (ch_flags & PR_VPS)) { + error = EINVAL; + vfs_opterror(opts, "vps cannot be changed after creation"); + goto done_errmsg; + } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { @@ -1801,6 +1811,39 @@ goto done_errmsg; } +#ifdef VIMAGE + /* Allocate a new vps if specified. */ + if (pr_flags & PR_VPS) { + vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); + if ((error = change_dir(pr->pr_root, td)) != 0) + goto c_unlock; +#ifdef MAC + if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) + goto c_unlock; +#endif +c_unlock: + VOP_UNLOCK(pr->pr_root, 0); + if (error || (error = pwd_chroot(td, pr->pr_root))) { + vfs_opterror(opts, "vps chroot failed"); + if (!created) + prison_deref(pr, PD_DEREF); + goto done_errmsg; + } + + /* We temporary need a ref as otheriwse a prhold will panic. */ + mtx_lock(&pr->pr_mtx); + pr->pr_ref++; + pr->pr_uref++; + mtx_unlock(&pr->pr_mtx); + pr->pr_vps = vps_alloc(pr); + mtx_lock(&pr->pr_mtx); + pr->pr_ref--; + pr->pr_uref--; + mtx_unlock(&pr->pr_mtx); + } else { + pr->pr_vps = ppr->pr_vps; + } +#endif /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { mtx_lock(&pr->pr_mtx); @@ -2285,7 +2328,28 @@ /* * Kill all processes unfortunate enough to be attached to this prison. */ - sx_slock(&allproc_lock); +#ifdef VIMAGE + if (pr->pr_vps) { + /* + * Send signal to init and let init do it's job. + * This should run rc.shutdown and processes should go away. + * All but init? We need to catch the tail-end of reboot(2) + * and handle appropriately for the non-default vpss. + * vps_destroy() will ensure init and swapper will also go + * away and might sleep. If they do not go something will + * hold refs on cred and prisons. + * XXX There are other places which might do that for a long + * time as well. + */ + CURVPS_SET(pr->pr_vps); + shutdown_nice(RB_HALT|RB_POWEROFF); + vps_destroy(pr->pr_vps); + CURVPS_RESTORE(); + } else +#endif + { + /* Operate on current vps instance only. */ + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NEW && p->p_ucred && @@ -2293,7 +2357,8 @@ kern_psignal(p, SIGKILL); PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); + } /* Remove the temporary reference added by jail_remove. */ prison_deref(pr, deuref | PD_DEREF); } @@ -2348,6 +2413,24 @@ struct ucred *newcred, *oldcred; int error; +#ifdef VIMAGE + /* + * Do not allow to migrate a process between virtual process spaces. + * Use the console to attach to it. Getting all process spaces things + * right, including a new pid, progress group, session, terminal, + * tracing is one thing (with a lot of work) and may break apps if the + * pid changes, the pgrp no longer has the same (p)id; getting things + * restored to original state and properly re-parented is virtually + * impossile. So do what we do on a normal machine, present a terminal + * to login to. + */ + if (pr->pr_flags & PR_VPS) { + mtx_unlock(&pr->pr_mtx); + sx_sunlock(&allprison_lock); + return (EPERM); + } +#endif + /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two @@ -2628,6 +2711,9 @@ #ifdef VIMAGE if (pr->pr_vnet != ppr->pr_vnet) vnet_destroy(pr->pr_vnet); + KASSERT((pr->pr_vps == ppr->pr_vps || pr->pr_vps == NULL), + ("%s: pr %p pr_vps %p != NULL\n", + __func__, pr, pr->pr_vps)); #endif if (pr->pr_root != NULL) vrele(pr->pr_root); @@ -2912,9 +2998,9 @@ #ifdef VIMAGE /* * Determine whether the prison represented by cred owns - * its vnet rather than having it inherited. + * its vnet/vps rather than having it inherited. * - * Returns 1 in case the prison owns the vnet, 0 otherwise. + * Returns 1 in case the prison owns the vnet/vps, 0 otherwise. */ int prison_owns_vnet(struct ucred *cred) @@ -2926,6 +3012,17 @@ */ return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0); } + +int +prison_owns_vps(struct ucred *cred) +{ + + /* + * vps cannot be added/removed after jail creation, + * so no need to lock here. + */ + return (cred->cr_prison->pr_flags & PR_VPS ? 1 : 0); +} #endif /* @@ -3542,6 +3639,26 @@ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns vnet?"); +static int +sysctl_jail_vps(SYSCTL_HANDLER_ARGS) +{ + int error, havevps; +#ifdef VIMAGE + struct ucred *cred = req->td->td_ucred; + + havevps = jailed(cred) && prison_owns_vps(cred); +#else + havevps = 0; +#endif + error = SYSCTL_OUT(req, &havevps, sizeof(havevps)); + + return (error); +} + +SYSCTL_PROC(_security_jail, OID_AUTO, vps, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + sysctl_jail_vps, "I", "Jail owns vps?"); + #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, @@ -3697,6 +3814,8 @@ #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); +SYSCTL_JAIL_PARAM(, vps, CTLTYPE_INT | CTLFLAG_RDTUN, + "E,jailsys", "Virtual process space"); #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); @@ -4023,12 +4142,12 @@ ASSERT_RACCT_ENABLED(); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); sx_xlock(&allprison_lock); if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { sx_xunlock(&allprison_lock); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return; } @@ -4046,6 +4165,7 @@ /* * Force rctl to reattach rules to processes. */ + /* XXX do we need to do this over all vps instances as well? */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); cred = crhold(p->p_ucred); @@ -4055,7 +4175,7 @@ } #endif - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); prison_racct_free_locked(oldprr); sx_xunlock(&allprison_lock); } @@ -4103,6 +4223,7 @@ ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); + db_printf(" vps = %p\n", pr->pr_vps); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); Index: sys/kern/kern_kthread.c =================================================================== --- sys/kern/kern_kthread.c +++ sys/kern/kern_kthread.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,8 @@ #include #include #include +#include + #include #include @@ -164,11 +167,30 @@ * Reparent curthread from proc0 to init so that the zombie * is harvested. */ - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); PROC_LOCK(p); - proc_reparent(p, initproc); +#ifdef VIMAGE + /* + * In the VIMAGE case if the kproc is our virtual "swapper" + * do not reparent it to our init as otherwise it would create + * a circle and never go away. Let the parent vps reap it + * as it was setup. And it needs to be the init there and + * not the swapper(kernel). + */ + if (!IS_DEFAULT_VPS(TD_TO_VPS(FIRST_THREAD_IN_PROC(p))) && + p->p_pid == 0) { + struct proc *init0; + + CURVPS_SET_QUIET(vps0) + init0 = V_initproc; + CURVPS_RESTORE(); + + proc_reparent(p, init0); + } else +#endif + proc_reparent(p, V_initproc); PROC_UNLOCK(p); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); /* * Wakeup anyone waiting for us to exit. @@ -271,7 +293,7 @@ /* If no process supplied, put it on proc0 */ if (p == NULL) - p = &proc0; + p = V_vproc0; /* Initialize our new td */ newtd = thread_alloc(pages); @@ -294,6 +316,9 @@ TSTHREAD(newtd, newtd->td_name); newtd->td_proc = p; /* needed for cpu_copy_thread */ +#ifdef VIMAGE + newtd->td_vps = TD_TO_VPS(oldtd); +#endif /* might be further optimized for kthread */ cpu_copy_thread(newtd, oldtd); /* put the designated function(arg) as the resume context */ Index: sys/kern/kern_ktrace.c =================================================================== --- sys/kern/kern_ktrace.c +++ sys/kern/kern_ktrace.c @@ -952,25 +952,33 @@ * Clear all uses of the tracefile. */ if (ops == KTROP_CLEARFILE) { + VPS_ITERATOR_DECL(vps_iter); int vrele_count; vrele_count = 0; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - if (p->p_tracevp == vp) { - if (ktrcanset(td, p)) { - mtx_lock(&ktrace_mtx); - ktr_freeproc(p, &cred, NULL); - mtx_unlock(&ktrace_mtx); - vrele_count++; - crfree(cred); - } else - error = EPERM; + + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_tracevp == vp) { + if (ktrcanset(td, p)) { + mtx_lock(&ktrace_mtx); + ktr_freeproc(p, &cred, NULL); + mtx_unlock(&ktrace_mtx); + vrele_count++; + crfree(cred); + } else + error = EPERM; + } + PROC_UNLOCK(p); } - PROC_UNLOCK(p); + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); if (vrele_count > 0) { while (vrele_count-- > 0) vrele(vp); @@ -980,14 +988,14 @@ /* * do it */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (uap->pid < 0) { /* * by process group */ pg = pgfind(-uap->pid); if (pg == NULL) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); error = ESRCH; goto done; } @@ -1011,7 +1019,7 @@ ret |= ktrops(td, p, ops, facs, vp); } if (nfound == 0) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); error = ESRCH; goto done; } @@ -1027,7 +1035,7 @@ if (error) { if (p != NULL) PROC_UNLOCK(p); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); goto done; } if (descend) @@ -1035,7 +1043,7 @@ else ret |= ktrops(td, p, ops, facs, vp); } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); if (!ret) error = EPERM; done: @@ -1143,7 +1151,7 @@ p = top; PROC_LOCK_ASSERT(p, MA_OWNED); - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); for (;;) { ret |= ktrops(td, p, ops, facs, vp); /* @@ -1170,6 +1178,7 @@ static void ktr_writerequest(struct thread *td, struct ktr_request *req) { + VPS_ITERATOR_DECL(vps_iter); struct ktr_header *kth; struct vnode *vp; struct proc *p; @@ -1270,22 +1279,28 @@ * credentials for the operation. */ cred = NULL; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - if (p->p_tracevp == vp) { - mtx_lock(&ktrace_mtx); - ktr_freeproc(p, &cred, NULL); - mtx_unlock(&ktrace_mtx); - vrele_count++; - } - PROC_UNLOCK(p); - if (cred != NULL) { - crfree(cred); - cred = NULL; + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_tracevp == vp) { + mtx_lock(&ktrace_mtx); + ktr_freeproc(p, &cred, NULL); + mtx_unlock(&ktrace_mtx); + vrele_count++; + } + PROC_UNLOCK(p); + if (cred != NULL) { + crfree(cred); + cred = NULL; + } } + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); while (vrele_count-- > 0) vrele(vp); Index: sys/kern/kern_mib.c =================================================================== --- sys/kern/kern_mib.c +++ sys/kern/kern_mib.c @@ -556,8 +556,8 @@ error = sysctl_handle_int(oidp, &pm, 0, req); if (error || !req->newptr) return (error); - sx_xlock(&proctree_lock); - sx_xlock(&allproc_lock); + sx_xlock(&V_proctree_lock); + sx_xlock(&V_allproc_lock); /* * Only permit the values less then PID_MAX. @@ -567,8 +567,8 @@ error = EINVAL; else pid_max = pm; - sx_xunlock(&allproc_lock); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_allproc_lock); + sx_xunlock(&V_proctree_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT | Index: sys/kern/kern_proc.c =================================================================== --- sys/kern/kern_proc.c +++ sys/kern/kern_proc.c @@ -126,15 +126,21 @@ /* * Other process lists */ -struct pidhashhead *pidhashtbl; -u_long pidhash; -struct pgrphashhead *pgrphashtbl; -u_long pgrphash; -struct proclist allproc; -struct proclist zombproc; +VPS_DEFINE(struct pidhashhead *, pidhashtbl); +VPS_DEFINE(u_long, pidhash); +VPS_DEFINE(struct pgrphashhead *, pgrphashtbl); +VPS_DEFINE(u_long, pgrphash); +VPS_DEFINE(struct proclist, allproc); +VPS_DEFINE(struct proclist, zombproc); +#ifndef VIMAGE struct sx __exclusive_cache_line allproc_lock; struct sx __exclusive_cache_line proctree_lock; struct mtx __exclusive_cache_line ppeers_lock; +#else +VPS_DEFINE(struct sx, allproc_lock); +VPS_DEFINE(struct sx, proctree_lock); +VPS_DEFINE(struct mtx, ppeers_lock); +#endif uma_zone_t proc_zone; /* @@ -179,22 +185,46 @@ /* * Initialize global process hashing structures. */ -void +static void procinit(void) { - sx_init(&allproc_lock, "allproc"); - sx_init(&proctree_lock, "proctree"); - mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF); - LIST_INIT(&allproc); - LIST_INIT(&zombproc); - pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); - pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); - proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), - proc_ctor, proc_dtor, proc_init, proc_fini, - UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uihashinit(); + sx_init(&V_allproc_lock, "allproc"); + sx_init(&V_proctree_lock, "proctree"); + mtx_init(&V_ppeers_lock, "p_peers", NULL, MTX_DEF); + LIST_INIT(&V_allproc); + LIST_INIT(&V_zombproc); + V_pidhashtbl = hashinit(maxproc / 4, M_PROC, &V_pidhash); + V_pgrphashtbl = hashinit(maxproc / 4, M_PROC, &V_pgrphash); + if (IS_DEFAULT_VPS(curvps)) { + proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), + proc_ctor, proc_dtor, proc_init, proc_fini, + UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + } } +VPS_SYSINIT(procinit, SI_SUB_INTRINSIC, SI_ORDER_SECOND, procinit, NULL); + +#ifdef VIMAGE +static void +procdestroy(void *ident __unused) +{ + + KASSERT((LIST_EMPTY(&V_allproc)), ("%s: list allproc %p not empty\n", + __func__, &V_allproc)); + KASSERT((LIST_EMPTY(&V_zombproc)), ("%s: list zombproc %p not empty\n", + __func__, &V_zombproc)); + + /* proc_zone */ + hashdestroy(V_pgrphashtbl, M_PROC, V_pgrphash); + hashdestroy(V_pidhashtbl, M_PROC, V_pidhash); + + mtx_destroy(&V_ppeers_lock); + sx_destroy(&V_proctree_lock); + sx_destroy(&V_allproc_lock); +} +VPS_SYSUNINIT(procdestroy, SI_SUB_INTRINSIC, SI_ORDER_SECOND, procdestroy, + NULL); +#endif /* * Prepare a proc for use. @@ -303,7 +333,7 @@ inferior(struct proc *p) { - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); for (; p != curproc; p = proc_realparent(p)) { if (p->p_pid == 0) @@ -317,7 +347,7 @@ { struct proc *p; - sx_assert(&allproc_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_LOCKED); LIST_FOREACH(p, PIDHASH(pid), p_hash) { if (p->p_pid == pid) { PROC_LOCK(p); @@ -347,9 +377,9 @@ PROC_LOCK(p); return (p); } - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); p = pfind_locked(pid); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (p); } @@ -361,11 +391,11 @@ { struct proc *p; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); p = pfind_locked(pid); if (p == NULL) p = zpfind_locked(pid); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (p); } @@ -376,7 +406,8 @@ struct proc *p; struct thread *td; - sx_assert(&allproc_lock, SX_LOCKED); + /* Operate on current vps instance only. */ + sx_assert(&V_allproc_lock, SX_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { @@ -402,7 +433,7 @@ { struct pgrp *pgrp; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { if (pgrp->pg_id == pgid) { @@ -426,7 +457,7 @@ if (p->p_pid == pid) { PROC_LOCK(p); } else { - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); if (pid <= PID_MAX) { p = pfind_locked(pid); if (p == NULL && (flags & PGET_NOTWEXIT) == 0) @@ -436,7 +467,7 @@ } else { p = NULL; } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (p == NULL) return (ESRCH); if ((flags & PGET_CANSEE) != 0) { @@ -486,7 +517,7 @@ enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL")); KASSERT(p->p_pid == pgid, @@ -547,7 +578,7 @@ enterthispgrp(struct proc *p, struct pgrp *pgrp) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); @@ -573,7 +604,7 @@ { struct pgrp *savepgrp; - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); @@ -610,7 +641,7 @@ { struct pgrp *savepgrp; - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); savepgrp = p->p_pgrp; PGRP_LOCK(savepgrp); PROC_LOCK(p); @@ -632,7 +663,7 @@ struct session *savesess; struct tty *tp; - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); @@ -691,7 +722,7 @@ struct session *mysession; struct proc *q; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); @@ -744,7 +775,7 @@ } PROC_UNLOCK(p); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); if (SESS_LEADER(p)) { sp = p->p_session; @@ -781,17 +812,17 @@ } if (ttyvp != NULL) { - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) { VOP_REVOKE(ttyvp, REVOKEALL); VOP_UNLOCK(ttyvp, 0); } vrele(ttyvp); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); } } fixjobc(p, p->p_pgrp, 0); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); } /* @@ -851,10 +882,10 @@ struct proc *p; int i; - for (i = 0; i <= pgrphash; i++) { - if (!LIST_EMPTY(&pgrphashtbl[i])) { + for (i = 0; i <= V_pgrphash; i++) { + if (!LIST_EMPTY(&V_pgrphashtbl[i])) { printf("\tindx %d\n", i); - LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { + LIST_FOREACH(pgrp, &V_pgrphashtbl[i], pg_hash) { printf( "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", (void *)pgrp, (long)pgrp->pg_id, @@ -910,7 +941,7 @@ struct timeval boottime; /* For proc_realparent. */ - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); bzero(kp, sizeof(*kp)); @@ -1019,7 +1050,7 @@ kp->ki_kiflag |= KI_CTTY; if (SESS_LEADER(p)) kp->ki_kiflag |= KI_SLEADER; - /* XXX proctree_lock */ + /* XXX V_proctree_lock */ tp = sp->s_ttyp; SESS_UNLOCK(sp); } @@ -1209,8 +1240,9 @@ { struct proc *p; - sx_assert(&allproc_lock, SX_LOCKED); - LIST_FOREACH(p, &zombproc, p_list) { + /* Operate on current vps instance only. */ + sx_assert(&V_allproc_lock, SX_LOCKED); + LIST_FOREACH(p, &V_zombproc, p_list) { if (p->p_pid == pid) { PROC_LOCK(p); break; @@ -1227,9 +1259,9 @@ { struct proc *p; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); p = zpfind_locked(pid); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (p); } @@ -1465,11 +1497,11 @@ error = sysctl_wire_old_buffer(req, 0); if (error) return (error); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); error = pget((pid_t)name[0], PGET_CANSEE, &p); if (error == 0) error = sysctl_out_proc(p, req, flags); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (error); } @@ -1502,14 +1534,15 @@ * traced process. Only grab it if we are producing any * data to begin with. */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); } - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { + /* Operate on current vps instance only. */ if (!doingzomb) - p = LIST_FIRST(&allproc); + p = LIST_FIRST(&V_allproc); else - p = LIST_FIRST(&zombproc); + p = LIST_FIRST(&V_zombproc); for (; p != NULL; p = LIST_NEXT(p, p_list)) { /* * Skip embryonic processes. @@ -1569,7 +1602,7 @@ PROC_UNLOCK(p); continue; } - /* XXX proctree_lock */ + /* XXX V_proctree_lock */ SESS_LOCK(p->p_session); if (p->p_session->s_ttyp == NULL || tty_udev(p->p_session->s_ttyp) != @@ -1609,9 +1642,9 @@ } } out: - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (req->oldptr != NULL) - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (error); } @@ -3095,101 +3128,133 @@ void stop_all_proc(void) { + VPS_ITERATOR_DECL(vps_iter); struct proc *cp, *p; int r, gen; bool restart, seen_stopped, seen_exiting, stopped_some; - cp = curproc; + KASSERT(IS_DEFAULT_VPS(curvps), + ("%s: called from non vps0 %p: vps %p\n", __func__, vps0, curvps)); + + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); +#ifdef VIMAGE + if (saved_vps != vps_iter) + cp = NULL; + else +#endif + cp = curproc; allproc_loop: - sx_xlock(&allproc_lock); - gen = allproc_gen; - seen_exiting = seen_stopped = stopped_some = restart = false; - LIST_REMOVE(cp, p_list); - LIST_INSERT_HEAD(&allproc, cp, p_list); - for (;;) { - p = LIST_NEXT(cp, p_list); - if (p == NULL) - break; + sx_xlock(&V_allproc_lock); + if (cp == NULL) + cp = LIST_FIRST(&V_allproc); + gen = allproc_gen; + seen_exiting = seen_stopped = stopped_some = restart = false; LIST_REMOVE(cp, p_list); - LIST_INSERT_AFTER(p, cp, p_list); - PROC_LOCK(p); - if ((p->p_flag & (P_KPROC | P_SYSTEM | P_TOTAL_STOP)) != 0) { - PROC_UNLOCK(p); - continue; - } - if ((p->p_flag & P_WEXIT) != 0) { - seen_exiting = true; - PROC_UNLOCK(p); - continue; - } - if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { - /* - * Stopped processes are tolerated when there - * are no other processes which might continue - * them. P_STOPPED_SINGLE but not - * P_TOTAL_STOP process still has at least one - * thread running. - */ - seen_stopped = true; + LIST_INSERT_HEAD(&V_allproc, cp, p_list); + for (;;) { + p = LIST_NEXT(cp, p_list); + if (p == NULL) + break; + LIST_REMOVE(cp, p_list); + LIST_INSERT_AFTER(p, cp, p_list); + PROC_LOCK(p); + if ((p->p_flag & (P_KPROC | P_SYSTEM | P_TOTAL_STOP)) != 0) { + PROC_UNLOCK(p); + continue; + } + if ((p->p_flag & P_WEXIT) != 0) { + seen_exiting = true; + PROC_UNLOCK(p); + continue; + } + if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { + /* + * Stopped processes are tolerated when there + * are no other processes which might continue + * them. P_STOPPED_SINGLE but not + * P_TOTAL_STOP process still has at least one + * thread running. + */ + seen_stopped = true; + PROC_UNLOCK(p); + continue; + } + _PHOLD(p); + sx_xunlock(&V_allproc_lock); + r = thread_single(p, SINGLE_ALLPROC); + if (r != 0) + restart = true; + else + stopped_some = true; + _PRELE(p); PROC_UNLOCK(p); - continue; + sx_xlock(&V_allproc_lock); } - _PHOLD(p); - sx_xunlock(&allproc_lock); - r = thread_single(p, SINGLE_ALLPROC); - if (r != 0) + /* Catch forked children we did not see in iteration. */ + if (gen != allproc_gen) restart = true; - else - stopped_some = true; - _PRELE(p); - PROC_UNLOCK(p); - sx_xlock(&allproc_lock); - } - /* Catch forked children we did not see in iteration. */ - if (gen != allproc_gen) - restart = true; - sx_xunlock(&allproc_lock); - if (restart || stopped_some || seen_exiting || seen_stopped) { - kern_yield(PRI_USER); - goto allproc_loop; + sx_xunlock(&V_allproc_lock); + if (restart || stopped_some || seen_exiting || seen_stopped) { + kern_yield(PRI_USER); + goto allproc_loop; + } + CURVPS_RESTORE(); } + VPS_LIST_RUNLOCK(); } void resume_all_proc(void) { + VPS_ITERATOR_DECL(vps_iter); struct proc *cp, *p; - cp = curproc; - sx_xlock(&allproc_lock); + KASSERT(IS_DEFAULT_VPS(curvps), + ("%s: called from non vps0 %p: vps %p\n", __func__, vps0, curvps)); + + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); +#ifdef VIMAGE + if (saved_vps != vps_iter) + cp = NULL; + else +#endif + cp = curproc; + sx_xlock(&V_allproc_lock); again: - LIST_REMOVE(cp, p_list); - LIST_INSERT_HEAD(&allproc, cp, p_list); - for (;;) { - p = LIST_NEXT(cp, p_list); - if (p == NULL) - break; LIST_REMOVE(cp, p_list); - LIST_INSERT_AFTER(p, cp, p_list); - PROC_LOCK(p); - if ((p->p_flag & P_TOTAL_STOP) != 0) { - sx_xunlock(&allproc_lock); - _PHOLD(p); - thread_single_end(p, SINGLE_ALLPROC); - _PRELE(p); - PROC_UNLOCK(p); - sx_xlock(&allproc_lock); - } else { - PROC_UNLOCK(p); + LIST_INSERT_HEAD(&V_allproc, cp, p_list); + for (;;) { + p = LIST_NEXT(cp, p_list); + if (p == NULL) + break; + LIST_REMOVE(cp, p_list); + LIST_INSERT_AFTER(p, cp, p_list); + PROC_LOCK(p); + if ((p->p_flag & P_TOTAL_STOP) != 0) { + sx_xunlock(&V_allproc_lock); + _PHOLD(p); + thread_single_end(p, SINGLE_ALLPROC); + _PRELE(p); + PROC_UNLOCK(p); + sx_xlock(&V_allproc_lock); + } else { + PROC_UNLOCK(p); + } } + /* Did the loop above missed any stopped process ? */ + FOREACH_PROC_IN_SYSTEM(p) { + /* No need for proc lock. */ + if ((p->p_flag & P_TOTAL_STOP) != 0) + goto again; + } + sx_xunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - /* Did the loop above missed any stopped process ? */ - FOREACH_PROC_IN_SYSTEM(p) { - /* No need for proc lock. */ - if ((p->p_flag & P_TOTAL_STOP) != 0) - goto again; - } - sx_xunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); } /* #define TOTAL_STOP_DEBUG 1 */ Index: sys/kern/kern_procctl.c =================================================================== --- sys/kern/kern_procctl.c +++ sys/kern/kern_procctl.c @@ -69,7 +69,7 @@ p = top; ret = 0; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); for (;;) { ret |= protect_setchild(td, p, flags); PROC_UNLOCK(p); @@ -128,7 +128,7 @@ reap_acquire(struct thread *td, struct proc *p) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); if (p != curproc) return (EPERM); if ((p->p_treeflag & P_TREE_REAPER) != 0) @@ -145,10 +145,10 @@ reap_release(struct thread *td, struct proc *p) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); if (p != curproc) return (EPERM); - if (p == initproc) + if (p == V_initproc) return (EINVAL); if ((p->p_treeflag & P_TREE_REAPER) == 0) return (EINVAL); @@ -162,7 +162,7 @@ { struct proc *reap, *p2, *first_p; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); bzero(rs, sizeof(*rs)); if ((p->p_treeflag & P_TREE_REAPER) == 0) { reap = p->p_reaper; @@ -170,7 +170,7 @@ reap = p; rs->rs_flags |= REAPER_STATUS_OWNED; } - if (reap == initproc) + if (reap == V_initproc) rs->rs_flags |= REAPER_STATUS_REALINIT; rs->rs_reaper = reap->p_pid; rs->rs_descendants = 0; @@ -199,18 +199,18 @@ u_int i, n; int error; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); PROC_UNLOCK(p); reap = (p->p_treeflag & P_TREE_REAPER) == 0 ? p->p_reaper : p; n = i = 0; error = 0; LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling) n++; - sx_unlock(&proctree_lock); + sx_unlock(&V_proctree_lock); if (rp->rp_count < n) n = rp->rp_count; pi = malloc(n * sizeof(*pi), M_TEMP, M_WAITOK); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling) { if (i == n) break; @@ -225,10 +225,10 @@ pip->pi_flags |= REAPER_PIDINFO_REAPER; i++; } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); error = copyout(pi, rp->rp_pids, i * sizeof(*pi)); free(pi, M_TEMP); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); return (error); } @@ -278,7 +278,7 @@ struct reap_kill_tracker *t; int error; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); if (IN_CAPABILITY_MODE(td)) return (ECAPMODE); if (rk->rk_sig <= 0 || rk->rk_sig > _SIG_MAXSIG || @@ -585,12 +585,12 @@ case PROC_REAP_KILL: case PROC_TRACE_CTL: case PROC_TRAPCAP_CTL: - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); tree_locked = true; break; case PROC_REAP_ACQUIRE: case PROC_REAP_RELEASE: - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); tree_locked = true; break; case PROC_TRACE_STATUS: @@ -657,6 +657,6 @@ break; } if (tree_locked) - sx_unlock(&proctree_lock); + sx_unlock(&V_proctree_lock); return (error); } Index: sys/kern/kern_prot.c =================================================================== --- sys/kern/kern_prot.c +++ sys/kern/kern_prot.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -133,10 +134,10 @@ PROC_UNLOCK(p); } else { PROC_UNLOCK(p); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); pp = proc_realparent(p); ppid = pp->p_pid; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); } return (ppid); @@ -340,7 +341,7 @@ newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) { if (pgrp != NULL) @@ -353,7 +354,7 @@ newsess = NULL; } - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); if (newpgrp != NULL) free(newpgrp, M_PGRP); @@ -399,7 +400,7 @@ newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); if (uap->pid != 0 && uap->pid != curp->p_pid) { if ((targp = pfind(uap->pid)) == NULL) { error = ESRCH; @@ -457,7 +458,7 @@ error = enterthispgrp(targp, pgrp); } done: - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); KASSERT((error == 0) || (newpgrp != NULL), ("setpgid failed and newpgrp is NULL")); if (newpgrp != NULL) @@ -1738,7 +1739,7 @@ } /* Can't trace init when securelevel > 0. */ - if (p == initproc) { + if (p == V_initproc) { error = securelevel_gt(td->td_ucred, 0); if (error) return (error); @@ -1860,8 +1861,10 @@ crfree(struct ucred *cr) { - KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref)); - KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred")); + KASSERT(cr->cr_ref > 0, ("%s: bad ucred %p refcount: %d", + __func__, cr, cr->cr_ref)); + KASSERT(cr->cr_ref != 0xdeadc0de, + ("%s: dangling reference to ucred %p", __func__, cr)); if (refcount_release(&cr->cr_ref)) { /* * Some callers of crget(), such as nfs_statfs(), Index: sys/kern/kern_racct.c =================================================================== --- sys/kern/kern_racct.c +++ sys/kern/kern_racct.c @@ -1214,94 +1214,107 @@ } static void -racctd(void) +_racctd(void) { struct thread *td; struct proc *p; struct timeval wallclock; uint64_t pct, pct_estimate, runtime; - ASSERT_RACCT_ENABLED(); - - for (;;) { - racct_decay(); + sx_slock(&V_allproc_lock); - sx_slock(&allproc_lock); + LIST_FOREACH(p, &V_zombproc, p_list) { + PROC_LOCK(p); + racct_set(p, RACCT_PCTCPU, 0); + PROC_UNLOCK(p); + } - LIST_FOREACH(p, &zombproc, p_list) { - PROC_LOCK(p); - racct_set(p, RACCT_PCTCPU, 0); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_state != PRS_NORMAL) { PROC_UNLOCK(p); + continue; } - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - if (p->p_state != PRS_NORMAL) { - PROC_UNLOCK(p); - continue; - } - - microuptime(&wallclock); - timevalsub(&wallclock, &p->p_stats->p_start); - PROC_STATLOCK(p); - FOREACH_THREAD_IN_PROC(p, td) - ruxagg(p, td); - runtime = cputick2usec(p->p_rux.rux_runtime); - PROC_STATUNLOCK(p); + microuptime(&wallclock); + timevalsub(&wallclock, &p->p_stats->p_start); + PROC_STATLOCK(p); + FOREACH_THREAD_IN_PROC(p, td) + ruxagg(p, td); + runtime = cputick2usec(p->p_rux.rux_runtime); + PROC_STATUNLOCK(p); #ifdef notyet - KASSERT(runtime >= p->p_prev_runtime, - ("runtime < p_prev_runtime")); + KASSERT(runtime >= p->p_prev_runtime, + ("runtime < p_prev_runtime")); #else - if (runtime < p->p_prev_runtime) - runtime = p->p_prev_runtime; + if (runtime < p->p_prev_runtime) + runtime = p->p_prev_runtime; #endif - p->p_prev_runtime = runtime; - if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { - pct_estimate = (1000000 * runtime * 100) / - ((uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec); - } else - pct_estimate = 0; - pct = racct_getpcpu(p, pct_estimate); - RACCT_LOCK(); + p->p_prev_runtime = runtime; + if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) { + pct_estimate = (1000000 * runtime * 100) / + ((uint64_t)wallclock.tv_sec * 1000000 + + wallclock.tv_usec); + } else + pct_estimate = 0; + pct = racct_getpcpu(p, pct_estimate); + RACCT_LOCK(); #ifdef RCTL - rctl_throttle_decay(p->p_racct, RACCT_READBPS); - rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); - rctl_throttle_decay(p->p_racct, RACCT_READIOPS); - rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); + rctl_throttle_decay(p->p_racct, RACCT_READBPS); + rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS); + rctl_throttle_decay(p->p_racct, RACCT_READIOPS); + rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS); #endif - racct_set_locked(p, RACCT_PCTCPU, pct, 1); - racct_set_locked(p, RACCT_CPU, runtime, 0); - racct_set_locked(p, RACCT_WALLCLOCK, - (uint64_t)wallclock.tv_sec * 1000000 + - wallclock.tv_usec, 0); - RACCT_UNLOCK(); + racct_set_locked(p, RACCT_PCTCPU, pct, 1); + racct_set_locked(p, RACCT_CPU, runtime, 0); + racct_set_locked(p, RACCT_WALLCLOCK, + (uint64_t)wallclock.tv_sec * 1000000 + + wallclock.tv_usec, 0); + RACCT_UNLOCK(); + PROC_UNLOCK(p); + } + + /* + * To ensure that processes are throttled in a fair way, we need + * to iterate over all processes again and check the limits + * for %cpu resource only after ucred racct containers have been + * properly filled. + */ + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_state != PRS_NORMAL) { PROC_UNLOCK(p); + continue; } - /* - * To ensure that processes are throttled in a fair way, we need - * to iterate over all processes again and check the limits - * for %cpu resource only after ucred racct containers have been - * properly filled. - */ - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - if (p->p_state != PRS_NORMAL) { - PROC_UNLOCK(p); - continue; - } + if (racct_pcpu_available(p) <= 0) { + if (p->p_racct->r_resources[RACCT_PCTCPU] > + pcpu_threshold) + racct_proc_throttle(p, -1); + } else if (p->p_throttled == -1) { + racct_proc_wakeup(p); + } + PROC_UNLOCK(p); + } + sx_sunlock(&V_allproc_lock); +} - if (racct_pcpu_available(p) <= 0) { - if (p->p_racct->r_resources[RACCT_PCTCPU] > - pcpu_threshold) - racct_proc_throttle(p, -1); - } else if (p->p_throttled == -1) { - racct_proc_wakeup(p); - } - PROC_UNLOCK(p); +static void +racctd(void) +{ + VPS_ITERATOR_DECL(vps_iter); + + ASSERT_RACCT_ENABLED(); + + for (;;) { + racct_decay(); + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + _racctd(); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); pause("-", hz); } } Index: sys/kern/kern_rctl.c =================================================================== --- sys/kern/kern_rctl.c +++ sys/kern/kern_rctl.c @@ -1175,7 +1175,7 @@ error = str2id(subject_idstr, &id); if (error != 0) goto out; - sx_assert(&allproc_lock, SA_LOCKED); + sx_assert(&V_allproc_lock, SA_LOCKED); rule->rr_subject.rs_proc = pfind(id); if (rule->rr_subject.rs_proc == NULL) { error = ESRCH; @@ -1266,6 +1266,7 @@ int rctl_rule_add(struct rctl_rule *rule) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p; struct ucred *cred; struct uidinfo *uip; @@ -1357,37 +1358,51 @@ * Now go through all the processes and add the new rule to the ones * it applies to. */ - sx_assert(&allproc_lock, SA_LOCKED); - FOREACH_PROC_IN_SYSTEM(p) { - cred = p->p_ucred; - switch (rule->rr_subject_type) { - case RCTL_SUBJECT_TYPE_USER: - if (cred->cr_uidinfo == rule->rr_subject.rs_uip || - cred->cr_ruidinfo == rule->rr_subject.rs_uip) - break; - continue; - case RCTL_SUBJECT_TYPE_LOGINCLASS: - if (cred->cr_loginclass == rule->rr_subject.rs_loginclass) - break; - continue; - case RCTL_SUBJECT_TYPE_JAIL: - match = 0; - for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) { - if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) { - match = 1; + sx_assert(&V_allproc_lock, SA_LOCKED); + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); +#ifdef VIMAGE + if (saved_vps != vps_iter) + sx_slock(&V_allproc_lock); +#endif + FOREACH_PROC_IN_SYSTEM(p) { + cred = p->p_ucred; + switch (rule->rr_subject_type) { + case RCTL_SUBJECT_TYPE_USER: + if (cred->cr_uidinfo == rule->rr_subject.rs_uip || + cred->cr_ruidinfo == rule->rr_subject.rs_uip) + break; + continue; + case RCTL_SUBJECT_TYPE_LOGINCLASS: + if (cred->cr_loginclass == rule->rr_subject.rs_loginclass) break; + continue; + case RCTL_SUBJECT_TYPE_JAIL: + match = 0; + for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) { + if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) { + match = 1; + break; + } } + if (match) + break; + continue; + default: + panic("rctl_rule_add: unknown subject type %d", + rule->rr_subject_type); } - if (match) - break; - continue; - default: - panic("rctl_rule_add: unknown subject type %d", - rule->rr_subject_type); - } - rctl_racct_add_rule(p->p_racct, rule); + rctl_racct_add_rule(p->p_racct, rule); + } +#ifdef VIMAGE + if (saved_vps != vps_iter) + sx_sunlock(&V_allproc_lock); +#endif + CURVPS_RESTORE(); } + VPS_LIST_RUNLOCK(); return (0); } @@ -1426,6 +1441,7 @@ int rctl_rule_remove(struct rctl_rule *filter) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p; int found = 0; @@ -1452,11 +1468,25 @@ rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); - sx_assert(&allproc_lock, SA_LOCKED); + sx_assert(&V_allproc_lock, SA_LOCKED); RACCT_LOCK(); - FOREACH_PROC_IN_SYSTEM(p) { - found += rctl_racct_remove_rules(p->p_racct, filter); + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); +#ifdef VIMAGE + if (saved_vps != vps_iter) + sx_slock(&V_allproc_lock); +#endif + FOREACH_PROC_IN_SYSTEM(p) { + found += rctl_racct_remove_rules(p->p_racct, filter); + } +#ifdef VIMAGE + if (saved_vps != vps_iter) + sx_sunlock(&V_allproc_lock); +#endif + CURVPS_RESTORE(); } + VPS_LIST_RUNLOCK(); RACCT_UNLOCK(); if (found) @@ -1623,11 +1653,11 @@ if (error != 0) return (error); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } @@ -1669,7 +1699,7 @@ } out: rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (error != 0) return (error); @@ -1699,6 +1729,7 @@ int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { + VPS_ITERATOR_DECL(vps_iter); struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; @@ -1718,41 +1749,56 @@ if (error != 0) return (error); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); - - FOREACH_PROC_IN_SYSTEM(p) { - RACCT_LOCK(); - LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { - /* - * Non-process rules will be added to the buffer later. - * Adding them here would result in duplicated output. - */ - if (link->rrl_rule->rr_subject_type != - RCTL_SUBJECT_TYPE_PROCESS) - continue; - if (!rctl_rule_matches(link->rrl_rule, filter)) - continue; - rctl_rule_to_sbuf(sb, link->rrl_rule); - sbuf_printf(sb, ","); + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); +#ifdef VIMAGE + if (saved_vps != vps_iter) + sx_slock(&V_allproc_lock); +#endif + FOREACH_PROC_IN_SYSTEM(p) { + RACCT_LOCK(); + LIST_FOREACH(link, &p->p_racct->r_rule_links, + rrl_next) { + /* + * Non-process rules will be added to the + * buffer later. Adding them here would result + * in duplicated output. + */ + if (link->rrl_rule->rr_subject_type != + RCTL_SUBJECT_TYPE_PROCESS) + continue; + if (!rctl_rule_matches(link->rrl_rule, filter)) + continue; + rctl_rule_to_sbuf(sb, link->rrl_rule); + sbuf_printf(sb, ","); + } + RACCT_UNLOCK(); } - RACCT_UNLOCK(); +#ifdef VIMAGE + if (saved_vps != vps_iter) + sx_sunlock(&V_allproc_lock); +#endif + CURVPS_RESTORE(); } + VPS_LIST_RUNLOCK(); loginclass_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, @@ -1777,7 +1823,7 @@ error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); free(buf, M_RCTL); return (error); } @@ -1803,34 +1849,34 @@ if (error != 0) return (error); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) { rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (EINVAL); } if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) { rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (EOPNOTSUPP); } if (filter->rr_subject.rs_proc == NULL) { rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (EINVAL); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (E2BIG); } @@ -1860,7 +1906,7 @@ error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); free(buf, M_RCTL); return (error); } @@ -1883,11 +1929,11 @@ if (error != 0) return (error); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); error = rctl_string_to_rule(inputstr, &rule); free(inputstr, M_RCTL); if (error != 0) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } /* @@ -1906,7 +1952,7 @@ out: rctl_rule_release(rule); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } @@ -1928,17 +1974,17 @@ if (error != 0) return (error); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } error = rctl_rule_remove(filter); rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } Index: sys/kern/kern_resource.c =================================================================== --- sys/kern/kern_resource.c +++ sys/kern/kern_resource.c @@ -69,10 +69,15 @@ static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures"); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); -#define UIHASH(uid) (&uihashtbl[(uid) & uihash]) + static struct rwlock uihashtbl_lock; -static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; -static u_long uihash; /* size of hash table - 1 */ + +LIST_HEAD(uihashhead, uidinfo); +static VPS_DEFINE(struct uihashhead *, uihashtbl); +#define V_uihashtbl VPS(uihashtbl) +static VPS_DEFINE(u_long, uihash); /* size of hash table - 1 */ +#define V_uihash VPS(uihash) +#define UIHASH(uid) (&V_uihashtbl[(uid) & V_uihash]) static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp); @@ -114,18 +119,18 @@ break; case PRIO_PGRP: - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (uap->who == 0) { pg = td->td_proc->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); break; } } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && @@ -141,7 +146,8 @@ case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; - sx_slock(&allproc_lock); + /* Operate on current vps instance only. */ + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && @@ -152,7 +158,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); break; default: @@ -199,18 +205,18 @@ break; case PRIO_PGRP: - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (uap->who == 0) { pg = curp->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); break; } } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && @@ -226,7 +232,8 @@ case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; - sx_slock(&allproc_lock); + /* Operate on current vps instance only. */ + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && @@ -237,7 +244,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); break; default: @@ -1214,13 +1221,43 @@ p->p_sysent->sv_fixlimit(rlp, which); } -void +static void uihashinit() { - uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash); rw_init(&uihashtbl_lock, "uidinfo hash"); } +SYSINIT(uihashinit, SI_SUB_INTRINSIC, SI_ORDER_SECOND, uihashinit, NULL); + +static void +uihashinit_vps() +{ + + V_uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &V_uihash); +} +VPS_SYSINIT(uihashinit_vps, SI_SUB_INTRINSIC, SI_ORDER_SECOND, uihashinit_vps, + NULL); + +#ifdef VIMAGE +static void +uihashdestroy_vps(void *ident __unused) +{ + struct uidinfo *uip; + struct uihashhead *uih; + int i; + + i = 0; + for (uih = &V_uihashtbl[V_uihash]; uih >= V_uihashtbl; uih--) + LIST_FOREACH(uip, uih, ui_hash) + i++; + if (i == 0) + hashdestroy(V_uihashtbl, M_UIDINFO, V_uihash); + else + printf("%s: leaking %d uihash entries\n", __func__, i); +} +VPS_SYSUNINIT(uihashdestroy_vps, SI_SUB_INTRINSIC, SI_ORDER_SECOND, + uihashdestroy_vps, NULL); +#endif /* * Look up a uidinfo struct for the parameter uid. @@ -1368,7 +1405,7 @@ rw_rlock(&uihashtbl_lock); if (pre != NULL) (pre)(); - for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) { + for (uih = &V_uihashtbl[V_uihash]; uih >= V_uihashtbl; uih--) { LIST_FOREACH(uip, uih, ui_hash) { (callback)(uip->ui_racct, arg2, arg3); } @@ -1392,7 +1429,8 @@ return (0); } } else if (new < 0) - printf("negative %s for uid = %d\n", name, uip->ui_uid); + printf("%s: curthread %p uip %p negative %s for uid = %d\n", + __func__, curthread, uip, name, uip->ui_uid); return (1); } Index: sys/kern/kern_shutdown.c =================================================================== --- sys/kern/kern_shutdown.c +++ sys/kern/kern_shutdown.c @@ -204,6 +204,10 @@ int dumping; /* system is dumping */ int rebooting; /* system is rebooting */ +#ifdef VIMAGE +VPS_DEFINE(int, vrebooting); /* vps is rebooting */ +#define V_vrebooting VPS(vrebooting) +#endif static struct dumperinfo dumper; /* our selected dumper */ /* Context information for dump-debuggers. */ @@ -276,29 +280,42 @@ if (error == 0) { if (uap->opt & RB_REROOT) error = kern_reroot(); - else + else { +#ifdef VIMAGE + /* XXX Can argue that we should never make it here. */ + /* Init will want to _exit() in this case. */ + if (!IS_DEFAULT_VPS(TD_TO_VPS(curthread))) { + V_vrebooting = 1; + return (error); + } +#endif kern_reboot(uap->opt); + } } return (error); } +static VPS_DEFINE(int, vhowto); +#define V_vhowto VPS(vhowto) + static void shutdown_nice_task_fn(void *arg, int pending __unused) { - int howto; - howto = (uintptr_t)arg; + CURVPS_SET((struct vps *)arg); /* Send a signal to init(8) and have it shutdown the world. */ - PROC_LOCK(initproc); - if (howto & RB_POWEROFF) - kern_psignal(initproc, SIGUSR2); - else if (howto & RB_POWERCYCLE) - kern_psignal(initproc, SIGWINCH); - else if (howto & RB_HALT) - kern_psignal(initproc, SIGUSR1); + PROC_LOCK(V_initproc); + if (V_vhowto & RB_POWEROFF) + kern_psignal(V_initproc, SIGUSR2); + else if (V_vhowto & RB_POWERCYCLE) + kern_psignal(V_initproc, SIGWINCH); + else if (V_vhowto & RB_HALT) + kern_psignal(V_initproc, SIGUSR1); else - kern_psignal(initproc, SIGINT); - PROC_UNLOCK(initproc); + kern_psignal(V_initproc, SIGINT); + PROC_UNLOCK(V_initproc); + V_vhowto = 0; + CURVPS_RESTORE(); } static struct task shutdown_nice_task = TASK_INITIALIZER(0, @@ -311,10 +328,22 @@ shutdown_nice(int howto) { - if (initproc != NULL && !SCHEDULER_STOPPED()) { - shutdown_nice_task.ta_context = (void *)(uintptr_t)howto; + if (V_initproc != NULL && !SCHEDULER_STOPPED()) { + + KASSERT(V_vhowto == 0, ("%s: vps %p howto not 0: %d\n", + __func__, curvps, V_vhowto)); + V_vhowto = howto; + shutdown_nice_task.ta_context = (void *)curvps; taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task); } else { +#ifdef VIMAGE + /* XXX Can argue that we should never make it here. */ + /* Init will want to _exit() in this case. */ + if (!IS_DEFAULT_VPS(TD_TO_VPS(curthread))) { + V_vrebooting = 1; + return; + } +#endif /* * No init(8) running, or scheduler would not allow it * to run, so simply reboot. @@ -462,7 +491,7 @@ struct mount *mp, *devmp; int error; - if (curproc != initproc) + if (curproc != V_initproc) return (EPERM); /* Index: sys/kern/kern_sig.c =================================================================== --- sys/kern/kern_sig.c +++ sys/kern/kern_sig.c @@ -1669,9 +1669,9 @@ ret = ESRCH; if (all) { /* - * broadcast + * broadcast; current vps context only. */ - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc || p->p_state == PRS_NEW) { @@ -1688,9 +1688,9 @@ ret = err; PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); } else { - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. @@ -1700,11 +1700,11 @@ } else { pgrp = pgfind(pgid); if (pgrp == NULL) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (ESRCH); } } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || @@ -1891,9 +1891,9 @@ struct pgrp *pgrp; if (pgid != 0) { - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); pgrp = pgfind(pgid); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0, ksi); PGRP_UNLOCK(pgrp); @@ -3279,7 +3279,7 @@ /* * Protect the access to corefilename[] by allproc_lock. */ -#define corefilename_lock allproc_lock +#define corefilename_lock V_allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); Index: sys/kern/kern_sysctl.c =================================================================== --- sys/kern/kern_sysctl.c +++ sys/kern/kern_sysctl.c @@ -60,6 +60,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -498,6 +499,7 @@ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && #ifdef VIMAGE (oidp->oid_kind & CTLFLAG_VNET) == 0 && + (oidp->oid_kind & CTLFLAG_VPS) == 0 && #endif (oidp->oid_kind & CTLFLAG_TUN) != 0 && (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { @@ -1998,6 +2000,9 @@ else if ((oid->oid_kind & CTLFLAG_VNET) && prison_owns_vnet(req->td->td_ucred)) priv = PRIV_SYSCTL_WRITEJAIL; + else if ((oid->oid_kind & CTLFLAG_VPS) && + prison_owns_vps(req->td->td_ucred)) + priv = PRIV_SYSCTL_WRITEJAIL; #endif else priv = PRIV_SYSCTL_WRITE; @@ -2025,8 +2030,13 @@ goto out; #endif #ifdef VIMAGE + KASSERT(((oid->oid_kind & (CTLFLAG_VNET|CTLFLAG_VPS)) != + (CTLFLAG_VNET|CTLFLAG_VPS)), + ("CTLFLAG VNET and VPS set oid %p", oid)); if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL) arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); + if ((oid->oid_kind & CTLFLAG_VPS) && arg1 != NULL) + arg1 = (void *)(curvps->vps_data_base + (uintptr_t)arg1); #endif error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker); @@ -2118,6 +2128,7 @@ memlocked = 1; sx_xlock(&sysctlmemlock); } + CURVPS_SET(TD_TO_VPS(td)); CURVNET_SET(TD_TO_VNET(td)); for (;;) { @@ -2130,6 +2141,7 @@ } CURVNET_RESTORE(); + CURVPS_RESTORE(); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); Index: sys/kern/kern_thr.c =================================================================== --- sys/kern/kern_thr.c +++ sys/kern/kern_thr.c @@ -32,6 +32,7 @@ #include "opt_posix.h" #include "opt_hwpmc_hooks.h" #include +#include #include #include #include @@ -56,6 +57,7 @@ #include #include #include +#include #ifdef HWPMC_HOOKS #include #endif @@ -238,6 +240,9 @@ bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = td->td_proc; +#ifdef VIMAGE + newtd->td_vps = TD_TO_VPS(td); +#endif newtd->td_rb_list = newtd->td_rbp_list = newtd->td_rb_inact = 0; thread_cow_get(newtd, td); Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c +++ sys/kern/kern_thread.c @@ -58,6 +58,9 @@ #ifdef HWPMC_HOOKS #include #endif +#ifdef VIMAGE +#include +#endif #include @@ -83,7 +86,7 @@ "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x470, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x518, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x528, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag"); @@ -103,7 +106,7 @@ "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x2e8, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x334, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x33c, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x68, "struct proc KBI p_flag"); @@ -451,6 +454,10 @@ PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_ucred = crhold(p->p_ucred); +#ifdef VIMAGE + /* Make sure the cached vps stays correct. */ + newtd->td_vps = p->p_ucred->cr_prison->pr_vps; +#endif newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } @@ -460,6 +467,10 @@ { newtd->td_ucred = crhold(td->td_ucred); +#ifdef VIMAGE + /* Make sure to inherit the cached vps as well. */ + newtd->td_vps = td->td_vps; +#endif newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } @@ -489,6 +500,11 @@ oldcred = td->td_ucred; td->td_ucred = crhold(p->p_ucred); } +#ifdef VIMAGE + /* Make sure the cached vps stays correct. */ + if (td->td_vps != p->p_ucred->cr_prison->pr_vps) + td->td_vps = p->p_ucred->cr_prison->pr_vps; +#endif if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); Index: sys/kern/kern_vps.c =================================================================== --- /dev/null +++ sys/kern/kern_vps.c @@ -0,0 +1,835 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2004-2009 University of Zagreb + * Copyright (c) 2006-2009 FreeBSD Foundation + * Copyright (c) 2018 iXsystems, Inc. + * All rights reserved. + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Portions of this software were developed by Bjoern Zeeb + * under sponsorship from iXsystems, Inc. + * + * Copyright (c) 2009 Jeffrey Roberson + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_ddb.h" +#include "opt_kdb.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef DDB +#include +#include +#endif + + +/*- + * This file implements core functions for virtual process spaces: + * + * - Virtual process space management functions. + * + * - Virtual process space memory allocator, which virtualizes global + * variables in the process space. + * + * - Virtualized SYSINIT's/SYSUNINIT's, which allow process spaces + * to register startup/shutdown events to be run for each virtual process + * space instance. + */ + +static MALLOC_DEFINE(M_VPS, "vps", "process space control block"); + +/* + * The virtual process space list has two read-write locks, one sleepable and + * the other not, so that the list can be stablized and walked in a variety + * of process space contexts. Both must be acquired exclusively to modify + * the list, but a read lock of either lock is sufficient to walk the list. + */ +struct rwlock vps_rwlock; +struct sx vps_sxlock; + +#define VPS_LIST_WLOCK() do { \ + sx_xlock(&vps_sxlock); \ + rw_wlock(&vps_rwlock); \ +} while (0) + +#define VPS_LIST_WUNLOCK() do { \ + rw_wunlock(&vps_rwlock); \ + sx_xunlock(&vps_sxlock); \ +} while (0) + +struct vps_list_head vps_head; +struct vps *vps0; + +/* + * The virtual process space allocator provides storage for virtualized + * global variables. These variables are defined/declared using the + * VPS_DEFINE()/VPS_DECLARE() macros, which place them in the 'set_vps' + * linker set. The details of the implementation are somewhat subtle, but + * allow the majority of most process subsystems to maintain + * virtualization-agnostic. + * + * The virtual process space allocator handles variables in the base kernel + * vs. modules in similar but different ways. In both cases, virtualized + * global variables are marked as such by being declared to be part of the + * vps linker set. These "primary" copies of global variables serve two + * functions: + * + * (1) They contain static initialization or "default" values for global + * variables which will be propagated to each virtual process space + * instance when created. As with normal global variables, they default + * to zero-filled. + * + * (2) They act as unique global names by which the variable can be referred + * to, regardless of process space instance. The single global symbol + * will be used to calculate the location of a per-virtual instance + * variable at run-time. + * + * Each virtual process space instance has a complete copy of each + * virtualized global variable, stored in a malloc'd block of memory + * referred to by vps->vps_data_mem. Critical to the design is that each + * per-instance memory block is laid out identically to the primary block so + * that the offset of each global variable is the same across all blocks. + * To optimize run-time access, a precalculated 'base' address, + * vps->vps_data_base, is stored in each vps, and is the amount that can + * be added to the address of a 'primary' instance of a variable to get to the + * per-vps instance. + * + * Virtualized global variables are handled in a similar manner, but as each + * module has its own 'set_vps' linker set, and we want to keep all + * virtualized globals togther, we reserve space in the kernel's linker set + * for potential module variables using a per-vps character array, + * 'modspace'. The virtual process space allocator maintains a free list to + * track what space in the array is free (all, initially) and as modules are + * linked, allocates portions of the space to specific globals. The kernel + * module linker queries the virtual process space allocator and will + * bind references of the global to the location during linking. It also + * calls into the virtual process space allocator, once the memory is + * initialized, in order to propagate the new static initializations to all + * existing virtual process space instances so that the soon-to-be executing + * module will find every process space instance with proper default values. + */ + +/* + * Number of bytes of data in the 'set_vps' linker set, and hence the total + * size of all kernel virtualized global variables, and the malloc(9) type + * that will be used to allocate it. + */ +#define VPS_BYTES (VPS_STOP - VPS_START) + +static MALLOC_DEFINE(M_VPS_DATA, "vps_data", "VPS data"); + +/* + * VPS_MODMIN is the minimum number of bytes we will reserve for the sum of + * global variables across all loaded modules. As this actually sizes an + * array declared as a virtualized global variable in the kernel itself, and + * we want the virtualized global variable space to be page-sized, we may + * have more space than that in practice. + */ +#define VPS_MODMIN 8192 +#define VPS_SIZE roundup2(VPS_BYTES, PAGE_SIZE) + +/* + * Space to store virtualized global variables from loadable kernel modules, + * and the free list to manage it. + */ +static VPS_DEFINE(char, modspace[VPS_MODMIN]); + +/* + * Global lists of subsystem constructor and destructors for vpss. They are + * registered via VPS_SYSINIT() and VPS_SYSUNINIT(). Both lists are + * protected by the vps_sysinit_sxlock global lock. + */ +static TAILQ_HEAD(vps_sysinit_head, vps_sysinit) vps_constructors = + TAILQ_HEAD_INITIALIZER(vps_constructors); +static TAILQ_HEAD(vps_sysuninit_head, vps_sysinit) vps_destructors = + TAILQ_HEAD_INITIALIZER(vps_destructors); + +struct sx vps_sysinit_sxlock; + +#define VPS_SYSINIT_WLOCK() sx_xlock(&vps_sysinit_sxlock); +#define VPS_SYSINIT_WUNLOCK() sx_xunlock(&vps_sysinit_sxlock); +#define VPS_SYSINIT_RLOCK() sx_slock(&vps_sysinit_sxlock); +#define VPS_SYSINIT_RUNLOCK() sx_sunlock(&vps_sysinit_sxlock); + +/* XXX-BZ should probably be vpd_* instead of vnd_* but in the hope to + * harmonize most of this later on keep the names the same for now. */ +struct vps_data_free { + uintptr_t vnd_start; + int vnd_len; + TAILQ_ENTRY(vps_data_free) vnd_link; +}; + +static MALLOC_DEFINE(M_VPS_DATA_FREE, "vps_data_free", + "VPS resource accounting"); +static TAILQ_HEAD(, vps_data_free) vps_data_free_head = + TAILQ_HEAD_INITIALIZER(vps_data_free_head); +static struct sx vps_data_free_lock; + +SDT_PROVIDER_DEFINE(vps); +SDT_PROBE_DEFINE1(vps, functions, vps_alloc, entry, "int"); +SDT_PROBE_DEFINE2(vps, functions, vps_alloc, alloc, "int", "struct vps *"); +SDT_PROBE_DEFINE2(vps, functions, vps_alloc, return, "int", "struct vps *"); +SDT_PROBE_DEFINE2(vps, functions, vps_destroy, entry, "int", "struct vps *"); +SDT_PROBE_DEFINE1(vps, functions, vps_destroy, return, "int"); + +#ifdef DDB +static void db_show_vps_print_vs(struct vps_sysinit *, int); +#endif + +/* + * Allocate a virtual process space. + */ +struct vps * +vps_alloc(struct prison *pr) +{ + struct vps *vps; + + SDT_PROBE1(vps, functions, vps_alloc, entry, __LINE__); + vps = malloc(sizeof(struct vps), M_VPS, M_WAITOK | M_ZERO); + vps->vps_magic_n = VPS_MAGIC_N; + vps->vps_state = 0; + vps->vps_pr = pr; + /* Cheat for vps_sysinit() to get creds right. */ + pr->pr_vps = vps; + SDT_PROBE2(vps, functions, vps_alloc, alloc, __LINE__, vps); + + /* + * Allocate storage for virtualized global variables and copy in + * initial values form our 'primary' copy. + */ + vps->vps_data_mem = malloc(VPS_SIZE, M_VPS_DATA, M_WAITOK); + memcpy(vps->vps_data_mem, (void *)VPS_START, VPS_BYTES); + + /* + * All use of vps-specific data will immediately subtract VPS_START + * from the base memory pointer, so pre-calculate that now to avoid + * it on each use. + */ + vps->vps_data_base = (uintptr_t)vps->vps_data_mem - VPS_START; + + /* Initialize / attach vps module instances. */ + CURVPS_SET_QUIET(vps); + vps_sysinit(); + CURVPS_RESTORE(); + + VPS_LIST_WLOCK(); + LIST_INSERT_HEAD(&vps_head, vps, vps_le); + VPS_LIST_WUNLOCK(); + + SDT_PROBE2(vps, functions, vps_alloc, return, __LINE__, vps); + return (vps); +} + +/* + * Destroy a virtual process space. + */ +void +vps_destroy(struct vps *vps) +{ + + SDT_PROBE2(vps, functions, vps_destroy, entry, __LINE__, vps); + + VPS_LIST_WLOCK(); + if (vps->vps_le.le_prev == NULL && vps->vps_le.le_next == NULL) { + VPS_LIST_WUNLOCK(); + DELAY(10000); + return; + } + LIST_REMOVE(vps, vps_le); + vps->vps_le.le_prev = NULL; + vps->vps_le.le_next = NULL; + VPS_LIST_WUNLOCK(); + + CURVPS_SET_QUIET(vps); + vps_sysuninit(); + CURVPS_RESTORE(); + + /* + * Release storage for the virtual process space instance. + */ + free(vps->vps_data_mem, M_VPS_DATA); + vps->vps_data_mem = NULL; + vps->vps_data_base = 0; + vps->vps_pr->pr_vps = NULL; + vps->vps_pr = NULL; + vps->vps_magic_n = 0xdeadbeef; + free(vps, M_VPS); + SDT_PROBE1(vps, functions, vps_destroy, return, __LINE__); +} + +/* + * Boot time initialization and allocation of virtual process space. + */ +static void +vps_init_prelink(void *arg __unused) +{ + + rw_init(&vps_rwlock, "vps_rwlock"); + sx_init(&vps_sxlock, "vps_sxlock"); + sx_init(&vps_sysinit_sxlock, "vps_sysinit_sxlock"); + LIST_INIT(&vps_head); +} +SYSINIT(vps_init_prelink, SI_SUB_VIMAGE_PRELINK, SI_ORDER_FIRST, + vps_init_prelink, NULL); + +static void +vps0_init(void *arg __unused) +{ + + if (bootverbose) + printf("VIMAGE (virtualized process space) enabled\n"); + + /* + * We MUST clear curvps in vi_init_done() before going SMP, + * otherwise CURVPS_SET() macros would scream about unnecessary + * curvps recursions. + */ + curvps = prison0.pr_vps = vps0 = vps_alloc(&prison0); +} +SYSINIT(vps0_init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, vps0_init, NULL); + +#if 0 +/* Compared to vnets, nuking the vps of the current thread does not go down well. */ +static void +vps_init_done(void *unused __unused) +{ + + curvps = NULL; +} +SYSINIT(vps_init_done, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, vps_init_done, NULL); +#endif + +/* + * Once on boot, initialize the modspace freelist to entirely cover modspace. + */ +static void +vps_data_startup(void *dummy __unused) +{ + struct vps_data_free *df; + + df = malloc(sizeof(*df), M_VPS_DATA_FREE, M_WAITOK | M_ZERO); + df->vnd_start = (uintptr_t)&VPS_NAME(modspace); + df->vnd_len = VPS_MODMIN; + TAILQ_INSERT_HEAD(&vps_data_free_head, df, vnd_link); + sx_init(&vps_data_free_lock, "vps_data alloc lock"); +} +SYSINIT(vps_data, SI_SUB_KLD, SI_ORDER_FIRST, vps_data_startup, 0); + +/* Dummy VPS_SYSINIT to make sure we always reach the final end state. */ +static void +vps_sysinit_done(void *unused __unused) +{ + + return; +} +VPS_SYSINIT(vps_sysinit_done, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, + vps_sysinit_done, NULL); + +/* + * When a module is loaded and requires storage for a virtualized global + * variable, allocate space from the modspace free list. This interface + * should be used only by the kernel linker. + */ +void * +vps_data_alloc(int size) +{ + struct vps_data_free *df; + void *s; + + s = NULL; + size = roundup2(size, sizeof(void *)); + sx_xlock(&vps_data_free_lock); + TAILQ_FOREACH(df, &vps_data_free_head, vnd_link) { + if (df->vnd_len < size) + continue; + if (df->vnd_len == size) { + s = (void *)df->vnd_start; + TAILQ_REMOVE(&vps_data_free_head, df, vnd_link); + free(df, M_VPS_DATA_FREE); + break; + } + s = (void *)df->vnd_start; + df->vnd_len -= size; + df->vnd_start = df->vnd_start + size; + break; + } + sx_xunlock(&vps_data_free_lock); + + return (s); +} + +/* + * Free space for a virtualized global variable on module unload. + */ +void +vps_data_free(void *start_arg, int size) +{ + struct vps_data_free *df; + struct vps_data_free *dn; + uintptr_t start; + uintptr_t end; + + size = roundup2(size, sizeof(void *)); + start = (uintptr_t)start_arg; + end = start + size; + /* + * Free a region of space and merge it with as many neighbors as + * possible. Keeping the list sorted simplifies this operation. + */ + sx_xlock(&vps_data_free_lock); + TAILQ_FOREACH(df, &vps_data_free_head, vnd_link) { + if (df->vnd_start > end) + break; + /* + * If we expand at the end of an entry we may have to merge + * it with the one following it as well. + */ + if (df->vnd_start + df->vnd_len == start) { + df->vnd_len += size; + dn = TAILQ_NEXT(df, vnd_link); + if (df->vnd_start + df->vnd_len == dn->vnd_start) { + df->vnd_len += dn->vnd_len; + TAILQ_REMOVE(&vps_data_free_head, dn, + vnd_link); + free(dn, M_VPS_DATA_FREE); + } + sx_xunlock(&vps_data_free_lock); + return; + } + if (df->vnd_start == end) { + df->vnd_start = start; + df->vnd_len += size; + sx_xunlock(&vps_data_free_lock); + return; + } + } + dn = malloc(sizeof(*df), M_VPS_DATA_FREE, M_WAITOK | M_ZERO); + dn->vnd_start = start; + dn->vnd_len = size; + if (df) + TAILQ_INSERT_BEFORE(df, dn, vnd_link); + else + TAILQ_INSERT_TAIL(&vps_data_free_head, dn, vnd_link); + sx_xunlock(&vps_data_free_lock); +} + +/* + * When a new virtualized global variable has been allocated, propagate its + * initial value to each already-allocated virtual process space instance. + */ +void +vps_data_copy(void *start, int size) +{ + struct vps *vps; + + VPS_LIST_RLOCK(); + LIST_FOREACH(vps, &vps_head, vps_le) + memcpy((void *)((uintptr_t)vps->vps_data_base + + (uintptr_t)start), start, size); + VPS_LIST_RUNLOCK(); +} + +/* + * Support for special SYSINIT handlers registered via VPS_SYSINIT() + * and VPS_SYSUNINIT(). + */ +void +vps_register_sysinit(void *arg) +{ + struct vps_sysinit *vs, *vs2; + struct vps *vps; + + vs = arg; + KASSERT(vs->subsystem >= SI_SUB_INTRINSIC, ("vps sysinit too early")); + + /* Add the constructor to the global list of vps constructors. */ + VPS_SYSINIT_WLOCK(); + TAILQ_FOREACH(vs2, &vps_constructors, link) { + if (vs2->subsystem > vs->subsystem) + break; + if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) + break; + } + if (vs2 != NULL) + TAILQ_INSERT_BEFORE(vs2, vs, link); + else + TAILQ_INSERT_TAIL(&vps_constructors, vs, link); + + /* + * Invoke the constructor on all the existing vpss when it is + * registered. + */ + VPS_FOREACH(vps) { + CURVPS_SET_QUIET(vps); + vs->func(vs->arg); + CURVPS_RESTORE(); + } + VPS_SYSINIT_WUNLOCK(); +} + +void +vps_deregister_sysinit(void *arg) +{ + struct vps_sysinit *vs; + + vs = arg; + + /* Remove the constructor from the global list of vps constructors. */ + VPS_SYSINIT_WLOCK(); + TAILQ_REMOVE(&vps_constructors, vs, link); + VPS_SYSINIT_WUNLOCK(); +} + +void +vps_register_sysuninit(void *arg) +{ + struct vps_sysinit *vs, *vs2; + + vs = arg; + + /* Add the destructor to the global list of vps destructors. */ + VPS_SYSINIT_WLOCK(); + TAILQ_FOREACH(vs2, &vps_destructors, link) { + if (vs2->subsystem > vs->subsystem) + break; + if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) + break; + } + if (vs2 != NULL) + TAILQ_INSERT_BEFORE(vs2, vs, link); + else + TAILQ_INSERT_TAIL(&vps_destructors, vs, link); + VPS_SYSINIT_WUNLOCK(); +} + +void +vps_deregister_sysuninit(void *arg) +{ + struct vps_sysinit *vs; + struct vps *vps; + + vs = arg; + + /* + * Invoke the destructor on all the existing vpss when it is + * deregistered. + */ + VPS_SYSINIT_WLOCK(); + VPS_FOREACH(vps) { + CURVPS_SET_QUIET(vps); + vs->func(vs->arg); + CURVPS_RESTORE(); + } + + /* Remove the destructor from the global list of vps destructors. */ + TAILQ_REMOVE(&vps_destructors, vs, link); + VPS_SYSINIT_WUNLOCK(); +} + +/* + * Invoke all registered vps constructors on the current vps. Used during + * vps construction. The caller is responsible for ensuring the new vps is + * the current vps and that the vps_sysinit_sxlock lock is locked. + */ +void +vps_sysinit(void) +{ + struct vps_sysinit *vs; + struct vps *vps; + + vps = curvps; + VPS_SYSINIT_RLOCK(); + TAILQ_FOREACH(vs, &vps_constructors, link) { + curvps->vps_state = vs->subsystem; + vs->func(vs->arg); + KASSERT((curvps == vps), + ("%s: vs %p subsystem %u order %u func %p returned " + "with curvps altered: curvps %p should be %p\n", + __func__, vs, vs->subsystem, vs->order, vs->func, + curvps, vps)); + } + VPS_SYSINIT_RUNLOCK(); +} + +/* + * Invoke all registered vps destructors on the current vps. Used during + * vps destruction. The caller is responsible for ensuring the dying vps + * the current vps and that the vps_sysinit_sxlock lock is locked. + */ +void +vps_sysuninit(void) +{ + struct vps_sysinit *vs; + + VPS_SYSINIT_RLOCK(); + TAILQ_FOREACH_REVERSE(vs, &vps_destructors, vps_sysuninit_head, + link) { + curvps->vps_state = vs->subsystem; + vs->func(vs->arg); + } + VPS_SYSINIT_RUNLOCK(); +} + +/* + * EVENTHANDLER(9) extensions. + */ +/* + * Invoke the eventhandler function originally registered with the possibly + * registered argument for all virtual process space instances. + * + * This iterator can only be used for eventhandlers that do not take any + * additional arguments, as we do ignore the variadic arguments from the + * EVENTHANDLER_INVOKE() call. + */ +void +vps_global_eventhandler_iterator_func(void *arg, ...) +{ + VPS_ITERATOR_DECL(vps_iter); + struct eventhandler_entry_vimage *v_ee; + + /* + * There is a bug here in that we should actually cast things to + * (struct eventhandler_entry_ ## name *) but that's not easily + * possible in here so just re-using the variadic version we + * defined for the generic vimage case. + */ + v_ee = arg; + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + ((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg); + CURVPS_RESTORE(); + } + VPS_LIST_RUNLOCK(); +} + +#ifdef VPS_DEBUG +struct vps_recursion { + SLIST_ENTRY(vps_recursion) vnr_le; + const char *prev_fn; + const char *where_fn; + int where_line; + struct vps *old_vps; + struct vps *new_vps; +}; + +static SLIST_HEAD(, vps_recursion) vps_recursions = + SLIST_HEAD_INITIALIZER(vps_recursions); + +static void +vps_print_recursion(struct vps_recursion *vnr, int brief) +{ + + if (!brief) + printf("CURVPS_SET() recursion in "); + printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line, + vnr->prev_fn); + if (brief) + printf(", "); + else + printf("\n "); + printf("%p -> %p\n", vnr->old_vps, vnr->new_vps); +} + +void +vps_log_recursion(struct vps *old_vps, const char *old_fn, int line) +{ + struct vps_recursion *vnr; + + /* Skip already logged recursion events. */ + SLIST_FOREACH(vnr, &vps_recursions, vnr_le) + if (vnr->prev_fn == old_fn && + vnr->where_fn == curthread->td_vps_lpush && + vnr->where_line == line && + (vnr->old_vps == vnr->new_vps) == (curvps == old_vps)) + return; + + vnr = malloc(sizeof(*vnr), M_VPS, M_NOWAIT | M_ZERO); + if (vnr == NULL) + panic("%s: malloc failed", __func__); + vnr->prev_fn = old_fn; + vnr->where_fn = curthread->td_vps_lpush; + vnr->where_line = line; + vnr->old_vps = old_vps; + vnr->new_vps = curvps; + + SLIST_INSERT_HEAD(&vps_recursions, vnr, vnr_le); + + vps_print_recursion(vnr, 0); +#ifdef KDB + kdb_backtrace(); +#endif +} +#endif /* VPS_DEBUG */ + +/* + * DDB(4). + */ +#ifdef DDB +static void +db_vps_print(struct vps *vps) +{ + + db_printf("vps = %p\n", vps); + db_printf(" vps_magic_n = %#08x (%s, orig %#08x)\n", + vps->vps_magic_n, + (vps->vps_magic_n == VPS_MAGIC_N) ? + "ok" : "mismatch", VPS_MAGIC_N); + db_printf(" vps_data_mem = %p\n", vps->vps_data_mem); + db_printf(" vps_data_base = %#jx\n", + (uintmax_t)vps->vps_data_base); + db_printf(" vps_state = %#08x\n", vps->vps_state); + db_printf("\n"); +} + +DB_SHOW_ALL_COMMAND(vpss, db_show_all_vpss) +{ + VPS_ITERATOR_DECL(vps_iter); + + VPS_FOREACH(vps_iter) { + db_vps_print(vps_iter); + if (db_pager_quit) + break; + } +} + +DB_SHOW_COMMAND(vps, db_show_vps) +{ + + if (!have_addr) { + db_printf("usage: show vps \n"); + return; + } + + db_vps_print((struct vps *)addr); +} + +static void +db_show_vps_print_vs(struct vps_sysinit *vs, int ddb) +{ + const char *vsname, *funcname; + c_db_sym_t sym; + db_expr_t offset; + +#define xprint(...) \ + if (ddb) \ + db_printf(__VA_ARGS__); \ + else \ + printf(__VA_ARGS__) + + if (vs == NULL) { + xprint("%s: no vps_sysinit * given\n", __func__); + return; + } + + sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset); + db_symbol_values(sym, &vsname, NULL); + sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset); + db_symbol_values(sym, &funcname, NULL); + xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs); + xprint(" %#08x %#08x\n", vs->subsystem, vs->order); + xprint(" %p(%s)(%p)\n", + vs->func, (funcname != NULL) ? funcname : "", vs->arg); +#undef xprint +} + +DB_SHOW_COMMAND(vps_sysinit, db_show_vps_sysinit) +{ + struct vps_sysinit *vs; + + db_printf("VPS_SYSINIT vs Name(Ptr)\n"); + db_printf(" Subsystem Order\n"); + db_printf(" Function(Name)(Arg)\n"); + TAILQ_FOREACH(vs, &vps_constructors, link) { + db_show_vps_print_vs(vs, 1); + if (db_pager_quit) + break; + } +} + +DB_SHOW_COMMAND(vps_sysuninit, db_show_vps_sysuninit) +{ + struct vps_sysinit *vs; + + db_printf("VPS_SYSUNINIT vs Name(Ptr)\n"); + db_printf(" Subsystem Order\n"); + db_printf(" Function(Name)(Arg)\n"); + TAILQ_FOREACH_REVERSE(vs, &vps_destructors, vps_sysuninit_head, + link) { + db_show_vps_print_vs(vs, 1); + if (db_pager_quit) + break; + } +} + +DB_COMMAND(setcurvps, db_setcurvps) +{ + struct vps *vps; + + if (!have_addr) { + db_printf("usage: setcurvps \n"); + return; + } + + vps = (struct vps *)addr; + db_printf("curvps %p -> %p\n", curvps, vps); + curvps = vps; + db_vps_print(vps); +} + +#ifdef VPS_DEBUG +DB_SHOW_COMMAND(vpsrcrs, db_show_vpsrcrs) +{ + struct vps_recursion *vnr; + + SLIST_FOREACH(vnr, &vps_recursions, vnr_le) + vps_print_recursion(vnr, 1); +} +#endif +#endif /* DDB */ Index: sys/kern/sched_4bsd.c =================================================================== --- sys/kern/sched_4bsd.c +++ sys/kern/sched_4bsd.c @@ -454,17 +454,15 @@ * Recompute process priorities, every hz ticks. * MP-safe, called without the Giant mutex. */ -/* ARGSUSED */ -static void -schedcpu(void) +static __inline void +_schedcpu(fixpt_t loadfac) { - fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); struct thread *td; struct proc *p; struct td_sched *ts; int awake; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { @@ -550,7 +548,22 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); +} + +static void +schedcpu(void) +{ + VPS_ITERATOR_DECL(vps_iter); + fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + _schedcpu(loadfac); + CURVPS_RESTORE(); + } + VPS_LIST_RUNLOCK(); } /* Index: sys/kern/subr_pcpu.c =================================================================== --- sys/kern/subr_pcpu.c +++ sys/kern/subr_pcpu.c @@ -378,6 +378,7 @@ #ifdef VIMAGE db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet); + db_printf("curvps = %p\n", pc->pc_curthread->td_vps); #endif #ifdef WITNESS Index: sys/kern/subr_prf.c =================================================================== --- sys/kern/subr_prf.c +++ sys/kern/subr_prf.c @@ -165,12 +165,12 @@ if (TD_IS_IDLETHREAD(td)) return (0); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); p = td->td_proc; PROC_LOCK(p); if ((p->p_flag & P_CONTROLT) == 0) { PROC_UNLOCK(p); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (0); } SESS_LOCK(p->p_session); @@ -178,14 +178,14 @@ SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (pca.tty == NULL) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (0); } pca.flags = TOTTY; pca.p_bufr = NULL; va_start(ap, fmt); tty_lock(pca.tty); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); retval = kvprintf(fmt, putchar, &pca, 10, ap); tty_unlock(pca.tty); va_end(ap); @@ -214,7 +214,7 @@ struct putchar_arg pca; struct session *sess = NULL; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (pri != -1) flags |= TOLOG; if (p != NULL) { @@ -237,7 +237,7 @@ pca.p_bufr = NULL; if (pca.tty != NULL) tty_lock(pca.tty); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); kvprintf(fmt, putchar, &pca, 10, ap); if (pca.tty != NULL) tty_unlock(pca.tty); Index: sys/kern/subr_turnstile.c =================================================================== --- sys/kern/subr_turnstile.c +++ sys/kern/subr_turnstile.c @@ -1212,22 +1212,32 @@ DB_SHOW_ALL_COMMAND(chains, db_show_allchains) { + VPS_ITERATOR_DECL(vps_iter); struct thread *td; struct proc *p; int i; i = 1; - FOREACH_PROC_IN_SYSTEM(p) { - FOREACH_THREAD_IN_PROC(p, td) { - if ((TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested)) - || (TD_IS_INHIBITED(td) && TD_ON_SLEEPQ(td))) { - db_printf("chain %d:\n", i++); - print_lockchain(td, " "); + + /* VPS_LIST_RLOCK(); */ + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + FOREACH_PROC_IN_SYSTEM(p) { + FOREACH_THREAD_IN_PROC(p, td) { + if ((TD_ON_LOCK(td) && + LIST_EMPTY(&td->td_contested)) + || (TD_IS_INHIBITED(td) && + TD_ON_SLEEPQ(td))) { + db_printf("chain %d:\n", i++); + print_lockchain(td, " "); + } + if (db_pager_quit) + return; } - if (db_pager_quit) - return; } + CURVPS_RESTORE(); } + /* VPS_LIST_RUNLOCK(); */ } DB_SHOW_ALIAS(allchains, db_show_allchains) Index: sys/kern/subr_witness.c =================================================================== --- sys/kern/subr_witness.c +++ sys/kern/subr_witness.c @@ -2534,6 +2534,7 @@ DB_SHOW_ALL_COMMAND(locks, db_witness_list_all) { + VPS_ITERATOR_DECL(vps_iter); struct thread *td; struct proc *p; @@ -2542,19 +2543,25 @@ * held sleep locks, but that information is currently not exported * by WITNESS. */ - FOREACH_PROC_IN_SYSTEM(p) { - if (!witness_proc_has_locks(p)) - continue; - FOREACH_THREAD_IN_PROC(p, td) { - if (!witness_thread_has_locks(td)) + /* VPS_LIST_RLOCK(); */ + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + FOREACH_PROC_IN_SYSTEM(p) { + if (!witness_proc_has_locks(p)) continue; - db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid, - p->p_comm, td, td->td_tid); - witness_ddb_list(td); - if (db_pager_quit) - return; + FOREACH_THREAD_IN_PROC(p, td) { + if (!witness_thread_has_locks(td)) + continue; + db_printf("Process %d (%s) thread %p (%d)\n", + p->p_pid, p->p_comm, td, td->td_tid); + witness_ddb_list(td); + if (db_pager_quit) + return; + } } + CURVPS_RESTORE(); } + /* VPS_LIST_RUNLOCK(); */ } DB_SHOW_ALIAS(alllocks, db_witness_list_all) Index: sys/kern/sys_procdesc.c =================================================================== --- sys/kern/sys_procdesc.c +++ sys/kern/sys_procdesc.c @@ -153,13 +153,13 @@ goto out; } pd = fp->f_data; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (pd->pd_proc != NULL) { *p = pd->pd_proc; PROC_LOCK(*p); } else error = ESRCH; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); out: fdrop(fp, td); return (error); @@ -305,14 +305,14 @@ { struct procdesc *pd; - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL")); pd = p->p_procdesc; PROCDESC_LOCK(pd); - KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc, + KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == V_initproc, ("procdesc_exit: closed && parent not init")); pd->pd_flags |= PDF_EXITED; @@ -349,7 +349,7 @@ { struct procdesc *pd; - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL")); pd = p->p_procdesc; @@ -375,7 +375,7 @@ fp->f_ops = &badfileops; fp->f_data = NULL; - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); PROCDESC_LOCK(pd); pd->pd_flags |= PDF_CLOSED; PROCDESC_UNLOCK(pd); @@ -385,7 +385,7 @@ * This is the case where process' exit status was already * collected and procdesc_reap() was already called. */ - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); } else { PROC_LOCK(p); AUDIT_ARG_PROCESS(p); @@ -415,11 +415,11 @@ * prejudice. */ p->p_sigparent = SIGCHLD; - proc_reparent(p, initproc); + proc_reparent(p, V_initproc); if ((pd->pd_flags & PDF_DAEMON) == 0) kern_psignal(p, SIGKILL); PROC_UNLOCK(p); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); } } @@ -531,7 +531,7 @@ */ bzero(sb, sizeof(*sb)); pd = fp->f_data; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (pd->pd_proc != NULL) { PROC_LOCK(pd->pd_proc); AUDIT_ARG_PROCESS(pd->pd_proc); @@ -553,7 +553,7 @@ PROC_UNLOCK(pd->pd_proc); } else sb->st_mode = S_IFREG; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (0); } Index: sys/kern/sys_process.c =================================================================== --- sys/kern/sys_process.c +++ sys/kern/sys_process.c @@ -688,7 +688,7 @@ proc_set_traced(struct proc *p, bool stop) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_TRACED; if (stop) @@ -733,7 +733,7 @@ case PT_SET_EVENT_MASK: case PT_DETACH: case PT_GET_SC_ARGS: - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); proctree_locked = 1; break; default: @@ -747,14 +747,14 @@ if (pid <= PID_MAX) { if ((p = pfind(pid)) == NULL) { if (proctree_locked) - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (ESRCH); } } else { td2 = tdfind(pid, -1); if (td2 == NULL) { if (proctree_locked) - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (ESRCH); } p = td2->td_proc; @@ -816,7 +816,7 @@ error = EBUSY; goto fail; } - if (p->p_pptr == initproc) { + if (p->p_pptr == V_initproc) { error = EPERM; goto fail; } @@ -923,7 +923,7 @@ CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid, p->p_oppid); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); proctree_locked = 0; MPASS(p->p_xthread == NULL); MPASS((p->p_flag & P_STOPPED_TRACE) == 0); @@ -1113,7 +1113,7 @@ pp = proc_realparent(p); proc_reparent(p, pp); - if (pp == initproc) + if (pp == V_initproc) p->p_sigparent = SIGCHLD; CTR3(KTR_PTRACE, "PT_DETACH: pid %d reparented to pid %d, sig %d", @@ -1142,7 +1142,7 @@ break; } - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); proctree_locked = 0; sendsig: @@ -1456,7 +1456,7 @@ fail: PROC_UNLOCK(p); if (proctree_locked) - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (error); } #undef PROC_READ Index: sys/kern/tty.c =================================================================== --- sys/kern/tty.c +++ sys/kern/tty.c @@ -1703,18 +1703,18 @@ /* XXX: This looks awful. */ tty_unlock(tp); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); tty_lock(tp); if (!SESS_LEADER(p)) { /* Only the session leader may do this. */ - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (EPERM); } if (tp->t_session != NULL && tp->t_session == p->p_session) { /* This is already our controlling TTY. */ - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (0); } @@ -1732,7 +1732,7 @@ * TTYs of which the session leader has been * killed or the TTY revoked. */ - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (EPERM); } @@ -1740,7 +1740,7 @@ tp->t_session = p->p_session; tp->t_session->s_ttyp = tp; tp->t_sessioncnt++; - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); /* Assign foreground process group. */ tp->t_pgrp = p->p_pgrp; @@ -1759,12 +1759,12 @@ * decompose proctree_lock. */ tty_unlock(tp); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); pg = pgfind(*(int *)data); if (pg != NULL) PGRP_UNLOCK(pg); if (pg == NULL || pg->pg_session != td->td_proc->p_session) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); tty_lock(tp); return (EPERM); } @@ -1775,11 +1775,11 @@ * relocking the TTY. */ if (!tty_is_ctty(tp, td->td_proc)) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (ENOTTY); } tp->t_pgrp = pg; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); /* Wake up the background process groups. */ cv_broadcast(&tp->t_bgwait); Index: sys/kern/tty_tty.c =================================================================== --- sys/kern/tty_tty.c +++ sys/kern/tty_tty.c @@ -68,7 +68,7 @@ return; p = curproc; sx_sunlock(&clone_drain_lock); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); sx_slock(&clone_drain_lock); dev_lock(); if (!(p->p_flag & P_CONTROLT)) @@ -83,7 +83,7 @@ *dev = p->p_session->s_ttyvp->v_rdev; dev_refl(*dev); dev_unlock(); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); } static void Index: sys/net/vnet.c =================================================================== --- sys/net/vnet.c +++ sys/net/vnet.c @@ -80,8 +80,6 @@ * stack instance. */ -FEATURE(vimage, "VIMAGE kernel virtualization"); - static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); /* @@ -307,7 +305,7 @@ sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock"); LIST_INIT(&vnet_head); } -SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST, +SYSINIT(vnet_init_prelink, SI_SUB_VIMAGE_PRELINK, SI_ORDER_FIRST, vnet_init_prelink, NULL); static void Index: sys/sys/jail.h =================================================================== --- sys/sys/jail.h +++ sys/sys/jail.h @@ -166,6 +166,7 @@ struct osd pr_osd; /* (p) additional data */ struct cpuset *pr_cpuset; /* (p) cpuset */ struct vnet *pr_vnet; /* (c) network stack */ + struct vps *pr_vps; /* (c) process space */ struct vnode *pr_root; /* (c) vnode to rdir */ int pr_ip4s; /* (p) number of v4 IPs */ int pr_ip6s; /* (p) number of v6 IPs */ @@ -209,6 +210,7 @@ /* primary jail address. */ #define PR_IP6_SADDRSEL 0x00000100 /* Do IPv6 src addr sel. or use the */ /* primary jail address. */ +#define PR_VPS 0x00000200 /* Virtual process space */ /* Internal flag bits */ #define PR_IP4 0x02000000 /* IPv4 restricted or disabled */ @@ -370,6 +372,7 @@ int prison_allow(struct ucred *, unsigned); int prison_check(struct ucred *cred1, struct ucred *cred2); int prison_owns_vnet(struct ucred *); +int prison_owns_vps(struct ucred *); int prison_canseemount(struct ucred *cred, struct mount *mp); void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp); Index: sys/sys/kernel.h =================================================================== --- sys/sys/kernel.h +++ sys/sys/kernel.h @@ -102,7 +102,7 @@ SI_SUB_MTX_POOL_DYNAMIC = 0x1AC0000, /* dynamic mutex pool */ SI_SUB_LOCK = 0x1B00000, /* various locks */ SI_SUB_EVENTHANDLER = 0x1C00000, /* eventhandler init */ - SI_SUB_VNET_PRELINK = 0x1E00000, /* vnet init before modules */ + SI_SUB_VIMAGE_PRELINK = 0x1E00000, /* VIMAGE init before modules */ SI_SUB_KLD = 0x2000000, /* KLD and module setup */ SI_SUB_CPU = 0x2100000, /* CPU resource(s)*/ SI_SUB_RACCT = 0x2110000, /* resource accounting */ @@ -159,7 +159,7 @@ SI_SUB_ROOT_CONF = 0xb000000, /* Find root devices */ SI_SUB_INTRINSIC_POST = 0xd000000, /* proc 0 cleanup*/ SI_SUB_SYSCALLS = 0xd800000, /* register system calls */ - SI_SUB_VNET_DONE = 0xdc00000, /* vnet registration complete */ + SI_SUB_VNET_DONE = 0xdc00000, /* VNET registration complete */ SI_SUB_KTHREAD_INIT = 0xe000000, /* init process*/ SI_SUB_KTHREAD_PAGE = 0xe400000, /* pageout daemon*/ SI_SUB_KTHREAD_VM = 0xe800000, /* vm daemon*/ @@ -170,6 +170,7 @@ SI_SUB_SMP = 0xf000000, /* start the APs*/ #endif SI_SUB_RACCTD = 0xf100000, /* start racctd*/ + SI_SUB_VIMAGE_DONE = 0xf800000, /* VIMAGE initialization done */ SI_SUB_LAST = 0xfffffff /* final initialization */ }; Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -68,6 +68,9 @@ #include #include #include +#ifdef _KERNEL +#include +#endif #include /* Machine-dependent proc substruct. */ #ifdef _KERNEL @@ -351,6 +354,8 @@ /* LP64 hole */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ + struct vps *td_vps; /* (k) Effective vps. */ + const char *td_vps_lpush; /* (k) Debugging vps push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ @@ -809,7 +814,7 @@ #endif #define FOREACH_PROC_IN_SYSTEM(p) \ - LIST_FOREACH((p), &allproc, p_list) + LIST_FOREACH((p), &V_allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) @@ -939,38 +944,61 @@ #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) -#define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) -extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; -extern u_long pidhash; -#define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) +LIST_HEAD(pidhashhead, proc); +VPS_DECLARE(struct pidhashhead *, pidhashtbl); +#define V_pidhashtbl VPS(pidhashtbl) +VPS_DECLARE(u_long, pidhash); +#define V_pidhash VPS(pidhash) +#define PIDHASH(pid) (&V_pidhashtbl[(pid) & V_pidhash]) + extern LIST_HEAD(tidhashhead, thread) *tidhashtbl; extern u_long tidhash; +#define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) extern struct rwlock tidhash_lock; -#define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) -extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; -extern u_long pgrphash; +LIST_HEAD(pgrphashhead, pgrp); +VPS_DECLARE(struct pgrphashhead *, pgrphashtbl); +#define V_pgrphashtbl VPS(pgrphashtbl) +VPS_DECLARE(u_long, pgrphash); +#define V_pgrphash VPS(pgrphash) +#define PGRPHASH(pgid) (&V_pgrphashtbl[(pgid) & V_pgrphash]) -extern struct sx allproc_lock; +VPS_DECLARE(struct sx, allproc_lock); +#define V_allproc_lock VPS(allproc_lock) extern int allproc_gen; -extern struct sx proctree_lock; -extern struct mtx ppeers_lock; +VPS_DECLARE(struct sx, proctree_lock); +#define V_proctree_lock VPS(proctree_lock) +VPS_DECLARE(struct mtx, ppeers_lock); +#define V_ppeers_lock VPS(ppeers_lock) extern struct proc proc0; /* Process slot for swapper. */ extern struct thread0_storage thread0_st; /* Primary thread in proc0. */ #define thread0 (thread0_st.t0st_thread) extern struct vmspace vmspace0; /* VM space for proc0. */ +VPS_DECLARE(struct proc *, vproc0); +#define V_vproc0 VPS(vproc0) +#ifdef VIMAGE +VPS_DECLARE(int, vpsdying); +#define V_vpsdying VPS(vpsdying) +#endif extern int hogticks; /* Limit on kernel cpu hogs. */ -extern int lastpid; -extern int nprocs, maxproc; /* Current and max number of procs. */ +VPS_DECLARE(int, lastpid); +#define V_lastpid VPS(lastpid) +VPS_DECLARE(int, nprocs); /* Current number of procs. */ +#define V_nprocs VPS(nprocs) +extern int maxproc; /* Max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); -extern struct proclist allproc; /* List of all processes. */ -extern struct proclist zombproc; /* List of zombie processes. */ -extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ +VPS_DECLARE(struct proclist, allproc); /* List of all processes. */ +#define V_allproc VPS(allproc) +VPS_DECLARE(struct proclist, zombproc); /* List of zombie processes. */ +#define V_zombproc VPS(zombproc) +VPS_DECLARE(struct proc *, initproc); /* Process slots for init. */ +#define V_initproc VPS(initproc) +extern struct proc *pageproc; /* Process slot for pager. */ extern struct uma_zone *proc_zone; @@ -1021,6 +1049,7 @@ int fork1(struct thread *, struct fork_req *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); +int fork_findpid(int); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kern_proc_vmmap_resident(struct vm_map *map, struct vm_map_entry *entry, @@ -1043,7 +1072,6 @@ int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); -void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); Index: sys/sys/resourcevar.h =================================================================== --- sys/sys/resourcevar.h +++ sys/sys/resourcevar.h @@ -154,7 +154,6 @@ struct uidinfo *uifind(uid_t uid); void uifree(struct uidinfo *uip); -void uihashinit(void); void uihold(struct uidinfo *uip); #ifdef RACCT void ui_racct_foreach(void (*callback)(struct racct *racct, Index: sys/sys/sysctl.h =================================================================== --- sys/sys/sysctl.h +++ sys/sys/sysctl.h @@ -104,6 +104,7 @@ #define CTLFLAG_CAPWR 0x00004000 /* Can be written in capability mode */ #define CTLFLAG_STATS 0x00002000 /* Statistics, not a tuneable */ #define CTLFLAG_NOFETCH 0x00001000 /* Don't fetch tunable from getenv() */ +#define CTLFLAG_VPS 0x00000800 /* Prisons with vps can fiddle */ #define CTLFLAG_CAPRW (CTLFLAG_CAPRD|CTLFLAG_CAPWR) /* Index: sys/sys/vps.h =================================================================== --- /dev/null +++ sys/sys/vps.h @@ -0,0 +1,381 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2006-2009 University of Zagreb + * Copyright (c) 2006-2009 FreeBSD Foundation + * Copyright (c) 2018 iXsystems, Inc. + * All rights reserved. + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Portions of this software were developed by Bjoern Zeeb + * under sponsorship from iXsystems, Inc. + * + * Copyright (c) 2009 Jeffrey Roberson + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * This header file defines several sets of interfaces supporting virtualized + * process space: + * + * - Definition of 'struct vps' and functions and macros to allocate/free/ + * manipulate it. + * + * - A virtual process stack memory allocator, which provides support for + * virtualized global variables via a special linker set, set_vps. + * + * - Virtualized sysinits/sysuninits, which allow constructors and + * destructors to be run for each process space as virtual + * instances are created and destroyed. + * + * If VIMAGE isn't compiled into the kernel, virtualized global variables + * compile to normal global variables, and virtualized sysinits to regular + * sysinits. + */ + +#ifndef _SYS_VPS_H_ +#define _SYS_VPS_H_ + +/* + * struct vps describes a virtualized process space, and is primarily a + * pointer to storage for virtualized global variables. Expose to userspace + * as required for libkvm. + */ +#if defined(_KERNEL) || defined(_WANT_VPS) +#include + +struct vps { + LIST_ENTRY(vps) vps_le; /* all vps list */ + u_int vps_magic_n; + u_int vps_state; /* SI_SUB_* */ + void *vps_data_mem; + uintptr_t vps_data_base; + struct prison *vps_pr; /* Put init on this if set. */ +}; +#define VPS_MAGIC_N 0x0f0307e2 + +/* + * These two virtual process space allocator definitions are also required + * for libkvm so that it can evaluate virtualized global variables. + */ +#define VPS_SETNAME "set_vps" +#define VPS_SYMPREFIX "vps_entry_" +#endif + +#ifdef _KERNEL +#ifdef VIMAGE +#include +#include /* for struct thread */ +#include +#include + +/* + * Location of the kernel's 'set_vps' linker set. + */ +extern uintptr_t *__start_set_vps; +__GLOBL(__start_set_vps); +extern uintptr_t *__stop_set_vps; +__GLOBL(__stop_set_vps); + +#define VPS_START (uintptr_t)&__start_set_vps +#define VPS_STOP (uintptr_t)&__stop_set_vps + +/* + * Functions to allocate and destroy virtual process spaces. + */ +struct vps *vps_alloc(struct prison *); +void vps_destroy(struct vps *); + +/* + * The current virtual process space -- we may wish to move this to struct + * pcpu in the future. + */ +#define curvps curthread->td_vps + +/* + * Various macros -- get and set the current process space, but also + * assertions. + */ +#if defined(INVARIANTS) || defined(VPS_DEBUG) +#define VPS_ASSERT(exp, msg) do { \ + if (!(exp)) \ + panic msg; \ +} while (0) +#else +#define VPS_ASSERT(exp, msg) do { \ +} while (0) +#endif + +#ifdef VPS_DEBUG +void vps_log_recursion(struct vps *, const char *, int); + +#define CURVPS_SET_QUIET(arg) \ + VPS_ASSERT((arg) != NULL && (arg)->vps_magic_n == VPS_MAGIC_N, \ + ("CURVPS_SET at %s:%d %s() curvps=%p vps=%p", \ + __FILE__, __LINE__, __func__, curvps, (arg))); \ + struct vps *saved_vps = curvps; \ + const char *saved_vps_lpush = curthread->td_vps_lpush; \ + curvps = arg; \ + curthread->td_vps_lpush = __func__; + +#define CURVPS_SET_VERBOSE(arg) \ + CURVPS_SET_QUIET(arg) \ + if (saved_vps) \ + vps_log_recursion(saved_vps, saved_vps_lpush, __LINE__); + +#define CURVPS_SET(arg) CURVPS_SET_VERBOSE(arg) + +#define CURVPS_RESTORE() \ + VPS_ASSERT(curvps != NULL && (saved_vps == NULL || \ + saved_vps->vps_magic_n == VPS_MAGIC_N), \ + ("CURVPS_RESTORE at %s:%d %s() curvps=%p saved_vps=%p", \ + __FILE__, __LINE__, __func__, curvps, saved_vps)); \ + curvps = saved_vps; \ + curthread->td_vps_lpush = saved_vps_lpush; +#else /* !VPS_DEBUG */ + +#define CURVPS_SET_QUIET(arg) \ + VPS_ASSERT((arg) != NULL && (arg)->vps_magic_n == VPS_MAGIC_N, \ + ("CURVPS_SET at %s:%d %s() curvps=%p vps=%p", \ + __FILE__, __LINE__, __func__, curvps, (arg))); \ + struct vps *saved_vps = curvps; \ + curvps = arg; + +#define CURVPS_SET_VERBOSE(arg) \ + CURVPS_SET_QUIET(arg) + +#define CURVPS_SET(arg) CURVPS_SET_VERBOSE(arg) + +#define CURVPS_RESTORE() \ + VPS_ASSERT(curvps != NULL && (saved_vps == NULL || \ + saved_vps->vps_magic_n == VPS_MAGIC_N), \ + ("CURVPS_RESTORE at %s:%d %s() curvps=%p saved_vps=%p", \ + __FILE__, __LINE__, __func__, curvps, saved_vps)); \ + curvps = saved_vps; +#endif /* VPS_DEBUG */ + +extern struct vps *vps0; +#define IS_DEFAULT_VPS(arg) ((arg) == vps0) + +#define CRED_TO_VPS(cr) (cr)->cr_prison->pr_vps +#define TD_TO_VPS(td) CRED_TO_VPS((td)->td_ucred) +#define P_TO_VPS(p) CRED_TO_VPS((p)->p_ucred) + +/* + * Global linked list of all virtual process spaces, along with read locks to + * access it. If a caller may sleep while accessing the list, it must use + * the sleepable lock macros. + */ +LIST_HEAD(vps_list_head, vps); +extern struct vps_list_head vps_head; +extern struct rwlock vps_rwlock; +extern struct sx vps_sxlock; + +#define VPS_LIST_RLOCK() sx_slock(&vps_sxlock) +#define VPS_LIST_RLOCK_NOSLEEP() rw_rlock(&vps_rwlock) +#define VPS_LIST_RUNLOCK() sx_sunlock(&vps_sxlock) +#define VPS_LIST_RUNLOCK_NOSLEEP() rw_runlock(&vps_rwlock) + +/* + * Iteration macros to walk the global list of virtual process spaces. + */ +#define VPS_ITERATOR_DECL(arg) struct vps *arg +#define VPS_FOREACH(arg) LIST_FOREACH((arg), &vps_head, vps_le) + +/* + * Virtual process space memory allocator, which allows global variables to + * be automatically instantiated for each process space instance. + */ +#define VPS_NAME(n) vps_entry_##n +#define VPS_DECLARE(t, n) extern t VPS_NAME(n) +#define VPS_DEFINE(t, n) t VPS_NAME(n) __section(VPS_SETNAME) __used +#define _VPS_PTR(b, n) (__typeof(VPS_NAME(n))*) \ + ((b) + (uintptr_t)&VPS_NAME(n)) + +#define _VPS(b, n) (*_VPS_PTR(b, n)) + +/* + * Virtualized global variable accessor macros. + */ +#define VPS_VPS_PTR(vps, n) _VPS_PTR((vps)->vps_data_base, n) +#define VPS_VPS(vps, n) (*VPS_VPS_PTR((vps), n)) + +#define VPS_PTR(n) VPS_VPS_PTR(curvps, n) +#define VPS(n) VPS_VPS(curvps, n) + +/* + * Virtual process space allocator interfaces from the kernel linker. + */ +void *vps_data_alloc(int size); +void vps_data_copy(void *start, int size); +void vps_data_free(void *start_arg, int size); + +/* + * Virtual sysinit mechanism, allowing process space components to declare + * startup and shutdown methods to be run when virtual process space + * instances are created and destroyed. + */ +#include + +/* + * SYSINIT/SYSUNINIT variants that provide per-vps constructors and + * destructors. + */ +struct vps_sysinit { + enum sysinit_sub_id subsystem; + enum sysinit_elem_order order; + sysinit_cfunc_t func; + const void *arg; + TAILQ_ENTRY(vps_sysinit) link; +}; + +#define VPS_SYSINIT(ident, subsystem, order, func, arg) \ + static struct vps_sysinit ident ## _vps_init = { \ + subsystem, \ + order, \ + (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ + (arg) \ + }; \ + SYSINIT(vps_init_ ## ident, subsystem, order, \ + vps_register_sysinit, &ident ## _vps_init); \ + SYSUNINIT(vps_init_ ## ident, subsystem, order, \ + vps_deregister_sysinit, &ident ## _vps_init) + +#define VPS_SYSUNINIT(ident, subsystem, order, func, arg) \ + static struct vps_sysinit ident ## _vps_uninit = { \ + subsystem, \ + order, \ + (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ + (arg) \ + }; \ + SYSINIT(vps_uninit_ ## ident, subsystem, order, \ + vps_register_sysuninit, &ident ## _vps_uninit); \ + SYSUNINIT(vps_uninit_ ## ident, subsystem, order, \ + vps_deregister_sysuninit, &ident ## _vps_uninit) + +/* + * Run per-vps sysinits or sysuninits during vps creation/destruction. + */ +void vps_sysinit(void); +void vps_sysuninit(void); + +/* + * Interfaces for managing per-vps constructors and destructors. + */ +void vps_register_sysinit(void *arg); +void vps_register_sysuninit(void *arg); +void vps_deregister_sysinit(void *arg); +void vps_deregister_sysuninit(void *arg); + +/* + * EVENTHANDLER(9) extensions. + */ +#include + +void vps_global_eventhandler_iterator_func(void *, ...); +#define VPS_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ +do { \ + if (IS_DEFAULT_VPS(curvps)) { \ + (tag) = vimage_eventhandler_register(NULL, #name, func, \ + arg, priority, \ + vps_global_eventhandler_iterator_func); \ + } \ +} while(0) +#define VPS_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ +do { \ + if (IS_DEFAULT_VPS(curvps)) { \ + vimage_eventhandler_register(NULL, #name, func, \ + arg, priority, \ + vps_global_eventhandler_iterator_func); \ + } \ +} while(0) + +#else /* !VIMAGE */ + +/* + * Various virtual process space macros compile to no-ops without VIMAGE. + */ +#define curvps NULL + +#define VPS_ASSERT(exp, msg) +#define CURVPS_SET(arg) +#define CURVPS_SET_QUIET(arg) +#define CURVPS_RESTORE() + +#define VPS_LIST_RLOCK() +#define VPS_LIST_RLOCK_NOSLEEP() +#define VPS_LIST_RUNLOCK() +#define VPS_LIST_RUNLOCK_NOSLEEP() +#define VPS_ITERATOR_DECL(arg) +#define VPS_FOREACH(arg) + +#define IS_DEFAULT_VPS(arg) 1 +#define CRED_TO_VPS(cr) NULL +#define TD_TO_VPS(td) NULL +#define P_TO_VPS(p) NULL + +/* + * Versions of the vps macros that compile to normal global variables and + * standard sysctl definitions. + */ +#define VPS_NAME(n) n +#define VPS_DECLARE(t, n) extern t n +#define VPS_DEFINE(t, n) t n +#define _VPS_PTR(b, n) &VPS_NAME(n) + +/* + * Virtualized global variable accessor macros. + */ +#define VPS_VPS_PTR(vps, n) (&(n)) +#define VPS_VPS(vps, n) (n) + +#define VPS_PTR(n) (&(n)) +#define VPS(n) (n) + +/* + * When VIMAGE isn't compiled into the kernel, VPS_SYSINIT/VPS_SYSUNINIT + * map into normal sysinits, which have the same ordering properties. + */ +#define VPS_SYSINIT(ident, subsystem, order, func, arg) \ + SYSINIT(ident, subsystem, order, func, arg) +#define VPS_SYSUNINIT(ident, subsystem, order, func, arg) \ + SYSUNINIT(ident, subsystem, order, func, arg) + +/* + * Without VIMAGE revert to the default implementation. + */ +#define VPS_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ + (tag) = eventhandler_register(NULL, #name, func, arg, priority) +#define VPS_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ + eventhandler_register(NULL, #name, func, arg, priority) +#endif /* VIMAGE */ +#endif /* _KERNEL */ + +#endif /* !_SYS_VPS_H_ */ Index: sys/vm/vm_meter.c =================================================================== --- sys/vm/vm_meter.c +++ sys/vm/vm_meter.c @@ -177,6 +177,7 @@ static int vmtotal(SYSCTL_HANDLER_ARGS) { + VPS_ITERATOR_DECL(vps_iter); struct vmtotal total; #if defined(COMPAT_FREEBSD11) struct vmtotal11 total11; @@ -197,41 +198,48 @@ /* * Calculate process statistics. */ - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - if ((p->p_flag & P_SYSTEM) != 0) - continue; - PROC_LOCK(p); - if (p->p_state != PRS_NEW) { - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); - switch (td->td_state) { - case TDS_INHIBITED: - if (TD_IS_SWAPPED(td)) + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if ((p->p_flag & P_SYSTEM) != 0) + continue; + PROC_LOCK(p); + if (p->p_state != PRS_NEW) { + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + switch (td->td_state) { + case TDS_INHIBITED: + if (TD_IS_SWAPPED(td)) + total.t_sw++; + else if (TD_IS_SLEEPING(td)) { + if (td->td_priority <= + PZERO) + total.t_dw++; + else + total.t_sl++; + } + break; + case TDS_CAN_RUN: total.t_sw++; - else if (TD_IS_SLEEPING(td)) { - if (td->td_priority <= PZERO) - total.t_dw++; - else - total.t_sl++; + break; + case TDS_RUNQ: + case TDS_RUNNING: + total.t_rq++; + break; + default: + break; } - break; - case TDS_CAN_RUN: - total.t_sw++; - break; - case TDS_RUNQ: - case TDS_RUNNING: - total.t_rq++; - break; - default: - break; + thread_unlock(td); } - thread_unlock(td); } + PROC_UNLOCK(p); } - PROC_UNLOCK(p); + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); /* * Calculate object memory usage statistics. */ Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -2507,18 +2507,27 @@ static int vm_object_in_map(vm_object_t object) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p; - /* sx_slock(&allproc_lock); */ - FOREACH_PROC_IN_SYSTEM(p) { - if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) - continue; - if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { - /* sx_sunlock(&allproc_lock); */ - return 1; + /* VPS_LIST_RLOCK(); */ + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + /* sx_slock(&V_allproc_lock); */ + FOREACH_PROC_IN_SYSTEM(p) { + if (!p->p_vmspace + /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) + continue; + if (_vm_object_in_map(&p->p_vmspace->vm_map, object, + 0)) { + /* sx_sunlock(&V_allproc_lock); */ + return 1; + } } + /* sx_sunlock(&V_allproc_lock); */ + CURVPS_RESTORE(); } - /* sx_sunlock(&allproc_lock); */ + /* VPS_LIST_RUNLOCK(); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c +++ sys/vm/vm_pageout.c @@ -1744,6 +1744,7 @@ void vm_pageout_oom(int shortage) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; @@ -1760,80 +1761,88 @@ */ bigproc = NULL; bigsize = 0; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - /* - * If this is a system, protected or killed process, skip it. - */ - if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | - P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || - p->p_pid == 1 || P_KILLED(p) || - (p->p_pid < 48 && swap_pager_avail != 0)) { - PROC_UNLOCK(p); - continue; - } - /* - * If the process is in a non-running type state, - * don't touch it. Check all the threads individually. - */ - breakout = false; - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); - if (!TD_ON_RUNQ(td) && - !TD_IS_RUNNING(td) && - !TD_IS_SLEEPING(td) && - !TD_IS_SUSPENDED(td) && - !TD_IS_SWAPPED(td)) { + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + + /* + * If this is a system, protected or killed process, + * skip it. + */ + if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | + P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || + p->p_pid == 1 || P_KILLED(p) || + (p->p_pid < 48 && swap_pager_avail != 0)) { + PROC_UNLOCK(p); + continue; + } + /* + * If the process is in a non-running type state, + * don't touch it. Check all the threads individually. + */ + breakout = false; + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + if (!TD_ON_RUNQ(td) && + !TD_IS_RUNNING(td) && + !TD_IS_SLEEPING(td) && + !TD_IS_SUSPENDED(td) && + !TD_IS_SWAPPED(td)) { + thread_unlock(td); + breakout = true; + break; + } thread_unlock(td); - breakout = true; - break; } - thread_unlock(td); - } - if (breakout) { - PROC_UNLOCK(p); - continue; - } - /* - * get the process size - */ - vm = vmspace_acquire_ref(p); - if (vm == NULL) { + if (breakout) { + PROC_UNLOCK(p); + continue; + } + /* + * get the process size + */ + vm = vmspace_acquire_ref(p); + if (vm == NULL) { + PROC_UNLOCK(p); + continue; + } + _PHOLD_LITE(p); PROC_UNLOCK(p); - continue; - } - _PHOLD_LITE(p); - PROC_UNLOCK(p); - sx_sunlock(&allproc_lock); - if (!vm_map_trylock_read(&vm->vm_map)) { + sx_sunlock(&V_allproc_lock); + if (!vm_map_trylock_read(&vm->vm_map)) { + vmspace_free(vm); + sx_slock(&V_allproc_lock); + PRELE(p); + continue; + } + size = vmspace_swap_count(vm); + if (shortage == VM_OOM_MEM) + size += vm_pageout_oom_pagecount(vm); + vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); - sx_slock(&allproc_lock); - PRELE(p); - continue; - } - size = vmspace_swap_count(vm); - if (shortage == VM_OOM_MEM) - size += vm_pageout_oom_pagecount(vm); - vm_map_unlock_read(&vm->vm_map); - vmspace_free(vm); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); - /* - * If this process is bigger than the biggest one, - * remember it. - */ - if (size > bigsize) { - if (bigproc != NULL) - PRELE(bigproc); - bigproc = p; - bigsize = size; - } else { - PRELE(p); + /* + * If this process is bigger than the biggest one, + * remember it. + */ + if (size > bigsize) { + if (bigproc != NULL) + PRELE(bigproc); + bigproc = p; + bigsize = size; + } else { + PRELE(p); + } } + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); if (bigproc != NULL) { if (vm_panic_on_oom != 0) panic("out of swap space"); Index: sys/vm/vm_swapout.c =================================================================== --- sys/vm/vm_swapout.c +++ sys/vm/vm_swapout.c @@ -378,6 +378,7 @@ static void vm_daemon(void) { + VPS_ITERATOR_DECL(vps_iter); struct rlimit rsslim; struct proc *p; struct thread *td; @@ -417,114 +418,129 @@ attempts = 0; again: attempts++; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - vm_pindex_t limit, size; - /* - * if this is a system process or if we have already - * looked at this process, skip it. - */ - PROC_LOCK(p); - if (p->p_state != PRS_NORMAL || - p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { - PROC_UNLOCK(p); - continue; - } - /* - * if the process is in a non-running type state, - * don't touch it. - */ - breakout = 0; - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); - if (!TD_ON_RUNQ(td) && - !TD_IS_RUNNING(td) && - !TD_IS_SLEEPING(td) && - !TD_IS_SUSPENDED(td)) { + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + vm_pindex_t limit, size; + + /* + * If this is a system process or if we have + * already looked at this process, skip it. + */ + PROC_LOCK(p); + if (p->p_state != PRS_NORMAL || p->p_flag & + (P_INEXEC | P_SYSTEM | P_WEXIT)) { + PROC_UNLOCK(p); + continue; + } + /* + * If the process is in a non-running type + * state, don't touch it. + */ + breakout = 0; + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + if (!TD_ON_RUNQ(td) && + !TD_IS_RUNNING(td) && + !TD_IS_SLEEPING(td) && + !TD_IS_SUSPENDED(td)) { + thread_unlock(td); + breakout = 1; + break; + } thread_unlock(td); - breakout = 1; - break; } - thread_unlock(td); - } - if (breakout) { - PROC_UNLOCK(p); - continue; - } - /* - * get a limit - */ - lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); - limit = OFF_TO_IDX( - qmin(rsslim.rlim_cur, rsslim.rlim_max)); + if (breakout) { + PROC_UNLOCK(p); + continue; + } + /* + * get a limit + */ + lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); + limit = OFF_TO_IDX( + qmin(rsslim.rlim_cur, rsslim.rlim_max)); - /* - * let processes that are swapped out really be - * swapped out set the limit to nothing (will force a - * swap-out.) - */ - if ((p->p_flag & P_INMEM) == 0) - limit = 0; /* XXX */ - vm = vmspace_acquire_ref(p); - _PHOLD_LITE(p); - PROC_UNLOCK(p); - if (vm == NULL) { - PRELE(p); - continue; - } - sx_sunlock(&allproc_lock); + /* + * let processes that are swapped out really be + * swapped out set the limit to nothing + * (will force a swap-out.) + */ + if ((p->p_flag & P_INMEM) == 0) + limit = 0; /* XXX */ + vm = vmspace_acquire_ref(p); + _PHOLD_LITE(p); + PROC_UNLOCK(p); + if (vm == NULL) { + PRELE(p); + continue; + } + sx_sunlock(&V_allproc_lock); - size = vmspace_resident_count(vm); - if (size >= limit) { - vm_swapout_map_deactivate_pages( - &vm->vm_map, limit); size = vmspace_resident_count(vm); - } -#ifdef RACCT - if (racct_enable) { - rsize = IDX_TO_OFF(size); - PROC_LOCK(p); - if (p->p_state == PRS_NORMAL) - racct_set(p, RACCT_RSS, rsize); - ravailable = racct_get_available(p, RACCT_RSS); - PROC_UNLOCK(p); - if (rsize > ravailable) { - /* - * Don't be overly aggressive; this - * might be an innocent process, - * and the limit could've been exceeded - * by some memory hog. Don't try - * to deactivate more than 1/4th - * of process' resident set size. - */ - if (attempts <= 8) { - if (ravailable < rsize - - (rsize / 4)) { - ravailable = rsize - - (rsize / 4); - } - } + if (size >= limit) { vm_swapout_map_deactivate_pages( - &vm->vm_map, - OFF_TO_IDX(ravailable)); - /* Update RSS usage after paging out. */ + &vm->vm_map, limit); size = vmspace_resident_count(vm); + } +#ifdef RACCT + if (racct_enable) { rsize = IDX_TO_OFF(size); PROC_LOCK(p); if (p->p_state == PRS_NORMAL) racct_set(p, RACCT_RSS, rsize); + ravailable = racct_get_available(p, + RACCT_RSS); PROC_UNLOCK(p); - if (rsize > ravailable) - tryagain = 1; + if (rsize > ravailable) { + /* + * Don't be overly aggressive; + * this might be an innocent + * process, and the limit + * could've been exceeded by + * some memory hog. Don't try to + * deactivate more than 1/4th of + * process' resident set size. + */ + if (attempts <= 8) { + if (ravailable < rsize - + (rsize / 4)) { + ravailable = + rsize - + (rsize / 4); + } + } + vm_swapout_map_deactivate_pages( + &vm->vm_map, + OFF_TO_IDX(ravailable)); + /* + * Update RSS usage after + * paging out. + */ + size = vmspace_resident_count( + vm); + rsize = IDX_TO_OFF(size); + PROC_LOCK(p); + if (p->p_state == PRS_NORMAL) + racct_set(p, RACCT_RSS, + rsize); + PROC_UNLOCK(p); + if (rsize > ravailable) + tryagain = 1; + } } - } #endif - vmspace_free(vm); - sx_slock(&allproc_lock); - PRELE(p); + vmspace_free(vm); + sx_slock(&V_allproc_lock); + PRELE(p); + } + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); if (tryagain != 0 && attempts <= 10) { maybe_yield(); goto again; @@ -656,6 +672,13 @@ int ppri, pri, slptime, swtime; loop: +#ifdef VIMAGE + if (!IS_DEFAULT_VPS(curvps) && V_vpsdying > 0) { + V_vproc0 = NULL; + return; + } +#endif + if (vm_page_count_min()) { vm_wait_min(); goto loop; @@ -663,7 +686,7 @@ pp = NULL; ppri = INT_MIN; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW || @@ -698,13 +721,13 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); /* * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { - tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2); + tsleep(V_vproc0, PVM, "swapin", MAXSLP * hz / 2); goto loop; } PROC_LOCK(p); @@ -738,6 +761,7 @@ static void swapout_procs(int action) { + VPS_ITERATOR_DECL(vps_iter); struct proc *p; struct thread *td; int slptime; @@ -746,74 +770,81 @@ MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0); didswap = false; - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - /* - * Filter out not yet fully constructed processes. Do - * not swap out held processes. Avoid processes which - * are system, exiting, execing, traced, already swapped - * out or are in the process of being swapped in or out. - */ - PROC_LOCK(p); - if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag & - (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE | - P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) != - P_INMEM) { - PROC_UNLOCK(p); - continue; - } - /* - * Further consideration of this process for swap out - * requires iterating over its threads. We release - * allproc_lock here so that process creation and - * destruction are not blocked while we iterate. - * - * To later reacquire allproc_lock and resume - * iteration over the allproc list, we will first have - * to release the lock on the process. We place a - * hold on the process so that it remains in the - * allproc list while it is unlocked. - */ - _PHOLD_LITE(p); - sx_sunlock(&allproc_lock); + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + sx_slock(&V_allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + /* + * Filter out not yet fully constructed processes. Do + * not swap out held processes. Avoid processes which + * are system, exiting, execing, traced, already swapped + * out or are in the process of being swapped in or out. + */ + PROC_LOCK(p); + if (p->p_state != PRS_NORMAL || p->p_lock != 0 || + (p->p_flag & (P_SYSTEM | P_WEXIT | P_INEXEC | + P_STOPPED_SINGLE | P_TRACED | P_SWAPPINGOUT | + P_SWAPPINGIN | P_INMEM)) != P_INMEM) { + PROC_UNLOCK(p); + continue; + } - /* - * Do not swapout a realtime process. - * Guarantee swap_idle_threshold1 time in memory. - * If the system is under memory stress, or if we are - * swapping idle processes >= swap_idle_threshold2, - * then swap the process out. - */ - doswap = true; - FOREACH_THREAD_IN_PROC(p, td) { - thread_lock(td); - slptime = (ticks - td->td_slptick) / hz; - if (PRI_IS_REALTIME(td->td_pri_class) || - slptime < swap_idle_threshold1 || - !thread_safetoswapout(td) || - ((action & VM_SWAP_NORMAL) == 0 && - slptime < swap_idle_threshold2)) - doswap = false; - thread_unlock(td); - if (!doswap) - break; - } - if (doswap && swapout(p) == 0) - didswap = true; + /* + * Further consideration of this process for swap out + * requires iterating over its threads. We release + * allproc_lock here so that process creation and + * destruction are not blocked while we iterate. + * + * To later reacquire allproc_lock and resume + * iteration over the allproc list, we will first have + * to release the lock on the process. We place a + * hold on the process so that it remains in the + * allproc list while it is unlocked. + */ + _PHOLD_LITE(p); + sx_sunlock(&V_allproc_lock); - PROC_UNLOCK(p); - sx_slock(&allproc_lock); - PRELE(p); + /* + * Do not swapout a realtime process. + * Guarantee swap_idle_threshold1 time in memory. + * If the system is under memory stress, or if we are + * swapping idle processes >= swap_idle_threshold2, + * then swap the process out. + */ + doswap = true; + FOREACH_THREAD_IN_PROC(p, td) { + thread_lock(td); + slptime = (ticks - td->td_slptick) / hz; + if (PRI_IS_REALTIME(td->td_pri_class) || + slptime < swap_idle_threshold1 || + !thread_safetoswapout(td) || + ((action & VM_SWAP_NORMAL) == 0 && + slptime < swap_idle_threshold2)) + doswap = false; + thread_unlock(td); + if (!doswap) + break; + } + if (doswap && swapout(p) == 0) + didswap = true; + + PROC_UNLOCK(p); + sx_slock(&V_allproc_lock); + PRELE(p); + } + sx_sunlock(&V_allproc_lock); + CURVPS_RESTORE(); } - sx_sunlock(&allproc_lock); + VPS_LIST_RUNLOCK(); /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) - wakeup(&proc0); + wakeup(V_vproc0); } static void