Index: sys/arm/arm/pmap-v6.c =================================================================== --- sys/arm/arm/pmap-v6.c +++ sys/arm/arm/pmap-v6.c @@ -6577,7 +6577,7 @@ int npte2 = 0; int i, j, index; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid || p->p_vmspace == NULL) continue; @@ -6605,7 +6605,7 @@ index = 0; printf("\n"); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (npte2); } pte2p = pmap_pte2(pmap, va); @@ -6632,7 +6632,7 @@ } } } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (npte2); } Index: sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c +++ sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c @@ -1022,11 +1022,11 @@ mutex_enter(pid_mtx); #else pp = p; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); while (pp->p_vmspace == pp->p_pptr->p_vmspace) pp = pp->p_pptr; pid = pp->p_pid; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); pp = NULL; rm_rlock(&fasttrap_tp_lock, &tracker); Index: sys/compat/linprocfs/linprocfs.c =================================================================== --- sys/compat/linprocfs/linprocfs.c +++ sys/compat/linprocfs/linprocfs.c @@ -689,8 +689,8 @@ (int)(averunnable.ldavg[2] / averunnable.fscale), (int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100), 1, /* number of running tasks */ - nprocs, /* number of tasks */ - lastpid /* the last pid */ + V_nprocs, /* number of tasks */ + V_lastpid /* the last pid */ ); return (0); } @@ -708,10 +708,10 @@ vm_offset_t startcode, startdata; getboottime(&boottime); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); fill_kinfo_proc(p, &kp); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); if (p->p_vmspace) { startcode = (vm_offset_t)p->p_vmspace->vm_taddr; startdata = (vm_offset_t)p->p_vmspace->vm_daddr; @@ -787,11 +787,11 @@ struct kinfo_proc kp; segsz_t lsize; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); fill_kinfo_proc(p, &kp); PROC_UNLOCK(p); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); /* * See comments in linprocfs_doprocstatus() regarding the @@ -825,7 +825,7 @@ l_sigset_t siglist, sigignore, sigcatch; int i; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */ @@ -864,7 +864,7 @@ } fill_kinfo_proc(p, &kp); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */ sbuf_printf(sb, "State:\t%s\n", state); Index: sys/compat/linux/linux_file.c =================================================================== --- sys/compat/linux/linux_file.c +++ sys/compat/linux/linux_file.c @@ -149,17 +149,17 @@ fdrop(fp, td); goto done; } - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); if (SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); /* XXXPJD: Verify if TIOCSCTTY is allowed. */ (void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td->td_ucred, td); } else { PROC_UNLOCK(p); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); } fdrop(fp, td); } Index: sys/compat/linux/linux_fork.c =================================================================== --- sys/compat/linux/linux_fork.c +++ sys/compat/linux/linux_fork.c @@ -233,11 +233,11 @@ * the same as that of the calling process. */ if (args->flags & LINUX_CLONE_PARENT) { - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); PROC_LOCK(p2); proc_reparent(p2, td->td_proc->p_pptr); PROC_UNLOCK(p2); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); } #ifdef DEBUG Index: sys/compat/linux/linux_misc.c =================================================================== --- sys/compat/linux/linux_misc.c +++ sys/compat/linux/linux_misc.c @@ -181,7 +181,7 @@ sysinfo.totalswap = i * PAGE_SIZE; sysinfo.freeswap = (i - j) * PAGE_SIZE; - sysinfo.procs = nprocs; + sysinfo.procs = V_nprocs; /* The following are only present in newer Linux kernels. */ sysinfo.totalbig = 0; Index: sys/compat/linuxkpi/common/src/linux_current.c =================================================================== --- sys/compat/linuxkpi/common/src/linux_current.c +++ sys/compat/linuxkpi/common/src/linux_current.c @@ -230,7 +230,7 @@ struct task_struct *ts; struct thread *td; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, td) { @@ -241,7 +241,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); EVENTHANDLER_DEREGISTER(thread_dtor, linuxkpi_thread_dtor_tag); } Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -3852,6 +3852,7 @@ kern/kern_tslog.c optional tslog kern/kern_umtx.c standard kern/kern_uuid.c standard +kern/kern_vps.c optional vimage kern/kern_xxx.c standard kern/link_elf.c standard kern/linker_if.m standard Index: sys/ddb/db_command.c =================================================================== --- sys/ddb/db_command.c +++ sys/ddb/db_command.c @@ -693,11 +693,11 @@ * Find the process in question. allproc_lock is not needed * since we're in DDB. */ - /* sx_slock(&allproc_lock); */ + /* sx_slock(&V_allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) if (p->p_pid == pid) break; - /* sx_sunlock(&allproc_lock); */ + /* sx_sunlock(&V_allproc_lock); */ if (p == NULL) DB_ERROR(("Can't find process with pid %ld\n", (long) pid)); Index: sys/ddb/db_ps.c =================================================================== --- sys/ddb/db_ps.c +++ sys/ddb/db_ps.c @@ -90,10 +90,10 @@ char state[9]; int np, rflag, sflag, dflag, lflag, wflag; - np = nprocs; + np = V_nprocs; - if (!LIST_EMPTY(&allproc)) - p = LIST_FIRST(&allproc); + if (!LIST_EMPTY(&V_allproc)) + p = LIST_FIRST(&V_allproc); else p = &proc0; @@ -217,8 +217,9 @@ p = LIST_NEXT(p, p_list); if (p == NULL && np > 0) - p = LIST_FIRST(&zombproc); + p = LIST_FIRST(&V_zombproc); } + db_printf("nprocs = %d, np = %d\n", V_nprocs, np); } static void Index: sys/ddb/db_thread.c =================================================================== --- sys/ddb/db_thread.c +++ sys/ddb/db_thread.c @@ -139,7 +139,7 @@ if (p->p_pid == decaddr) return (FIRST_THREAD_IN_PROC(p)); } - LIST_FOREACH(p, &zombproc, p_list) { + LIST_FOREACH(p, &V_zombproc, p_list) { if (p->p_pid == decaddr) return (FIRST_THREAD_IN_PROC(p)); } @@ -165,7 +165,7 @@ if (p->p_pid == decaddr) return (p); } - LIST_FOREACH(p, &zombproc, p_list) { + LIST_FOREACH(p, &V_zombproc, p_list) { if (p->p_pid == decaddr) return (p); } Index: sys/dev/filemon/filemon.c =================================================================== --- sys/dev/filemon/filemon.c +++ sys/dev/filemon/filemon.c @@ -223,7 +223,7 @@ * filemon_event_process_exit() will lock on filemon->lock * which we hold. */ - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { /* * No PROC_LOCK is needed to compare here since it is @@ -234,7 +234,7 @@ if (p->p_filemon == filemon) filemon_proc_drop(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); /* * It's possible some references were acquired but will be Index: sys/dev/hwpmc/hwpmc_mod.c =================================================================== --- sys/dev/hwpmc/hwpmc_mod.c +++ sys/dev/hwpmc/hwpmc_mod.c @@ -1203,7 +1203,7 @@ * this PMC. */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); top = p; @@ -1227,7 +1227,7 @@ (void) pmc_detach_process(top, pm); done: - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return error; } @@ -1312,7 +1312,7 @@ * partially attached proc tree. */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); top = p; @@ -1333,7 +1333,7 @@ } done: - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); if (LIST_EMPTY(&pm->pm_targets)) pm->pm_flags &= ~PMC_F_ATTACH_DONE; @@ -2025,7 +2025,7 @@ PROC_UNLOCK(p); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); top = p; @@ -2044,7 +2044,7 @@ } } done: - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); } /* @@ -5371,7 +5371,7 @@ po = pm->pm_owner; if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) return; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { pmclog_process_proccreate(po, p, 0 /* sync */); PROC_LOCK(p); @@ -5379,7 +5379,7 @@ pmclog_process_threadcreate(po, td, 0 /* sync */); PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); pmclog_flush(po, 0); } Index: sys/fs/devfs/devfs_vnops.c =================================================================== --- sys/fs/devfs/devfs_vnops.c +++ sys/fs/devfs/devfs_vnops.c @@ -596,7 +596,7 @@ if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); VI_LOCK(vp); @@ -609,7 +609,7 @@ VI_UNLOCK(vp); SESS_UNLOCK(p->p_session); } - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); if (oldvp != NULL) vrele(oldvp); } else @@ -813,9 +813,9 @@ if (error == 0 && com == TIOCSCTTY) { /* Do nothing if reassigning same control tty */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (td->td_proc->p_session->s_ttyvp == vp) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (0); } @@ -826,7 +826,7 @@ td->td_proc->p_session->s_ttydp = cdev2priv(dev); SESS_UNLOCK(td->td_proc->p_session); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); /* Get rid of reference to old control tty */ if (vpold) Index: sys/fs/nfs/nfsport.h =================================================================== --- sys/fs/nfs/nfsport.h +++ sys/fs/nfs/nfsport.h @@ -692,8 +692,8 @@ #define NFSUNLOCKMNT(m) mtx_unlock(&((m)->nm_mtx)) #define NFSLOCKREQUEST(r) mtx_lock(&((r)->r_mtx)) #define NFSUNLOCKREQUEST(r) mtx_unlock(&((r)->r_mtx)) -#define NFSPROCLISTLOCK() sx_slock(&allproc_lock) -#define NFSPROCLISTUNLOCK() sx_sunlock(&allproc_lock) +#define NFSPROCLISTLOCK() sx_slock(&V_allproc_lock) +#define NFSPROCLISTUNLOCK() sx_sunlock(&V_allproc_lock) #define NFSLOCKSOCKREQ(r) mtx_lock(&((r)->nr_mtx)) #define NFSUNLOCKSOCKREQ(r) mtx_unlock(&((r)->nr_mtx)) #define NFSLOCKDS(d) mtx_lock(&((d)->nfsclds_mtx)) Index: sys/fs/pseudofs/pseudofs_vnops.c =================================================================== --- sys/fs/pseudofs/pseudofs_vnops.c +++ sys/fs/pseudofs/pseudofs_vnops.c @@ -705,7 +705,7 @@ { int visible; - sx_assert(&allproc_lock, SX_SLOCKED); + sx_assert(&V_allproc_lock, SX_SLOCKED); pfs_assert_owned(pd); again: if (*pn == NULL) { @@ -718,7 +718,7 @@ if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) { /* next process */ if (*p == NULL) - *p = LIST_FIRST(&allproc); + *p = LIST_FIRST(&V_allproc); else *p = LIST_NEXT(*p, p_list); /* out of processes: next node */ @@ -791,12 +791,12 @@ if (resid == 0) PFS_RETURN (0); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); pfs_lock(pd); /* check if the directory is visible to the caller */ if (!pfs_visible(curthread, pd, pid, true, &proc)) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); pfs_unlock(pd); PFS_RETURN (ENOENT); } @@ -810,7 +810,7 @@ if (proc != NULL) PROC_UNLOCK(proc); pfs_unlock(pd); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); PFS_RETURN (0); } } @@ -860,7 +860,7 @@ if (proc != NULL) PROC_UNLOCK(proc); pfs_unlock(pd); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); i = 0; STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) { if (error == 0) Index: sys/i386/i386/pmap.c =================================================================== --- sys/i386/i386/pmap.c +++ sys/i386/i386/pmap.c @@ -5799,7 +5799,7 @@ int npte = 0; int index; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; @@ -5822,7 +5822,7 @@ index = 0; printf("\n"); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (npte); } pte = pmap_pte(pmap, va); @@ -5847,7 +5847,7 @@ } } } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (npte); } #endif Index: sys/i386/ibcs2/ibcs2_sysvec.c =================================================================== --- sys/i386/ibcs2/ibcs2_sysvec.c +++ sys/i386/ibcs2/ibcs2_sysvec.c @@ -117,14 +117,14 @@ break; case MOD_UNLOAD: /* if this was an ELF module we'd use elf_brand_inuse()... */ - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_sysent == &ibcs2_svr3_sysvec) { rval = EBUSY; break; } } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); break; default: rval = EOPNOTSUPP; Index: sys/kern/imgact_elf.c =================================================================== --- sys/kern/imgact_elf.c +++ sys/kern/imgact_elf.c @@ -241,14 +241,14 @@ struct proc *p; int rval = FALSE; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_sysent == entry->sysvec) { rval = TRUE; break; } } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (rval); } @@ -2106,10 +2106,10 @@ KASSERT(*sizep == size, ("invalid size")); structsize = sizeof(elf_kinfo_proc_t); sbuf_bcat(sb, &structsize, sizeof(structsize)); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); kern_proc_out(p, sb, ELF_KERN_PROC_MASK); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); } *sizep = size; } Index: sys/kern/init_main.c =================================================================== --- sys/kern/init_main.c +++ sys/kern/init_main.c @@ -103,7 +103,9 @@ struct proc proc0; struct thread0_storage thread0_st __aligned(32); struct vmspace vmspace0; -struct proc *initproc; +VPS_DEFINE(struct proc *, initproc); + +VPS_DEFINE(struct proc *, vproc0); #ifndef BOOTHOWTO #define BOOTHOWTO 0 @@ -461,9 +463,8 @@ p->p_osrel = osreldate; /* - * Initialize thread and process structures. + * Initialize thread structures. */ - procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* @@ -475,7 +476,8 @@ /* * Create process 0 (the swapper). */ - LIST_INSERT_HEAD(&allproc, p, p_list); + V_vproc0 = p; + LIST_INSERT_HEAD(&V_allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; @@ -549,7 +551,7 @@ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ - siginit(&proc0); + siginit(V_vproc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL, false); @@ -629,7 +631,7 @@ * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { @@ -649,7 +651,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); @@ -841,18 +843,18 @@ bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; - fr.fr_procp = &initproc; + fr.fr_procp = &V_initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); - KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); + KASSERT(V_initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); - sx_xlock(&proctree_lock); - PROC_LOCK(initproc); - initproc->p_flag |= P_SYSTEM | P_INMEM; - initproc->p_treeflag |= P_TREE_REAPER; - oldcred = initproc->p_ucred; + sx_xlock(&V_proctree_lock); + PROC_LOCK(V_initproc); + V_initproc->p_flag |= P_SYSTEM | P_INMEM; + V_initproc->p_treeflag |= P_TREE_REAPER; + oldcred = V_initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); @@ -860,14 +862,14 @@ #ifdef AUDIT audit_cred_proc1(newcred); #endif - proc_set_cred(initproc, newcred); - td = FIRST_THREAD_IN_PROC(initproc); + proc_set_cred(V_initproc, newcred); + td = FIRST_THREAD_IN_PROC(V_initproc); crfree(td->td_ucred); - td->td_ucred = crhold(initproc->p_ucred); - PROC_UNLOCK(initproc); - sx_xunlock(&proctree_lock); + td->td_ucred = crhold(V_initproc->p_ucred); + PROC_UNLOCK(V_initproc); + sx_xunlock(&V_proctree_lock); crfree(oldcred); - cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), + cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(V_initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); @@ -880,7 +882,7 @@ { struct thread *td; - td = FIRST_THREAD_IN_PROC(initproc); + td = FIRST_THREAD_IN_PROC(V_initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); Index: sys/kern/kern_acct.c =================================================================== --- sys/kern/kern_acct.c +++ sys/kern/kern_acct.c @@ -378,7 +378,7 @@ * Get process accounting information. */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); /* (1) The terminal from which the process was started */ @@ -386,7 +386,7 @@ acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp); else acct.ac_tty = NODEV; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); /* (2) The name of the command that ran */ bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm); Index: sys/kern/kern_clock.c =================================================================== --- sys/kern/kern_clock.c +++ sys/kern/kern_clock.c @@ -189,7 +189,7 @@ { int tticks; - sx_assert(&allproc_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* @@ -214,7 +214,7 @@ void *wchan; int i, slptype, tticks; - sx_assert(&allproc_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* @@ -261,7 +261,7 @@ * starvation. * If the lock can't be held after 100 tries, panic. */ - if (!sx_try_slock(&allproc_lock)) { + if (!sx_try_slock(&V_allproc_lock)) { if (tryl > 100) panic("%s: possible deadlock detected " "on allproc_lock\n", __func__); @@ -289,7 +289,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); /* Sleep for sleepfreq seconds. */ pause("-", sleepfreq * hz); Index: sys/kern/kern_cpuset.c =================================================================== --- sys/kern/kern_cpuset.c +++ sys/kern/kern_cpuset.c @@ -513,7 +513,7 @@ struct thread *td; struct proc *p; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { @@ -527,7 +527,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); kernel_object->domain.dr_policy = cpuset_kernel->cs_domain; } Index: sys/kern/kern_descrip.c =================================================================== --- sys/kern/kern_descrip.c +++ sys/kern/kern_descrip.c @@ -1063,7 +1063,7 @@ sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { @@ -1131,14 +1131,14 @@ sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); SIGIO_LOCK(); *sigiop = sigio; SIGIO_UNLOCK(); return (0); fail: - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); return (ret); @@ -3198,7 +3198,7 @@ if (vrefcnt(olddp) == 1) return; nrele = 0; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); fdp = fdhold(p); @@ -3224,7 +3224,7 @@ FILEDESC_XUNLOCK(fdp); fddrop(fdp); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (rootvnode == olddp) { vrefact(newdp); rootvnode = newdp; @@ -3318,7 +3318,7 @@ return (error); if (req->oldptr == NULL) { n = 0; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { @@ -3334,13 +3334,13 @@ n += fdp->fd_lastfile; fddrop(fdp); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { @@ -3379,7 +3379,7 @@ if (error) break; } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } Index: sys/kern/kern_exit.c =================================================================== --- sys/kern/kern_exit.c +++ sys/kern/kern_exit.c @@ -106,13 +106,13 @@ { struct proc *p, *parent; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); if ((child->p_treeflag & P_TREE_ORPHANED) == 0) { if (child->p_oppid == 0 || child->p_pptr->p_pid == child->p_oppid) parent = child->p_pptr; else - parent = initproc; + parent = V_initproc; return (parent); } for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) { @@ -132,8 +132,8 @@ { struct proc *p1, *p2, *ptmp; - sx_assert(&proctree_lock, SX_LOCKED); - KASSERT(p != initproc, ("reaper_abandon_children for initproc")); + sx_assert(&V_proctree_lock, SX_LOCKED); + KASSERT(p != V_initproc, ("reaper_abandon_children for initproc")); if ((p->p_treeflag & P_TREE_REAPER) == 0) return; p1 = p->p_reaper; @@ -157,7 +157,7 @@ { struct proc *p1; - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); if ((p->p_treeflag & P_TREE_ORPHANED) == 0) return; if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) { @@ -203,7 +203,7 @@ * work around an unsolved stack overflow seen very late during * shutdown on sparc64 when the gmirror worker process exists. */ - if (p == initproc && rebooting == 0) { + if (p == V_initproc && rebooting == 0) { printf("init died (signal %d, exit %d)\n", signo, rval); panic("Going nowhere without my init!"); } @@ -313,7 +313,7 @@ /* Are we a task leader with peers? */ if (p->p_peers != NULL && p == p->p_leader) { - mtx_lock(&ppeers_lock); + mtx_lock(&V_ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); @@ -322,8 +322,8 @@ q = q->p_peers; } while (p->p_peers != NULL) - msleep(p, &ppeers_lock, PWAIT, "exit1", 0); - mtx_unlock(&ppeers_lock); + msleep(p, &V_ppeers_lock, PWAIT, "exit1", 0); + mtx_unlock(&V_ppeers_lock); } /* @@ -388,7 +388,7 @@ * Remove ourself from our leader's peer list and wake our leader. */ if (p->p_leader->p_peers != NULL) { - mtx_lock(&ppeers_lock); + mtx_lock(&V_ppeers_lock); if (p->p_leader->p_peers != NULL) { q = p->p_leader; while (q->p_peers != p) @@ -396,7 +396,7 @@ q->p_peers = p->p_peers; wakeup(p->p_leader); } - mtx_unlock(&ppeers_lock); + mtx_unlock(&V_ppeers_lock); } vmspace_exit(td); @@ -432,16 +432,16 @@ WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ - sx_xlock(&allproc_lock); + sx_xlock(&V_allproc_lock); LIST_REMOVE(p, p_list); - LIST_INSERT_HEAD(&zombproc, p, p_list); + LIST_INSERT_HEAD(&V_zombproc, p, p_list); LIST_REMOVE(p, p_hash); - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); /* * Reparent all children processes: @@ -602,7 +602,7 @@ } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); - if (p->p_pptr == p->p_reaper || p->p_pptr == initproc) { + if (p->p_pptr == p->p_reaper || p->p_pptr == V_initproc) { signal_parent = 1; } else if (p->p_sigparent != 0) { if (p->p_sigparent == SIGCHLD) { @@ -613,7 +613,7 @@ } } else PROC_LOCK(p->p_pptr); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); if (signal_parent == 1) { childproc_exited(p); @@ -827,7 +827,7 @@ { struct proc *q, *t; - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE")); @@ -843,7 +843,7 @@ * release the proc struct just yet. */ PROC_UNLOCK(p); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return; } @@ -870,7 +870,7 @@ wakeup(t); cv_broadcast(&p->p_pwait); PROC_UNLOCK(t); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return; } p->p_oppid = 0; @@ -880,9 +880,9 @@ * Remove other references to this process to ensure we have an * exclusive reference. */ - sx_xlock(&allproc_lock); + sx_xlock(&V_allproc_lock); LIST_REMOVE(p, p_list); /* off zombproc */ - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); LIST_REMOVE(p, p_sibling); reaper_abandon_children(p, true); LIST_REMOVE(p, p_reapsibling); @@ -892,7 +892,7 @@ leavepgrp(p); if (p->p_procdesc != NULL) procdesc_reap(p); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); PROC_LOCK(p); knlist_detach(p->p_klist); @@ -955,7 +955,7 @@ KASSERT(FIRST_THREAD_IN_PROC(p), ("proc_reap: no residual thread!")); uma_zfree(proc_zone, p); - atomic_add_int(&nprocs, -1); + atomic_add_int(&V_nprocs, -1); } static int @@ -965,7 +965,7 @@ { struct rusage *rup; - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); PROC_LOCK(p); @@ -1156,7 +1156,7 @@ bool cont; PROC_LOCK_ASSERT(p, MA_OWNED); - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); MPASS(si_code == CLD_TRAPPED || si_code == CLD_STOPPED || si_code == CLD_CONTINUED); @@ -1170,7 +1170,7 @@ sigqueue_take(p->p_ksi); PROC_UNLOCK(td->td_proc); } - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); if (siginfo != NULL) { siginfo->si_code = si_code; siginfo->si_status = cont ? SIGCONT : p->p_xsig; @@ -1223,7 +1223,7 @@ q->p_flag &= ~P_STATCHILD; PROC_UNLOCK(q); } - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); loop_locked: nfound = 0; LIST_FOREACH(p, &q->p_children, p_sibling) { @@ -1307,11 +1307,11 @@ } } if (nfound == 0) { - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (ECHILD); } if (options & WNOHANG) { - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); td->td_retval[0] = 0; return (0); } @@ -1321,7 +1321,7 @@ PROC_UNLOCK(q); goto loop_locked; } - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); error = msleep(q, &q->p_mtx, PWAIT | PCATCH | PDROP, "wait", 0); if (error) return (error); @@ -1336,7 +1336,7 @@ proc_reparent(struct proc *child, struct proc *parent) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(child, MA_OWNED); if (child->p_pptr == parent) return; Index: sys/kern/kern_fork.c =================================================================== --- sys/kern/kern_fork.c +++ sys/kern/kern_fork.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -184,10 +185,10 @@ return (error); } -int nprocs = 1; /* process 0 */ -int lastpid = 0; -SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, - "Last used PID"); +VPS_DEFINE(int, nprocs) = 1; /* process 0 */ +VPS_DEFINE(int, lastpid) = 0; +SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD|CTLFLAG_VPS, + &VPS_NAME(lastpid), 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make @@ -197,7 +198,8 @@ * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ -static int randompid = 0; +static VPS_DEFINE(int, randompid) = 0; +#define V_randompid VPS(randompid) static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) @@ -207,44 +209,46 @@ error = sysctl_wire_old_buffer(req, sizeof(int)); if (error != 0) return(error); - sx_xlock(&allproc_lock); - pid = randompid; + sx_xlock(&V_allproc_lock); + pid = V_randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid == 0) - randompid = 0; + V_randompid = 0; else if (pid == 1) /* generate a random PID modulus between 100 and 1123 */ - randompid = 100 + arc4random() % 1024; + V_randompid = 100 + arc4random() % 1024; else if (pid < 0 || pid > pid_max - 100) /* out of range */ - randompid = pid_max - 100; + V_randompid = pid_max - 100; else if (pid < 100) /* Make it reasonable */ - randompid = 100; + V_randompid = 100; else - randompid = pid; + V_randompid = pid; } - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus. Special values: 0: disable, 1: choose random value"); +static VPS_DEFINE(int, pidchecked) = 0; +#define V_pidchecked VPS(pidchecked) + static int fork_findpid(int flags) { struct proc *p; int trypid; - static int pidchecked = 0; /* * Requires allproc_lock in order to iterate over the list * of processes, and proctree_lock to access p_pgrp. */ - sx_assert(&allproc_lock, SX_LOCKED); - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); /* * Find an unused process ID. We remember a range of unused IDs @@ -253,13 +257,13 @@ * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ - trypid = lastpid + 1; + trypid = V_lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { - if (randompid) - trypid += arc4random() % randompid; + if (V_randompid) + trypid += arc4random() % V_randompid; } retry: /* @@ -271,12 +275,12 @@ trypid = trypid % pid_max; if (trypid < 100) trypid += 100; - pidchecked = 0; + V_pidchecked = 0; } - if (trypid >= pidchecked) { + if (trypid >= V_pidchecked) { int doingzomb = 0; - pidchecked = PID_MAX; + V_pidchecked = PID_MAX; /* * Scan the active and zombie procs to check whether this pid * is in use. Remember the lowest pid that's greater @@ -291,7 +295,7 @@ * reserved pids is limited by process limit times * two. */ - p = LIST_FIRST(&allproc); + p = LIST_FIRST(&V_allproc); again: for (; p != NULL; p = LIST_NEXT(p, p_list)) { while (p->p_pid == trypid || @@ -301,24 +305,24 @@ (p->p_session != NULL && p->p_session->s_sid == trypid)))) { trypid++; - if (trypid >= pidchecked) + if (trypid >= V_pidchecked) goto retry; } - if (p->p_pid > trypid && pidchecked > p->p_pid) - pidchecked = p->p_pid; + if (p->p_pid > trypid && V_pidchecked > p->p_pid) + V_pidchecked = p->p_pid; if (p->p_pgrp != NULL) { if (p->p_pgrp->pg_id > trypid && - pidchecked > p->p_pgrp->pg_id) - pidchecked = p->p_pgrp->pg_id; + V_pidchecked > p->p_pgrp->pg_id) + V_pidchecked = p->p_pgrp->pg_id; if (p->p_session != NULL && p->p_session->s_sid > trypid && - pidchecked > p->p_session->s_sid) - pidchecked = p->p_session->s_sid; + V_pidchecked > p->p_session->s_sid) + V_pidchecked = p->p_session->s_sid; } } if (!doingzomb) { doingzomb = 1; - p = LIST_FIRST(&zombproc); + p = LIST_FIRST(&V_zombproc); goto again; } } @@ -327,9 +331,9 @@ * RFHIGHPID does not mess with the lastpid counter during boot. */ if (flags & RFHIGHPID) - pidchecked = 0; + V_pidchecked = 0; else - lastpid = trypid; + V_lastpid = trypid; return (trypid); } @@ -394,8 +398,8 @@ struct filedesc_to_leader *fdtol; struct sigacts *newsigacts; - sx_assert(&proctree_lock, SX_LOCKED); - sx_assert(&allproc_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_XLOCKED); p1 = td->td_proc; @@ -404,14 +408,14 @@ p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = trypid; AUDIT_ARG_PID(p2->p_pid); - LIST_INSERT_HEAD(&allproc, p2, p_list); + LIST_INSERT_HEAD(&V_allproc, p2, p_list); allproc_gen++; LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); PROC_LOCK(p2); PROC_LOCK(p1); - sx_xunlock(&allproc_lock); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_allproc_lock); + sx_xunlock(&V_proctree_lock); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); @@ -554,11 +558,11 @@ * Set up linkage for kernel based threading. */ if ((fr->fr_flags & RFTHREAD) != 0) { - mtx_lock(&ppeers_lock); + mtx_lock(&V_ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; - mtx_unlock(&ppeers_lock); + mtx_unlock(&V_ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); @@ -585,7 +589,7 @@ p2->p_leader = p2; } - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); @@ -648,7 +652,7 @@ LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling); if (p2->p_reaper == p1) p2->p_reapsubtree = p2->p_pid; - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; @@ -751,7 +755,7 @@ * if being set atm. */ if ((p1->p_ptevents & PTRACE_FORK) != 0) { - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); PROC_LOCK(p2); /* @@ -777,7 +781,7 @@ proc_reparent(p2, p1->p_pptr); } PROC_UNLOCK(p2); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); } if ((fr->fr_flags & RFSTOPPED) == 0) { @@ -801,6 +805,11 @@ PROC_UNLOCK(p2); } +static VPS_DEFINE(int, curfail); +#define V_curfail VPS(curfail) +static VPS_DEFINE(struct timeval, lastfail); +#define V_lastfail VPS(lastfail) + int fork1(struct thread *td, struct fork_req *fr) { @@ -810,8 +819,6 @@ struct file *fp_procdesc; vm_ooffset_t mem_charged; int error, nprocs_new, ok; - static int curfail; - static struct timeval lastfail; int flags, pages; flags = fr->fr_flags; @@ -881,17 +888,17 @@ * Don't allow a nonprivileged user to use the last ten * processes; don't let root exceed the limit. */ - nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1; + nprocs_new = atomic_fetchadd_int(&V_nprocs, 1) + 1; if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred, PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) { error = EAGAIN; - sx_xlock(&allproc_lock); - if (ppsratecheck(&lastfail, &curfail, 1)) { + sx_xlock(&V_allproc_lock); + if (ppsratecheck(&V_lastfail, &V_curfail, 1)) { printf("maxproc limit exceeded by uid %u (pid %d); " "see tuning(7) and login.conf(5)\n", td->td_ucred->cr_ruid, p1->p_pid); } - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); goto fail2; } @@ -973,8 +980,8 @@ STAILQ_INIT(&newproc->p_ktr); /* We have to lock the process tree while we look for a pid. */ - sx_xlock(&proctree_lock); - sx_xlock(&allproc_lock); + sx_xlock(&V_proctree_lock); + sx_xlock(&V_allproc_lock); /* * Increment the count of procs running with this uid. Don't allow @@ -995,8 +1002,8 @@ } error = EAGAIN; - sx_xunlock(&allproc_lock); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_allproc_lock); + sx_xunlock(&V_proctree_lock); #ifdef MAC mac_proc_destroy(newproc); #endif @@ -1012,7 +1019,7 @@ fdclose(td, fp_procdesc, *fr->fr_pd_fd); fdrop(fp_procdesc, td); } - atomic_add_int(&nprocs, -1); + atomic_add_int(&V_nprocs, -1); pause("fork", hz / 2); return (error); } Index: sys/kern/kern_jail.c =================================================================== --- sys/kern/kern_jail.c +++ sys/kern/kern_jail.c @@ -62,6 +62,10 @@ #include #include #include +#include +#ifdef VIMAGE +#include +#endif #include #include @@ -74,6 +78,10 @@ #include +#ifdef VIMAGE +FEATURE(vimage, "VIMAGE kernel virtualization"); +#endif + #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); @@ -107,7 +115,7 @@ .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE - .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, + .pr_flags = PR_HOST|PR_VNET|PR_VPS|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif @@ -171,6 +179,9 @@ {"host", 0, PR_HOST}, #ifdef VIMAGE {"vnet", 0, PR_VNET}, +#ifdef ENABLE_VPS + {"vps", 0, PR_VPS }, +#endif #endif #ifdef INET {"ip4", PR_IP4_USER, PR_IP4_USER}, @@ -627,6 +638,11 @@ vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } + if ((flags & JAIL_UPDATE) && (ch_flags & PR_VPS)) { + error = EINVAL; + vfs_opterror(opts, "vps cannot be changed after creation"); + goto done_errmsg; + } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { @@ -1801,6 +1817,43 @@ goto done_errmsg; } +#ifdef VIMAGE + /* Allocate a new vps if specified. */ +#ifdef ENABLE_VPS + if (pr_flags & PR_VPS) { +#else + if (0) { +#endif + vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); + if ((error = change_dir(pr->pr_root, td)) != 0) + goto c_unlock; +#ifdef MAC + if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) + goto c_unlock; +#endif +c_unlock: + VOP_UNLOCK(pr->pr_root, 0); + if (error || (error = pwd_chroot(td, pr->pr_root))) { + vfs_opterror(opts, "vps chroot failed"); + if (!created) + prison_deref(pr, PD_DEREF); + goto done_errmsg; + } + + /* We temporary need a ref as otheriwse a prhold will panic. */ + mtx_lock(&pr->pr_mtx); + pr->pr_ref++; + pr->pr_uref++; + mtx_unlock(&pr->pr_mtx); + pr->pr_vps = vps_alloc(pr); + mtx_lock(&pr->pr_mtx); + pr->pr_ref--; + pr->pr_uref--; + mtx_unlock(&pr->pr_mtx); + } else { + pr->pr_vps = ppr->pr_vps; + } +#endif /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { mtx_lock(&pr->pr_mtx); @@ -2285,7 +2338,29 @@ /* * Kill all processes unfortunate enough to be attached to this prison. */ - sx_slock(&allproc_lock); +#ifdef VIMAGE +#ifdef ENABLE_VPS + if (pr->pr_vps) { + /* + * Send signal to init and let init do it's job. + * This should run rc.shutdown and processes should go away. + * All but init? We need to catch the tail-end of reboot(2) + * and handle appropriately for the non-default vpss. + * vps_destroy() will ensure init and swapper will also go + * away and might sleep. If they do not go something will + * hold refs on cred and prisons. + * XXX There are other places which might do that for a long + * time as well. + */ + CURVPS_SET(pr->pr_vps); + shutdown_nice(RB_HALT|RB_POWEROFF); + vps_destroy(pr->pr_vps); + CURVPS_RESTORE(); + } else +#endif +#endif + { + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NEW && p->p_ucred && @@ -2293,7 +2368,8 @@ kern_psignal(p, SIGKILL); PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); + } /* Remove the temporary reference added by jail_remove. */ prison_deref(pr, deuref | PD_DEREF); } @@ -2348,6 +2424,24 @@ struct ucred *newcred, *oldcred; int error; +#ifdef VIMAGE + /* + * Do not allow to migrate a process between virtual process spaces. + * Use the console to attach to it. Getting all process spaces things + * right, including a new pid, progress group, session, terminal, + * tracing is one thing (with a lot of work) and may break apps if the + * pid changes, the pgrp no longer has the same (p)id; getting things + * restored to oriinal state and properly re-parented is virtually + * impossile. So do what we do on a normal machine, present a terminal + * to login to. + */ + if (pr->pr_flags & PR_VPS) { + mtx_unlock(&pr->pr_mtx); + sx_sunlock(&allprison_lock); + return (EPERM); + } +#endif + /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two @@ -2628,6 +2722,8 @@ #ifdef VIMAGE if (pr->pr_vnet != ppr->pr_vnet) vnet_destroy(pr->pr_vnet); + KASSERT(pr->pr_vps == NULL, ("%s: pr %p pr_vps %p != NULL\n", + __func__, pr, pr->pr_vps)); #endif if (pr->pr_root != NULL) vrele(pr->pr_root); @@ -2912,9 +3008,9 @@ #ifdef VIMAGE /* * Determine whether the prison represented by cred owns - * its vnet rather than having it inherited. + * its vnet/vps rather than having it inherited. * - * Returns 1 in case the prison owns the vnet, 0 otherwise. + * Returns 1 in case the prison owns the vnet/vps, 0 otherwise. */ int prison_owns_vnet(struct ucred *cred) @@ -2926,6 +3022,17 @@ */ return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0); } + +int +prison_owns_vps(struct ucred *cred) +{ + + /* + * vps cannot be added/removed after jail creation, + * so no need to lock here. + */ + return (cred->cr_prison->pr_flags & PR_VPS ? 1 : 0); +} #endif /* @@ -3542,6 +3649,26 @@ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns vnet?"); +static int +sysctl_jail_vps(SYSCTL_HANDLER_ARGS) +{ + int error, havevps; +#ifdef VIMAGE + struct ucred *cred = req->td->td_ucred; + + havevps = jailed(cred) && prison_owns_vps(cred); +#else + havevps = 0; +#endif + error = SYSCTL_OUT(req, &havevps, sizeof(havevps)); + + return (error); +} + +SYSCTL_PROC(_security_jail, OID_AUTO, vps, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + sysctl_jail_vps, "I", "Jail owns vps?"); + #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, @@ -3697,6 +3824,10 @@ #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); +#ifdef ENABLE_VPS +SYSCTL_JAIL_PARAM(, vps, CTLTYPE_INT | CTLFLAG_RDTUN, + "E,jailsys", "Virtual process space"); +#endif #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); @@ -4023,12 +4154,12 @@ ASSERT_RACCT_ENABLED(); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); sx_xlock(&allprison_lock); if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { sx_xunlock(&allprison_lock); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return; } @@ -4055,7 +4186,7 @@ } #endif - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); prison_racct_free_locked(oldprr); sx_xunlock(&allprison_lock); } @@ -4103,6 +4234,7 @@ ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); + db_printf(" vps = %p\n", pr->pr_vps); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); Index: sys/kern/kern_kthread.c =================================================================== --- sys/kern/kern_kthread.c +++ sys/kern/kern_kthread.c @@ -89,7 +89,7 @@ struct thread *td; struct proc *p2; - if (!proc0.p_stats) + if (!V_vproc0->p_stats) panic("kproc_create called too soon"); bzero(&fr, sizeof(fr)); @@ -164,11 +164,11 @@ * Reparent curthread from proc0 to init so that the zombie * is harvested. */ - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); PROC_LOCK(p); - proc_reparent(p, initproc); + proc_reparent(p, V_initproc); PROC_UNLOCK(p); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); /* * Wakeup anyone waiting for us to exit. @@ -266,12 +266,12 @@ va_list ap; struct thread *newtd, *oldtd; - if (!proc0.p_stats) + if (!V_vproc0->p_stats) panic("kthread_add called too soon"); /* If no process supplied, put it on proc0 */ if (p == NULL) - p = &proc0; + p = V_vproc0; /* Initialize our new td */ newtd = thread_alloc(pages); Index: sys/kern/kern_ktrace.c =================================================================== --- sys/kern/kern_ktrace.c +++ sys/kern/kern_ktrace.c @@ -955,7 +955,7 @@ int vrele_count; vrele_count = 0; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_tracevp == vp) { @@ -970,7 +970,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (vrele_count > 0) { while (vrele_count-- > 0) vrele(vp); @@ -980,14 +980,14 @@ /* * do it */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (uap->pid < 0) { /* * by process group */ pg = pgfind(-uap->pid); if (pg == NULL) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); error = ESRCH; goto done; } @@ -1011,7 +1011,7 @@ ret |= ktrops(td, p, ops, facs, vp); } if (nfound == 0) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); error = ESRCH; goto done; } @@ -1027,7 +1027,7 @@ if (error) { if (p != NULL) PROC_UNLOCK(p); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); goto done; } if (descend) @@ -1035,7 +1035,7 @@ else ret |= ktrops(td, p, ops, facs, vp); } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); if (!ret) error = EPERM; done: @@ -1143,7 +1143,7 @@ p = top; PROC_LOCK_ASSERT(p, MA_OWNED); - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); for (;;) { ret |= ktrops(td, p, ops, facs, vp); /* @@ -1270,7 +1270,7 @@ * credentials for the operation. */ cred = NULL; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_tracevp == vp) { @@ -1285,7 +1285,7 @@ cred = NULL; } } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); while (vrele_count-- > 0) vrele(vp); Index: sys/kern/kern_mib.c =================================================================== --- sys/kern/kern_mib.c +++ sys/kern/kern_mib.c @@ -556,8 +556,8 @@ error = sysctl_handle_int(oidp, &pm, 0, req); if (error || !req->newptr) return (error); - sx_xlock(&proctree_lock); - sx_xlock(&allproc_lock); + sx_xlock(&V_proctree_lock); + sx_xlock(&V_allproc_lock); /* * Only permit the values less then PID_MAX. @@ -567,8 +567,8 @@ error = EINVAL; else pid_max = pm; - sx_xunlock(&allproc_lock); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_allproc_lock); + sx_xunlock(&V_proctree_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT | Index: sys/kern/kern_proc.c =================================================================== --- sys/kern/kern_proc.c +++ sys/kern/kern_proc.c @@ -126,15 +126,21 @@ /* * Other process lists */ -struct pidhashhead *pidhashtbl; -u_long pidhash; -struct pgrphashhead *pgrphashtbl; -u_long pgrphash; -struct proclist allproc; -struct proclist zombproc; +VPS_DEFINE(struct pidhashhead *, pidhashtbl); +VPS_DEFINE(u_long, pidhash); +VPS_DEFINE(struct pgrphashhead *, pgrphashtbl); +VPS_DEFINE(u_long, pgrphash); +VPS_DEFINE(struct proclist, allproc); +VPS_DEFINE(struct proclist, zombproc); +#ifndef VIMAGE struct sx __exclusive_cache_line allproc_lock; struct sx __exclusive_cache_line proctree_lock; struct mtx __exclusive_cache_line ppeers_lock; +#else +VPS_DEFINE(struct sx, allproc_lock); +VPS_DEFINE(struct sx, proctree_lock); +VPS_DEFINE(struct mtx, ppeers_lock); +#endif uma_zone_t proc_zone; /* @@ -179,22 +185,22 @@ /* * Initialize global process hashing structures. */ -void +static void procinit(void) { - sx_init(&allproc_lock, "allproc"); - sx_init(&proctree_lock, "proctree"); - mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF); - LIST_INIT(&allproc); - LIST_INIT(&zombproc); - pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); - pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); + sx_init(&V_allproc_lock, "allproc"); + sx_init(&V_proctree_lock, "proctree"); + mtx_init(&V_ppeers_lock, "p_peers", NULL, MTX_DEF); + LIST_INIT(&V_allproc); + LIST_INIT(&V_zombproc); + V_pidhashtbl = hashinit(maxproc / 4, M_PROC, &V_pidhash); + V_pgrphashtbl = hashinit(maxproc / 4, M_PROC, &V_pgrphash); proc_zone = uma_zcreate("PROC", sched_sizeof_proc(), proc_ctor, proc_dtor, proc_init, proc_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uihashinit(); } +VPS_SYSINIT(procinit, SI_SUB_INTRINSIC, SI_ORDER_SECOND, procinit, NULL); /* * Prepare a proc for use. @@ -303,7 +309,7 @@ inferior(struct proc *p) { - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); for (; p != curproc; p = proc_realparent(p)) { if (p->p_pid == 0) @@ -317,7 +323,7 @@ { struct proc *p; - sx_assert(&allproc_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_LOCKED); LIST_FOREACH(p, PIDHASH(pid), p_hash) { if (p->p_pid == pid) { PROC_LOCK(p); @@ -347,9 +353,9 @@ PROC_LOCK(p); return (p); } - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); p = pfind_locked(pid); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (p); } @@ -361,11 +367,11 @@ { struct proc *p; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); p = pfind_locked(pid); if (p == NULL) p = zpfind_locked(pid); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (p); } @@ -376,7 +382,7 @@ struct proc *p; struct thread *td; - sx_assert(&allproc_lock, SX_LOCKED); + sx_assert(&V_allproc_lock, SX_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { @@ -402,7 +408,7 @@ { struct pgrp *pgrp; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) { if (pgrp->pg_id == pgid) { @@ -426,7 +432,7 @@ if (p->p_pid == pid) { PROC_LOCK(p); } else { - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); if (pid <= PID_MAX) { p = pfind_locked(pid); if (p == NULL && (flags & PGET_NOTWEXIT) == 0) @@ -436,7 +442,7 @@ } else { p = NULL; } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (p == NULL) return (ESRCH); if ((flags & PGET_CANSEE) != 0) { @@ -486,7 +492,7 @@ enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL")); KASSERT(p->p_pid == pgid, @@ -547,7 +553,7 @@ enterthispgrp(struct proc *p, struct pgrp *pgrp) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); @@ -573,7 +579,7 @@ { struct pgrp *savepgrp; - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED); @@ -610,7 +616,7 @@ { struct pgrp *savepgrp; - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); savepgrp = p->p_pgrp; PGRP_LOCK(savepgrp); PROC_LOCK(p); @@ -632,7 +638,7 @@ struct session *savesess; struct tty *tp; - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); @@ -691,7 +697,7 @@ struct session *mysession; struct proc *q; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_NOTOWNED); PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED); SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED); @@ -744,7 +750,7 @@ } PROC_UNLOCK(p); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); if (SESS_LEADER(p)) { sp = p->p_session; @@ -781,17 +787,17 @@ } if (ttyvp != NULL) { - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) { VOP_REVOKE(ttyvp, REVOKEALL); VOP_UNLOCK(ttyvp, 0); } vrele(ttyvp); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); } } fixjobc(p, p->p_pgrp, 0); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); } /* @@ -851,10 +857,10 @@ struct proc *p; int i; - for (i = 0; i <= pgrphash; i++) { - if (!LIST_EMPTY(&pgrphashtbl[i])) { + for (i = 0; i <= V_pgrphash; i++) { + if (!LIST_EMPTY(&V_pgrphashtbl[i])) { printf("\tindx %d\n", i); - LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) { + LIST_FOREACH(pgrp, &V_pgrphashtbl[i], pg_hash) { printf( "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", (void *)pgrp, (long)pgrp->pg_id, @@ -910,7 +916,7 @@ struct timeval boottime; /* For proc_realparent. */ - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); bzero(kp, sizeof(*kp)); @@ -1019,7 +1025,7 @@ kp->ki_kiflag |= KI_CTTY; if (SESS_LEADER(p)) kp->ki_kiflag |= KI_SLEADER; - /* XXX proctree_lock */ + /* XXX V_proctree_lock */ tp = sp->s_ttyp; SESS_UNLOCK(sp); } @@ -1209,8 +1215,8 @@ { struct proc *p; - sx_assert(&allproc_lock, SX_LOCKED); - LIST_FOREACH(p, &zombproc, p_list) { + sx_assert(&V_allproc_lock, SX_LOCKED); + LIST_FOREACH(p, &V_zombproc, p_list) { if (p->p_pid == pid) { PROC_LOCK(p); break; @@ -1227,9 +1233,9 @@ { struct proc *p; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); p = zpfind_locked(pid); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (p); } @@ -1465,11 +1471,11 @@ error = sysctl_wire_old_buffer(req, 0); if (error) return (error); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); error = pget((pid_t)name[0], PGET_CANSEE, &p); if (error == 0) error = sysctl_out_proc(p, req, flags); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (error); } @@ -1502,14 +1508,14 @@ * traced process. Only grab it if we are producing any * data to begin with. */ - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); } - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { if (!doingzomb) - p = LIST_FIRST(&allproc); + p = LIST_FIRST(&V_allproc); else - p = LIST_FIRST(&zombproc); + p = LIST_FIRST(&V_zombproc); for (; p != NULL; p = LIST_NEXT(p, p_list)) { /* * Skip embryonic processes. @@ -1569,7 +1575,7 @@ PROC_UNLOCK(p); continue; } - /* XXX proctree_lock */ + /* XXX V_proctree_lock */ SESS_LOCK(p->p_session); if (p->p_session->s_ttyp == NULL || tty_udev(p->p_session->s_ttyp) != @@ -1609,9 +1615,9 @@ } } out: - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (req->oldptr != NULL) - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (error); } @@ -3101,11 +3107,11 @@ cp = curproc; allproc_loop: - sx_xlock(&allproc_lock); + sx_xlock(&V_allproc_lock); gen = allproc_gen; seen_exiting = seen_stopped = stopped_some = restart = false; LIST_REMOVE(cp, p_list); - LIST_INSERT_HEAD(&allproc, cp, p_list); + LIST_INSERT_HEAD(&V_allproc, cp, p_list); for (;;) { p = LIST_NEXT(cp, p_list); if (p == NULL) @@ -3135,7 +3141,7 @@ continue; } _PHOLD(p); - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); r = thread_single(p, SINGLE_ALLPROC); if (r != 0) restart = true; @@ -3143,12 +3149,12 @@ stopped_some = true; _PRELE(p); PROC_UNLOCK(p); - sx_xlock(&allproc_lock); + sx_xlock(&V_allproc_lock); } /* Catch forked children we did not see in iteration. */ if (gen != allproc_gen) restart = true; - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); if (restart || stopped_some || seen_exiting || seen_stopped) { kern_yield(PRI_USER); goto allproc_loop; @@ -3161,10 +3167,10 @@ struct proc *cp, *p; cp = curproc; - sx_xlock(&allproc_lock); + sx_xlock(&V_allproc_lock); again: LIST_REMOVE(cp, p_list); - LIST_INSERT_HEAD(&allproc, cp, p_list); + LIST_INSERT_HEAD(&V_allproc, cp, p_list); for (;;) { p = LIST_NEXT(cp, p_list); if (p == NULL) @@ -3173,12 +3179,12 @@ LIST_INSERT_AFTER(p, cp, p_list); PROC_LOCK(p); if ((p->p_flag & P_TOTAL_STOP) != 0) { - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); _PHOLD(p); thread_single_end(p, SINGLE_ALLPROC); _PRELE(p); PROC_UNLOCK(p); - sx_xlock(&allproc_lock); + sx_xlock(&V_allproc_lock); } else { PROC_UNLOCK(p); } @@ -3189,7 +3195,7 @@ if ((p->p_flag & P_TOTAL_STOP) != 0) goto again; } - sx_xunlock(&allproc_lock); + sx_xunlock(&V_allproc_lock); } /* #define TOTAL_STOP_DEBUG 1 */ Index: sys/kern/kern_procctl.c =================================================================== --- sys/kern/kern_procctl.c +++ sys/kern/kern_procctl.c @@ -69,7 +69,7 @@ p = top; ret = 0; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); for (;;) { ret |= protect_setchild(td, p, flags); PROC_UNLOCK(p); @@ -128,7 +128,7 @@ reap_acquire(struct thread *td, struct proc *p) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); if (p != curproc) return (EPERM); if ((p->p_treeflag & P_TREE_REAPER) != 0) @@ -145,10 +145,10 @@ reap_release(struct thread *td, struct proc *p) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); if (p != curproc) return (EPERM); - if (p == initproc) + if (p == V_initproc) return (EINVAL); if ((p->p_treeflag & P_TREE_REAPER) == 0) return (EINVAL); @@ -162,7 +162,7 @@ { struct proc *reap, *p2, *first_p; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); bzero(rs, sizeof(*rs)); if ((p->p_treeflag & P_TREE_REAPER) == 0) { reap = p->p_reaper; @@ -170,7 +170,7 @@ reap = p; rs->rs_flags |= REAPER_STATUS_OWNED; } - if (reap == initproc) + if (reap == V_initproc) rs->rs_flags |= REAPER_STATUS_REALINIT; rs->rs_reaper = reap->p_pid; rs->rs_descendants = 0; @@ -199,18 +199,18 @@ u_int i, n; int error; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); PROC_UNLOCK(p); reap = (p->p_treeflag & P_TREE_REAPER) == 0 ? p->p_reaper : p; n = i = 0; error = 0; LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling) n++; - sx_unlock(&proctree_lock); + sx_unlock(&V_proctree_lock); if (rp->rp_count < n) n = rp->rp_count; pi = malloc(n * sizeof(*pi), M_TEMP, M_WAITOK); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling) { if (i == n) break; @@ -225,10 +225,10 @@ pip->pi_flags |= REAPER_PIDINFO_REAPER; i++; } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); error = copyout(pi, rp->rp_pids, i * sizeof(*pi)); free(pi, M_TEMP); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); PROC_LOCK(p); return (error); } @@ -278,7 +278,7 @@ struct reap_kill_tracker *t; int error; - sx_assert(&proctree_lock, SX_LOCKED); + sx_assert(&V_proctree_lock, SX_LOCKED); if (IN_CAPABILITY_MODE(td)) return (ECAPMODE); if (rk->rk_sig <= 0 || rk->rk_sig > _SIG_MAXSIG || @@ -585,12 +585,12 @@ case PROC_REAP_KILL: case PROC_TRACE_CTL: case PROC_TRAPCAP_CTL: - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); tree_locked = true; break; case PROC_REAP_ACQUIRE: case PROC_REAP_RELEASE: - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); tree_locked = true; break; case PROC_TRACE_STATUS: @@ -657,6 +657,6 @@ break; } if (tree_locked) - sx_unlock(&proctree_lock); + sx_unlock(&V_proctree_lock); return (error); } Index: sys/kern/kern_prot.c =================================================================== --- sys/kern/kern_prot.c +++ sys/kern/kern_prot.c @@ -133,10 +133,10 @@ PROC_UNLOCK(p); } else { PROC_UNLOCK(p); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); pp = proc_realparent(p); ppid = pp->p_pid; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); } return (ppid); @@ -340,7 +340,7 @@ newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) { if (pgrp != NULL) @@ -353,7 +353,7 @@ newsess = NULL; } - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); if (newpgrp != NULL) free(newpgrp, M_PGRP); @@ -399,7 +399,7 @@ newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); if (uap->pid != 0 && uap->pid != curp->p_pid) { if ((targp = pfind(uap->pid)) == NULL) { error = ESRCH; @@ -457,7 +457,7 @@ error = enterthispgrp(targp, pgrp); } done: - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); KASSERT((error == 0) || (newpgrp != NULL), ("setpgid failed and newpgrp is NULL")); if (newpgrp != NULL) @@ -1738,7 +1738,7 @@ } /* Can't trace init when securelevel > 0. */ - if (p == initproc) { + if (p == V_initproc) { error = securelevel_gt(td->td_ucred, 0); if (error) return (error); Index: sys/kern/kern_racct.c =================================================================== --- sys/kern/kern_racct.c +++ sys/kern/kern_racct.c @@ -1226,9 +1226,9 @@ for (;;) { racct_decay(); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); - LIST_FOREACH(p, &zombproc, p_list) { + LIST_FOREACH(p, &V_zombproc, p_list) { PROC_LOCK(p); racct_set(p, RACCT_PCTCPU, 0); PROC_UNLOCK(p); @@ -1301,7 +1301,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); pause("-", hz); } } Index: sys/kern/kern_rctl.c =================================================================== --- sys/kern/kern_rctl.c +++ sys/kern/kern_rctl.c @@ -1175,7 +1175,7 @@ error = str2id(subject_idstr, &id); if (error != 0) goto out; - sx_assert(&allproc_lock, SA_LOCKED); + sx_assert(&V_allproc_lock, SA_LOCKED); rule->rr_subject.rs_proc = pfind(id); if (rule->rr_subject.rs_proc == NULL) { error = ESRCH; @@ -1357,7 +1357,7 @@ * Now go through all the processes and add the new rule to the ones * it applies to. */ - sx_assert(&allproc_lock, SA_LOCKED); + sx_assert(&V_allproc_lock, SA_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { cred = p->p_ucred; switch (rule->rr_subject_type) { @@ -1452,7 +1452,7 @@ rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); - sx_assert(&allproc_lock, SA_LOCKED); + sx_assert(&V_allproc_lock, SA_LOCKED); RACCT_LOCK(); FOREACH_PROC_IN_SYSTEM(p) { found += rctl_racct_remove_rules(p->p_racct, filter); @@ -1623,11 +1623,11 @@ if (error != 0) return (error); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } @@ -1669,7 +1669,7 @@ } out: rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (error != 0) return (error); @@ -1718,17 +1718,17 @@ if (error != 0) return (error); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (E2BIG); } @@ -1777,7 +1777,7 @@ error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); free(buf, M_RCTL); return (error); } @@ -1803,34 +1803,34 @@ if (error != 0) return (error); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) { rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (EINVAL); } if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) { rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (EOPNOTSUPP); } if (filter->rr_subject.rs_proc == NULL) { rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (EINVAL); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (E2BIG); } @@ -1860,7 +1860,7 @@ error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); free(buf, M_RCTL); return (error); } @@ -1883,11 +1883,11 @@ if (error != 0) return (error); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); error = rctl_string_to_rule(inputstr, &rule); free(inputstr, M_RCTL); if (error != 0) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } /* @@ -1906,7 +1906,7 @@ out: rctl_rule_release(rule); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } @@ -1928,17 +1928,17 @@ if (error != 0) return (error); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } error = rctl_rule_remove(filter); rctl_rule_release(filter); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); return (error); } Index: sys/kern/kern_resource.c =================================================================== --- sys/kern/kern_resource.c +++ sys/kern/kern_resource.c @@ -69,10 +69,15 @@ static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures"); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); -#define UIHASH(uid) (&uihashtbl[(uid) & uihash]) + static struct rwlock uihashtbl_lock; -static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; -static u_long uihash; /* size of hash table - 1 */ + +LIST_HEAD(uihashhead, uidinfo); +static VPS_DEFINE(struct uihashhead *, uihashtbl); +#define V_uihashtbl VPS(uihashtbl) +static VPS_DEFINE(u_long, uihash); /* size of hash table - 1 */ +#define V_uihash VPS(uihash) +#define UIHASH(uid) (&V_uihashtbl[(uid) & V_uihash]) static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp); @@ -114,18 +119,18 @@ break; case PRIO_PGRP: - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (uap->who == 0) { pg = td->td_proc->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); break; } } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && @@ -141,7 +146,7 @@ case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && @@ -152,7 +157,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); break; default: @@ -199,18 +204,18 @@ break; case PRIO_PGRP: - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (uap->who == 0) { pg = curp->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); break; } } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && @@ -226,7 +231,7 @@ case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && @@ -237,7 +242,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); break; default: @@ -1214,13 +1219,22 @@ p->p_sysent->sv_fixlimit(rlp, which); } -void +static void uihashinit() { - uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash); rw_init(&uihashtbl_lock, "uidinfo hash"); } +SYSINIT(uihashinit, SI_SUB_INTRINSIC, SI_ORDER_SECOND, uihashinit, NULL); + +static void +uihashinit_vps() +{ + + V_uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &V_uihash); +} +VPS_SYSINIT(uihashinit_vps, SI_SUB_INTRINSIC, SI_ORDER_SECOND, uihashinit_vps, + NULL); /* * Look up a uidinfo struct for the parameter uid. @@ -1368,7 +1382,7 @@ rw_rlock(&uihashtbl_lock); if (pre != NULL) (pre)(); - for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) { + for (uih = &V_uihashtbl[V_uihash]; uih >= V_uihashtbl; uih--) { LIST_FOREACH(uip, uih, ui_hash) { (callback)(uip->ui_racct, arg2, arg3); } Index: sys/kern/kern_shutdown.c =================================================================== --- sys/kern/kern_shutdown.c +++ sys/kern/kern_shutdown.c @@ -289,16 +289,16 @@ howto = (uintptr_t)arg; /* Send a signal to init(8) and have it shutdown the world. */ - PROC_LOCK(initproc); + PROC_LOCK(V_initproc); if (howto & RB_POWEROFF) - kern_psignal(initproc, SIGUSR2); + kern_psignal(V_initproc, SIGUSR2); else if (howto & RB_POWERCYCLE) - kern_psignal(initproc, SIGWINCH); + kern_psignal(V_initproc, SIGWINCH); else if (howto & RB_HALT) - kern_psignal(initproc, SIGUSR1); + kern_psignal(V_initproc, SIGUSR1); else - kern_psignal(initproc, SIGINT); - PROC_UNLOCK(initproc); + kern_psignal(V_initproc, SIGINT); + PROC_UNLOCK(V_initproc); } static struct task shutdown_nice_task = TASK_INITIALIZER(0, @@ -311,7 +311,7 @@ shutdown_nice(int howto) { - if (initproc != NULL && !SCHEDULER_STOPPED()) { + if (V_initproc != NULL && !SCHEDULER_STOPPED()) { shutdown_nice_task.ta_context = (void *)(uintptr_t)howto; taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task); } else { @@ -462,7 +462,7 @@ struct mount *mp, *devmp; int error; - if (curproc != initproc) + if (curproc != V_initproc) return (EPERM); /* Index: sys/kern/kern_sig.c =================================================================== --- sys/kern/kern_sig.c +++ sys/kern/kern_sig.c @@ -1671,7 +1671,7 @@ /* * broadcast */ - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc || p->p_state == PRS_NEW) { @@ -1688,9 +1688,9 @@ ret = err; PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); } else { - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. @@ -1700,11 +1700,11 @@ } else { pgrp = pgfind(pgid); if (pgrp == NULL) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (ESRCH); } } - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || @@ -1891,9 +1891,9 @@ struct pgrp *pgrp; if (pgid != 0) { - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); pgrp = pgfind(pgid); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0, ksi); PGRP_UNLOCK(pgrp); @@ -3279,7 +3279,7 @@ /* * Protect the access to corefilename[] by allproc_lock. */ -#define corefilename_lock allproc_lock +#define corefilename_lock V_allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); Index: sys/kern/kern_sysctl.c =================================================================== --- sys/kern/kern_sysctl.c +++ sys/kern/kern_sysctl.c @@ -60,6 +60,7 @@ #include #include #include +#include #ifdef KTRACE #include #endif @@ -498,6 +499,7 @@ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && #ifdef VIMAGE (oidp->oid_kind & CTLFLAG_VNET) == 0 && + (oidp->oid_kind & CTLFLAG_VPS) == 0 && #endif (oidp->oid_kind & CTLFLAG_TUN) != 0 && (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { @@ -1998,6 +2000,9 @@ else if ((oid->oid_kind & CTLFLAG_VNET) && prison_owns_vnet(req->td->td_ucred)) priv = PRIV_SYSCTL_WRITEJAIL; + else if ((oid->oid_kind & CTLFLAG_VPS) && + prison_owns_vps(req->td->td_ucred)) + priv = PRIV_SYSCTL_WRITEJAIL; #endif else priv = PRIV_SYSCTL_WRITE; @@ -2025,8 +2030,13 @@ goto out; #endif #ifdef VIMAGE + KASSERT(((oid->oid_kind & (CTLFLAG_VNET|CTLFLAG_VPS)) != + (CTLFLAG_VNET|CTLFLAG_VPS)), + ("CTLFLAG VNET and VPS set oid %p", oid)); if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL) arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); + if ((oid->oid_kind & CTLFLAG_VPS) && arg1 != NULL) + arg1 = (void *)(curvps->vps_data_base + (uintptr_t)arg1); #endif error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker); Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c +++ sys/kern/kern_thread.c @@ -83,7 +83,7 @@ "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x470, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x518, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x528, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag"); @@ -103,7 +103,7 @@ "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x2e8, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x334, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x33c, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x68, "struct proc KBI p_flag"); Index: sys/kern/kern_vps.c =================================================================== --- /dev/null +++ sys/kern/kern_vps.c @@ -0,0 +1,828 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2004-2009 University of Zagreb + * Copyright (c) 2006-2009 FreeBSD Foundation + * Copyright (c) 2018 iXsystems, Inc. + * All rights reserved. + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Portions of this software were developed by Bjoern Zeeb + * under sponsorship from iXsystems, Inc. + * + * Copyright (c) 2009 Jeffrey Roberson + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_ddb.h" +#include "opt_kdb.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef DDB +#include +#include +#endif + + +/*- + * This file implements core functions for virtual process spaces: + * + * - Virtual process space management functions. + * + * - Virtual process space memory allocator, which virtualizes global + * variables in the process space. + * + * - Virtualized SYSINIT's/SYSUNINIT's, which allow process spaces + * to register startup/shutdown events to be run for each virtual process + * space instance. + */ + +static MALLOC_DEFINE(M_VPS, "vps", "process space control block"); + +/* + * The virtual process space list has two read-write locks, one sleepable and + * the other not, so that the list can be stablized and walked in a variety + * of process space contexts. Both must be acquired exclusively to modify + * the list, but a read lock of either lock is sufficient to walk the list. + */ +struct rwlock vps_rwlock; +struct sx vps_sxlock; + +#define VPS_LIST_WLOCK() do { \ + sx_xlock(&vps_sxlock); \ + rw_wlock(&vps_rwlock); \ +} while (0) + +#define VPS_LIST_WUNLOCK() do { \ + rw_wunlock(&vps_rwlock); \ + sx_xunlock(&vps_sxlock); \ +} while (0) + +struct vps_list_head vps_head; +struct vps *vps0; + +/* + * The virtual process space allocator provides storage for virtualized + * global variables. These variables are defined/declared using the + * VPS_DEFINE()/VPS_DECLARE() macros, which place them in the 'set_vps' + * linker set. The details of the implementation are somewhat subtle, but + * allow the majority of most process subsystems to maintain + * virtualization-agnostic. + * + * The virtual process space allocator handles variables in the base kernel + * vs. modules in similar but different ways. In both cases, virtualized + * global variables are marked as such by being declared to be part of the + * vps linker set. These "primary" copies of global variables serve two + * functions: + * + * (1) They contain static initialization or "default" values for global + * variables which will be propagated to each virtual process space + * instance when created. As with normal global variables, they default + * to zero-filled. + * + * (2) They act as unique global names by which the variable can be referred + * to, regardless of process space instance. The single global symbol + * will be used to calculate the location of a per-virtual instance + * variable at run-time. + * + * Each virtual process space instance has a complete copy of each + * virtualized global variable, stored in a malloc'd block of memory + * referred to by vps->vps_data_mem. Critical to the design is that each + * per-instance memory block is laid out identically to the primary block so + * that the offset of each global variable is the same across all blocks. + * To optimize run-time access, a precalculated 'base' address, + * vps->vps_data_base, is stored in each vps, and is the amount that can + * be added to the address of a 'primary' instance of a variable to get to the + * per-vps instance. + * + * Virtualized global variables are handled in a similar manner, but as each + * module has its own 'set_vps' linker set, and we want to keep all + * virtualized globals togther, we reserve space in the kernel's linker set + * for potential module variables using a per-vps character array, + * 'modspace'. The virtual process space allocator maintains a free list to + * track what space in the array is free (all, initially) and as modules are + * linked, allocates portions of the space to specific globals. The kernel + * module linker queries the virtual process space allocator and will + * bind references of the global to the location during linking. It also + * calls into the virtual process space allocator, once the memory is + * initialized, in order to propagate the new static initializations to all + * existing virtual process space instances so that the soon-to-be executing + * module will find every process space instance with proper default values. + */ + +/* + * Number of bytes of data in the 'set_vps' linker set, and hence the total + * size of all kernel virtualized global variables, and the malloc(9) type + * that will be used to allocate it. + */ +#define VPS_BYTES (VPS_STOP - VPS_START) + +static MALLOC_DEFINE(M_VPS_DATA, "vps_data", "VPS data"); + +/* + * VPS_MODMIN is the minimum number of bytes we will reserve for the sum of + * global variables across all loaded modules. As this actually sizes an + * array declared as a virtualized global variable in the kernel itself, and + * we want the virtualized global variable space to be page-sized, we may + * have more space than that in practice. + */ +#define VPS_MODMIN 8192 +#define VPS_SIZE roundup2(VPS_BYTES, PAGE_SIZE) + +/* + * Space to store virtualized global variables from loadable kernel modules, + * and the free list to manage it. + */ +static VPS_DEFINE(char, modspace[VPS_MODMIN]); + +/* + * Global lists of subsystem constructor and destructors for vpss. They are + * registered via VPS_SYSINIT() and VPS_SYSUNINIT(). Both lists are + * protected by the vps_sysinit_sxlock global lock. + */ +static TAILQ_HEAD(vps_sysinit_head, vps_sysinit) vps_constructors = + TAILQ_HEAD_INITIALIZER(vps_constructors); +static TAILQ_HEAD(vps_sysuninit_head, vps_sysinit) vps_destructors = + TAILQ_HEAD_INITIALIZER(vps_destructors); + +struct sx vps_sysinit_sxlock; + +#define VPS_SYSINIT_WLOCK() sx_xlock(&vps_sysinit_sxlock); +#define VPS_SYSINIT_WUNLOCK() sx_xunlock(&vps_sysinit_sxlock); +#define VPS_SYSINIT_RLOCK() sx_slock(&vps_sysinit_sxlock); +#define VPS_SYSINIT_RUNLOCK() sx_sunlock(&vps_sysinit_sxlock); + +/* XXX-BZ should probably be vpd_* instead of vnd_* but in the hope to + * harmonize most of this later on keep the names the same for now. */ +struct vps_data_free { + uintptr_t vnd_start; + int vnd_len; + TAILQ_ENTRY(vps_data_free) vnd_link; +}; + +static MALLOC_DEFINE(M_VPS_DATA_FREE, "vps_data_free", + "VPS resource accounting"); +static TAILQ_HEAD(, vps_data_free) vps_data_free_head = + TAILQ_HEAD_INITIALIZER(vps_data_free_head); +static struct sx vps_data_free_lock; + +SDT_PROVIDER_DEFINE(vps); +SDT_PROBE_DEFINE1(vps, functions, vps_alloc, entry, "int"); +SDT_PROBE_DEFINE2(vps, functions, vps_alloc, alloc, "int", "struct vps *"); +SDT_PROBE_DEFINE2(vps, functions, vps_alloc, return, "int", "struct vps *"); +SDT_PROBE_DEFINE2(vps, functions, vps_destroy, entry, "int", "struct vps *"); +SDT_PROBE_DEFINE1(vps, functions, vps_destroy, return, "int"); + +#ifdef DDB +static void db_show_vps_print_vs(struct vps_sysinit *, int); +#endif + +/* + * Allocate a virtual process space. + */ +struct vps * +vps_alloc(struct prison *pr) +{ + struct vps *vps; + + SDT_PROBE1(vps, functions, vps_alloc, entry, __LINE__); + vps = malloc(sizeof(struct vps), M_VPS, M_WAITOK | M_ZERO); + vps->vps_magic_n = VPS_MAGIC_N; + vps->vps_state = 0; + vps->vps_pr = pr; + /* Cheat for vps_sysinit() to get creds right. */ + pr->pr_vps = vps; + SDT_PROBE2(vps, functions, vps_alloc, alloc, __LINE__, vps); + + /* + * Allocate storage for virtualized global variables and copy in + * initial values form our 'primary' copy. + */ + vps->vps_data_mem = malloc(VPS_SIZE, M_VPS_DATA, M_WAITOK); + memcpy(vps->vps_data_mem, (void *)VPS_START, VPS_BYTES); + + /* + * All use of vps-specific data will immediately subtract VPS_START + * from the base memory pointer, so pre-calculate that now to avoid + * it on each use. + */ + vps->vps_data_base = (uintptr_t)vps->vps_data_mem - VPS_START; + + /* Initialize / attach vps module instances. */ + CURVPS_SET_QUIET(vps); + vps_sysinit(); + CURVPS_RESTORE(); + + VPS_LIST_WLOCK(); + LIST_INSERT_HEAD(&vps_head, vps, vps_le); + VPS_LIST_WUNLOCK(); + + SDT_PROBE2(vps, functions, vps_alloc, return, __LINE__, vps); + return (vps); +} + +/* + * Destroy a virtual process space. + */ +void +vps_destroy(struct vps *vps) +{ + + SDT_PROBE2(vps, functions, vps_destroy, entry, __LINE__, vps); + + VPS_LIST_WLOCK(); + LIST_REMOVE(vps, vps_le); + VPS_LIST_WUNLOCK(); + + CURVPS_SET_QUIET(vps); + vps_sysuninit(); + CURVPS_RESTORE(); + + /* + * Release storage for the virtual process space instance. + */ + free(vps->vps_data_mem, M_VPS_DATA); + vps->vps_data_mem = NULL; + vps->vps_data_base = 0; + vps->vps_pr->pr_vps = NULL; + vps->vps_pr = NULL; + vps->vps_magic_n = 0xdeadbeef; + free(vps, M_VPS); + SDT_PROBE1(vps, functions, vps_destroy, return, __LINE__); +} + +/* + * Boot time initialization and allocation of virtual process space. + */ +static void +vps_init_prelink(void *arg __unused) +{ + + rw_init(&vps_rwlock, "vps_rwlock"); + sx_init(&vps_sxlock, "vps_sxlock"); + sx_init(&vps_sysinit_sxlock, "vps_sysinit_sxlock"); + LIST_INIT(&vps_head); +} +SYSINIT(vps_init_prelink, SI_SUB_VIMAGE_PRELINK, SI_ORDER_FIRST, + vps_init_prelink, NULL); + +static void +vps0_init(void *arg __unused) +{ + + if (bootverbose) + printf("VIMAGE (virtualized process space) enabled\n"); + + /* + * We MUST clear curvps in vi_init_done() before going SMP, + * otherwise CURVPS_SET() macros would scream about unnecessary + * curvps recursions. + */ + curvps = prison0.pr_vps = vps0 = vps_alloc(&prison0); +} +SYSINIT(vps0_init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, vps0_init, NULL); + +#if 0 +/* Compared to vnets, nuking the vps of the current thread does not go down well. */ +static void +vps_init_done(void *unused __unused) +{ + + curvps = NULL; +} +SYSINIT(vps_init_done, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, vps_init_done, NULL); +#endif + +/* + * Once on boot, initialize the modspace freelist to entirely cover modspace. + */ +static void +vps_data_startup(void *dummy __unused) +{ + struct vps_data_free *df; + + df = malloc(sizeof(*df), M_VPS_DATA_FREE, M_WAITOK | M_ZERO); + df->vnd_start = (uintptr_t)&VPS_NAME(modspace); + df->vnd_len = VPS_MODMIN; + TAILQ_INSERT_HEAD(&vps_data_free_head, df, vnd_link); + sx_init(&vps_data_free_lock, "vps_data alloc lock"); +} +SYSINIT(vps_data, SI_SUB_KLD, SI_ORDER_FIRST, vps_data_startup, 0); + +/* Dummy VPS_SYSINIT to make sure we always reach the final end state. */ +static void +vps_sysinit_done(void *unused __unused) +{ + + return; +} +VPS_SYSINIT(vps_sysinit_done, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, + vps_sysinit_done, NULL); + +/* + * When a module is loaded and requires storage for a virtualized global + * variable, allocate space from the modspace free list. This interface + * should be used only by the kernel linker. + */ +void * +vps_data_alloc(int size) +{ + struct vps_data_free *df; + void *s; + + s = NULL; + size = roundup2(size, sizeof(void *)); + sx_xlock(&vps_data_free_lock); + TAILQ_FOREACH(df, &vps_data_free_head, vnd_link) { + if (df->vnd_len < size) + continue; + if (df->vnd_len == size) { + s = (void *)df->vnd_start; + TAILQ_REMOVE(&vps_data_free_head, df, vnd_link); + free(df, M_VPS_DATA_FREE); + break; + } + s = (void *)df->vnd_start; + df->vnd_len -= size; + df->vnd_start = df->vnd_start + size; + break; + } + sx_xunlock(&vps_data_free_lock); + + return (s); +} + +/* + * Free space for a virtualized global variable on module unload. + */ +void +vps_data_free(void *start_arg, int size) +{ + struct vps_data_free *df; + struct vps_data_free *dn; + uintptr_t start; + uintptr_t end; + + size = roundup2(size, sizeof(void *)); + start = (uintptr_t)start_arg; + end = start + size; + /* + * Free a region of space and merge it with as many neighbors as + * possible. Keeping the list sorted simplifies this operation. + */ + sx_xlock(&vps_data_free_lock); + TAILQ_FOREACH(df, &vps_data_free_head, vnd_link) { + if (df->vnd_start > end) + break; + /* + * If we expand at the end of an entry we may have to merge + * it with the one following it as well. + */ + if (df->vnd_start + df->vnd_len == start) { + df->vnd_len += size; + dn = TAILQ_NEXT(df, vnd_link); + if (df->vnd_start + df->vnd_len == dn->vnd_start) { + df->vnd_len += dn->vnd_len; + TAILQ_REMOVE(&vps_data_free_head, dn, + vnd_link); + free(dn, M_VPS_DATA_FREE); + } + sx_xunlock(&vps_data_free_lock); + return; + } + if (df->vnd_start == end) { + df->vnd_start = start; + df->vnd_len += size; + sx_xunlock(&vps_data_free_lock); + return; + } + } + dn = malloc(sizeof(*df), M_VPS_DATA_FREE, M_WAITOK | M_ZERO); + dn->vnd_start = start; + dn->vnd_len = size; + if (df) + TAILQ_INSERT_BEFORE(df, dn, vnd_link); + else + TAILQ_INSERT_TAIL(&vps_data_free_head, dn, vnd_link); + sx_xunlock(&vps_data_free_lock); +} + +/* + * When a new virtualized global variable has been allocated, propagate its + * initial value to each already-allocated virtual process space instance. + */ +void +vps_data_copy(void *start, int size) +{ + struct vps *vps; + + VPS_LIST_RLOCK(); + LIST_FOREACH(vps, &vps_head, vps_le) + memcpy((void *)((uintptr_t)vps->vps_data_base + + (uintptr_t)start), start, size); + VPS_LIST_RUNLOCK(); +} + +/* + * Support for special SYSINIT handlers registered via VPS_SYSINIT() + * and VPS_SYSUNINIT(). + */ +void +vps_register_sysinit(void *arg) +{ + struct vps_sysinit *vs, *vs2; + struct vps *vps; + + vs = arg; + KASSERT(vs->subsystem >= SI_SUB_INTRINSIC, ("vps sysinit too early")); + + /* Add the constructor to the global list of vps constructors. */ + VPS_SYSINIT_WLOCK(); + TAILQ_FOREACH(vs2, &vps_constructors, link) { + if (vs2->subsystem > vs->subsystem) + break; + if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) + break; + } + if (vs2 != NULL) + TAILQ_INSERT_BEFORE(vs2, vs, link); + else + TAILQ_INSERT_TAIL(&vps_constructors, vs, link); + + /* + * Invoke the constructor on all the existing vpss when it is + * registered. + */ + VPS_FOREACH(vps) { + CURVPS_SET_QUIET(vps); + vs->func(vs->arg); + CURVPS_RESTORE(); + } + VPS_SYSINIT_WUNLOCK(); +} + +void +vps_deregister_sysinit(void *arg) +{ + struct vps_sysinit *vs; + + vs = arg; + + /* Remove the constructor from the global list of vps constructors. */ + VPS_SYSINIT_WLOCK(); + TAILQ_REMOVE(&vps_constructors, vs, link); + VPS_SYSINIT_WUNLOCK(); +} + +void +vps_register_sysuninit(void *arg) +{ + struct vps_sysinit *vs, *vs2; + + vs = arg; + + /* Add the destructor to the global list of vps destructors. */ + VPS_SYSINIT_WLOCK(); + TAILQ_FOREACH(vs2, &vps_destructors, link) { + if (vs2->subsystem > vs->subsystem) + break; + if (vs2->subsystem == vs->subsystem && vs2->order > vs->order) + break; + } + if (vs2 != NULL) + TAILQ_INSERT_BEFORE(vs2, vs, link); + else + TAILQ_INSERT_TAIL(&vps_destructors, vs, link); + VPS_SYSINIT_WUNLOCK(); +} + +void +vps_deregister_sysuninit(void *arg) +{ + struct vps_sysinit *vs; + struct vps *vps; + + vs = arg; + + /* + * Invoke the destructor on all the existing vpss when it is + * deregistered. + */ + VPS_SYSINIT_WLOCK(); + VPS_FOREACH(vps) { + CURVPS_SET_QUIET(vps); + vs->func(vs->arg); + CURVPS_RESTORE(); + } + + /* Remove the destructor from the global list of vps destructors. */ + TAILQ_REMOVE(&vps_destructors, vs, link); + VPS_SYSINIT_WUNLOCK(); +} + +/* + * Invoke all registered vps constructors on the current vps. Used during + * vps construction. The caller is responsible for ensuring the new vps is + * the current vps and that the vps_sysinit_sxlock lock is locked. + */ +void +vps_sysinit(void) +{ + struct vps_sysinit *vs; + struct vps *vps; + + vps = curvps; + VPS_SYSINIT_RLOCK(); + TAILQ_FOREACH(vs, &vps_constructors, link) { + curvps->vps_state = vs->subsystem; + vs->func(vs->arg); + KASSERT((curvps == vps), + ("%s: vs %p subsystem %u order %u func %p returned " + "with curvps altered: curvps %p should be %p\n", + __func__, vs, vs->subsystem, vs->order, vs->func, + curvps, vps)); + } + VPS_SYSINIT_RUNLOCK(); +} + +/* + * Invoke all registered vps destructors on the current vps. Used during + * vps destruction. The caller is responsible for ensuring the dying vps + * the current vps and that the vps_sysinit_sxlock lock is locked. + */ +void +vps_sysuninit(void) +{ + struct vps_sysinit *vs; + + VPS_SYSINIT_RLOCK(); + TAILQ_FOREACH_REVERSE(vs, &vps_destructors, vps_sysuninit_head, + link) { + curvps->vps_state = vs->subsystem; + vs->func(vs->arg); + } + VPS_SYSINIT_RUNLOCK(); +} + +/* + * EVENTHANDLER(9) extensions. + */ +/* + * Invoke the eventhandler function originally registered with the possibly + * registered argument for all virtual process space instances. + * + * This iterator can only be used for eventhandlers that do not take any + * additional arguments, as we do ignore the variadic arguments from the + * EVENTHANDLER_INVOKE() call. + */ +void +vps_global_eventhandler_iterator_func(void *arg, ...) +{ + VPS_ITERATOR_DECL(vps_iter); + struct eventhandler_entry_vimage *v_ee; + + /* + * There is a bug here in that we should actually cast things to + * (struct eventhandler_entry_ ## name *) but that's not easily + * possible in here so just re-using the variadic version we + * defined for the generic vimage case. + */ + v_ee = arg; + VPS_LIST_RLOCK(); + VPS_FOREACH(vps_iter) { + CURVPS_SET(vps_iter); + ((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg); + CURVPS_RESTORE(); + } + VPS_LIST_RUNLOCK(); +} + +#ifdef VPS_DEBUG +struct vps_recursion { + SLIST_ENTRY(vps_recursion) vnr_le; + const char *prev_fn; + const char *where_fn; + int where_line; + struct vps *old_vps; + struct vps *new_vps; +}; + +static SLIST_HEAD(, vps_recursion) vps_recursions = + SLIST_HEAD_INITIALIZER(vps_recursions); + +static void +vps_print_recursion(struct vps_recursion *vnr, int brief) +{ + + if (!brief) + printf("CURVPS_SET() recursion in "); + printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line, + vnr->prev_fn); + if (brief) + printf(", "); + else + printf("\n "); + printf("%p -> %p\n", vnr->old_vps, vnr->new_vps); +} + +void +vps_log_recursion(struct vps *old_vps, const char *old_fn, int line) +{ + struct vps_recursion *vnr; + + /* Skip already logged recursion events. */ + SLIST_FOREACH(vnr, &vps_recursions, vnr_le) + if (vnr->prev_fn == old_fn && + vnr->where_fn == curthread->td_vps_lpush && + vnr->where_line == line && + (vnr->old_vps == vnr->new_vps) == (curvps == old_vps)) + return; + + vnr = malloc(sizeof(*vnr), M_VPS, M_NOWAIT | M_ZERO); + if (vnr == NULL) + panic("%s: malloc failed", __func__); + vnr->prev_fn = old_fn; + vnr->where_fn = curthread->td_vps_lpush; + vnr->where_line = line; + vnr->old_vps = old_vps; + vnr->new_vps = curvps; + + SLIST_INSERT_HEAD(&vps_recursions, vnr, vnr_le); + + vps_print_recursion(vnr, 0); +#ifdef KDB + kdb_backtrace(); +#endif +} +#endif /* VPS_DEBUG */ + +/* + * DDB(4). + */ +#ifdef DDB +static void +db_vps_print(struct vps *vps) +{ + + db_printf("vps = %p\n", vps); + db_printf(" vps_magic_n = %#08x (%s, orig %#08x)\n", + vps->vps_magic_n, + (vps->vps_magic_n == VPS_MAGIC_N) ? + "ok" : "mismatch", VPS_MAGIC_N); + db_printf(" vps_data_mem = %p\n", vps->vps_data_mem); + db_printf(" vps_data_base = %#jx\n", + (uintmax_t)vps->vps_data_base); + db_printf(" vps_state = %#08x\n", vps->vps_state); + db_printf("\n"); +} + +DB_SHOW_ALL_COMMAND(vpss, db_show_all_vpss) +{ + VPS_ITERATOR_DECL(vps_iter); + + VPS_FOREACH(vps_iter) { + db_vps_print(vps_iter); + if (db_pager_quit) + break; + } +} + +DB_SHOW_COMMAND(vps, db_show_vps) +{ + + if (!have_addr) { + db_printf("usage: show vps \n"); + return; + } + + db_vps_print((struct vps *)addr); +} + +static void +db_show_vps_print_vs(struct vps_sysinit *vs, int ddb) +{ + const char *vsname, *funcname; + c_db_sym_t sym; + db_expr_t offset; + +#define xprint(...) \ + if (ddb) \ + db_printf(__VA_ARGS__); \ + else \ + printf(__VA_ARGS__) + + if (vs == NULL) { + xprint("%s: no vps_sysinit * given\n", __func__); + return; + } + + sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset); + db_symbol_values(sym, &vsname, NULL); + sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset); + db_symbol_values(sym, &funcname, NULL); + xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs); + xprint(" %#08x %#08x\n", vs->subsystem, vs->order); + xprint(" %p(%s)(%p)\n", + vs->func, (funcname != NULL) ? funcname : "", vs->arg); +#undef xprint +} + +DB_SHOW_COMMAND(vps_sysinit, db_show_vps_sysinit) +{ + struct vps_sysinit *vs; + + db_printf("VPS_SYSINIT vs Name(Ptr)\n"); + db_printf(" Subsystem Order\n"); + db_printf(" Function(Name)(Arg)\n"); + TAILQ_FOREACH(vs, &vps_constructors, link) { + db_show_vps_print_vs(vs, 1); + if (db_pager_quit) + break; + } +} + +DB_SHOW_COMMAND(vps_sysuninit, db_show_vps_sysuninit) +{ + struct vps_sysinit *vs; + + db_printf("VPS_SYSUNINIT vs Name(Ptr)\n"); + db_printf(" Subsystem Order\n"); + db_printf(" Function(Name)(Arg)\n"); + TAILQ_FOREACH_REVERSE(vs, &vps_destructors, vps_sysuninit_head, + link) { + db_show_vps_print_vs(vs, 1); + if (db_pager_quit) + break; + } +} + +DB_COMMAND(setcurvps, db_setcurvps) +{ + struct vps *vps; + + if (!have_addr) { + db_printf("usage: setcurvps \n"); + return; + } + + vps = (struct vps *)addr; + db_printf("curvps %p -> %p\n", curvps, vps); + curvps = vps; + db_vps_print(vps); +} + +#ifdef VPS_DEBUG +DB_SHOW_COMMAND(vpsrcrs, db_show_vpsrcrs) +{ + struct vps_recursion *vnr; + + SLIST_FOREACH(vnr, &vps_recursions, vnr_le) + vps_print_recursion(vnr, 1); +} +#endif +#endif /* DDB */ Index: sys/kern/sched_4bsd.c =================================================================== --- sys/kern/sched_4bsd.c +++ sys/kern/sched_4bsd.c @@ -464,7 +464,7 @@ struct td_sched *ts; int awake; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { @@ -550,7 +550,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); } /* Index: sys/kern/subr_prf.c =================================================================== --- sys/kern/subr_prf.c +++ sys/kern/subr_prf.c @@ -165,12 +165,12 @@ if (TD_IS_IDLETHREAD(td)) return (0); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); p = td->td_proc; PROC_LOCK(p); if ((p->p_flag & P_CONTROLT) == 0) { PROC_UNLOCK(p); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (0); } SESS_LOCK(p->p_session); @@ -178,14 +178,14 @@ SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (pca.tty == NULL) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (0); } pca.flags = TOTTY; pca.p_bufr = NULL; va_start(ap, fmt); tty_lock(pca.tty); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); retval = kvprintf(fmt, putchar, &pca, 10, ap); tty_unlock(pca.tty); va_end(ap); @@ -214,7 +214,7 @@ struct putchar_arg pca; struct session *sess = NULL; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (pri != -1) flags |= TOLOG; if (p != NULL) { @@ -237,7 +237,7 @@ pca.p_bufr = NULL; if (pca.tty != NULL) tty_lock(pca.tty); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); kvprintf(fmt, putchar, &pca, 10, ap); if (pca.tty != NULL) tty_unlock(pca.tty); Index: sys/kern/sys_procdesc.c =================================================================== --- sys/kern/sys_procdesc.c +++ sys/kern/sys_procdesc.c @@ -153,13 +153,13 @@ goto out; } pd = fp->f_data; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (pd->pd_proc != NULL) { *p = pd->pd_proc; PROC_LOCK(*p); } else error = ESRCH; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); out: fdrop(fp, td); return (error); @@ -305,14 +305,14 @@ { struct procdesc *pd; - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL")); pd = p->p_procdesc; PROCDESC_LOCK(pd); - KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc, + KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == V_initproc, ("procdesc_exit: closed && parent not init")); pd->pd_flags |= PDF_EXITED; @@ -349,7 +349,7 @@ { struct procdesc *pd; - sx_assert(&proctree_lock, SA_XLOCKED); + sx_assert(&V_proctree_lock, SA_XLOCKED); KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL")); pd = p->p_procdesc; @@ -375,7 +375,7 @@ fp->f_ops = &badfileops; fp->f_data = NULL; - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); PROCDESC_LOCK(pd); pd->pd_flags |= PDF_CLOSED; PROCDESC_UNLOCK(pd); @@ -385,7 +385,7 @@ * This is the case where process' exit status was already * collected and procdesc_reap() was already called. */ - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); } else { PROC_LOCK(p); AUDIT_ARG_PROCESS(p); @@ -415,11 +415,11 @@ * prejudice. */ p->p_sigparent = SIGCHLD; - proc_reparent(p, initproc); + proc_reparent(p, V_initproc); if ((pd->pd_flags & PDF_DAEMON) == 0) kern_psignal(p, SIGKILL); PROC_UNLOCK(p); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); } } @@ -531,7 +531,7 @@ */ bzero(sb, sizeof(*sb)); pd = fp->f_data; - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); if (pd->pd_proc != NULL) { PROC_LOCK(pd->pd_proc); AUDIT_ARG_PROCESS(pd->pd_proc); @@ -553,7 +553,7 @@ PROC_UNLOCK(pd->pd_proc); } else sb->st_mode = S_IFREG; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (0); } Index: sys/kern/sys_process.c =================================================================== --- sys/kern/sys_process.c +++ sys/kern/sys_process.c @@ -688,7 +688,7 @@ proc_set_traced(struct proc *p, bool stop) { - sx_assert(&proctree_lock, SX_XLOCKED); + sx_assert(&V_proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_TRACED; if (stop) @@ -733,7 +733,7 @@ case PT_SET_EVENT_MASK: case PT_DETACH: case PT_GET_SC_ARGS: - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); proctree_locked = 1; break; default: @@ -747,14 +747,14 @@ if (pid <= PID_MAX) { if ((p = pfind(pid)) == NULL) { if (proctree_locked) - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (ESRCH); } } else { td2 = tdfind(pid, -1); if (td2 == NULL) { if (proctree_locked) - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (ESRCH); } p = td2->td_proc; @@ -816,7 +816,7 @@ error = EBUSY; goto fail; } - if (p->p_pptr == initproc) { + if (p->p_pptr == V_initproc) { error = EPERM; goto fail; } @@ -923,7 +923,7 @@ CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid, p->p_oppid); - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); proctree_locked = 0; MPASS(p->p_xthread == NULL); MPASS((p->p_flag & P_STOPPED_TRACE) == 0); @@ -1113,7 +1113,7 @@ pp = proc_realparent(p); proc_reparent(p, pp); - if (pp == initproc) + if (pp == V_initproc) p->p_sigparent = SIGCHLD; CTR3(KTR_PTRACE, "PT_DETACH: pid %d reparented to pid %d, sig %d", @@ -1142,7 +1142,7 @@ break; } - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); proctree_locked = 0; sendsig: @@ -1456,7 +1456,7 @@ fail: PROC_UNLOCK(p); if (proctree_locked) - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (error); } #undef PROC_READ Index: sys/kern/tty.c =================================================================== --- sys/kern/tty.c +++ sys/kern/tty.c @@ -1703,18 +1703,18 @@ /* XXX: This looks awful. */ tty_unlock(tp); - sx_xlock(&proctree_lock); + sx_xlock(&V_proctree_lock); tty_lock(tp); if (!SESS_LEADER(p)) { /* Only the session leader may do this. */ - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (EPERM); } if (tp->t_session != NULL && tp->t_session == p->p_session) { /* This is already our controlling TTY. */ - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (0); } @@ -1732,7 +1732,7 @@ * TTYs of which the session leader has been * killed or the TTY revoked. */ - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); return (EPERM); } @@ -1740,7 +1740,7 @@ tp->t_session = p->p_session; tp->t_session->s_ttyp = tp; tp->t_sessioncnt++; - sx_xunlock(&proctree_lock); + sx_xunlock(&V_proctree_lock); /* Assign foreground process group. */ tp->t_pgrp = p->p_pgrp; @@ -1759,12 +1759,12 @@ * decompose proctree_lock. */ tty_unlock(tp); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); pg = pgfind(*(int *)data); if (pg != NULL) PGRP_UNLOCK(pg); if (pg == NULL || pg->pg_session != td->td_proc->p_session) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); tty_lock(tp); return (EPERM); } @@ -1775,11 +1775,11 @@ * relocking the TTY. */ if (!tty_is_ctty(tp, td->td_proc)) { - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); return (ENOTTY); } tp->t_pgrp = pg; - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); /* Wake up the background process groups. */ cv_broadcast(&tp->t_bgwait); Index: sys/kern/tty_tty.c =================================================================== --- sys/kern/tty_tty.c +++ sys/kern/tty_tty.c @@ -68,7 +68,7 @@ return; p = curproc; sx_sunlock(&clone_drain_lock); - sx_slock(&proctree_lock); + sx_slock(&V_proctree_lock); sx_slock(&clone_drain_lock); dev_lock(); if (!(p->p_flag & P_CONTROLT)) @@ -83,7 +83,7 @@ *dev = p->p_session->s_ttyvp->v_rdev; dev_refl(*dev); dev_unlock(); - sx_sunlock(&proctree_lock); + sx_sunlock(&V_proctree_lock); } static void Index: sys/net/vnet.c =================================================================== --- sys/net/vnet.c +++ sys/net/vnet.c @@ -80,8 +80,6 @@ * stack instance. */ -FEATURE(vimage, "VIMAGE kernel virtualization"); - static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block"); /* @@ -307,7 +305,7 @@ sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock"); LIST_INIT(&vnet_head); } -SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST, +SYSINIT(vnet_init_prelink, SI_SUB_VIMAGE_PRELINK, SI_ORDER_FIRST, vnet_init_prelink, NULL); static void Index: sys/sys/jail.h =================================================================== --- sys/sys/jail.h +++ sys/sys/jail.h @@ -166,6 +166,7 @@ struct osd pr_osd; /* (p) additional data */ struct cpuset *pr_cpuset; /* (p) cpuset */ struct vnet *pr_vnet; /* (c) network stack */ + struct vps *pr_vps; /* (c) process space */ struct vnode *pr_root; /* (c) vnode to rdir */ int pr_ip4s; /* (p) number of v4 IPs */ int pr_ip6s; /* (p) number of v6 IPs */ @@ -209,6 +210,7 @@ /* primary jail address. */ #define PR_IP6_SADDRSEL 0x00000100 /* Do IPv6 src addr sel. or use the */ /* primary jail address. */ +#define PR_VPS 0x00000200 /* Virtual process space */ /* Internal flag bits */ #define PR_IP4 0x02000000 /* IPv4 restricted or disabled */ @@ -370,6 +372,7 @@ int prison_allow(struct ucred *, unsigned); int prison_check(struct ucred *cred1, struct ucred *cred2); int prison_owns_vnet(struct ucred *); +int prison_owns_vps(struct ucred *); int prison_canseemount(struct ucred *cred, struct mount *mp); void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp); Index: sys/sys/kernel.h =================================================================== --- sys/sys/kernel.h +++ sys/sys/kernel.h @@ -102,7 +102,7 @@ SI_SUB_MTX_POOL_DYNAMIC = 0x1AC0000, /* dynamic mutex pool */ SI_SUB_LOCK = 0x1B00000, /* various locks */ SI_SUB_EVENTHANDLER = 0x1C00000, /* eventhandler init */ - SI_SUB_VNET_PRELINK = 0x1E00000, /* vnet init before modules */ + SI_SUB_VIMAGE_PRELINK = 0x1E00000, /* VIMAGE init before modules */ SI_SUB_KLD = 0x2000000, /* KLD and module setup */ SI_SUB_CPU = 0x2100000, /* CPU resource(s)*/ SI_SUB_RACCT = 0x2110000, /* resource accounting */ @@ -159,7 +159,7 @@ SI_SUB_ROOT_CONF = 0xb000000, /* Find root devices */ SI_SUB_INTRINSIC_POST = 0xd000000, /* proc 0 cleanup*/ SI_SUB_SYSCALLS = 0xd800000, /* register system calls */ - SI_SUB_VNET_DONE = 0xdc00000, /* vnet registration complete */ + SI_SUB_VNET_DONE = 0xdc00000, /* VNET registration complete */ SI_SUB_KTHREAD_INIT = 0xe000000, /* init process*/ SI_SUB_KTHREAD_PAGE = 0xe400000, /* pageout daemon*/ SI_SUB_KTHREAD_VM = 0xe800000, /* vm daemon*/ @@ -170,6 +170,7 @@ SI_SUB_SMP = 0xf000000, /* start the APs*/ #endif SI_SUB_RACCTD = 0xf100000, /* start racctd*/ + SI_SUB_VIMAGE_DONE = 0xf800000, /* VIMAGE initialization done */ SI_SUB_LAST = 0xfffffff /* final initialization */ }; Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -68,6 +68,9 @@ #include #include #include +#ifdef _KERNEL +#include +#endif #include /* Machine-dependent proc substruct. */ #ifdef _KERNEL @@ -351,6 +354,8 @@ /* LP64 hole */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ + struct vps *td_vps; /* (k) Effective vps. */ + const char *td_vps_lpush; /* (k) Debugging vps push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ @@ -809,7 +814,7 @@ #endif #define FOREACH_PROC_IN_SYSTEM(p) \ - LIST_FOREACH((p), &allproc, p_list) + LIST_FOREACH((p), &V_allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) @@ -939,38 +944,61 @@ #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) -#define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) -extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; -extern u_long pidhash; -#define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) +LIST_HEAD(pidhashhead, proc); +VPS_DECLARE(struct pidhashhead *, pidhashtbl); +#define V_pidhashtbl VPS(pidhashtbl) +VPS_DECLARE(u_long, pidhash); +#define V_pidhash VPS(pidhash) +#define PIDHASH(pid) (&V_pidhashtbl[(pid) & V_pidhash]) + extern LIST_HEAD(tidhashhead, thread) *tidhashtbl; extern u_long tidhash; +#define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) extern struct rwlock tidhash_lock; -#define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) -extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; -extern u_long pgrphash; +LIST_HEAD(pgrphashhead, pgrp); +VPS_DECLARE(struct pgrphashhead *, pgrphashtbl); +#define V_pgrphashtbl VPS(pgrphashtbl) +VPS_DECLARE(u_long, pgrphash); +#define V_pgrphash VPS(pgrphash) +#define PGRPHASH(pgid) (&V_pgrphashtbl[(pgid) & V_pgrphash]) -extern struct sx allproc_lock; +VPS_DECLARE(struct sx, allproc_lock); +#define V_allproc_lock VPS(allproc_lock) extern int allproc_gen; -extern struct sx proctree_lock; -extern struct mtx ppeers_lock; +VPS_DECLARE(struct sx, proctree_lock); +#define V_proctree_lock VPS(proctree_lock) +VPS_DECLARE(struct mtx, ppeers_lock); +#define V_ppeers_lock VPS(ppeers_lock) extern struct proc proc0; /* Process slot for swapper. */ extern struct thread0_storage thread0_st; /* Primary thread in proc0. */ #define thread0 (thread0_st.t0st_thread) extern struct vmspace vmspace0; /* VM space for proc0. */ +VPS_DECLARE(struct proc *, vproc0); +#define V_vproc0 VPS(vproc0) +#ifdef VIMAGE +VPS_DECLARE(int, vpsdying); +#define V_vpsdying VPS(vpsdying) +#endif extern int hogticks; /* Limit on kernel cpu hogs. */ -extern int lastpid; -extern int nprocs, maxproc; /* Current and max number of procs. */ +VPS_DECLARE(int, lastpid); +#define V_lastpid VPS(lastpid) +VPS_DECLARE(int, nprocs); /* Current number of procs. */ +#define V_nprocs VPS(nprocs) +extern int maxproc; /* Max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); -extern struct proclist allproc; /* List of all processes. */ -extern struct proclist zombproc; /* List of zombie processes. */ -extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ +VPS_DECLARE(struct proclist, allproc); /* List of all processes. */ +#define V_allproc VPS(allproc) +VPS_DECLARE(struct proclist, zombproc); /* List of zombie processes. */ +#define V_zombproc VPS(zombproc) +VPS_DECLARE(struct proc *, initproc); /* Process slots for init. */ +#define V_initproc VPS(initproc) +extern struct proc *pageproc; /* Process slot for pager. */ extern struct uma_zone *proc_zone; @@ -1043,7 +1071,6 @@ int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); -void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); Index: sys/sys/resourcevar.h =================================================================== --- sys/sys/resourcevar.h +++ sys/sys/resourcevar.h @@ -154,7 +154,6 @@ struct uidinfo *uifind(uid_t uid); void uifree(struct uidinfo *uip); -void uihashinit(void); void uihold(struct uidinfo *uip); #ifdef RACCT void ui_racct_foreach(void (*callback)(struct racct *racct, Index: sys/sys/sysctl.h =================================================================== --- sys/sys/sysctl.h +++ sys/sys/sysctl.h @@ -104,6 +104,7 @@ #define CTLFLAG_CAPWR 0x00004000 /* Can be written in capability mode */ #define CTLFLAG_STATS 0x00002000 /* Statistics, not a tuneable */ #define CTLFLAG_NOFETCH 0x00001000 /* Don't fetch tunable from getenv() */ +#define CTLFLAG_VPS 0x00000800 /* Prisons with vps can fiddle */ #define CTLFLAG_CAPRW (CTLFLAG_CAPRD|CTLFLAG_CAPWR) /* Index: sys/sys/vps.h =================================================================== --- /dev/null +++ sys/sys/vps.h @@ -0,0 +1,381 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2006-2009 University of Zagreb + * Copyright (c) 2006-2009 FreeBSD Foundation + * Copyright (c) 2018 iXsystems, Inc. + * All rights reserved. + * + * This software was developed by the University of Zagreb and the + * FreeBSD Foundation under sponsorship by the Stichting NLnet and the + * FreeBSD Foundation. + * + * Portions of this software were developed by Bjoern Zeeb + * under sponsorship from iXsystems, Inc. + * + * Copyright (c) 2009 Jeffrey Roberson + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/*- + * This header file defines several sets of interfaces supporting virtualized + * process space: + * + * - Definition of 'struct vps' and functions and macros to allocate/free/ + * manipulate it. + * + * - A virtual process stack memory allocator, which provides support for + * virtualized global variables via a special linker set, set_vps. + * + * - Virtualized sysinits/sysuninits, which allow constructors and + * destructors to be run for each process space as virtual + * instances are created and destroyed. + * + * If VIMAGE isn't compiled into the kernel, virtualized global variables + * compile to normal global variables, and virtualized sysinits to regular + * sysinits. + */ + +#ifndef _SYS_VPS_H_ +#define _SYS_VPS_H_ + +/* + * struct vps describes a virtualized process space, and is primarily a + * pointer to storage for virtualized global variables. Expose to userspace + * as required for libkvm. + */ +#if defined(_KERNEL) || defined(_WANT_VPS) +#include + +struct vps { + LIST_ENTRY(vps) vps_le; /* all vps list */ + u_int vps_magic_n; + u_int vps_state; /* SI_SUB_* */ + void *vps_data_mem; + uintptr_t vps_data_base; + struct prison *vps_pr; /* Put init on this if set. */ +}; +#define VPS_MAGIC_N 0x0f0307e2 + +/* + * These two virtual process space allocator definitions are also required + * for libkvm so that it can evaluate virtualized global variables. + */ +#define VPS_SETNAME "set_vps" +#define VPS_SYMPREFIX "vps_entry_" +#endif + +#ifdef _KERNEL +#ifdef VIMAGE +#include +#include /* for struct thread */ +#include +#include + +/* + * Location of the kernel's 'set_vps' linker set. + */ +extern uintptr_t *__start_set_vps; +__GLOBL(__start_set_vps); +extern uintptr_t *__stop_set_vps; +__GLOBL(__stop_set_vps); + +#define VPS_START (uintptr_t)&__start_set_vps +#define VPS_STOP (uintptr_t)&__stop_set_vps + +/* + * Functions to allocate and destroy virtual process spaces. + */ +struct vps *vps_alloc(struct prison *); +void vps_destroy(struct vps *); + +/* + * The current virtual process space -- we may wish to move this to struct + * pcpu in the future. + */ +#define curvps curthread->td_vps + +/* + * Various macros -- get and set the current process space, but also + * assertions. + */ +#if defined(INVARIANTS) || defined(VPS_DEBUG) +#define VPS_ASSERT(exp, msg) do { \ + if (!(exp)) \ + panic msg; \ +} while (0) +#else +#define VPS_ASSERT(exp, msg) do { \ +} while (0) +#endif + +#ifdef VPS_DEBUG +void vps_log_recursion(struct vps *, const char *, int); + +#define CURVPS_SET_QUIET(arg) \ + VPS_ASSERT((arg) != NULL && (arg)->vps_magic_n == VPS_MAGIC_N, \ + ("CURVPS_SET at %s:%d %s() curvps=%p vps=%p", \ + __FILE__, __LINE__, __func__, curvps, (arg))); \ + struct vps *saved_vps = curvps; \ + const char *saved_vps_lpush = curthread->td_vps_lpush; \ + curvps = arg; \ + curthread->td_vps_lpush = __func__; + +#define CURVPS_SET_VERBOSE(arg) \ + CURVPS_SET_QUIET(arg) \ + if (saved_vps) \ + vps_log_recursion(saved_vps, saved_vps_lpush, __LINE__); + +#define CURVPS_SET(arg) CURVPS_SET_VERBOSE(arg) + +#define CURVPS_RESTORE() \ + VPS_ASSERT(curvps != NULL && (saved_vps == NULL || \ + saved_vps->vps_magic_n == VPS_MAGIC_N), \ + ("CURVPS_RESTORE at %s:%d %s() curvps=%p saved_vps=%p", \ + __FILE__, __LINE__, __func__, curvps, saved_vps)); \ + curvps = saved_vps; \ + curthread->td_vps_lpush = saved_vps_lpush; +#else /* !VPS_DEBUG */ + +#define CURVPS_SET_QUIET(arg) \ + VPS_ASSERT((arg) != NULL && (arg)->vps_magic_n == VPS_MAGIC_N, \ + ("CURVPS_SET at %s:%d %s() curvps=%p vps=%p", \ + __FILE__, __LINE__, __func__, curvps, (arg))); \ + struct vps *saved_vps = curvps; \ + curvps = arg; + +#define CURVPS_SET_VERBOSE(arg) \ + CURVPS_SET_QUIET(arg) + +#define CURVPS_SET(arg) CURVPS_SET_VERBOSE(arg) + +#define CURVPS_RESTORE() \ + VPS_ASSERT(curvps != NULL && (saved_vps == NULL || \ + saved_vps->vps_magic_n == VPS_MAGIC_N), \ + ("CURVPS_RESTORE at %s:%d %s() curvps=%p saved_vps=%p", \ + __FILE__, __LINE__, __func__, curvps, saved_vps)); \ + curvps = saved_vps; +#endif /* VPS_DEBUG */ + +extern struct vps *vps0; +#define IS_DEFAULT_VPS(arg) ((arg) == vps0) + +#define CRED_TO_VPS(cr) (cr)->cr_prison->pr_vps +#define TD_TO_VPS(td) CRED_TO_VPS((td)->td_ucred) +#define P_TO_VPS(p) CRED_TO_VPS((p)->p_ucred) + +/* + * Global linked list of all virtual process spaces, along with read locks to + * access it. If a caller may sleep while accessing the list, it must use + * the sleepable lock macros. + */ +LIST_HEAD(vps_list_head, vps); +extern struct vps_list_head vps_head; +extern struct rwlock vps_rwlock; +extern struct sx vps_sxlock; + +#define VPS_LIST_RLOCK() sx_slock(&vps_sxlock) +#define VPS_LIST_RLOCK_NOSLEEP() rw_rlock(&vps_rwlock) +#define VPS_LIST_RUNLOCK() sx_sunlock(&vps_sxlock) +#define VPS_LIST_RUNLOCK_NOSLEEP() rw_runlock(&vps_rwlock) + +/* + * Iteration macros to walk the global list of virtual process spaces. + */ +#define VPS_ITERATOR_DECL(arg) struct vps *arg +#define VPS_FOREACH(arg) LIST_FOREACH((arg), &vps_head, vps_le) + +/* + * Virtual process space memory allocator, which allows global variables to + * be automatically instantiated for each process space instance. + */ +#define VPS_NAME(n) vps_entry_##n +#define VPS_DECLARE(t, n) extern t VPS_NAME(n) +#define VPS_DEFINE(t, n) t VPS_NAME(n) __section(VPS_SETNAME) __used +#define _VPS_PTR(b, n) (__typeof(VPS_NAME(n))*) \ + ((b) + (uintptr_t)&VPS_NAME(n)) + +#define _VPS(b, n) (*_VPS_PTR(b, n)) + +/* + * Virtualized global variable accessor macros. + */ +#define VPS_VPS_PTR(vps, n) _VPS_PTR((vps)->vps_data_base, n) +#define VPS_VPS(vps, n) (*VPS_VPS_PTR((vps), n)) + +#define VPS_PTR(n) VPS_VPS_PTR(curvps, n) +#define VPS(n) VPS_VPS(curvps, n) + +/* + * Virtual process space allocator interfaces from the kernel linker. + */ +void *vps_data_alloc(int size); +void vps_data_copy(void *start, int size); +void vps_data_free(void *start_arg, int size); + +/* + * Virtual sysinit mechanism, allowing process space components to declare + * startup and shutdown methods to be run when virtual process space + * instances are created and destroyed. + */ +#include + +/* + * SYSINIT/SYSUNINIT variants that provide per-vps constructors and + * destructors. + */ +struct vps_sysinit { + enum sysinit_sub_id subsystem; + enum sysinit_elem_order order; + sysinit_cfunc_t func; + const void *arg; + TAILQ_ENTRY(vps_sysinit) link; +}; + +#define VPS_SYSINIT(ident, subsystem, order, func, arg) \ + static struct vps_sysinit ident ## _vps_init = { \ + subsystem, \ + order, \ + (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ + (arg) \ + }; \ + SYSINIT(vps_init_ ## ident, subsystem, order, \ + vps_register_sysinit, &ident ## _vps_init); \ + SYSUNINIT(vps_init_ ## ident, subsystem, order, \ + vps_deregister_sysinit, &ident ## _vps_init) + +#define VPS_SYSUNINIT(ident, subsystem, order, func, arg) \ + static struct vps_sysinit ident ## _vps_uninit = { \ + subsystem, \ + order, \ + (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ + (arg) \ + }; \ + SYSINIT(vps_uninit_ ## ident, subsystem, order, \ + vps_register_sysuninit, &ident ## _vps_uninit); \ + SYSUNINIT(vps_uninit_ ## ident, subsystem, order, \ + vps_deregister_sysuninit, &ident ## _vps_uninit) + +/* + * Run per-vps sysinits or sysuninits during vps creation/destruction. + */ +void vps_sysinit(void); +void vps_sysuninit(void); + +/* + * Interfaces for managing per-vps constructors and destructors. + */ +void vps_register_sysinit(void *arg); +void vps_register_sysuninit(void *arg); +void vps_deregister_sysinit(void *arg); +void vps_deregister_sysuninit(void *arg); + +/* + * EVENTHANDLER(9) extensions. + */ +#include + +void vps_global_eventhandler_iterator_func(void *, ...); +#define VPS_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ +do { \ + if (IS_DEFAULT_VPS(curvps)) { \ + (tag) = vimage_eventhandler_register(NULL, #name, func, \ + arg, priority, \ + vps_global_eventhandler_iterator_func); \ + } \ +} while(0) +#define VPS_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ +do { \ + if (IS_DEFAULT_VPS(curvps)) { \ + vimage_eventhandler_register(NULL, #name, func, \ + arg, priority, \ + vps_global_eventhandler_iterator_func); \ + } \ +} while(0) + +#else /* !VIMAGE */ + +/* + * Various virtual process space macros compile to no-ops without VIMAGE. + */ +#define curvps NULL + +#define VPS_ASSERT(exp, msg) +#define CURVPS_SET(arg) +#define CURVPS_SET_QUIET(arg) +#define CURVPS_RESTORE() + +#define VPS_LIST_RLOCK() +#define VPS_LIST_RLOCK_NOSLEEP() +#define VPS_LIST_RUNLOCK() +#define VPS_LIST_RUNLOCK_NOSLEEP() +#define VPS_ITERATOR_DECL(arg) +#define VPS_FOREACH(arg) + +#define IS_DEFAULT_VPS(arg) 1 +#define CRED_TO_VPS(cr) NULL +#define TD_TO_VPS(td) NULL +#define P_TO_VPS(p) NULL + +/* + * Versions of the vps macros that compile to normal global variables and + * standard sysctl definitions. + */ +#define VPS_NAME(n) n +#define VPS_DECLARE(t, n) extern t n +#define VPS_DEFINE(t, n) t n +#define _VPS_PTR(b, n) &VPS_NAME(n) + +/* + * Virtualized global variable accessor macros. + */ +#define VPS_VPS_PTR(vps, n) (&(n)) +#define VPS_VPS(vps, n) (n) + +#define VPS_PTR(n) (&(n)) +#define VPS(n) (n) + +/* + * When VIMAGE isn't compiled into the kernel, VPS_SYSINIT/VPS_SYSUNINIT + * map into normal sysinits, which have the same ordering properties. + */ +#define VPS_SYSINIT(ident, subsystem, order, func, arg) \ + SYSINIT(ident, subsystem, order, func, arg) +#define VPS_SYSUNINIT(ident, subsystem, order, func, arg) \ + SYSUNINIT(ident, subsystem, order, func, arg) + +/* + * Without VIMAGE revert to the default implementation. + */ +#define VPS_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ + (tag) = eventhandler_register(NULL, #name, func, arg, priority) +#define VPS_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ + eventhandler_register(NULL, #name, func, arg, priority) +#endif /* VIMAGE */ +#endif /* _KERNEL */ + +#endif /* !_SYS_VPS_H_ */ Index: sys/vm/vm_meter.c =================================================================== --- sys/vm/vm_meter.c +++ sys/vm/vm_meter.c @@ -197,7 +197,7 @@ /* * Calculate process statistics. */ - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if ((p->p_flag & P_SYSTEM) != 0) continue; @@ -231,7 +231,7 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); /* * Calculate object memory usage statistics. */ Index: sys/vm/vm_object.c =================================================================== --- sys/vm/vm_object.c +++ sys/vm/vm_object.c @@ -2509,16 +2509,16 @@ { struct proc *p; - /* sx_slock(&allproc_lock); */ + /* sx_slock(&V_allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { - /* sx_sunlock(&allproc_lock); */ + /* sx_sunlock(&V_allproc_lock); */ return 1; } } - /* sx_sunlock(&allproc_lock); */ + /* sx_sunlock(&V_allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c +++ sys/vm/vm_pageout.c @@ -1760,7 +1760,7 @@ */ bigproc = NULL; bigsize = 0; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); @@ -1806,10 +1806,10 @@ } _PHOLD_LITE(p); PROC_UNLOCK(p); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (!vm_map_trylock_read(&vm->vm_map)) { vmspace_free(vm); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); PRELE(p); continue; } @@ -1818,7 +1818,7 @@ size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); /* * If this process is bigger than the biggest one, @@ -1833,7 +1833,7 @@ PRELE(p); } } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (bigproc != NULL) { if (vm_panic_on_oom != 0) panic("out of swap space"); Index: sys/vm/vm_swapout.c =================================================================== --- sys/vm/vm_swapout.c +++ sys/vm/vm_swapout.c @@ -417,7 +417,7 @@ attempts = 0; again: attempts++; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { vm_pindex_t limit, size; @@ -473,7 +473,7 @@ PRELE(p); continue; } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); size = vmspace_resident_count(vm); if (size >= limit) { @@ -521,10 +521,10 @@ } #endif vmspace_free(vm); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); PRELE(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); if (tryagain != 0 && attempts <= 10) { maybe_yield(); goto again; @@ -663,7 +663,7 @@ pp = NULL; ppri = INT_MIN; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW || @@ -698,13 +698,13 @@ } PROC_UNLOCK(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); /* * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { - tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2); + tsleep(V_vproc0, PVM, "swapin", MAXSLP * hz / 2); goto loop; } PROC_LOCK(p); @@ -746,7 +746,7 @@ MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0); didswap = false; - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { /* * Filter out not yet fully constructed processes. Do @@ -776,7 +776,7 @@ * allproc list while it is unlocked. */ _PHOLD_LITE(p); - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); /* * Do not swapout a realtime process. @@ -803,17 +803,17 @@ didswap = true; PROC_UNLOCK(p); - sx_slock(&allproc_lock); + sx_slock(&V_allproc_lock); PRELE(p); } - sx_sunlock(&allproc_lock); + sx_sunlock(&V_allproc_lock); /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) - wakeup(&proc0); + wakeup(V_vproc0); } static void