Page MenuHomeFreeBSD

D15865.diff
No OneTemporary

D15865.diff

Index: sys/arm/arm/pmap-v6.c
===================================================================
--- sys/arm/arm/pmap-v6.c
+++ sys/arm/arm/pmap-v6.c
@@ -6577,7 +6577,7 @@
int npte2 = 0;
int i, j, index;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_pid != pid || p->p_vmspace == NULL)
continue;
@@ -6605,7 +6605,7 @@
index = 0;
printf("\n");
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (npte2);
}
pte2p = pmap_pte2(pmap, va);
@@ -6632,7 +6632,7 @@
}
}
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (npte2);
}
Index: sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
+++ sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
@@ -1022,11 +1022,11 @@
mutex_enter(pid_mtx);
#else
pp = p;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
while (pp->p_vmspace == pp->p_pptr->p_vmspace)
pp = pp->p_pptr;
pid = pp->p_pid;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
pp = NULL;
rm_rlock(&fasttrap_tp_lock, &tracker);
Index: sys/compat/linprocfs/linprocfs.c
===================================================================
--- sys/compat/linprocfs/linprocfs.c
+++ sys/compat/linprocfs/linprocfs.c
@@ -689,8 +689,8 @@
(int)(averunnable.ldavg[2] / averunnable.fscale),
(int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100),
1, /* number of running tasks */
- nprocs, /* number of tasks */
- lastpid /* the last pid */
+ V_nprocs, /* number of tasks */
+ V_lastpid /* the last pid */
);
return (0);
}
@@ -708,10 +708,10 @@
vm_offset_t startcode, startdata;
getboottime(&boottime);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
fill_kinfo_proc(p, &kp);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
if (p->p_vmspace) {
startcode = (vm_offset_t)p->p_vmspace->vm_taddr;
startdata = (vm_offset_t)p->p_vmspace->vm_daddr;
@@ -787,11 +787,11 @@
struct kinfo_proc kp;
segsz_t lsize;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
fill_kinfo_proc(p, &kp);
PROC_UNLOCK(p);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
/*
* See comments in linprocfs_doprocstatus() regarding the
@@ -825,7 +825,7 @@
l_sigset_t siglist, sigignore, sigcatch;
int i;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */
@@ -864,7 +864,7 @@
}
fill_kinfo_proc(p, &kp);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */
sbuf_printf(sb, "State:\t%s\n", state);
Index: sys/compat/linux/linux_file.c
===================================================================
--- sys/compat/linux/linux_file.c
+++ sys/compat/linux/linux_file.c
@@ -149,17 +149,17 @@
fdrop(fp, td);
goto done;
}
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
if (SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) {
PROC_UNLOCK(p);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
/* XXXPJD: Verify if TIOCSCTTY is allowed. */
(void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0,
td->td_ucred, td);
} else {
PROC_UNLOCK(p);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
}
fdrop(fp, td);
}
Index: sys/compat/linux/linux_fork.c
===================================================================
--- sys/compat/linux/linux_fork.c
+++ sys/compat/linux/linux_fork.c
@@ -233,11 +233,11 @@
* the same as that of the calling process.
*/
if (args->flags & LINUX_CLONE_PARENT) {
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
PROC_LOCK(p2);
proc_reparent(p2, td->td_proc->p_pptr);
PROC_UNLOCK(p2);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
}
#ifdef DEBUG
Index: sys/compat/linux/linux_misc.c
===================================================================
--- sys/compat/linux/linux_misc.c
+++ sys/compat/linux/linux_misc.c
@@ -181,7 +181,7 @@
sysinfo.totalswap = i * PAGE_SIZE;
sysinfo.freeswap = (i - j) * PAGE_SIZE;
- sysinfo.procs = nprocs;
+ sysinfo.procs = V_nprocs;
/* The following are only present in newer Linux kernels. */
sysinfo.totalbig = 0;
Index: sys/compat/linuxkpi/common/src/linux_current.c
===================================================================
--- sys/compat/linuxkpi/common/src/linux_current.c
+++ sys/compat/linuxkpi/common/src/linux_current.c
@@ -226,22 +226,29 @@
static void
linux_current_uninit(void *arg __unused)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
struct task_struct *ts;
struct thread *td;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- FOREACH_THREAD_IN_PROC(p, td) {
- if ((ts = td->td_lkpi_task) != NULL) {
- td->td_lkpi_task = NULL;
- put_task_struct(ts);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if ((ts = td->td_lkpi_task) != NULL) {
+ td->td_lkpi_task = NULL;
+ put_task_struct(ts);
+ }
}
+ PROC_UNLOCK(p);
}
- PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
EVENTHANDLER_DEREGISTER(thread_dtor, linuxkpi_thread_dtor_tag);
}
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -3852,6 +3852,7 @@
kern/kern_tslog.c optional tslog
kern/kern_umtx.c standard
kern/kern_uuid.c standard
+kern/kern_vps.c optional vimage
kern/kern_xxx.c standard
kern/link_elf.c standard
kern/linker_if.m standard
Index: sys/ddb/db_command.c
===================================================================
--- sys/ddb/db_command.c
+++ sys/ddb/db_command.c
@@ -693,11 +693,12 @@
* Find the process in question. allproc_lock is not needed
* since we're in DDB.
*/
- /* sx_slock(&allproc_lock); */
+ /* Operate on current vps instance only. */
+ /* sx_slock(&V_allproc_lock); */
FOREACH_PROC_IN_SYSTEM(p)
if (p->p_pid == pid)
break;
- /* sx_sunlock(&allproc_lock); */
+ /* sx_sunlock(&V_allproc_lock); */
if (p == NULL)
DB_ERROR(("Can't find process with pid %ld\n", (long) pid));
@@ -875,12 +876,26 @@
}
}
+static void
+_db_stack_trace_all_v(bool active_only)
+{
+ VPS_ITERATOR_DECL(vps_iter);
+
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ _db_stack_trace_all(active_only);
+ CURVPS_RESTORE();
+ }
+ /* VPS_LIST_RUNLOCK(); */
+}
+
static void
db_stack_trace_active(db_expr_t dummy, bool dummy2, db_expr_t dummy3,
char *dummy4)
{
- _db_stack_trace_all(true);
+ _db_stack_trace_all_v(true);
}
static void
@@ -888,7 +903,7 @@
char *dummy4)
{
- _db_stack_trace_all(false);
+ _db_stack_trace_all_v(false);
}
/*
Index: sys/ddb/db_expr.c
===================================================================
--- sys/ddb/db_expr.c
+++ sys/ddb/db_expr.c
@@ -58,7 +58,8 @@
if (t == tIDENT) {
if (!db_value_of_name(db_tok_string, valuep) &&
!db_value_of_name_pcpu(db_tok_string, valuep) &&
- !db_value_of_name_vnet(db_tok_string, valuep)) {
+ !db_value_of_name_vnet(db_tok_string, valuep) &&
+ !db_value_of_name_vps(db_tok_string, valuep)) {
db_printf("Symbol '%s' not found\n", db_tok_string);
db_error(NULL);
/*NOTREACHED*/
Index: sys/ddb/db_ps.c
===================================================================
--- sys/ddb/db_ps.c
+++ sys/ddb/db_ps.c
@@ -90,10 +90,11 @@
char state[9];
int np, rflag, sflag, dflag, lflag, wflag;
- np = nprocs;
+ np = V_nprocs;
- if (!LIST_EMPTY(&allproc))
- p = LIST_FIRST(&allproc);
+ /* Operate on current vps instance only. */
+ if (!LIST_EMPTY(&V_allproc))
+ p = LIST_FIRST(&V_allproc);
else
p = &proc0;
@@ -217,8 +218,9 @@
p = LIST_NEXT(p, p_list);
if (p == NULL && np > 0)
- p = LIST_FIRST(&zombproc);
+ p = LIST_FIRST(&V_zombproc);
}
+ db_printf("nprocs = %d, np = %d\n", V_nprocs, np);
}
static void
@@ -397,6 +399,9 @@
db_printf(" last involuntary switch: %d ms ago\n",
1000 * delta / hz);
}
+#ifdef VIMAGE
+ db_printf(" vnet: %p vps: %p\n", td->td_vnet, td->td_vps);
+#endif
}
DB_SHOW_COMMAND(proc, db_show_proc)
@@ -475,6 +480,7 @@
db_findstack_cmd(db_expr_t addr, bool have_addr, db_expr_t dummy3 __unused,
char *dummy4 __unused)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
struct thread *td;
struct kstack_cache_entry *ks_ce;
@@ -487,15 +493,22 @@
return;
}
- FOREACH_PROC_IN_SYSTEM(p) {
- FOREACH_THREAD_IN_PROC(p, td) {
- if (td->td_kstack <= saddr && saddr < td->td_kstack +
- PAGE_SIZE * td->td_kstack_pages) {
- db_printf("Thread %p\n", td);
- return;
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->td_kstack <= saddr &&
+ saddr < td->td_kstack +
+ PAGE_SIZE * td->td_kstack_pages) {
+ db_printf("Thread %p\n", td);
+ return;
+ }
}
}
+ CURVPS_RESTORE();
}
+ /* VPS_LIST_RUNLOCK(); */
for (ks_ce = kstack_cache; ks_ce != NULL;
ks_ce = ks_ce->next_ks_entry) {
Index: sys/ddb/db_sym.c
===================================================================
--- sys/ddb/db_sym.c
+++ sys/ddb/db_sym.c
@@ -37,8 +37,10 @@
#include <sys/param.h>
#include <sys/pcpu.h>
+#include <sys/proc.h>
#include <sys/smp.h>
#include <sys/systm.h>
+#include <sys/vps.h>
#include <net/vnet.h>
@@ -69,6 +71,7 @@
#ifdef VIMAGE
static void *db_vnet = NULL;
+static void *db_vps = NULL;
#endif
/*
@@ -168,6 +171,53 @@
return (0);
}
}
+
+/*
+ * Validate the virtual process space pointer used to interpret per-vps global
+ * variable expansion. Right now we don't do much here, really we should
+ * walk the global vps list to check it's an OK pointer.
+ */
+int
+db_var_db_vps(struct db_variable *vp, db_expr_t *valuep, int op)
+{
+
+ switch (op) {
+ case DB_VAR_GET:
+ *valuep = (db_expr_t)db_vps;
+ return (1);
+
+ case DB_VAR_SET:
+ db_vps = *(void **)valuep;
+ return (1);
+
+ default:
+ db_printf("db_var_db_vps: unknown operation\n");
+ return (0);
+ }
+}
+
+/*
+ * Read-only variable reporting the current vps, which is what we use when
+ * db_vps is set to NULL.
+ */
+int
+db_var_curvps(struct db_variable *vp, db_expr_t *valuep, int op)
+{
+
+ switch (op) {
+ case DB_VAR_GET:
+ *valuep = (db_expr_t)curvps;
+ return (1);
+
+ case DB_VAR_SET:
+ db_printf("Read-only variable.\n");
+ return (0);
+
+ default:
+ db_printf("db_var_curvps: unknown operation\n");
+ return (0);
+ }
+}
#endif
/*
@@ -278,6 +328,33 @@
#endif
}
+bool
+db_value_of_name_vps(const char *name, db_expr_t *valuep)
+{
+#ifdef VIMAGE
+ static char tmp[256];
+ db_expr_t value;
+ c_db_sym_t sym;
+ struct vps *vps;
+
+ if (db_vps != NULL)
+ vps = db_vps;
+ else
+ vps = curvps;
+ snprintf(tmp, sizeof(tmp), "vps_entry_%s", name);
+ sym = db_lookup(tmp);
+ if (sym == C_DB_SYM_NULL)
+ return (false);
+ db_symbol_values(sym, &name, &value);
+ if (value < VPS_START || value >= VPS_STOP)
+ return (false);
+ *valuep = (db_expr_t)((uintptr_t)value + vps->vps_data_base);
+ return (true);
+#else
+ return (false);
+#endif
+}
+
/*
* Lookup a symbol.
* If the symbol has a qualifier (e.g., ux:vm_map),
Index: sys/ddb/db_thread.c
===================================================================
--- sys/ddb/db_thread.c
+++ sys/ddb/db_thread.c
@@ -135,11 +135,12 @@
if (td != NULL)
return (td);
if (check_pid) {
+ /* Operate on current vps instance only. */
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_pid == decaddr)
return (FIRST_THREAD_IN_PROC(p));
}
- LIST_FOREACH(p, &zombproc, p_list) {
+ LIST_FOREACH(p, &V_zombproc, p_list) {
if (p->p_pid == decaddr)
return (FIRST_THREAD_IN_PROC(p));
}
@@ -161,11 +162,12 @@
decaddr = db_hex2dec(addr);
if (decaddr != -1) {
+ /* Operate on current vps instance only. */
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_pid == decaddr)
return (p);
}
- LIST_FOREACH(p, &zombproc, p_list) {
+ LIST_FOREACH(p, &V_zombproc, p_list) {
if (p->p_pid == decaddr)
return (p);
}
Index: sys/ddb/db_variables.h
===================================================================
--- sys/ddb/db_variables.h
+++ sys/ddb/db_variables.h
@@ -56,8 +56,10 @@
extern db_varfcn_t db_var_curcpu; /* DPCPU default CPU */
extern db_varfcn_t db_var_curvnet; /* Default vnet */
+extern db_varfcn_t db_var_curvps; /* Default vps */
extern db_varfcn_t db_var_db_cpu; /* DPCPU active CPU */
extern db_varfcn_t db_var_db_vnet; /* Active vnet */
+extern db_varfcn_t db_var_db_vps; /* Active vps */
int db_read_variable(struct db_variable *, db_expr_t *);
int db_write_variable(struct db_variable *, db_expr_t);
Index: sys/ddb/db_variables.c
===================================================================
--- sys/ddb/db_variables.c
+++ sys/ddb/db_variables.c
@@ -53,6 +53,8 @@
#ifdef VIMAGE
{ "curvnet", NULL, db_var_curvnet },
{ "db_vnet", NULL, db_var_db_vnet },
+ { "curvps", NULL, db_var_curvps },
+ { "db_vps", NULL, db_var_db_vps },
#endif
};
static struct db_variable *db_evars = db_vars + nitems(db_vars);
Index: sys/ddb/ddb.h
===================================================================
--- sys/ddb/ddb.h
+++ sys/ddb/ddb.h
@@ -229,6 +229,7 @@
bool db_value_of_name(const char *name, db_expr_t *valuep);
bool db_value_of_name_pcpu(const char *name, db_expr_t *valuep);
bool db_value_of_name_vnet(const char *name, db_expr_t *valuep);
+bool db_value_of_name_vps(const char *name, db_expr_t *valuep);
int db_write_bytes(vm_offset_t addr, size_t size, char *data);
void db_command_register(struct command_table *, struct command *);
void db_command_unregister(struct command_table *, struct command *);
Index: sys/dev/filemon/filemon.c
===================================================================
--- sys/dev/filemon/filemon.c
+++ sys/dev/filemon/filemon.c
@@ -210,6 +210,7 @@
static void
filemon_untrack_processes(struct filemon *filemon)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
sx_assert(&filemon->lock, SA_XLOCKED);
@@ -223,18 +224,24 @@
* filemon_event_process_exit() will lock on filemon->lock
* which we hold.
*/
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- /*
- * No PROC_LOCK is needed to compare here since it is
- * guaranteed to not change since we have its filemon
- * locked. Everything that changes this p_filemon will
- * be locked on it.
- */
- if (p->p_filemon == filemon)
- filemon_proc_drop(p);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ /*
+ * No PROC_LOCK is needed to compare here since it is
+ * guaranteed to not change since we have its filemon
+ * locked. Everything that changes this p_filemon will
+ * be locked on it.
+ */
+ if (p->p_filemon == filemon)
+ filemon_proc_drop(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
/*
* It's possible some references were acquired but will be
Index: sys/dev/hwpmc/hwpmc_mod.c
===================================================================
--- sys/dev/hwpmc/hwpmc_mod.c
+++ sys/dev/hwpmc/hwpmc_mod.c
@@ -1203,7 +1203,7 @@
* this PMC.
*/
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
top = p;
@@ -1227,7 +1227,7 @@
(void) pmc_detach_process(top, pm);
done:
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return error;
}
@@ -1312,7 +1312,7 @@
* partially attached proc tree.
*/
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
top = p;
@@ -1333,7 +1333,7 @@
}
done:
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
if (LIST_EMPTY(&pm->pm_targets))
pm->pm_flags &= ~PMC_F_ATTACH_DONE;
@@ -2025,7 +2025,7 @@
PROC_UNLOCK(p);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
top = p;
@@ -2044,7 +2044,7 @@
}
}
done:
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
}
/*
@@ -5364,6 +5364,7 @@
static void
pmc_process_allproc(struct pmc *pm)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct pmc_owner *po;
struct thread *td;
struct proc *p;
@@ -5371,15 +5372,22 @@
po = pm->pm_owner;
if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
return;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- pmclog_process_proccreate(po, p, 0 /* sync */);
- PROC_LOCK(p);
- FOREACH_THREAD_IN_PROC(p, td)
- pmclog_process_threadcreate(po, td, 0 /* sync */);
- PROC_UNLOCK(p);
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ pmclog_process_proccreate(po, p, 0 /* sync */);
+ PROC_LOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td)
+ pmclog_process_threadcreate(po, td, 0 /* sync */);
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
pmclog_flush(po, 0);
}
Index: sys/fs/devfs/devfs_vnops.c
===================================================================
--- sys/fs/devfs/devfs_vnops.c
+++ sys/fs/devfs/devfs_vnops.c
@@ -596,7 +596,7 @@
if (vp == p->p_session->s_ttyvp) {
PROC_UNLOCK(p);
oldvp = NULL;
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
if (vp == p->p_session->s_ttyvp) {
SESS_LOCK(p->p_session);
VI_LOCK(vp);
@@ -609,7 +609,7 @@
VI_UNLOCK(vp);
SESS_UNLOCK(p->p_session);
}
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
if (oldvp != NULL)
vrele(oldvp);
} else
@@ -813,9 +813,9 @@
if (error == 0 && com == TIOCSCTTY) {
/* Do nothing if reassigning same control tty */
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (td->td_proc->p_session->s_ttyvp == vp) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (0);
}
@@ -826,7 +826,7 @@
td->td_proc->p_session->s_ttydp = cdev2priv(dev);
SESS_UNLOCK(td->td_proc->p_session);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
/* Get rid of reference to old control tty */
if (vpold)
Index: sys/fs/nfs/nfsport.h
===================================================================
--- sys/fs/nfs/nfsport.h
+++ sys/fs/nfs/nfsport.h
@@ -692,8 +692,8 @@
#define NFSUNLOCKMNT(m) mtx_unlock(&((m)->nm_mtx))
#define NFSLOCKREQUEST(r) mtx_lock(&((r)->r_mtx))
#define NFSUNLOCKREQUEST(r) mtx_unlock(&((r)->r_mtx))
-#define NFSPROCLISTLOCK() sx_slock(&allproc_lock)
-#define NFSPROCLISTUNLOCK() sx_sunlock(&allproc_lock)
+#define NFSPROCLISTLOCK() sx_slock(&V_allproc_lock)
+#define NFSPROCLISTUNLOCK() sx_sunlock(&V_allproc_lock)
#define NFSLOCKSOCKREQ(r) mtx_lock(&((r)->nr_mtx))
#define NFSUNLOCKSOCKREQ(r) mtx_unlock(&((r)->nr_mtx))
#define NFSLOCKDS(d) mtx_lock(&((d)->nfsclds_mtx))
Index: sys/fs/pseudofs/pseudofs_vnops.c
===================================================================
--- sys/fs/pseudofs/pseudofs_vnops.c
+++ sys/fs/pseudofs/pseudofs_vnops.c
@@ -705,7 +705,7 @@
{
int visible;
- sx_assert(&allproc_lock, SX_SLOCKED);
+ sx_assert(&V_allproc_lock, SX_SLOCKED);
pfs_assert_owned(pd);
again:
if (*pn == NULL) {
@@ -716,9 +716,14 @@
*pn = (*pn)->pn_next;
}
if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) {
+ /*
+ * Operate on current vps instance only.
+ * We must not iterate over all vps as duplicate process space
+ * would not work at all and leak a lot of information.
+ */
/* next process */
if (*p == NULL)
- *p = LIST_FIRST(&allproc);
+ *p = LIST_FIRST(&V_allproc);
else
*p = LIST_NEXT(*p, p_list);
/* out of processes: next node */
@@ -791,12 +796,12 @@
if (resid == 0)
PFS_RETURN (0);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
pfs_lock(pd);
/* check if the directory is visible to the caller */
if (!pfs_visible(curthread, pd, pid, true, &proc)) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
pfs_unlock(pd);
PFS_RETURN (ENOENT);
}
@@ -810,7 +815,7 @@
if (proc != NULL)
PROC_UNLOCK(proc);
pfs_unlock(pd);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
PFS_RETURN (0);
}
}
@@ -860,7 +865,7 @@
if (proc != NULL)
PROC_UNLOCK(proc);
pfs_unlock(pd);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
i = 0;
STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) {
if (error == 0)
Index: sys/i386/i386/pmap.c
===================================================================
--- sys/i386/i386/pmap.c
+++ sys/i386/i386/pmap.c
@@ -5799,7 +5799,7 @@
int npte = 0;
int index;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_pid != pid)
continue;
@@ -5822,7 +5822,7 @@
index = 0;
printf("\n");
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (npte);
}
pte = pmap_pte(pmap, va);
@@ -5847,7 +5847,7 @@
}
}
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (npte);
}
#endif
Index: sys/i386/ibcs2/ibcs2_sysvec.c
===================================================================
--- sys/i386/ibcs2/ibcs2_sysvec.c
+++ sys/i386/ibcs2/ibcs2_sysvec.c
@@ -109,6 +109,7 @@
static int
ibcs2_modevent(module_t mod, int type, void *unused)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p = NULL;
int rval = 0;
@@ -117,14 +118,20 @@
break;
case MOD_UNLOAD:
/* if this was an ELF module we'd use elf_brand_inuse()... */
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- if (p->p_sysent == &ibcs2_svr3_sysvec) {
- rval = EBUSY;
- break;
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_sysent == &ibcs2_svr3_sysvec) {
+ rval = EBUSY;
+ break;
+ }
}
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
break;
default:
rval = EOPNOTSUPP;
Index: sys/kern/imgact_elf.c
===================================================================
--- sys/kern/imgact_elf.c
+++ sys/kern/imgact_elf.c
@@ -238,17 +238,24 @@
int
__elfN(brand_inuse)(Elf_Brandinfo *entry)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
int rval = FALSE;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- if (p->p_sysent == entry->sysvec) {
- rval = TRUE;
- break;
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_sysent == entry->sysvec) {
+ rval = TRUE;
+ break;
+ }
}
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
return (rval);
}
@@ -2106,10 +2113,10 @@
KASSERT(*sizep == size, ("invalid size"));
structsize = sizeof(elf_kinfo_proc_t);
sbuf_bcat(sb, &structsize, sizeof(structsize));
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
kern_proc_out(p, sb, ELF_KERN_PROC_MASK);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
}
*sizep = size;
}
Index: sys/kern/init_main.c
===================================================================
--- sys/kern/init_main.c
+++ sys/kern/init_main.c
@@ -56,6 +56,7 @@
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/jail.h>
+#include <sys/kthread.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/loginclass.h>
@@ -79,6 +80,7 @@
#include <sys/malloc.h>
#include <sys/conf.h>
#include <sys/cpuset.h>
+#include <sys/vps.h>
#include <machine/cpu.h>
@@ -103,7 +105,18 @@
struct proc proc0;
struct thread0_storage thread0_st __aligned(32);
struct vmspace vmspace0;
-struct proc *initproc;
+VPS_DEFINE(struct proc *, initproc);
+
+VPS_DEFINE(struct proc *, vproc0);
+#ifdef VIMAGE
+/*
+ * Initialize to -2; after kproc_create() our thread will still be
+ * forked from thread0 and in the wrong vps. Once that is fixed it will
+ * see the local copy and not the DEFAULT_VPS one. Make sure we have
+ * a value that we can spin on until this happens.
+ */
+VPS_DEFINE(int, vpsdying) = -2;
+#endif
#ifndef BOOTHOWTO
#define BOOTHOWTO 0
@@ -461,9 +474,8 @@
p->p_osrel = osreldate;
/*
- * Initialize thread and process structures.
+ * Initialize thread structures.
*/
- procinit(); /* set up proc zone */
threadinit(); /* set up UMA zones */
/*
@@ -475,7 +487,8 @@
/*
* Create process 0 (the swapper).
*/
- LIST_INSERT_HEAD(&allproc, p, p_list);
+ V_vproc0 = p;
+ LIST_INSERT_HEAD(&V_allproc, p, p_list);
LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
p->p_pgrp = &pgrp0;
@@ -511,6 +524,9 @@
td->td_cpuset = cpuset_thread0();
td->td_domain.dr_policy = td->td_cpuset->cs_domain;
prison0_init();
+#ifdef VIMAGE
+ td->td_vps = vps0;
+#endif
p->p_peers = 0;
p->p_leader = p;
p->p_reaper = p;
@@ -549,7 +565,7 @@
p->p_sigacts = sigacts_alloc();
/* Initialize signal state for process 0. */
- siginit(&proc0);
+ siginit(V_vproc0);
/* Create the file descriptor table. */
p->p_fd = fdinit(NULL, false);
@@ -614,7 +630,184 @@
racct_add_force(p, RACCT_NPROC, 1);
PROC_UNLOCK(p);
}
-SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL);
+SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_init, NULL);
+
+#ifdef VIMAGE
+static void
+vps_swapper(void *dummy __unused)
+{
+
+ /*
+ * Make sure the surgical changes to V_vproc0 are done before
+ * entering the long-lasting loop. Otherwise we may start
+ * acquiring locks and accessing variables based on the wrong
+ * credential leading to, e.g., panics when trying to unlock a
+ * lock from a different context which may not be locked.
+ * When entering the function our credentials might still point
+ * to the DEFAULT_VPS; see comment for V_vpsdying declaration above.
+ */
+ while (V_vpsdying < 0)
+ pause("wswvps", hz/2);
+
+ /*
+ * Now hand over this thread to swapper.
+ */
+ swapper();
+
+ if (V_vpsdying < 1)
+ panic("%s: swapper curtd %p ended but V_vpsdying %d\n",
+ __func__, curthread, V_vpsdying);
+
+ kproc_exit(0);
+}
+
+static void
+proc0_init_vps(void *dummy __unused)
+{
+ struct ucred *newcred, *savecred;
+ struct thread *td;
+ struct prison *pr;
+ struct uidinfo tmpuinfo;
+ struct loginclass tmplc = {
+ .lc_name = "",
+ };
+ int error;
+
+ /* vps0 is handled normally in p0init. */
+ if (IS_DEFAULT_VPS(curvps))
+ return;
+
+ KASSERT((curvps->vps_pr != NULL && curvps != vps0),
+ ("%s: curvps %p has vps_pr %p or is vps0 %p\n",
+ __func__, curvps, curvps->vps_pr, vps0));
+ KASSERT((curvps == curvps->vps_pr->pr_vps),
+ ("%s: curvps %p != curvps->vps_pr %p ->pr_vps %p\n",
+ __func__, curvps, curvps->vps_pr, curvps->vps_pr->pr_vps));
+
+ /*
+ * Initialized the non-default VPS version to < 0 so vps_swapper()
+ * will spin once the credential is changed before all other surgery
+ * has happened.
+ */
+ V_vpsdying = -1;
+
+ /*
+ * Default is nprocs = 1 for vps0; need to set it to 0 here as our
+ * "proc0" and with that initproc are forked and not manually constructed.
+ */
+ V_nprocs = 0;
+
+ /*
+ * Set lastpid to -1 so that our swapper gets 0.
+ */
+ V_lastpid = -1;
+
+ error = kproc_create(vps_swapper, NULL, &V_vproc0, 0, 0, "vps%u",
+ curvps->vps_pr->pr_id);
+ if (error)
+ panic("%s: cannot create vps %p swapper: %d\n",
+ __func__, curvps, error);
+
+ /* Create credentials. Copied from proc0. Just using vps_pr. */
+ newcred = crget();
+ newcred->cr_ngroups = 1; /* group 0 */
+ /* A hack to prevent uifind from tripping over NULL pointers. */
+ savecred = curthread->td_ucred;
+ curthread->td_ucred = newcred;
+ tmpuinfo.ui_uid = 1;
+ newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo;
+ newcred->cr_uidinfo = uifind(0);
+ newcred->cr_ruidinfo = uifind(0);
+ newcred->cr_loginclass = &tmplc;
+ newcred->cr_loginclass = loginclass_find("default");
+ /* End hack. creds get properly set later with thread_cow_get_proc */
+ curthread->td_ucred = savecred;
+ PROC_LOCK(V_vproc0);
+ newcred->cr_prison = curvps->vps_pr;
+ prison_hold(newcred->cr_prison);
+ /* The kernel process was accounted to thread0's prison. */
+ prison_proc_hold(newcred->cr_prison);
+ prison_proc_free(savecred->cr_prison);
+ V_vproc0->p_treeflag |= P_TREE_REAPER;
+ savecred = proc_set_cred(V_vproc0, newcred);
+ PROC_UNLOCK(V_vproc0);
+#ifdef AUDIT
+ audit_cred_kproc0(newcred);
+#endif
+#ifdef MAC
+ mac_cred_create_swapper(newcred);
+#endif
+ crfree(savecred);
+
+ PROC_LOCK(V_vproc0);
+ td = FIRST_THREAD_IN_PROC(V_vproc0);
+ thread_cow_get_proc(td, V_vproc0);
+ PROC_UNLOCK(V_vproc0);
+ KASSERT(curvps->vps_pr ==
+ FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred->cr_prison,
+ ("%s:%d: curvps %p vps_pr %p != FTIP(V_vproc0 %p)->td_ucred %p "
+ "cr_prison %p\n", __func__, __LINE__,
+ curvps, curvps->vps_pr, V_vproc0,
+ FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred,
+ FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred->cr_prison));
+ KASSERT(curvps == TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)),
+ ("%s:%d: curvps %p != TD_TO_VPS(..(V_vproc0 %p)) %p\n",
+ __func__, __LINE__,
+ curvps, V_vproc0, TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0))));
+
+ /* Chroot it. */
+ td = FIRST_THREAD_IN_PROC(V_vproc0);
+ pr = curvps->vps_pr;
+ vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = change_dir(pr->pr_root, td)) != 0) {
+ printf("%s: td %p change_dir %p failed: %d\n",
+ __func__, td, pr->pr_root, error);
+ goto err;
+ }
+#ifdef MAC
+ if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) {
+ printf("%s: td %p mac_vnode_check_chroot %p failed: %d\n",
+ __func__, td, pr->pr_root, error);
+ goto err;
+ }
+#endif
+ VOP_UNLOCK(pr->pr_root, 0);
+ if ((error = pwd_chroot(td, pr->pr_root))) {
+ printf("%s: td %p pwd_chroot %p failed: %d\n",
+ __func__, td, pr->pr_root, error);
+ goto err;
+ }
+
+ V_vpsdying = 0;
+ return;
+
+err:
+ /* XXX could panic or singal the jail to abort; cannot really stop. */
+ return;
+}
+VPS_SYSINIT(p0init_vps, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_init_vps, NULL);
+
+static void
+proc0_uninit_vps(void *dummy __unused)
+{
+
+ if (IS_DEFAULT_VPS(curvps))
+ return;
+
+ /*
+ * XXX ideally we want to get that state from elsewhere;
+ * neither prison, not vps state, .. lends itself though.
+ */
+ V_vpsdying = 1;
+ wakeup(V_vproc0);
+
+ /* Operate on current vps instance only. */
+ while (V_vproc0 != NULL ||
+ !LIST_EMPTY(&V_zombproc) || !LIST_EMPTY(&V_allproc))
+ pause("p0uvps", hz/2);
+}
+VPS_SYSUNINIT(p0uninit_vps, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_uninit_vps, NULL);
+#endif
/* ARGSUSED*/
static void
@@ -628,8 +821,9 @@
/*
* Now we can look at the time, having had a chance to verify the
* time from the filesystem. Pretend that proc0 started now.
+ * Operate on vps0 instance only.
*/
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
@@ -649,7 +843,7 @@
}
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
PCPU_SET(switchtime, cpu_ticks());
PCPU_SET(switchticks, ticks);
@@ -729,7 +923,8 @@
td = curthread;
p = td->td_proc;
- vfs_mountroot();
+ if (IS_DEFAULT_VPS(curvps))
+ vfs_mountroot();
/* Wipe GELI passphrase from the environment. */
kern_unsetenv("kern.geom.eli.passphrase");
@@ -753,8 +948,8 @@
while ((path = strsep(&tmp_init_path, ":")) != NULL) {
pathlen = strlen(path) + 1;
if (bootverbose)
- printf("start_init: trying %s\n", path);
-
+ printf("%s: trying %s\n", __func__, path);
+
/*
* Move out the boot flag argument.
*/
@@ -839,38 +1034,60 @@
struct thread *td;
int error;
+ KASSERT(curvps == FIRST_THREAD_IN_PROC(V_vproc0)->td_vps,
+ ("%s: curvps %p != V_vproc0 %p first td %p td_vps %p\n",
+ __func__, curvps, V_vproc0, FIRST_THREAD_IN_PROC(V_vproc0),
+ FIRST_THREAD_IN_PROC(V_vproc0)->td_vps));
+ KASSERT(curvps == TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)),
+ ("%s: curvps %p != TD_TO_VPS(..(V_vproc0 %p)) %p\n",
+ __func__, curvps, V_vproc0,
+ TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0))));
+
bzero(&fr, sizeof(fr));
fr.fr_flags = RFFDG | RFPROC | RFSTOPPED;
- fr.fr_procp = &initproc;
- error = fork1(&thread0, &fr);
+ fr.fr_procp = &V_initproc;
+ td = FIRST_THREAD_IN_PROC(V_vproc0);
+ error = fork1(td, &fr);
if (error)
panic("cannot fork init: %d\n", error);
- KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
+ KASSERT(V_initproc->p_pid == 1, ("%s: initproc->p_pid(%d) != 1",
+ __func__, V_initproc->p_pid));
+ KASSERT(curvps == FIRST_THREAD_IN_PROC(V_initproc)->td_vps,
+ ("%s: curvps %p != V_initproc %p first td %p td_vps %p\n",
+ __func__, curvps, V_initproc, FIRST_THREAD_IN_PROC(V_initproc),
+ FIRST_THREAD_IN_PROC(V_initproc)->td_vps));
+
/* divorce init's credentials from the kernel's */
newcred = crget();
- sx_xlock(&proctree_lock);
- PROC_LOCK(initproc);
- initproc->p_flag |= P_SYSTEM | P_INMEM;
- initproc->p_treeflag |= P_TREE_REAPER;
- oldcred = initproc->p_ucred;
+ sx_xlock(&V_proctree_lock);
+ PROC_LOCK(V_initproc);
+ V_initproc->p_flag |= P_SYSTEM | P_INMEM;
+ V_initproc->p_treeflag |= P_TREE_REAPER;
+ oldcred = V_initproc->p_ucred;
crcopy(newcred, oldcred);
+#ifdef VIMAGE
+ /* Swap to the correct prison. */
+ /* XXX is this really needed or was this related to a V_vproc0 bug? */
+ prison_free(newcred->cr_prison);
+ newcred->cr_prison = curvps->vps_pr;
+ prison_hold(newcred->cr_prison);
+#endif
#ifdef MAC
mac_cred_create_init(newcred);
#endif
#ifdef AUDIT
audit_cred_proc1(newcred);
#endif
- proc_set_cred(initproc, newcred);
- td = FIRST_THREAD_IN_PROC(initproc);
- crfree(td->td_ucred);
- td->td_ucred = crhold(initproc->p_ucred);
- PROC_UNLOCK(initproc);
- sx_xunlock(&proctree_lock);
+ /* This will also update cowgen. */
+ proc_set_cred(V_initproc, newcred);
+ PROC_UNLOCK(V_initproc);
+ sx_xunlock(&V_proctree_lock);
crfree(oldcred);
- cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc),
+
+ cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(V_initproc),
start_init, NULL);
}
-SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
+VPS_SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
/*
* Make it runnable now.
@@ -880,10 +1097,49 @@
{
struct thread *td;
- td = FIRST_THREAD_IN_PROC(initproc);
+ td = FIRST_THREAD_IN_PROC(V_initproc);
thread_lock(td);
TD_SET_CAN_RUN(td);
sched_add(td, SRQ_BORING);
thread_unlock(td);
}
-SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL);
+VPS_SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL);
+
+#ifdef VIMAGE
+static void
+reapinit(void *ident __unused)
+{
+ struct proc *p, *p2;
+
+ while (V_nprocs > 2) {
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_pid <= 1)
+ continue;
+ PROC_LOCK(p);
+ kern_psignal(p, SIGKILL);
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+ pause("reapin1t", hz/2);
+ }
+
+ /* Operate on current vps instance only. */
+ sx_xlock(&V_proctree_lock);
+ LIST_FOREACH_SAFE(p, &V_zombproc, p_list, p2) {
+ PROC_LOCK(p);
+ proc_reap(FIRST_THREAD_IN_PROC(V_vproc0), p, NULL, 0);
+ sx_xlock(&V_proctree_lock);
+ }
+ sx_xunlock(&V_proctree_lock);
+
+ while (V_nprocs > 1)
+ pause("reapinit", hz/2);
+
+ /* Only our "swapper" left. */
+ KASSERT(V_nprocs == 1, ("%s: vps %p V_nprocs %d != 1",
+ __func__, curvps, V_nprocs));
+}
+/* Run very first. */
+VPS_SYSUNINIT(reapinit, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, reapinit, NULL);
+#endif
Index: sys/kern/kern_acct.c
===================================================================
--- sys/kern/kern_acct.c
+++ sys/kern/kern_acct.c
@@ -378,7 +378,7 @@
* Get process accounting information.
*/
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
/* (1) The terminal from which the process was started */
@@ -386,7 +386,7 @@
acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp);
else
acct.ac_tty = NODEV;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
/* (2) The name of the command that ran */
bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
Index: sys/kern/kern_clock.c
===================================================================
--- sys/kern/kern_clock.c
+++ sys/kern/kern_clock.c
@@ -184,12 +184,78 @@
static int blktime_threshold = 900;
static int sleepfreq = 3;
+static __inline void
+_deadlres_td_on_lock(struct proc *p, struct thread *td, int blkticks)
+{
+ int tticks;
+
+ /*
+ * The thread should be blocked on a turnstile, simply check
+ * if the turnstile channel is in good state.
+ */
+ MPASS(td->td_blocked != NULL);
+
+ tticks = ticks - td->td_blktick;
+ thread_unlock(td);
+ if (tticks > blkticks) {
+
+ /*
+ * Accordingly with provided thresholds, this thread is stuck
+ * for too long on a turnstile.
+ */
+ PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ panic("%s: possible deadlock detected for %p, "
+ "blocked for %d ticks\n", __func__, td, tticks);
+ }
+}
+
+static __inline void
+_deadlres_td_sleep_q(struct proc *p, struct thread *td, int slpticks)
+{
+ void *wchan;
+ int i, slptype, tryl, tticks;
+
+ /*
+ * Check if the thread is sleeping on a lock, otherwise skip the check.
+ * Drop the thread lock in order to avoid a LOR with the sleepqueue
+ * spinlock.
+ */
+ wchan = td->td_wchan;
+ tticks = ticks - td->td_slptick;
+ thread_unlock(td);
+ slptype = sleepq_type(wchan);
+ if ((slptype == SLEEPQ_SX || slptype == SLEEPQ_LK) &&
+ tticks > slpticks) {
+
+ /*
+ * Accordingly with provided thresholds, this thread is stuck
+ * for too long on a sleepqueue.
+ * However, being on a sleepqueue, we might still check for the
+ * blessed list.
+ */
+ tryl = 0;
+ for (i = 0; blessed[i] != NULL; i++) {
+ if (!strcmp(blessed[i], td->td_wmesg)) {
+ tryl = 1;
+ break;
+ }
+ }
+ if (tryl != 0)
+ return;
+ PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ panic("%s: possible deadlock detected for %p, "
+ "blocked for %d ticks\n", __func__, td, tticks);
+ }
+}
+
static void
deadlres_td_on_lock(struct proc *p, struct thread *td, int blkticks)
{
int tticks;
- sx_assert(&allproc_lock, SX_LOCKED);
+ sx_assert(&V_allproc_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
THREAD_LOCK_ASSERT(td, MA_OWNED);
/*
@@ -214,7 +280,7 @@
void *wchan;
int i, slptype, tticks;
- sx_assert(&allproc_lock, SX_LOCKED);
+ sx_assert(&V_allproc_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
THREAD_LOCK_ASSERT(td, MA_OWNED);
/*
@@ -246,6 +312,7 @@
static void
deadlkres(void)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
struct thread *td;
int blkticks, slpticks, tryl;
@@ -255,41 +322,49 @@
blkticks = blktime_threshold * hz;
slpticks = slptime_threshold * hz;
- /*
- * Avoid to sleep on the sx_lock in order to avoid a
- * possible priority inversion problem leading to
- * starvation.
- * If the lock can't be held after 100 tries, panic.
- */
- if (!sx_try_slock(&allproc_lock)) {
- if (tryl > 100)
- panic("%s: possible deadlock detected "
- "on allproc_lock\n", __func__);
- tryl++;
- pause("allproc", sleepfreq * hz);
- continue;
- }
- tryl = 0;
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state == PRS_NEW) {
- PROC_UNLOCK(p);
- continue;
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+again:
+ CURVPS_SET_QUIET(vps_iter);
+ /*
+ * Avoid to sleep on the sx_lock in order to avoid a
+ * possible priority inversion problem leading to
+ * starvation.
+ * If the lock can't be held after 100 tries, panic.
+ */
+ if (!sx_try_slock(&V_allproc_lock)) {
+ if (tryl > 100)
+ panic("%s: possible deadlock detected "
+ "on allproc_lock\n", __func__);
+ tryl++;
+ CURVPS_RESTORE();
+ pause("allproc", sleepfreq * hz);
+ goto again;
}
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (TD_ON_LOCK(td))
- deadlres_td_on_lock(p, td,
- blkticks);
- else if (TD_IS_SLEEPING(td) &&
- TD_ON_SLEEPQ(td))
- deadlres_td_sleep_q(p, td,
- slpticks);
- thread_unlock(td);
+ tryl = 0;
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (TD_ON_LOCK(td))
+ deadlres_td_on_lock(p, td,
+ blkticks);
+ else if (TD_IS_SLEEPING(td) &&
+ TD_ON_SLEEPQ(td))
+ deadlres_td_sleep_q(p, td,
+ slpticks);
+ thread_unlock(td);
+ }
+ PROC_UNLOCK(p);
}
- PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
/* Sleep for sleepfreq seconds. */
pause("-", sleepfreq * hz);
Index: sys/kern/kern_cpuset.c
===================================================================
--- sys/kern/kern_cpuset.c
+++ sys/kern/kern_cpuset.c
@@ -510,24 +510,32 @@
static void
domainset_notify(void)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct thread *td;
struct proc *p;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state == PRS_NEW) {
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ td->td_domain.dr_policy =
+ td->td_cpuset->cs_domain;
+ thread_unlock(td);
+ }
PROC_UNLOCK(p);
- continue;
}
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- td->td_domain.dr_policy = td->td_cpuset->cs_domain;
- thread_unlock(td);
- }
- PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
kernel_object->domain.dr_policy = cpuset_kernel->cs_domain;
}
Index: sys/kern/kern_descrip.c
===================================================================
--- sys/kern/kern_descrip.c
+++ sys/kern/kern_descrip.c
@@ -1063,7 +1063,7 @@
sigio->sio_ucred = crhold(curthread->td_ucred);
sigio->sio_myref = sigiop;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (pgid > 0) {
proc = pfind(pgid);
if (proc == NULL) {
@@ -1131,14 +1131,14 @@
sigio->sio_pgrp = pgrp;
PGRP_UNLOCK(pgrp);
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
SIGIO_LOCK();
*sigiop = sigio;
SIGIO_UNLOCK();
return (0);
fail:
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
crfree(sigio->sio_ucred);
free(sigio, M_SIGIO);
return (ret);
@@ -3190,6 +3190,7 @@
void
mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct filedesc *fdp;
struct prison *pr;
struct proc *p;
@@ -3198,33 +3199,40 @@
if (vrefcnt(olddp) == 1)
return;
nrele = 0;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- fdp = fdhold(p);
- PROC_UNLOCK(p);
- if (fdp == NULL)
- continue;
- FILEDESC_XLOCK(fdp);
- if (fdp->fd_cdir == olddp) {
- vrefact(newdp);
- fdp->fd_cdir = newdp;
- nrele++;
- }
- if (fdp->fd_rdir == olddp) {
- vrefact(newdp);
- fdp->fd_rdir = newdp;
- nrele++;
- }
- if (fdp->fd_jdir == olddp) {
- vrefact(newdp);
- fdp->fd_jdir = newdp;
- nrele++;
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ if (fdp == NULL)
+ continue;
+ FILEDESC_XLOCK(fdp);
+ if (fdp->fd_cdir == olddp) {
+ vrefact(newdp);
+ fdp->fd_cdir = newdp;
+ nrele++;
+ }
+ if (fdp->fd_rdir == olddp) {
+ vrefact(newdp);
+ fdp->fd_rdir = newdp;
+ nrele++;
+ }
+ if (fdp->fd_jdir == olddp) {
+ vrefact(newdp);
+ fdp->fd_jdir = newdp;
+ nrele++;
+ }
+ FILEDESC_XUNLOCK(fdp);
+ fddrop(fdp);
}
- FILEDESC_XUNLOCK(fdp);
- fddrop(fdp);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
if (rootvnode == olddp) {
vrefact(newdp);
rootvnode = newdp;
@@ -3307,6 +3315,7 @@
static int
sysctl_kern_file(SYSCTL_HANDLER_ARGS)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct xfile xf;
struct filedesc *fdp;
struct file *fp;
@@ -3318,68 +3327,82 @@
return (error);
if (req->oldptr == NULL) {
n = 0;
- sx_slock(&allproc_lock);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ if (fdp == NULL)
+ continue;
+ /* overestimates sparse tables. */
+ if (fdp->fd_lastfile > 0)
+ n += fdp->fd_lastfile;
+ fddrop(fdp);
+ }
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
+ }
+ VPS_LIST_RUNLOCK();
+ return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
+ }
+ error = 0;
+ bzero(&xf, sizeof(xf));
+ xf.xf_size = sizeof(xf);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
PROC_UNLOCK(p);
continue;
}
+ if (p_cansee(req->td, p) != 0) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ xf.xf_pid = p->p_pid;
+ xf.xf_uid = p->p_ucred->cr_uid;
fdp = fdhold(p);
PROC_UNLOCK(p);
if (fdp == NULL)
continue;
- /* overestimates sparse tables. */
- if (fdp->fd_lastfile > 0)
- n += fdp->fd_lastfile;
+ FILEDESC_SLOCK(fdp);
+ for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
+ if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+ continue;
+ xf.xf_fd = n;
+ xf.xf_file = (kvaddr_t)(uintptr_t)fp;
+ xf.xf_data = (kvaddr_t)(uintptr_t)fp->f_data;
+ xf.xf_vnode = (kvaddr_t)(uintptr_t)fp->f_vnode;
+ xf.xf_type = (kvaddr_t)(uintptr_t)fp->f_type;
+ xf.xf_count = fp->f_count;
+ xf.xf_msgcount = 0;
+ xf.xf_offset = foffset_get(fp);
+ xf.xf_flag = fp->f_flag;
+ error = SYSCTL_OUT(req, &xf, sizeof(xf));
+ if (error)
+ break;
+ }
+ FILEDESC_SUNLOCK(fdp);
fddrop(fdp);
- }
- sx_sunlock(&allproc_lock);
- return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
- }
- error = 0;
- bzero(&xf, sizeof(xf));
- xf.xf_size = sizeof(xf);
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state == PRS_NEW) {
- PROC_UNLOCK(p);
- continue;
- }
- if (p_cansee(req->td, p) != 0) {
- PROC_UNLOCK(p);
- continue;
- }
- xf.xf_pid = p->p_pid;
- xf.xf_uid = p->p_ucred->cr_uid;
- fdp = fdhold(p);
- PROC_UNLOCK(p);
- if (fdp == NULL)
- continue;
- FILEDESC_SLOCK(fdp);
- for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
- if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
- continue;
- xf.xf_fd = n;
- xf.xf_file = (kvaddr_t)(uintptr_t)fp;
- xf.xf_data = (kvaddr_t)(uintptr_t)fp->f_data;
- xf.xf_vnode = (kvaddr_t)(uintptr_t)fp->f_vnode;
- xf.xf_type = (kvaddr_t)(uintptr_t)fp->f_type;
- xf.xf_count = fp->f_count;
- xf.xf_msgcount = 0;
- xf.xf_offset = foffset_get(fp);
- xf.xf_flag = fp->f_flag;
- error = SYSCTL_OUT(req, &xf, sizeof(xf));
if (error)
break;
}
- FILEDESC_SUNLOCK(fdp);
- fddrop(fdp);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
if (error)
break;
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
return (error);
}
@@ -3930,21 +3953,28 @@
static struct proc *
file_to_first_proc(struct file *fp)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct filedesc *fdp;
struct proc *p;
int n;
- FOREACH_PROC_IN_SYSTEM(p) {
- if (p->p_state == PRS_NEW)
- continue;
- fdp = p->p_fd;
- if (fdp == NULL)
- continue;
- for (n = 0; n <= fdp->fd_lastfile; n++) {
- if (fp == fdp->fd_ofiles[n].fde_file)
- return (p);
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_state == PRS_NEW)
+ continue;
+ fdp = p->p_fd;
+ if (fdp == NULL)
+ continue;
+ for (n = 0; n <= fdp->fd_lastfile; n++) {
+ if (fp == fdp->fd_ofiles[n].fde_file)
+ return (p);
+ }
}
+ CURVPS_RESTORE();
}
+ /* VPS_LIST_RUNLOCK(); */
return (NULL);
}
@@ -3982,6 +4012,7 @@
DB_SHOW_COMMAND(files, db_show_files)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct filedesc *fdp;
struct file *fp;
struct proc *p;
@@ -3989,18 +4020,24 @@
int n;
header = 1;
- FOREACH_PROC_IN_SYSTEM(p) {
- if (p->p_state == PRS_NEW)
- continue;
- if ((fdp = p->p_fd) == NULL)
- continue;
- for (n = 0; n <= fdp->fd_lastfile; ++n) {
- if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_state == PRS_NEW)
+ continue;
+ if ((fdp = p->p_fd) == NULL)
continue;
- db_print_file(fp, header);
- header = 0;
+ for (n = 0; n <= fdp->fd_lastfile; ++n) {
+ if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+ continue;
+ db_print_file(fp, header);
+ header = 0;
+ }
}
+ CURVPS_RESTORE();
}
+ /* VPS_LIST_RUNLOCK(); */
}
#endif
Index: sys/kern/kern_exit.c
===================================================================
--- sys/kern/kern_exit.c
+++ sys/kern/kern_exit.c
@@ -96,6 +96,11 @@
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE1(proc, , , exit, "int");
+#ifdef VIMAGE
+VPS_DECLARE(int, vrebooting); /* kern_reboot() has been called. */
+#define V_vrebooting VPS(vrebooting)
+#endif
+
/* Hook for NFS teardown procedure. */
void (*nlminfo_release_p)(struct proc *p);
@@ -106,13 +111,13 @@
{
struct proc *p, *parent;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
if ((child->p_treeflag & P_TREE_ORPHANED) == 0) {
if (child->p_oppid == 0 ||
child->p_pptr->p_pid == child->p_oppid)
parent = child->p_pptr;
else
- parent = initproc;
+ parent = V_initproc;
return (parent);
}
for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) {
@@ -132,10 +137,16 @@
{
struct proc *p1, *p2, *ptmp;
- sx_assert(&proctree_lock, SX_LOCKED);
- KASSERT(p != initproc, ("reaper_abandon_children for initproc"));
- if ((p->p_treeflag & P_TREE_REAPER) == 0)
+ sx_assert(&V_proctree_lock, SX_LOCKED);
+ /* init inside a vps may die on prison_remove. */
+ KASSERT(!IS_DEFAULT_VPS(curvps) || p != V_initproc,
+ ("%s: for initproc %p", __func__, p));
+ if ((p->p_treeflag & P_TREE_REAPER) == 0) {
+ KASSERT((p != V_initproc && p->p_pid != 1 && p->p_pid != 0),
+ ("%s:%d curvps %p p %p pid %d p_treeflag %#x",
+ __func__, __LINE__, curvps, p, p->p_pid, p->p_treeflag));
return;
+ }
p1 = p->p_reaper;
LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) {
LIST_REMOVE(p2, p_reapsibling);
@@ -148,7 +159,8 @@
PROC_UNLOCK(p2);
}
}
- KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty"));
+ KASSERT(LIST_EMPTY(&p->p_reaplist),
+ ("%s: p %p p_reaplist not empty", __func__, p));
p->p_treeflag &= ~P_TREE_REAPER;
}
@@ -157,7 +169,7 @@
{
struct proc *p1;
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
if ((p->p_treeflag & P_TREE_ORPHANED) == 0)
return;
if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) {
@@ -203,9 +215,19 @@
* work around an unsolved stack overflow seen very late during
* shutdown on sparc64 when the gmirror worker process exists.
*/
- if (p == initproc && rebooting == 0) {
+ if (p == V_initproc && (rebooting == 0
+#ifdef VIMAGE
+ || V_vrebooting
+#endif
+ )) {
printf("init died (signal %d, exit %d)\n", signo, rval);
- panic("Going nowhere without my init!");
+#ifdef VIMAGE
+ if (!IS_DEFAULT_VPS(TD_TO_VPS(td)))
+ /* XXX-BZ make this jail go away. */ ;
+ else
+#endif
+ panic("%s: Going nowhere without my init! td %p",
+ __func__, td);
}
/*
@@ -313,7 +335,7 @@
/* Are we a task leader with peers? */
if (p->p_peers != NULL && p == p->p_leader) {
- mtx_lock(&ppeers_lock);
+ mtx_lock(&V_ppeers_lock);
q = p->p_peers;
while (q != NULL) {
PROC_LOCK(q);
@@ -322,8 +344,8 @@
q = q->p_peers;
}
while (p->p_peers != NULL)
- msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
- mtx_unlock(&ppeers_lock);
+ msleep(p, &V_ppeers_lock, PWAIT, "exit1", 0);
+ mtx_unlock(&V_ppeers_lock);
}
/*
@@ -388,7 +410,7 @@
* Remove ourself from our leader's peer list and wake our leader.
*/
if (p->p_leader->p_peers != NULL) {
- mtx_lock(&ppeers_lock);
+ mtx_lock(&V_ppeers_lock);
if (p->p_leader->p_peers != NULL) {
q = p->p_leader;
while (q->p_peers != p)
@@ -396,7 +418,7 @@
q->p_peers = p->p_peers;
wakeup(p->p_leader);
}
- mtx_unlock(&ppeers_lock);
+ mtx_unlock(&V_ppeers_lock);
}
vmspace_exit(td);
@@ -432,16 +454,17 @@
WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
/*
* Remove proc from allproc queue and pidhash chain.
* Place onto zombproc. Unlink from parent's child list.
*/
- sx_xlock(&allproc_lock);
+ /* Operate on current vps instance only. */
+ sx_xlock(&V_allproc_lock);
LIST_REMOVE(p, p_list);
- LIST_INSERT_HEAD(&zombproc, p, p_list);
+ LIST_INSERT_HEAD(&V_zombproc, p, p_list);
LIST_REMOVE(p, p_hash);
- sx_xunlock(&allproc_lock);
+ sx_xunlock(&V_allproc_lock);
/*
* Reparent all children processes:
@@ -602,7 +625,7 @@
} else
mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
- if (p->p_pptr == p->p_reaper || p->p_pptr == initproc) {
+ if (p->p_pptr == p->p_reaper || p->p_pptr == V_initproc) {
signal_parent = 1;
} else if (p->p_sigparent != 0) {
if (p->p_sigparent == SIGCHLD) {
@@ -613,7 +636,7 @@
}
} else
PROC_LOCK(p->p_pptr);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
if (signal_parent == 1) {
childproc_exited(p);
@@ -827,9 +850,9 @@
{
struct proc *q, *t;
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
- KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE"));
+ KASSERT(p->p_state == PRS_ZOMBIE, ("%s: !PRS_ZOMBIE", __func__));
mtx_spin_wait_unlocked(&p->p_slock);
@@ -843,7 +866,7 @@
* release the proc struct just yet.
*/
PROC_UNLOCK(p);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return;
}
@@ -870,7 +893,7 @@
wakeup(t);
cv_broadcast(&p->p_pwait);
PROC_UNLOCK(t);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return;
}
p->p_oppid = 0;
@@ -880,9 +903,9 @@
* Remove other references to this process to ensure we have an
* exclusive reference.
*/
- sx_xlock(&allproc_lock);
+ sx_xlock(&V_allproc_lock);
LIST_REMOVE(p, p_list); /* off zombproc */
- sx_xunlock(&allproc_lock);
+ sx_xunlock(&V_allproc_lock);
LIST_REMOVE(p, p_sibling);
reaper_abandon_children(p, true);
LIST_REMOVE(p, p_reapsibling);
@@ -892,7 +915,7 @@
leavepgrp(p);
if (p->p_procdesc != NULL)
procdesc_reap(p);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
PROC_LOCK(p);
knlist_detach(p->p_klist);
@@ -953,9 +976,9 @@
#endif
KASSERT(FIRST_THREAD_IN_PROC(p),
- ("proc_reap: no residual thread!"));
+ ("%s: no residual thread!", __func__));
uma_zfree(proc_zone, p);
- atomic_add_int(&nprocs, -1);
+ atomic_add_int(&V_nprocs, -1);
}
static int
@@ -965,7 +988,7 @@
{
struct rusage *rup;
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
PROC_LOCK(p);
@@ -1156,7 +1179,7 @@
bool cont;
PROC_LOCK_ASSERT(p, MA_OWNED);
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
MPASS(si_code == CLD_TRAPPED || si_code == CLD_STOPPED ||
si_code == CLD_CONTINUED);
@@ -1170,7 +1193,7 @@
sigqueue_take(p->p_ksi);
PROC_UNLOCK(td->td_proc);
}
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
if (siginfo != NULL) {
siginfo->si_code = si_code;
siginfo->si_status = cont ? SIGCONT : p->p_xsig;
@@ -1223,7 +1246,7 @@
q->p_flag &= ~P_STATCHILD;
PROC_UNLOCK(q);
}
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
loop_locked:
nfound = 0;
LIST_FOREACH(p, &q->p_children, p_sibling) {
@@ -1307,11 +1330,11 @@
}
}
if (nfound == 0) {
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (ECHILD);
}
if (options & WNOHANG) {
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
td->td_retval[0] = 0;
return (0);
}
@@ -1321,7 +1344,7 @@
PROC_UNLOCK(q);
goto loop_locked;
}
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
error = msleep(q, &q->p_mtx, PWAIT | PCATCH | PDROP, "wait", 0);
if (error)
return (error);
@@ -1336,7 +1359,7 @@
proc_reparent(struct proc *child, struct proc *parent)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(child, MA_OWNED);
if (child->p_pptr == parent)
return;
Index: sys/kern/kern_fork.c
===================================================================
--- sys/kern/kern_fork.c
+++ sys/kern/kern_fork.c
@@ -74,6 +74,7 @@
#include <sys/sx.h>
#include <sys/sysent.h>
#include <sys/signalvar.h>
+#include <sys/vps.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
@@ -184,10 +185,10 @@
return (error);
}
-int nprocs = 1; /* process 0 */
-int lastpid = 0;
-SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
- "Last used PID");
+VPS_DEFINE(int, nprocs) = 1; /* process 0 */
+VPS_DEFINE(int, lastpid) = 0;
+SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD|CTLFLAG_VPS,
+ &VPS_NAME(lastpid), 0, "Last used PID");
/*
* Random component to lastpid generation. We mix in a random factor to make
@@ -197,7 +198,8 @@
* modulus that is too big causes a LOT more process table scans and slows
* down fork processing as the pidchecked caching is defeated.
*/
-static int randompid = 0;
+static VPS_DEFINE(int, randompid) = 0;
+#define V_randompid VPS(randompid)
static int
sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
@@ -207,44 +209,46 @@
error = sysctl_wire_old_buffer(req, sizeof(int));
if (error != 0)
return(error);
- sx_xlock(&allproc_lock);
- pid = randompid;
+ sx_xlock(&V_allproc_lock);
+ pid = V_randompid;
error = sysctl_handle_int(oidp, &pid, 0, req);
if (error == 0 && req->newptr != NULL) {
if (pid == 0)
- randompid = 0;
+ V_randompid = 0;
else if (pid == 1)
/* generate a random PID modulus between 100 and 1123 */
- randompid = 100 + arc4random() % 1024;
+ V_randompid = 100 + arc4random() % 1024;
else if (pid < 0 || pid > pid_max - 100)
/* out of range */
- randompid = pid_max - 100;
+ V_randompid = pid_max - 100;
else if (pid < 100)
/* Make it reasonable */
- randompid = 100;
+ V_randompid = 100;
else
- randompid = pid;
+ V_randompid = pid;
}
- sx_xunlock(&allproc_lock);
+ sx_xunlock(&V_allproc_lock);
return (error);
}
SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
0, 0, sysctl_kern_randompid, "I", "Random PID modulus. Special values: 0: disable, 1: choose random value");
-static int
+static VPS_DEFINE(int, pidchecked) = 0;
+#define V_pidchecked VPS(pidchecked)
+
+int
fork_findpid(int flags)
{
struct proc *p;
int trypid;
- static int pidchecked = 0;
/*
* Requires allproc_lock in order to iterate over the list
* of processes, and proctree_lock to access p_pgrp.
*/
- sx_assert(&allproc_lock, SX_LOCKED);
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_allproc_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
/*
* Find an unused process ID. We remember a range of unused IDs
@@ -253,13 +257,13 @@
* If RFHIGHPID is set (used during system boot), do not allocate
* low-numbered pids.
*/
- trypid = lastpid + 1;
+ trypid = V_lastpid + 1;
if (flags & RFHIGHPID) {
if (trypid < 10)
trypid = 10;
} else {
- if (randompid)
- trypid += arc4random() % randompid;
+ if (V_randompid)
+ trypid += arc4random() % V_randompid;
}
retry:
/*
@@ -271,12 +275,12 @@
trypid = trypid % pid_max;
if (trypid < 100)
trypid += 100;
- pidchecked = 0;
+ V_pidchecked = 0;
}
- if (trypid >= pidchecked) {
+ if (trypid >= V_pidchecked) {
int doingzomb = 0;
- pidchecked = PID_MAX;
+ V_pidchecked = PID_MAX;
/*
* Scan the active and zombie procs to check whether this pid
* is in use. Remember the lowest pid that's greater
@@ -291,7 +295,8 @@
* reserved pids is limited by process limit times
* two.
*/
- p = LIST_FIRST(&allproc);
+ /* Operate on current vps instance only. */
+ p = LIST_FIRST(&V_allproc);
again:
for (; p != NULL; p = LIST_NEXT(p, p_list)) {
while (p->p_pid == trypid ||
@@ -301,24 +306,25 @@
(p->p_session != NULL &&
p->p_session->s_sid == trypid)))) {
trypid++;
- if (trypid >= pidchecked)
+ if (trypid >= V_pidchecked)
goto retry;
}
- if (p->p_pid > trypid && pidchecked > p->p_pid)
- pidchecked = p->p_pid;
+ if (p->p_pid > trypid && V_pidchecked > p->p_pid)
+ V_pidchecked = p->p_pid;
if (p->p_pgrp != NULL) {
if (p->p_pgrp->pg_id > trypid &&
- pidchecked > p->p_pgrp->pg_id)
- pidchecked = p->p_pgrp->pg_id;
+ V_pidchecked > p->p_pgrp->pg_id)
+ V_pidchecked = p->p_pgrp->pg_id;
if (p->p_session != NULL &&
p->p_session->s_sid > trypid &&
- pidchecked > p->p_session->s_sid)
- pidchecked = p->p_session->s_sid;
+ V_pidchecked > p->p_session->s_sid)
+ V_pidchecked = p->p_session->s_sid;
}
}
if (!doingzomb) {
+ /* Operate on current vps instance only. */
doingzomb = 1;
- p = LIST_FIRST(&zombproc);
+ p = LIST_FIRST(&V_zombproc);
goto again;
}
}
@@ -327,9 +333,9 @@
* RFHIGHPID does not mess with the lastpid counter during boot.
*/
if (flags & RFHIGHPID)
- pidchecked = 0;
+ V_pidchecked = 0;
else
- lastpid = trypid;
+ V_lastpid = trypid;
return (trypid);
}
@@ -394,8 +400,8 @@
struct filedesc_to_leader *fdtol;
struct sigacts *newsigacts;
- sx_assert(&proctree_lock, SX_LOCKED);
- sx_assert(&allproc_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
+ sx_assert(&V_allproc_lock, SX_XLOCKED);
p1 = td->td_proc;
@@ -404,14 +410,14 @@
p2->p_state = PRS_NEW; /* protect against others */
p2->p_pid = trypid;
AUDIT_ARG_PID(p2->p_pid);
- LIST_INSERT_HEAD(&allproc, p2, p_list);
+ LIST_INSERT_HEAD(&V_allproc, p2, p_list);
allproc_gen++;
LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
PROC_LOCK(p2);
PROC_LOCK(p1);
- sx_xunlock(&allproc_lock);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_allproc_lock);
+ sx_xunlock(&V_proctree_lock);
bcopy(&p1->p_startcopy, &p2->p_startcopy,
__rangeof(struct proc, p_startcopy, p_endcopy));
@@ -490,6 +496,7 @@
td2->td_lend_user_pri = PRI_MAX;
#ifdef VIMAGE
+ td2->td_vps = TD_TO_VPS(td);
td2->td_vnet = NULL;
td2->td_vnet_lpush = NULL;
#endif
@@ -554,11 +561,11 @@
* Set up linkage for kernel based threading.
*/
if ((fr->fr_flags & RFTHREAD) != 0) {
- mtx_lock(&ppeers_lock);
+ mtx_lock(&V_ppeers_lock);
p2->p_peers = p1->p_peers;
p1->p_peers = p2;
p2->p_leader = p1->p_leader;
- mtx_unlock(&ppeers_lock);
+ mtx_unlock(&V_ppeers_lock);
PROC_LOCK(p1->p_leader);
if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
PROC_UNLOCK(p1->p_leader);
@@ -585,7 +592,7 @@
p2->p_leader = p2;
}
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
PGRP_LOCK(p1->p_pgrp);
PROC_LOCK(p2);
PROC_LOCK(p1);
@@ -648,7 +655,7 @@
LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling);
if (p2->p_reaper == p1)
p2->p_reapsubtree = p2->p_pid;
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
/* Inform accounting that we have forked. */
p2->p_acflag = AFORK;
@@ -751,7 +758,7 @@
* if being set atm.
*/
if ((p1->p_ptevents & PTRACE_FORK) != 0) {
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
PROC_LOCK(p2);
/*
@@ -777,7 +784,7 @@
proc_reparent(p2, p1->p_pptr);
}
PROC_UNLOCK(p2);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
}
if ((fr->fr_flags & RFSTOPPED) == 0) {
@@ -801,6 +808,11 @@
PROC_UNLOCK(p2);
}
+static VPS_DEFINE(int, curfail);
+#define V_curfail VPS(curfail)
+static VPS_DEFINE(struct timeval, lastfail);
+#define V_lastfail VPS(lastfail)
+
int
fork1(struct thread *td, struct fork_req *fr)
{
@@ -810,8 +822,6 @@
struct file *fp_procdesc;
vm_ooffset_t mem_charged;
int error, nprocs_new, ok;
- static int curfail;
- static struct timeval lastfail;
int flags, pages;
flags = fr->fr_flags;
@@ -881,17 +891,17 @@
* Don't allow a nonprivileged user to use the last ten
* processes; don't let root exceed the limit.
*/
- nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1;
+ nprocs_new = atomic_fetchadd_int(&V_nprocs, 1) + 1;
if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred,
PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) {
error = EAGAIN;
- sx_xlock(&allproc_lock);
- if (ppsratecheck(&lastfail, &curfail, 1)) {
+ sx_xlock(&V_allproc_lock);
+ if (ppsratecheck(&V_lastfail, &V_curfail, 1)) {
printf("maxproc limit exceeded by uid %u (pid %d); "
"see tuning(7) and login.conf(5)\n",
td->td_ucred->cr_ruid, p1->p_pid);
}
- sx_xunlock(&allproc_lock);
+ sx_xunlock(&V_allproc_lock);
goto fail2;
}
@@ -973,8 +983,8 @@
STAILQ_INIT(&newproc->p_ktr);
/* We have to lock the process tree while we look for a pid. */
- sx_xlock(&proctree_lock);
- sx_xlock(&allproc_lock);
+ sx_xlock(&V_proctree_lock);
+ sx_xlock(&V_allproc_lock);
/*
* Increment the count of procs running with this uid. Don't allow
@@ -995,8 +1005,8 @@
}
error = EAGAIN;
- sx_xunlock(&allproc_lock);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_allproc_lock);
+ sx_xunlock(&V_proctree_lock);
#ifdef MAC
mac_proc_destroy(newproc);
#endif
@@ -1012,7 +1022,7 @@
fdclose(td, fp_procdesc, *fr->fr_pd_fd);
fdrop(fp_procdesc, td);
}
- atomic_add_int(&nprocs, -1);
+ atomic_add_int(&V_nprocs, -1);
pause("fork", hz / 2);
return (error);
}
Index: sys/kern/kern_jail.c
===================================================================
--- sys/kern/kern_jail.c
+++ sys/kern/kern_jail.c
@@ -62,6 +62,10 @@
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
+#include <sys/vps.h>
+#ifdef VIMAGE
+#include <sys/reboot.h>
+#endif
#include <net/if.h>
#include <net/vnet.h>
@@ -107,7 +111,7 @@
.pr_hostuuid = DEFAULT_HOSTUUID,
.pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children),
#ifdef VIMAGE
- .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
+ .pr_flags = PR_HOST|PR_VNET|PR_VPS|_PR_IP_SADDRSEL,
#else
.pr_flags = PR_HOST|_PR_IP_SADDRSEL,
#endif
@@ -171,6 +175,7 @@
{"host", 0, PR_HOST},
#ifdef VIMAGE
{"vnet", 0, PR_VNET},
+ {"vps", 0, PR_VPS},
#endif
#ifdef INET
{"ip4", PR_IP4_USER, PR_IP4_USER},
@@ -627,6 +632,11 @@
vfs_opterror(opts, "vnet cannot be changed after creation");
goto done_errmsg;
}
+ if ((flags & JAIL_UPDATE) && (ch_flags & PR_VPS)) {
+ error = EINVAL;
+ vfs_opterror(opts, "vps cannot be changed after creation");
+ goto done_errmsg;
+ }
#endif
#ifdef INET
if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
@@ -1801,6 +1811,39 @@
goto done_errmsg;
}
+#ifdef VIMAGE
+ /* Allocate a new vps if specified. */
+ if (pr_flags & PR_VPS) {
+ vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = change_dir(pr->pr_root, td)) != 0)
+ goto c_unlock;
+#ifdef MAC
+ if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
+ goto c_unlock;
+#endif
+c_unlock:
+ VOP_UNLOCK(pr->pr_root, 0);
+ if (error || (error = pwd_chroot(td, pr->pr_root))) {
+ vfs_opterror(opts, "vps chroot failed");
+ if (!created)
+ prison_deref(pr, PD_DEREF);
+ goto done_errmsg;
+ }
+
+ /* We temporary need a ref as otheriwse a prhold will panic. */
+ mtx_lock(&pr->pr_mtx);
+ pr->pr_ref++;
+ pr->pr_uref++;
+ mtx_unlock(&pr->pr_mtx);
+ pr->pr_vps = vps_alloc(pr);
+ mtx_lock(&pr->pr_mtx);
+ pr->pr_ref--;
+ pr->pr_uref--;
+ mtx_unlock(&pr->pr_mtx);
+ } else {
+ pr->pr_vps = ppr->pr_vps;
+ }
+#endif
/* Attach this process to the prison if requested. */
if (flags & JAIL_ATTACH) {
mtx_lock(&pr->pr_mtx);
@@ -2285,7 +2328,28 @@
/*
* Kill all processes unfortunate enough to be attached to this prison.
*/
- sx_slock(&allproc_lock);
+#ifdef VIMAGE
+ if (pr->pr_vps) {
+ /*
+ * Send signal to init and let init do it's job.
+ * This should run rc.shutdown and processes should go away.
+ * All but init? We need to catch the tail-end of reboot(2)
+ * and handle appropriately for the non-default vpss.
+ * vps_destroy() will ensure init and swapper will also go
+ * away and might sleep. If they do not go something will
+ * hold refs on cred and prisons.
+ * XXX There are other places which might do that for a long
+ * time as well.
+ */
+ CURVPS_SET(pr->pr_vps);
+ shutdown_nice(RB_HALT|RB_POWEROFF);
+ vps_destroy(pr->pr_vps);
+ CURVPS_RESTORE();
+ } else
+#endif
+ {
+ /* Operate on current vps instance only. */
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state != PRS_NEW && p->p_ucred &&
@@ -2293,7 +2357,8 @@
kern_psignal(p, SIGKILL);
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
+ }
/* Remove the temporary reference added by jail_remove. */
prison_deref(pr, deuref | PD_DEREF);
}
@@ -2348,6 +2413,24 @@
struct ucred *newcred, *oldcred;
int error;
+#ifdef VIMAGE
+ /*
+ * Do not allow to migrate a process between virtual process spaces.
+ * Use the console to attach to it. Getting all process spaces things
+ * right, including a new pid, progress group, session, terminal,
+ * tracing is one thing (with a lot of work) and may break apps if the
+ * pid changes, the pgrp no longer has the same (p)id; getting things
+ * restored to original state and properly re-parented is virtually
+ * impossile. So do what we do on a normal machine, present a terminal
+ * to login to.
+ */
+ if (pr->pr_flags & PR_VPS) {
+ mtx_unlock(&pr->pr_mtx);
+ sx_sunlock(&allprison_lock);
+ return (EPERM);
+ }
+#endif
+
/*
* XXX: Note that there is a slight race here if two threads
* in the same privileged process attempt to attach to two
@@ -2628,6 +2711,9 @@
#ifdef VIMAGE
if (pr->pr_vnet != ppr->pr_vnet)
vnet_destroy(pr->pr_vnet);
+ KASSERT((pr->pr_vps == ppr->pr_vps || pr->pr_vps == NULL),
+ ("%s: pr %p pr_vps %p != NULL\n",
+ __func__, pr, pr->pr_vps));
#endif
if (pr->pr_root != NULL)
vrele(pr->pr_root);
@@ -2912,9 +2998,9 @@
#ifdef VIMAGE
/*
* Determine whether the prison represented by cred owns
- * its vnet rather than having it inherited.
+ * its vnet/vps rather than having it inherited.
*
- * Returns 1 in case the prison owns the vnet, 0 otherwise.
+ * Returns 1 in case the prison owns the vnet/vps, 0 otherwise.
*/
int
prison_owns_vnet(struct ucred *cred)
@@ -2926,6 +3012,17 @@
*/
return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
}
+
+int
+prison_owns_vps(struct ucred *cred)
+{
+
+ /*
+ * vps cannot be added/removed after jail creation,
+ * so no need to lock here.
+ */
+ return (cred->cr_prison->pr_flags & PR_VPS ? 1 : 0);
+}
#endif
/*
@@ -3542,6 +3639,26 @@
CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
sysctl_jail_vnet, "I", "Jail owns vnet?");
+static int
+sysctl_jail_vps(SYSCTL_HANDLER_ARGS)
+{
+ int error, havevps;
+#ifdef VIMAGE
+ struct ucred *cred = req->td->td_ucred;
+
+ havevps = jailed(cred) && prison_owns_vps(cred);
+#else
+ havevps = 0;
+#endif
+ error = SYSCTL_OUT(req, &havevps, sizeof(havevps));
+
+ return (error);
+}
+
+SYSCTL_PROC(_security_jail, OID_AUTO, vps,
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_jail_vps, "I", "Jail owns vps?");
+
#if defined(INET) || defined(INET6)
SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
&jail_max_af_ips, 0,
@@ -3697,6 +3814,8 @@
#ifdef VIMAGE
SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
"E,jailsys", "Virtual network stack");
+SYSCTL_JAIL_PARAM(, vps, CTLTYPE_INT | CTLFLAG_RDTUN,
+ "E,jailsys", "Virtual process space");
#endif
SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
"B", "Jail is in the process of shutting down");
@@ -4023,12 +4142,12 @@
ASSERT_RACCT_ENABLED();
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
sx_xlock(&allprison_lock);
if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
sx_xunlock(&allprison_lock);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return;
}
@@ -4046,6 +4165,7 @@
/*
* Force rctl to reattach rules to processes.
*/
+ /* XXX do we need to do this over all vps instances as well? */
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
cred = crhold(p->p_ucred);
@@ -4055,7 +4175,7 @@
}
#endif
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
prison_racct_free_locked(oldprr);
sx_xunlock(&allprison_lock);
}
@@ -4103,6 +4223,7 @@
? pr->pr_cpuset->cs_id : -1);
#ifdef VIMAGE
db_printf(" vnet = %p\n", pr->pr_vnet);
+ db_printf(" vps = %p\n", pr->pr_vps);
#endif
db_printf(" root = %p\n", pr->pr_root);
db_printf(" securelevel = %d\n", pr->pr_securelevel);
Index: sys/kern/kern_kthread.c
===================================================================
--- sys/kern/kern_kthread.c
+++ sys/kern/kern_kthread.c
@@ -32,6 +32,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/cpuset.h>
+#include <sys/jail.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@@ -45,6 +46,8 @@
#include <sys/wait.h>
#include <sys/sched.h>
#include <sys/tslog.h>
+#include <sys/vps.h>
+
#include <vm/vm.h>
#include <vm/vm_extern.h>
@@ -164,11 +167,30 @@
* Reparent curthread from proc0 to init so that the zombie
* is harvested.
*/
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
PROC_LOCK(p);
- proc_reparent(p, initproc);
+#ifdef VIMAGE
+ /*
+ * In the VIMAGE case if the kproc is our virtual "swapper"
+ * do not reparent it to our init as otherwise it would create
+ * a circle and never go away. Let the parent vps reap it
+ * as it was setup. And it needs to be the init there and
+ * not the swapper(kernel).
+ */
+ if (!IS_DEFAULT_VPS(TD_TO_VPS(FIRST_THREAD_IN_PROC(p))) &&
+ p->p_pid == 0) {
+ struct proc *init0;
+
+ CURVPS_SET_QUIET(vps0)
+ init0 = V_initproc;
+ CURVPS_RESTORE();
+
+ proc_reparent(p, init0);
+ } else
+#endif
+ proc_reparent(p, V_initproc);
PROC_UNLOCK(p);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
/*
* Wakeup anyone waiting for us to exit.
@@ -271,7 +293,7 @@
/* If no process supplied, put it on proc0 */
if (p == NULL)
- p = &proc0;
+ p = V_vproc0;
/* Initialize our new td */
newtd = thread_alloc(pages);
@@ -294,6 +316,9 @@
TSTHREAD(newtd, newtd->td_name);
newtd->td_proc = p; /* needed for cpu_copy_thread */
+#ifdef VIMAGE
+ newtd->td_vps = TD_TO_VPS(oldtd);
+#endif
/* might be further optimized for kthread */
cpu_copy_thread(newtd, oldtd);
/* put the designated function(arg) as the resume context */
Index: sys/kern/kern_ktrace.c
===================================================================
--- sys/kern/kern_ktrace.c
+++ sys/kern/kern_ktrace.c
@@ -952,25 +952,33 @@
* Clear all uses of the tracefile.
*/
if (ops == KTROP_CLEARFILE) {
+ VPS_ITERATOR_DECL(vps_iter);
int vrele_count;
vrele_count = 0;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_tracevp == vp) {
- if (ktrcanset(td, p)) {
- mtx_lock(&ktrace_mtx);
- ktr_freeproc(p, &cred, NULL);
- mtx_unlock(&ktrace_mtx);
- vrele_count++;
- crfree(cred);
- } else
- error = EPERM;
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_tracevp == vp) {
+ if (ktrcanset(td, p)) {
+ mtx_lock(&ktrace_mtx);
+ ktr_freeproc(p, &cred, NULL);
+ mtx_unlock(&ktrace_mtx);
+ vrele_count++;
+ crfree(cred);
+ } else
+ error = EPERM;
+ }
+ PROC_UNLOCK(p);
}
- PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
if (vrele_count > 0) {
while (vrele_count-- > 0)
vrele(vp);
@@ -980,14 +988,14 @@
/*
* do it
*/
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (uap->pid < 0) {
/*
* by process group
*/
pg = pgfind(-uap->pid);
if (pg == NULL) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
error = ESRCH;
goto done;
}
@@ -1011,7 +1019,7 @@
ret |= ktrops(td, p, ops, facs, vp);
}
if (nfound == 0) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
error = ESRCH;
goto done;
}
@@ -1027,7 +1035,7 @@
if (error) {
if (p != NULL)
PROC_UNLOCK(p);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
goto done;
}
if (descend)
@@ -1035,7 +1043,7 @@
else
ret |= ktrops(td, p, ops, facs, vp);
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
if (!ret)
error = EPERM;
done:
@@ -1143,7 +1151,7 @@
p = top;
PROC_LOCK_ASSERT(p, MA_OWNED);
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
for (;;) {
ret |= ktrops(td, p, ops, facs, vp);
/*
@@ -1170,6 +1178,7 @@
static void
ktr_writerequest(struct thread *td, struct ktr_request *req)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct ktr_header *kth;
struct vnode *vp;
struct proc *p;
@@ -1270,22 +1279,28 @@
* credentials for the operation.
*/
cred = NULL;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_tracevp == vp) {
- mtx_lock(&ktrace_mtx);
- ktr_freeproc(p, &cred, NULL);
- mtx_unlock(&ktrace_mtx);
- vrele_count++;
- }
- PROC_UNLOCK(p);
- if (cred != NULL) {
- crfree(cred);
- cred = NULL;
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_tracevp == vp) {
+ mtx_lock(&ktrace_mtx);
+ ktr_freeproc(p, &cred, NULL);
+ mtx_unlock(&ktrace_mtx);
+ vrele_count++;
+ }
+ PROC_UNLOCK(p);
+ if (cred != NULL) {
+ crfree(cred);
+ cred = NULL;
+ }
}
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
while (vrele_count-- > 0)
vrele(vp);
Index: sys/kern/kern_mib.c
===================================================================
--- sys/kern/kern_mib.c
+++ sys/kern/kern_mib.c
@@ -556,8 +556,8 @@
error = sysctl_handle_int(oidp, &pm, 0, req);
if (error || !req->newptr)
return (error);
- sx_xlock(&proctree_lock);
- sx_xlock(&allproc_lock);
+ sx_xlock(&V_proctree_lock);
+ sx_xlock(&V_allproc_lock);
/*
* Only permit the values less then PID_MAX.
@@ -567,8 +567,8 @@
error = EINVAL;
else
pid_max = pm;
- sx_xunlock(&allproc_lock);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_allproc_lock);
+ sx_xunlock(&V_proctree_lock);
return (error);
}
SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT |
Index: sys/kern/kern_proc.c
===================================================================
--- sys/kern/kern_proc.c
+++ sys/kern/kern_proc.c
@@ -126,15 +126,21 @@
/*
* Other process lists
*/
-struct pidhashhead *pidhashtbl;
-u_long pidhash;
-struct pgrphashhead *pgrphashtbl;
-u_long pgrphash;
-struct proclist allproc;
-struct proclist zombproc;
+VPS_DEFINE(struct pidhashhead *, pidhashtbl);
+VPS_DEFINE(u_long, pidhash);
+VPS_DEFINE(struct pgrphashhead *, pgrphashtbl);
+VPS_DEFINE(u_long, pgrphash);
+VPS_DEFINE(struct proclist, allproc);
+VPS_DEFINE(struct proclist, zombproc);
+#ifndef VIMAGE
struct sx __exclusive_cache_line allproc_lock;
struct sx __exclusive_cache_line proctree_lock;
struct mtx __exclusive_cache_line ppeers_lock;
+#else
+VPS_DEFINE(struct sx, allproc_lock);
+VPS_DEFINE(struct sx, proctree_lock);
+VPS_DEFINE(struct mtx, ppeers_lock);
+#endif
uma_zone_t proc_zone;
/*
@@ -179,22 +185,46 @@
/*
* Initialize global process hashing structures.
*/
-void
+static void
procinit(void)
{
- sx_init(&allproc_lock, "allproc");
- sx_init(&proctree_lock, "proctree");
- mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
- LIST_INIT(&allproc);
- LIST_INIT(&zombproc);
- pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
- pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
- proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
- proc_ctor, proc_dtor, proc_init, proc_fini,
- UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- uihashinit();
+ sx_init(&V_allproc_lock, "allproc");
+ sx_init(&V_proctree_lock, "proctree");
+ mtx_init(&V_ppeers_lock, "p_peers", NULL, MTX_DEF);
+ LIST_INIT(&V_allproc);
+ LIST_INIT(&V_zombproc);
+ V_pidhashtbl = hashinit(maxproc / 4, M_PROC, &V_pidhash);
+ V_pgrphashtbl = hashinit(maxproc / 4, M_PROC, &V_pgrphash);
+ if (IS_DEFAULT_VPS(curvps)) {
+ proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
+ proc_ctor, proc_dtor, proc_init, proc_fini,
+ UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ }
}
+VPS_SYSINIT(procinit, SI_SUB_INTRINSIC, SI_ORDER_SECOND, procinit, NULL);
+
+#ifdef VIMAGE
+static void
+procdestroy(void *ident __unused)
+{
+
+ KASSERT((LIST_EMPTY(&V_allproc)), ("%s: list allproc %p not empty\n",
+ __func__, &V_allproc));
+ KASSERT((LIST_EMPTY(&V_zombproc)), ("%s: list zombproc %p not empty\n",
+ __func__, &V_zombproc));
+
+ /* proc_zone */
+ hashdestroy(V_pgrphashtbl, M_PROC, V_pgrphash);
+ hashdestroy(V_pidhashtbl, M_PROC, V_pidhash);
+
+ mtx_destroy(&V_ppeers_lock);
+ sx_destroy(&V_proctree_lock);
+ sx_destroy(&V_allproc_lock);
+}
+VPS_SYSUNINIT(procdestroy, SI_SUB_INTRINSIC, SI_ORDER_SECOND, procdestroy,
+ NULL);
+#endif
/*
* Prepare a proc for use.
@@ -303,7 +333,7 @@
inferior(struct proc *p)
{
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
for (; p != curproc; p = proc_realparent(p)) {
if (p->p_pid == 0)
@@ -317,7 +347,7 @@
{
struct proc *p;
- sx_assert(&allproc_lock, SX_LOCKED);
+ sx_assert(&V_allproc_lock, SX_LOCKED);
LIST_FOREACH(p, PIDHASH(pid), p_hash) {
if (p->p_pid == pid) {
PROC_LOCK(p);
@@ -347,9 +377,9 @@
PROC_LOCK(p);
return (p);
}
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
p = pfind_locked(pid);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (p);
}
@@ -361,11 +391,11 @@
{
struct proc *p;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
p = pfind_locked(pid);
if (p == NULL)
p = zpfind_locked(pid);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (p);
}
@@ -376,7 +406,8 @@
struct proc *p;
struct thread *td;
- sx_assert(&allproc_lock, SX_LOCKED);
+ /* Operate on current vps instance only. */
+ sx_assert(&V_allproc_lock, SX_LOCKED);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
@@ -402,7 +433,7 @@
{
struct pgrp *pgrp;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
if (pgrp->pg_id == pgid) {
@@ -426,7 +457,7 @@
if (p->p_pid == pid) {
PROC_LOCK(p);
} else {
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
if (pid <= PID_MAX) {
p = pfind_locked(pid);
if (p == NULL && (flags & PGET_NOTWEXIT) == 0)
@@ -436,7 +467,7 @@
} else {
p = NULL;
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
if (p == NULL)
return (ESRCH);
if ((flags & PGET_CANSEE) != 0) {
@@ -486,7 +517,7 @@
enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
KASSERT(p->p_pid == pgid,
@@ -547,7 +578,7 @@
enterthispgrp(struct proc *p, struct pgrp *pgrp)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
@@ -573,7 +604,7 @@
{
struct pgrp *savepgrp;
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
@@ -610,7 +641,7 @@
{
struct pgrp *savepgrp;
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
savepgrp = p->p_pgrp;
PGRP_LOCK(savepgrp);
PROC_LOCK(p);
@@ -632,7 +663,7 @@
struct session *savesess;
struct tty *tp;
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
@@ -691,7 +722,7 @@
struct session *mysession;
struct proc *q;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
@@ -744,7 +775,7 @@
}
PROC_UNLOCK(p);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
if (SESS_LEADER(p)) {
sp = p->p_session;
@@ -781,17 +812,17 @@
}
if (ttyvp != NULL) {
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) {
VOP_REVOKE(ttyvp, REVOKEALL);
VOP_UNLOCK(ttyvp, 0);
}
vrele(ttyvp);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
}
}
fixjobc(p, p->p_pgrp, 0);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
}
/*
@@ -851,10 +882,10 @@
struct proc *p;
int i;
- for (i = 0; i <= pgrphash; i++) {
- if (!LIST_EMPTY(&pgrphashtbl[i])) {
+ for (i = 0; i <= V_pgrphash; i++) {
+ if (!LIST_EMPTY(&V_pgrphashtbl[i])) {
printf("\tindx %d\n", i);
- LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
+ LIST_FOREACH(pgrp, &V_pgrphashtbl[i], pg_hash) {
printf(
"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
(void *)pgrp, (long)pgrp->pg_id,
@@ -910,7 +941,7 @@
struct timeval boottime;
/* For proc_realparent. */
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
bzero(kp, sizeof(*kp));
@@ -1019,7 +1050,7 @@
kp->ki_kiflag |= KI_CTTY;
if (SESS_LEADER(p))
kp->ki_kiflag |= KI_SLEADER;
- /* XXX proctree_lock */
+ /* XXX V_proctree_lock */
tp = sp->s_ttyp;
SESS_UNLOCK(sp);
}
@@ -1209,8 +1240,9 @@
{
struct proc *p;
- sx_assert(&allproc_lock, SX_LOCKED);
- LIST_FOREACH(p, &zombproc, p_list) {
+ /* Operate on current vps instance only. */
+ sx_assert(&V_allproc_lock, SX_LOCKED);
+ LIST_FOREACH(p, &V_zombproc, p_list) {
if (p->p_pid == pid) {
PROC_LOCK(p);
break;
@@ -1227,9 +1259,9 @@
{
struct proc *p;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
p = zpfind_locked(pid);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (p);
}
@@ -1465,11 +1497,11 @@
error = sysctl_wire_old_buffer(req, 0);
if (error)
return (error);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
error = pget((pid_t)name[0], PGET_CANSEE, &p);
if (error == 0)
error = sysctl_out_proc(p, req, flags);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (error);
}
@@ -1502,14 +1534,15 @@
* traced process. Only grab it if we are producing any
* data to begin with.
*/
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
}
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
+ /* Operate on current vps instance only. */
if (!doingzomb)
- p = LIST_FIRST(&allproc);
+ p = LIST_FIRST(&V_allproc);
else
- p = LIST_FIRST(&zombproc);
+ p = LIST_FIRST(&V_zombproc);
for (; p != NULL; p = LIST_NEXT(p, p_list)) {
/*
* Skip embryonic processes.
@@ -1569,7 +1602,7 @@
PROC_UNLOCK(p);
continue;
}
- /* XXX proctree_lock */
+ /* XXX V_proctree_lock */
SESS_LOCK(p->p_session);
if (p->p_session->s_ttyp == NULL ||
tty_udev(p->p_session->s_ttyp) !=
@@ -1609,9 +1642,9 @@
}
}
out:
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
if (req->oldptr != NULL)
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (error);
}
@@ -3095,101 +3128,133 @@
void
stop_all_proc(void)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *cp, *p;
int r, gen;
bool restart, seen_stopped, seen_exiting, stopped_some;
- cp = curproc;
+ KASSERT(IS_DEFAULT_VPS(curvps),
+ ("%s: called from non vps0 %p: vps %p\n", __func__, vps0, curvps));
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ cp = NULL;
+ else
+#endif
+ cp = curproc;
allproc_loop:
- sx_xlock(&allproc_lock);
- gen = allproc_gen;
- seen_exiting = seen_stopped = stopped_some = restart = false;
- LIST_REMOVE(cp, p_list);
- LIST_INSERT_HEAD(&allproc, cp, p_list);
- for (;;) {
- p = LIST_NEXT(cp, p_list);
- if (p == NULL)
- break;
+ sx_xlock(&V_allproc_lock);
+ if (cp == NULL)
+ cp = LIST_FIRST(&V_allproc);
+ gen = allproc_gen;
+ seen_exiting = seen_stopped = stopped_some = restart = false;
LIST_REMOVE(cp, p_list);
- LIST_INSERT_AFTER(p, cp, p_list);
- PROC_LOCK(p);
- if ((p->p_flag & (P_KPROC | P_SYSTEM | P_TOTAL_STOP)) != 0) {
- PROC_UNLOCK(p);
- continue;
- }
- if ((p->p_flag & P_WEXIT) != 0) {
- seen_exiting = true;
- PROC_UNLOCK(p);
- continue;
- }
- if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
- /*
- * Stopped processes are tolerated when there
- * are no other processes which might continue
- * them. P_STOPPED_SINGLE but not
- * P_TOTAL_STOP process still has at least one
- * thread running.
- */
- seen_stopped = true;
+ LIST_INSERT_HEAD(&V_allproc, cp, p_list);
+ for (;;) {
+ p = LIST_NEXT(cp, p_list);
+ if (p == NULL)
+ break;
+ LIST_REMOVE(cp, p_list);
+ LIST_INSERT_AFTER(p, cp, p_list);
+ PROC_LOCK(p);
+ if ((p->p_flag & (P_KPROC | P_SYSTEM | P_TOTAL_STOP)) != 0) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ if ((p->p_flag & P_WEXIT) != 0) {
+ seen_exiting = true;
+ PROC_UNLOCK(p);
+ continue;
+ }
+ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+ /*
+ * Stopped processes are tolerated when there
+ * are no other processes which might continue
+ * them. P_STOPPED_SINGLE but not
+ * P_TOTAL_STOP process still has at least one
+ * thread running.
+ */
+ seen_stopped = true;
+ PROC_UNLOCK(p);
+ continue;
+ }
+ _PHOLD(p);
+ sx_xunlock(&V_allproc_lock);
+ r = thread_single(p, SINGLE_ALLPROC);
+ if (r != 0)
+ restart = true;
+ else
+ stopped_some = true;
+ _PRELE(p);
PROC_UNLOCK(p);
- continue;
+ sx_xlock(&V_allproc_lock);
}
- _PHOLD(p);
- sx_xunlock(&allproc_lock);
- r = thread_single(p, SINGLE_ALLPROC);
- if (r != 0)
+ /* Catch forked children we did not see in iteration. */
+ if (gen != allproc_gen)
restart = true;
- else
- stopped_some = true;
- _PRELE(p);
- PROC_UNLOCK(p);
- sx_xlock(&allproc_lock);
- }
- /* Catch forked children we did not see in iteration. */
- if (gen != allproc_gen)
- restart = true;
- sx_xunlock(&allproc_lock);
- if (restart || stopped_some || seen_exiting || seen_stopped) {
- kern_yield(PRI_USER);
- goto allproc_loop;
+ sx_xunlock(&V_allproc_lock);
+ if (restart || stopped_some || seen_exiting || seen_stopped) {
+ kern_yield(PRI_USER);
+ goto allproc_loop;
+ }
+ CURVPS_RESTORE();
}
+ VPS_LIST_RUNLOCK();
}
void
resume_all_proc(void)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *cp, *p;
- cp = curproc;
- sx_xlock(&allproc_lock);
+ KASSERT(IS_DEFAULT_VPS(curvps),
+ ("%s: called from non vps0 %p: vps %p\n", __func__, vps0, curvps));
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ cp = NULL;
+ else
+#endif
+ cp = curproc;
+ sx_xlock(&V_allproc_lock);
again:
- LIST_REMOVE(cp, p_list);
- LIST_INSERT_HEAD(&allproc, cp, p_list);
- for (;;) {
- p = LIST_NEXT(cp, p_list);
- if (p == NULL)
- break;
LIST_REMOVE(cp, p_list);
- LIST_INSERT_AFTER(p, cp, p_list);
- PROC_LOCK(p);
- if ((p->p_flag & P_TOTAL_STOP) != 0) {
- sx_xunlock(&allproc_lock);
- _PHOLD(p);
- thread_single_end(p, SINGLE_ALLPROC);
- _PRELE(p);
- PROC_UNLOCK(p);
- sx_xlock(&allproc_lock);
- } else {
- PROC_UNLOCK(p);
+ LIST_INSERT_HEAD(&V_allproc, cp, p_list);
+ for (;;) {
+ p = LIST_NEXT(cp, p_list);
+ if (p == NULL)
+ break;
+ LIST_REMOVE(cp, p_list);
+ LIST_INSERT_AFTER(p, cp, p_list);
+ PROC_LOCK(p);
+ if ((p->p_flag & P_TOTAL_STOP) != 0) {
+ sx_xunlock(&V_allproc_lock);
+ _PHOLD(p);
+ thread_single_end(p, SINGLE_ALLPROC);
+ _PRELE(p);
+ PROC_UNLOCK(p);
+ sx_xlock(&V_allproc_lock);
+ } else {
+ PROC_UNLOCK(p);
+ }
}
+ /* Did the loop above missed any stopped process ? */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ /* No need for proc lock. */
+ if ((p->p_flag & P_TOTAL_STOP) != 0)
+ goto again;
+ }
+ sx_xunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- /* Did the loop above missed any stopped process ? */
- FOREACH_PROC_IN_SYSTEM(p) {
- /* No need for proc lock. */
- if ((p->p_flag & P_TOTAL_STOP) != 0)
- goto again;
- }
- sx_xunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
}
/* #define TOTAL_STOP_DEBUG 1 */
Index: sys/kern/kern_procctl.c
===================================================================
--- sys/kern/kern_procctl.c
+++ sys/kern/kern_procctl.c
@@ -69,7 +69,7 @@
p = top;
ret = 0;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
for (;;) {
ret |= protect_setchild(td, p, flags);
PROC_UNLOCK(p);
@@ -128,7 +128,7 @@
reap_acquire(struct thread *td, struct proc *p)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
if (p != curproc)
return (EPERM);
if ((p->p_treeflag & P_TREE_REAPER) != 0)
@@ -145,10 +145,10 @@
reap_release(struct thread *td, struct proc *p)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
if (p != curproc)
return (EPERM);
- if (p == initproc)
+ if (p == V_initproc)
return (EINVAL);
if ((p->p_treeflag & P_TREE_REAPER) == 0)
return (EINVAL);
@@ -162,7 +162,7 @@
{
struct proc *reap, *p2, *first_p;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
bzero(rs, sizeof(*rs));
if ((p->p_treeflag & P_TREE_REAPER) == 0) {
reap = p->p_reaper;
@@ -170,7 +170,7 @@
reap = p;
rs->rs_flags |= REAPER_STATUS_OWNED;
}
- if (reap == initproc)
+ if (reap == V_initproc)
rs->rs_flags |= REAPER_STATUS_REALINIT;
rs->rs_reaper = reap->p_pid;
rs->rs_descendants = 0;
@@ -199,18 +199,18 @@
u_int i, n;
int error;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
PROC_UNLOCK(p);
reap = (p->p_treeflag & P_TREE_REAPER) == 0 ? p->p_reaper : p;
n = i = 0;
error = 0;
LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling)
n++;
- sx_unlock(&proctree_lock);
+ sx_unlock(&V_proctree_lock);
if (rp->rp_count < n)
n = rp->rp_count;
pi = malloc(n * sizeof(*pi), M_TEMP, M_WAITOK);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling) {
if (i == n)
break;
@@ -225,10 +225,10 @@
pip->pi_flags |= REAPER_PIDINFO_REAPER;
i++;
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
error = copyout(pi, rp->rp_pids, i * sizeof(*pi));
free(pi, M_TEMP);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
return (error);
}
@@ -278,7 +278,7 @@
struct reap_kill_tracker *t;
int error;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
if (IN_CAPABILITY_MODE(td))
return (ECAPMODE);
if (rk->rk_sig <= 0 || rk->rk_sig > _SIG_MAXSIG ||
@@ -585,12 +585,12 @@
case PROC_REAP_KILL:
case PROC_TRACE_CTL:
case PROC_TRAPCAP_CTL:
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
tree_locked = true;
break;
case PROC_REAP_ACQUIRE:
case PROC_REAP_RELEASE:
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
tree_locked = true;
break;
case PROC_TRACE_STATUS:
@@ -657,6 +657,6 @@
break;
}
if (tree_locked)
- sx_unlock(&proctree_lock);
+ sx_unlock(&V_proctree_lock);
return (error);
}
Index: sys/kern/kern_prot.c
===================================================================
--- sys/kern/kern_prot.c
+++ sys/kern/kern_prot.c
@@ -52,6 +52,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/acct.h>
+#include <sys/filedesc.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/lock.h>
@@ -133,10 +134,10 @@
PROC_UNLOCK(p);
} else {
PROC_UNLOCK(p);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
pp = proc_realparent(p);
ppid = pp->p_pid;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
}
return (ppid);
@@ -340,7 +341,7 @@
newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
if (pgrp != NULL)
@@ -353,7 +354,7 @@
newsess = NULL;
}
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
if (newpgrp != NULL)
free(newpgrp, M_PGRP);
@@ -399,7 +400,7 @@
newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
if (uap->pid != 0 && uap->pid != curp->p_pid) {
if ((targp = pfind(uap->pid)) == NULL) {
error = ESRCH;
@@ -457,7 +458,7 @@
error = enterthispgrp(targp, pgrp);
}
done:
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
KASSERT((error == 0) || (newpgrp != NULL),
("setpgid failed and newpgrp is NULL"));
if (newpgrp != NULL)
@@ -1738,7 +1739,7 @@
}
/* Can't trace init when securelevel > 0. */
- if (p == initproc) {
+ if (p == V_initproc) {
error = securelevel_gt(td->td_ucred, 0);
if (error)
return (error);
@@ -1860,8 +1861,10 @@
crfree(struct ucred *cr)
{
- KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
- KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
+ KASSERT(cr->cr_ref > 0, ("%s: bad ucred %p refcount: %d",
+ __func__, cr, cr->cr_ref));
+ KASSERT(cr->cr_ref != 0xdeadc0de,
+ ("%s: dangling reference to ucred %p", __func__, cr));
if (refcount_release(&cr->cr_ref)) {
/*
* Some callers of crget(), such as nfs_statfs(),
Index: sys/kern/kern_racct.c
===================================================================
--- sys/kern/kern_racct.c
+++ sys/kern/kern_racct.c
@@ -1214,94 +1214,107 @@
}
static void
-racctd(void)
+_racctd(void)
{
struct thread *td;
struct proc *p;
struct timeval wallclock;
uint64_t pct, pct_estimate, runtime;
- ASSERT_RACCT_ENABLED();
-
- for (;;) {
- racct_decay();
+ sx_slock(&V_allproc_lock);
- sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &V_zombproc, p_list) {
+ PROC_LOCK(p);
+ racct_set(p, RACCT_PCTCPU, 0);
+ PROC_UNLOCK(p);
+ }
- LIST_FOREACH(p, &zombproc, p_list) {
- PROC_LOCK(p);
- racct_set(p, RACCT_PCTCPU, 0);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL) {
PROC_UNLOCK(p);
+ continue;
}
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state != PRS_NORMAL) {
- PROC_UNLOCK(p);
- continue;
- }
-
- microuptime(&wallclock);
- timevalsub(&wallclock, &p->p_stats->p_start);
- PROC_STATLOCK(p);
- FOREACH_THREAD_IN_PROC(p, td)
- ruxagg(p, td);
- runtime = cputick2usec(p->p_rux.rux_runtime);
- PROC_STATUNLOCK(p);
+ microuptime(&wallclock);
+ timevalsub(&wallclock, &p->p_stats->p_start);
+ PROC_STATLOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td)
+ ruxagg(p, td);
+ runtime = cputick2usec(p->p_rux.rux_runtime);
+ PROC_STATUNLOCK(p);
#ifdef notyet
- KASSERT(runtime >= p->p_prev_runtime,
- ("runtime < p_prev_runtime"));
+ KASSERT(runtime >= p->p_prev_runtime,
+ ("runtime < p_prev_runtime"));
#else
- if (runtime < p->p_prev_runtime)
- runtime = p->p_prev_runtime;
+ if (runtime < p->p_prev_runtime)
+ runtime = p->p_prev_runtime;
#endif
- p->p_prev_runtime = runtime;
- if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
- pct_estimate = (1000000 * runtime * 100) /
- ((uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec);
- } else
- pct_estimate = 0;
- pct = racct_getpcpu(p, pct_estimate);
- RACCT_LOCK();
+ p->p_prev_runtime = runtime;
+ if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+ pct_estimate = (1000000 * runtime * 100) /
+ ((uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec);
+ } else
+ pct_estimate = 0;
+ pct = racct_getpcpu(p, pct_estimate);
+ RACCT_LOCK();
#ifdef RCTL
- rctl_throttle_decay(p->p_racct, RACCT_READBPS);
- rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
- rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
- rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
+ rctl_throttle_decay(p->p_racct, RACCT_READBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
#endif
- racct_set_locked(p, RACCT_PCTCPU, pct, 1);
- racct_set_locked(p, RACCT_CPU, runtime, 0);
- racct_set_locked(p, RACCT_WALLCLOCK,
- (uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec, 0);
- RACCT_UNLOCK();
+ racct_set_locked(p, RACCT_PCTCPU, pct, 1);
+ racct_set_locked(p, RACCT_CPU, runtime, 0);
+ racct_set_locked(p, RACCT_WALLCLOCK,
+ (uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec, 0);
+ RACCT_UNLOCK();
+ PROC_UNLOCK(p);
+ }
+
+ /*
+ * To ensure that processes are throttled in a fair way, we need
+ * to iterate over all processes again and check the limits
+ * for %cpu resource only after ucred racct containers have been
+ * properly filled.
+ */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL) {
PROC_UNLOCK(p);
+ continue;
}
- /*
- * To ensure that processes are throttled in a fair way, we need
- * to iterate over all processes again and check the limits
- * for %cpu resource only after ucred racct containers have been
- * properly filled.
- */
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state != PRS_NORMAL) {
- PROC_UNLOCK(p);
- continue;
- }
+ if (racct_pcpu_available(p) <= 0) {
+ if (p->p_racct->r_resources[RACCT_PCTCPU] >
+ pcpu_threshold)
+ racct_proc_throttle(p, -1);
+ } else if (p->p_throttled == -1) {
+ racct_proc_wakeup(p);
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+}
- if (racct_pcpu_available(p) <= 0) {
- if (p->p_racct->r_resources[RACCT_PCTCPU] >
- pcpu_threshold)
- racct_proc_throttle(p, -1);
- } else if (p->p_throttled == -1) {
- racct_proc_wakeup(p);
- }
- PROC_UNLOCK(p);
+static void
+racctd(void)
+{
+ VPS_ITERATOR_DECL(vps_iter);
+
+ ASSERT_RACCT_ENABLED();
+
+ for (;;) {
+ racct_decay();
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ _racctd();
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
pause("-", hz);
}
}
Index: sys/kern/kern_rctl.c
===================================================================
--- sys/kern/kern_rctl.c
+++ sys/kern/kern_rctl.c
@@ -1175,7 +1175,7 @@
error = str2id(subject_idstr, &id);
if (error != 0)
goto out;
- sx_assert(&allproc_lock, SA_LOCKED);
+ sx_assert(&V_allproc_lock, SA_LOCKED);
rule->rr_subject.rs_proc = pfind(id);
if (rule->rr_subject.rs_proc == NULL) {
error = ESRCH;
@@ -1266,6 +1266,7 @@
int
rctl_rule_add(struct rctl_rule *rule)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
struct ucred *cred;
struct uidinfo *uip;
@@ -1357,37 +1358,51 @@
* Now go through all the processes and add the new rule to the ones
* it applies to.
*/
- sx_assert(&allproc_lock, SA_LOCKED);
- FOREACH_PROC_IN_SYSTEM(p) {
- cred = p->p_ucred;
- switch (rule->rr_subject_type) {
- case RCTL_SUBJECT_TYPE_USER:
- if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
- cred->cr_ruidinfo == rule->rr_subject.rs_uip)
- break;
- continue;
- case RCTL_SUBJECT_TYPE_LOGINCLASS:
- if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
- break;
- continue;
- case RCTL_SUBJECT_TYPE_JAIL:
- match = 0;
- for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
- if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
- match = 1;
+ sx_assert(&V_allproc_lock, SA_LOCKED);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_slock(&V_allproc_lock);
+#endif
+ FOREACH_PROC_IN_SYSTEM(p) {
+ cred = p->p_ucred;
+ switch (rule->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_USER:
+ if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
+ cred->cr_ruidinfo == rule->rr_subject.rs_uip)
+ break;
+ continue;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
break;
+ continue;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ match = 0;
+ for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
+ if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
+ match = 1;
+ break;
+ }
}
+ if (match)
+ break;
+ continue;
+ default:
+ panic("rctl_rule_add: unknown subject type %d",
+ rule->rr_subject_type);
}
- if (match)
- break;
- continue;
- default:
- panic("rctl_rule_add: unknown subject type %d",
- rule->rr_subject_type);
- }
- rctl_racct_add_rule(p->p_racct, rule);
+ rctl_racct_add_rule(p->p_racct, rule);
+ }
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_sunlock(&V_allproc_lock);
+#endif
+ CURVPS_RESTORE();
}
+ VPS_LIST_RUNLOCK();
return (0);
}
@@ -1426,6 +1441,7 @@
int
rctl_rule_remove(struct rctl_rule *filter)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
int found = 0;
@@ -1452,11 +1468,25 @@
rctl_rule_pre_callback, rctl_rule_post_callback,
filter, (void *)&found);
- sx_assert(&allproc_lock, SA_LOCKED);
+ sx_assert(&V_allproc_lock, SA_LOCKED);
RACCT_LOCK();
- FOREACH_PROC_IN_SYSTEM(p) {
- found += rctl_racct_remove_rules(p->p_racct, filter);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_slock(&V_allproc_lock);
+#endif
+ FOREACH_PROC_IN_SYSTEM(p) {
+ found += rctl_racct_remove_rules(p->p_racct, filter);
+ }
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_sunlock(&V_allproc_lock);
+#endif
+ CURVPS_RESTORE();
}
+ VPS_LIST_RUNLOCK();
RACCT_UNLOCK();
if (found)
@@ -1623,11 +1653,11 @@
if (error != 0)
return (error);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
@@ -1669,7 +1699,7 @@
}
out:
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
if (error != 0)
return (error);
@@ -1699,6 +1729,7 @@
int
sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct sbuf *sb;
struct rctl_rule *filter;
struct rctl_rule_link *link;
@@ -1718,41 +1749,56 @@
if (error != 0)
return (error);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
bufsize = uap->outbuflen;
if (bufsize > rctl_maxbufsize) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (E2BIG);
}
buf = malloc(bufsize, M_RCTL, M_WAITOK);
sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
KASSERT(sb != NULL, ("sbuf_new failed"));
-
- FOREACH_PROC_IN_SYSTEM(p) {
- RACCT_LOCK();
- LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
- /*
- * Non-process rules will be added to the buffer later.
- * Adding them here would result in duplicated output.
- */
- if (link->rrl_rule->rr_subject_type !=
- RCTL_SUBJECT_TYPE_PROCESS)
- continue;
- if (!rctl_rule_matches(link->rrl_rule, filter))
- continue;
- rctl_rule_to_sbuf(sb, link->rrl_rule);
- sbuf_printf(sb, ",");
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_slock(&V_allproc_lock);
+#endif
+ FOREACH_PROC_IN_SYSTEM(p) {
+ RACCT_LOCK();
+ LIST_FOREACH(link, &p->p_racct->r_rule_links,
+ rrl_next) {
+ /*
+ * Non-process rules will be added to the
+ * buffer later. Adding them here would result
+ * in duplicated output.
+ */
+ if (link->rrl_rule->rr_subject_type !=
+ RCTL_SUBJECT_TYPE_PROCESS)
+ continue;
+ if (!rctl_rule_matches(link->rrl_rule, filter))
+ continue;
+ rctl_rule_to_sbuf(sb, link->rrl_rule);
+ sbuf_printf(sb, ",");
+ }
+ RACCT_UNLOCK();
}
- RACCT_UNLOCK();
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_sunlock(&V_allproc_lock);
+#endif
+ CURVPS_RESTORE();
}
+ VPS_LIST_RUNLOCK();
loginclass_racct_foreach(rctl_get_rules_callback,
rctl_rule_pre_callback, rctl_rule_post_callback,
@@ -1777,7 +1823,7 @@
error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
out:
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
free(buf, M_RCTL);
return (error);
}
@@ -1803,34 +1849,34 @@
if (error != 0)
return (error);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (EINVAL);
}
if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (EOPNOTSUPP);
}
if (filter->rr_subject.rs_proc == NULL) {
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (EINVAL);
}
bufsize = uap->outbuflen;
if (bufsize > rctl_maxbufsize) {
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (E2BIG);
}
@@ -1860,7 +1906,7 @@
error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
out:
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
free(buf, M_RCTL);
return (error);
}
@@ -1883,11 +1929,11 @@
if (error != 0)
return (error);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
error = rctl_string_to_rule(inputstr, &rule);
free(inputstr, M_RCTL);
if (error != 0) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
/*
@@ -1906,7 +1952,7 @@
out:
rctl_rule_release(rule);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
@@ -1928,17 +1974,17 @@
if (error != 0)
return (error);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
error = rctl_rule_remove(filter);
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
Index: sys/kern/kern_resource.c
===================================================================
--- sys/kern/kern_resource.c
+++ sys/kern/kern_resource.c
@@ -69,10 +69,15 @@
static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
-#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
+
static struct rwlock uihashtbl_lock;
-static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
-static u_long uihash; /* size of hash table - 1 */
+
+LIST_HEAD(uihashhead, uidinfo);
+static VPS_DEFINE(struct uihashhead *, uihashtbl);
+#define V_uihashtbl VPS(uihashtbl)
+static VPS_DEFINE(u_long, uihash); /* size of hash table - 1 */
+#define V_uihash VPS(uihash)
+#define UIHASH(uid) (&V_uihashtbl[(uid) & V_uihash])
static void calcru1(struct proc *p, struct rusage_ext *ruxp,
struct timeval *up, struct timeval *sp);
@@ -114,18 +119,18 @@
break;
case PRIO_PGRP:
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (uap->who == 0) {
pg = td->td_proc->p_pgrp;
PGRP_LOCK(pg);
} else {
pg = pgfind(uap->who);
if (pg == NULL) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
break;
}
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
@@ -141,7 +146,8 @@
case PRIO_USER:
if (uap->who == 0)
uap->who = td->td_ucred->cr_uid;
- sx_slock(&allproc_lock);
+ /* Operate on current vps instance only. */
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
@@ -152,7 +158,7 @@
}
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
break;
default:
@@ -199,18 +205,18 @@
break;
case PRIO_PGRP:
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (uap->who == 0) {
pg = curp->p_pgrp;
PGRP_LOCK(pg);
} else {
pg = pgfind(uap->who);
if (pg == NULL) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
break;
}
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
@@ -226,7 +232,8 @@
case PRIO_USER:
if (uap->who == 0)
uap->who = td->td_ucred->cr_uid;
- sx_slock(&allproc_lock);
+ /* Operate on current vps instance only. */
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
@@ -237,7 +244,7 @@
}
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
break;
default:
@@ -1214,13 +1221,43 @@
p->p_sysent->sv_fixlimit(rlp, which);
}
-void
+static void
uihashinit()
{
- uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
rw_init(&uihashtbl_lock, "uidinfo hash");
}
+SYSINIT(uihashinit, SI_SUB_INTRINSIC, SI_ORDER_SECOND, uihashinit, NULL);
+
+static void
+uihashinit_vps()
+{
+
+ V_uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &V_uihash);
+}
+VPS_SYSINIT(uihashinit_vps, SI_SUB_INTRINSIC, SI_ORDER_SECOND, uihashinit_vps,
+ NULL);
+
+#ifdef VIMAGE
+static void
+uihashdestroy_vps(void *ident __unused)
+{
+ struct uidinfo *uip;
+ struct uihashhead *uih;
+ int i;
+
+ i = 0;
+ for (uih = &V_uihashtbl[V_uihash]; uih >= V_uihashtbl; uih--)
+ LIST_FOREACH(uip, uih, ui_hash)
+ i++;
+ if (i == 0)
+ hashdestroy(V_uihashtbl, M_UIDINFO, V_uihash);
+ else
+ printf("%s: leaking %d uihash entries\n", __func__, i);
+}
+VPS_SYSUNINIT(uihashdestroy_vps, SI_SUB_INTRINSIC, SI_ORDER_SECOND,
+ uihashdestroy_vps, NULL);
+#endif
/*
* Look up a uidinfo struct for the parameter uid.
@@ -1368,7 +1405,7 @@
rw_rlock(&uihashtbl_lock);
if (pre != NULL)
(pre)();
- for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) {
+ for (uih = &V_uihashtbl[V_uihash]; uih >= V_uihashtbl; uih--) {
LIST_FOREACH(uip, uih, ui_hash) {
(callback)(uip->ui_racct, arg2, arg3);
}
@@ -1392,7 +1429,8 @@
return (0);
}
} else if (new < 0)
- printf("negative %s for uid = %d\n", name, uip->ui_uid);
+ printf("%s: curthread %p uip %p negative %s for uid = %d\n",
+ __func__, curthread, uip, name, uip->ui_uid);
return (1);
}
Index: sys/kern/kern_shutdown.c
===================================================================
--- sys/kern/kern_shutdown.c
+++ sys/kern/kern_shutdown.c
@@ -204,6 +204,10 @@
int dumping; /* system is dumping */
int rebooting; /* system is rebooting */
+#ifdef VIMAGE
+VPS_DEFINE(int, vrebooting); /* vps is rebooting */
+#define V_vrebooting VPS(vrebooting)
+#endif
static struct dumperinfo dumper; /* our selected dumper */
/* Context information for dump-debuggers. */
@@ -276,29 +280,42 @@
if (error == 0) {
if (uap->opt & RB_REROOT)
error = kern_reroot();
- else
+ else {
+#ifdef VIMAGE
+ /* XXX Can argue that we should never make it here. */
+ /* Init will want to _exit() in this case. */
+ if (!IS_DEFAULT_VPS(TD_TO_VPS(curthread))) {
+ V_vrebooting = 1;
+ return (error);
+ }
+#endif
kern_reboot(uap->opt);
+ }
}
return (error);
}
+static VPS_DEFINE(int, vhowto);
+#define V_vhowto VPS(vhowto)
+
static void
shutdown_nice_task_fn(void *arg, int pending __unused)
{
- int howto;
- howto = (uintptr_t)arg;
+ CURVPS_SET((struct vps *)arg);
/* Send a signal to init(8) and have it shutdown the world. */
- PROC_LOCK(initproc);
- if (howto & RB_POWEROFF)
- kern_psignal(initproc, SIGUSR2);
- else if (howto & RB_POWERCYCLE)
- kern_psignal(initproc, SIGWINCH);
- else if (howto & RB_HALT)
- kern_psignal(initproc, SIGUSR1);
+ PROC_LOCK(V_initproc);
+ if (V_vhowto & RB_POWEROFF)
+ kern_psignal(V_initproc, SIGUSR2);
+ else if (V_vhowto & RB_POWERCYCLE)
+ kern_psignal(V_initproc, SIGWINCH);
+ else if (V_vhowto & RB_HALT)
+ kern_psignal(V_initproc, SIGUSR1);
else
- kern_psignal(initproc, SIGINT);
- PROC_UNLOCK(initproc);
+ kern_psignal(V_initproc, SIGINT);
+ PROC_UNLOCK(V_initproc);
+ V_vhowto = 0;
+ CURVPS_RESTORE();
}
static struct task shutdown_nice_task = TASK_INITIALIZER(0,
@@ -311,10 +328,22 @@
shutdown_nice(int howto)
{
- if (initproc != NULL && !SCHEDULER_STOPPED()) {
- shutdown_nice_task.ta_context = (void *)(uintptr_t)howto;
+ if (V_initproc != NULL && !SCHEDULER_STOPPED()) {
+
+ KASSERT(V_vhowto == 0, ("%s: vps %p howto not 0: %d\n",
+ __func__, curvps, V_vhowto));
+ V_vhowto = howto;
+ shutdown_nice_task.ta_context = (void *)curvps;
taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task);
} else {
+#ifdef VIMAGE
+ /* XXX Can argue that we should never make it here. */
+ /* Init will want to _exit() in this case. */
+ if (!IS_DEFAULT_VPS(TD_TO_VPS(curthread))) {
+ V_vrebooting = 1;
+ return;
+ }
+#endif
/*
* No init(8) running, or scheduler would not allow it
* to run, so simply reboot.
@@ -462,7 +491,7 @@
struct mount *mp, *devmp;
int error;
- if (curproc != initproc)
+ if (curproc != V_initproc)
return (EPERM);
/*
Index: sys/kern/kern_sig.c
===================================================================
--- sys/kern/kern_sig.c
+++ sys/kern/kern_sig.c
@@ -1669,9 +1669,9 @@
ret = ESRCH;
if (all) {
/*
- * broadcast
+ * broadcast; current vps context only.
*/
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
p == td->td_proc || p->p_state == PRS_NEW) {
@@ -1688,9 +1688,9 @@
ret = err;
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
} else {
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (pgid == 0) {
/*
* zero pgid means send to my process group.
@@ -1700,11 +1700,11 @@
} else {
pgrp = pgfind(pgid);
if (pgrp == NULL) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (ESRCH);
}
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
@@ -1891,9 +1891,9 @@
struct pgrp *pgrp;
if (pgid != 0) {
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
pgrp = pgfind(pgid);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
if (pgrp != NULL) {
pgsignal(pgrp, sig, 0, ksi);
PGRP_UNLOCK(pgrp);
@@ -3279,7 +3279,7 @@
/*
* Protect the access to corefilename[] by allproc_lock.
*/
-#define corefilename_lock allproc_lock
+#define corefilename_lock V_allproc_lock
static char corefilename[MAXPATHLEN] = {"%N.core"};
TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
Index: sys/kern/kern_sysctl.c
===================================================================
--- sys/kern/kern_sysctl.c
+++ sys/kern/kern_sysctl.c
@@ -60,6 +60,7 @@
#include <sys/sx.h>
#include <sys/sysproto.h>
#include <sys/uio.h>
+#include <sys/vps.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
@@ -498,6 +499,7 @@
if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
#ifdef VIMAGE
(oidp->oid_kind & CTLFLAG_VNET) == 0 &&
+ (oidp->oid_kind & CTLFLAG_VPS) == 0 &&
#endif
(oidp->oid_kind & CTLFLAG_TUN) != 0 &&
(oidp->oid_kind & CTLFLAG_NOFETCH) == 0) {
@@ -1998,6 +2000,9 @@
else if ((oid->oid_kind & CTLFLAG_VNET) &&
prison_owns_vnet(req->td->td_ucred))
priv = PRIV_SYSCTL_WRITEJAIL;
+ else if ((oid->oid_kind & CTLFLAG_VPS) &&
+ prison_owns_vps(req->td->td_ucred))
+ priv = PRIV_SYSCTL_WRITEJAIL;
#endif
else
priv = PRIV_SYSCTL_WRITE;
@@ -2025,8 +2030,13 @@
goto out;
#endif
#ifdef VIMAGE
+ KASSERT(((oid->oid_kind & (CTLFLAG_VNET|CTLFLAG_VPS)) !=
+ (CTLFLAG_VNET|CTLFLAG_VPS)),
+ ("CTLFLAG VNET and VPS set oid %p", oid));
if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL)
arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1);
+ if ((oid->oid_kind & CTLFLAG_VPS) && arg1 != NULL)
+ arg1 = (void *)(curvps->vps_data_base + (uintptr_t)arg1);
#endif
error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker);
@@ -2118,6 +2128,7 @@
memlocked = 1;
sx_xlock(&sysctlmemlock);
}
+ CURVPS_SET(TD_TO_VPS(td));
CURVNET_SET(TD_TO_VNET(td));
for (;;) {
@@ -2130,6 +2141,7 @@
}
CURVNET_RESTORE();
+ CURVPS_RESTORE();
if (req.lock == REQ_WIRED && req.validlen > 0)
vsunlock(req.oldptr, req.validlen);
Index: sys/kern/kern_thr.c
===================================================================
--- sys/kern/kern_thr.c
+++ sys/kern/kern_thr.c
@@ -32,6 +32,7 @@
#include "opt_posix.h"
#include "opt_hwpmc_hooks.h"
#include <sys/param.h>
+#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@@ -56,6 +57,7 @@
#include <sys/rtprio.h>
#include <sys/umtx.h>
#include <sys/limits.h>
+#include <sys/vps.h>
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
@@ -238,6 +240,9 @@
bcopy(&td->td_startcopy, &newtd->td_startcopy,
__rangeof(struct thread, td_startcopy, td_endcopy));
newtd->td_proc = td->td_proc;
+#ifdef VIMAGE
+ newtd->td_vps = TD_TO_VPS(td);
+#endif
newtd->td_rb_list = newtd->td_rbp_list = newtd->td_rb_inact = 0;
thread_cow_get(newtd, td);
Index: sys/kern/kern_thread.c
===================================================================
--- sys/kern/kern_thread.c
+++ sys/kern/kern_thread.c
@@ -58,6 +58,9 @@
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
+#ifdef VIMAGE
+#include <sys/jail.h>
+#endif
#include <security/audit/audit.h>
@@ -83,7 +86,7 @@
"struct thread KBI td_pflags");
_Static_assert(offsetof(struct thread, td_frame) == 0x470,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x518,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x528,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0xb0,
"struct proc KBI p_flag");
@@ -103,7 +106,7 @@
"struct thread KBI td_pflags");
_Static_assert(offsetof(struct thread, td_frame) == 0x2e8,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x334,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x33c,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0x68,
"struct proc KBI p_flag");
@@ -451,6 +454,10 @@
PROC_LOCK_ASSERT(p, MA_OWNED);
newtd->td_ucred = crhold(p->p_ucred);
+#ifdef VIMAGE
+ /* Make sure the cached vps stays correct. */
+ newtd->td_vps = p->p_ucred->cr_prison->pr_vps;
+#endif
newtd->td_limit = lim_hold(p->p_limit);
newtd->td_cowgen = p->p_cowgen;
}
@@ -460,6 +467,10 @@
{
newtd->td_ucred = crhold(td->td_ucred);
+#ifdef VIMAGE
+ /* Make sure to inherit the cached vps as well. */
+ newtd->td_vps = td->td_vps;
+#endif
newtd->td_limit = lim_hold(td->td_limit);
newtd->td_cowgen = td->td_cowgen;
}
@@ -489,6 +500,11 @@
oldcred = td->td_ucred;
td->td_ucred = crhold(p->p_ucred);
}
+#ifdef VIMAGE
+ /* Make sure the cached vps stays correct. */
+ if (td->td_vps != p->p_ucred->cr_prison->pr_vps)
+ td->td_vps = p->p_ucred->cr_prison->pr_vps;
+#endif
if (td->td_limit != p->p_limit) {
oldlimit = td->td_limit;
td->td_limit = lim_hold(p->p_limit);
Index: sys/kern/kern_vps.c
===================================================================
--- /dev/null
+++ sys/kern/kern_vps.c
@@ -0,0 +1,835 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2004-2009 University of Zagreb
+ * Copyright (c) 2006-2009 FreeBSD Foundation
+ * Copyright (c) 2018 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by the University of Zagreb and the
+ * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
+ * FreeBSD Foundation.
+ *
+ * Portions of this software were developed by Bjoern Zeeb
+ * under sponsorship from iXsystems, Inc.
+ *
+ * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kdb.h"
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/jail.h>
+#include <sys/sdt.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/vps.h>
+
+#include <machine/stdarg.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+#endif
+
+
+/*-
+ * This file implements core functions for virtual process spaces:
+ *
+ * - Virtual process space management functions.
+ *
+ * - Virtual process space memory allocator, which virtualizes global
+ * variables in the process space.
+ *
+ * - Virtualized SYSINIT's/SYSUNINIT's, which allow process spaces
+ * to register startup/shutdown events to be run for each virtual process
+ * space instance.
+ */
+
+static MALLOC_DEFINE(M_VPS, "vps", "process space control block");
+
+/*
+ * The virtual process space list has two read-write locks, one sleepable and
+ * the other not, so that the list can be stablized and walked in a variety
+ * of process space contexts. Both must be acquired exclusively to modify
+ * the list, but a read lock of either lock is sufficient to walk the list.
+ */
+struct rwlock vps_rwlock;
+struct sx vps_sxlock;
+
+#define VPS_LIST_WLOCK() do { \
+ sx_xlock(&vps_sxlock); \
+ rw_wlock(&vps_rwlock); \
+} while (0)
+
+#define VPS_LIST_WUNLOCK() do { \
+ rw_wunlock(&vps_rwlock); \
+ sx_xunlock(&vps_sxlock); \
+} while (0)
+
+struct vps_list_head vps_head;
+struct vps *vps0;
+
+/*
+ * The virtual process space allocator provides storage for virtualized
+ * global variables. These variables are defined/declared using the
+ * VPS_DEFINE()/VPS_DECLARE() macros, which place them in the 'set_vps'
+ * linker set. The details of the implementation are somewhat subtle, but
+ * allow the majority of most process subsystems to maintain
+ * virtualization-agnostic.
+ *
+ * The virtual process space allocator handles variables in the base kernel
+ * vs. modules in similar but different ways. In both cases, virtualized
+ * global variables are marked as such by being declared to be part of the
+ * vps linker set. These "primary" copies of global variables serve two
+ * functions:
+ *
+ * (1) They contain static initialization or "default" values for global
+ * variables which will be propagated to each virtual process space
+ * instance when created. As with normal global variables, they default
+ * to zero-filled.
+ *
+ * (2) They act as unique global names by which the variable can be referred
+ * to, regardless of process space instance. The single global symbol
+ * will be used to calculate the location of a per-virtual instance
+ * variable at run-time.
+ *
+ * Each virtual process space instance has a complete copy of each
+ * virtualized global variable, stored in a malloc'd block of memory
+ * referred to by vps->vps_data_mem. Critical to the design is that each
+ * per-instance memory block is laid out identically to the primary block so
+ * that the offset of each global variable is the same across all blocks.
+ * To optimize run-time access, a precalculated 'base' address,
+ * vps->vps_data_base, is stored in each vps, and is the amount that can
+ * be added to the address of a 'primary' instance of a variable to get to the
+ * per-vps instance.
+ *
+ * Virtualized global variables are handled in a similar manner, but as each
+ * module has its own 'set_vps' linker set, and we want to keep all
+ * virtualized globals togther, we reserve space in the kernel's linker set
+ * for potential module variables using a per-vps character array,
+ * 'modspace'. The virtual process space allocator maintains a free list to
+ * track what space in the array is free (all, initially) and as modules are
+ * linked, allocates portions of the space to specific globals. The kernel
+ * module linker queries the virtual process space allocator and will
+ * bind references of the global to the location during linking. It also
+ * calls into the virtual process space allocator, once the memory is
+ * initialized, in order to propagate the new static initializations to all
+ * existing virtual process space instances so that the soon-to-be executing
+ * module will find every process space instance with proper default values.
+ */
+
+/*
+ * Number of bytes of data in the 'set_vps' linker set, and hence the total
+ * size of all kernel virtualized global variables, and the malloc(9) type
+ * that will be used to allocate it.
+ */
+#define VPS_BYTES (VPS_STOP - VPS_START)
+
+static MALLOC_DEFINE(M_VPS_DATA, "vps_data", "VPS data");
+
+/*
+ * VPS_MODMIN is the minimum number of bytes we will reserve for the sum of
+ * global variables across all loaded modules. As this actually sizes an
+ * array declared as a virtualized global variable in the kernel itself, and
+ * we want the virtualized global variable space to be page-sized, we may
+ * have more space than that in practice.
+ */
+#define VPS_MODMIN 8192
+#define VPS_SIZE roundup2(VPS_BYTES, PAGE_SIZE)
+
+/*
+ * Space to store virtualized global variables from loadable kernel modules,
+ * and the free list to manage it.
+ */
+static VPS_DEFINE(char, modspace[VPS_MODMIN]);
+
+/*
+ * Global lists of subsystem constructor and destructors for vpss. They are
+ * registered via VPS_SYSINIT() and VPS_SYSUNINIT(). Both lists are
+ * protected by the vps_sysinit_sxlock global lock.
+ */
+static TAILQ_HEAD(vps_sysinit_head, vps_sysinit) vps_constructors =
+ TAILQ_HEAD_INITIALIZER(vps_constructors);
+static TAILQ_HEAD(vps_sysuninit_head, vps_sysinit) vps_destructors =
+ TAILQ_HEAD_INITIALIZER(vps_destructors);
+
+struct sx vps_sysinit_sxlock;
+
+#define VPS_SYSINIT_WLOCK() sx_xlock(&vps_sysinit_sxlock);
+#define VPS_SYSINIT_WUNLOCK() sx_xunlock(&vps_sysinit_sxlock);
+#define VPS_SYSINIT_RLOCK() sx_slock(&vps_sysinit_sxlock);
+#define VPS_SYSINIT_RUNLOCK() sx_sunlock(&vps_sysinit_sxlock);
+
+/* XXX-BZ should probably be vpd_* instead of vnd_* but in the hope to
+ * harmonize most of this later on keep the names the same for now. */
+struct vps_data_free {
+ uintptr_t vnd_start;
+ int vnd_len;
+ TAILQ_ENTRY(vps_data_free) vnd_link;
+};
+
+static MALLOC_DEFINE(M_VPS_DATA_FREE, "vps_data_free",
+ "VPS resource accounting");
+static TAILQ_HEAD(, vps_data_free) vps_data_free_head =
+ TAILQ_HEAD_INITIALIZER(vps_data_free_head);
+static struct sx vps_data_free_lock;
+
+SDT_PROVIDER_DEFINE(vps);
+SDT_PROBE_DEFINE1(vps, functions, vps_alloc, entry, "int");
+SDT_PROBE_DEFINE2(vps, functions, vps_alloc, alloc, "int", "struct vps *");
+SDT_PROBE_DEFINE2(vps, functions, vps_alloc, return, "int", "struct vps *");
+SDT_PROBE_DEFINE2(vps, functions, vps_destroy, entry, "int", "struct vps *");
+SDT_PROBE_DEFINE1(vps, functions, vps_destroy, return, "int");
+
+#ifdef DDB
+static void db_show_vps_print_vs(struct vps_sysinit *, int);
+#endif
+
+/*
+ * Allocate a virtual process space.
+ */
+struct vps *
+vps_alloc(struct prison *pr)
+{
+ struct vps *vps;
+
+ SDT_PROBE1(vps, functions, vps_alloc, entry, __LINE__);
+ vps = malloc(sizeof(struct vps), M_VPS, M_WAITOK | M_ZERO);
+ vps->vps_magic_n = VPS_MAGIC_N;
+ vps->vps_state = 0;
+ vps->vps_pr = pr;
+ /* Cheat for vps_sysinit() to get creds right. */
+ pr->pr_vps = vps;
+ SDT_PROBE2(vps, functions, vps_alloc, alloc, __LINE__, vps);
+
+ /*
+ * Allocate storage for virtualized global variables and copy in
+ * initial values form our 'primary' copy.
+ */
+ vps->vps_data_mem = malloc(VPS_SIZE, M_VPS_DATA, M_WAITOK);
+ memcpy(vps->vps_data_mem, (void *)VPS_START, VPS_BYTES);
+
+ /*
+ * All use of vps-specific data will immediately subtract VPS_START
+ * from the base memory pointer, so pre-calculate that now to avoid
+ * it on each use.
+ */
+ vps->vps_data_base = (uintptr_t)vps->vps_data_mem - VPS_START;
+
+ /* Initialize / attach vps module instances. */
+ CURVPS_SET_QUIET(vps);
+ vps_sysinit();
+ CURVPS_RESTORE();
+
+ VPS_LIST_WLOCK();
+ LIST_INSERT_HEAD(&vps_head, vps, vps_le);
+ VPS_LIST_WUNLOCK();
+
+ SDT_PROBE2(vps, functions, vps_alloc, return, __LINE__, vps);
+ return (vps);
+}
+
+/*
+ * Destroy a virtual process space.
+ */
+void
+vps_destroy(struct vps *vps)
+{
+
+ SDT_PROBE2(vps, functions, vps_destroy, entry, __LINE__, vps);
+
+ VPS_LIST_WLOCK();
+ if (vps->vps_le.le_prev == NULL && vps->vps_le.le_next == NULL) {
+ VPS_LIST_WUNLOCK();
+ DELAY(10000);
+ return;
+ }
+ LIST_REMOVE(vps, vps_le);
+ vps->vps_le.le_prev = NULL;
+ vps->vps_le.le_next = NULL;
+ VPS_LIST_WUNLOCK();
+
+ CURVPS_SET_QUIET(vps);
+ vps_sysuninit();
+ CURVPS_RESTORE();
+
+ /*
+ * Release storage for the virtual process space instance.
+ */
+ free(vps->vps_data_mem, M_VPS_DATA);
+ vps->vps_data_mem = NULL;
+ vps->vps_data_base = 0;
+ vps->vps_pr->pr_vps = NULL;
+ vps->vps_pr = NULL;
+ vps->vps_magic_n = 0xdeadbeef;
+ free(vps, M_VPS);
+ SDT_PROBE1(vps, functions, vps_destroy, return, __LINE__);
+}
+
+/*
+ * Boot time initialization and allocation of virtual process space.
+ */
+static void
+vps_init_prelink(void *arg __unused)
+{
+
+ rw_init(&vps_rwlock, "vps_rwlock");
+ sx_init(&vps_sxlock, "vps_sxlock");
+ sx_init(&vps_sysinit_sxlock, "vps_sysinit_sxlock");
+ LIST_INIT(&vps_head);
+}
+SYSINIT(vps_init_prelink, SI_SUB_VIMAGE_PRELINK, SI_ORDER_FIRST,
+ vps_init_prelink, NULL);
+
+static void
+vps0_init(void *arg __unused)
+{
+
+ if (bootverbose)
+ printf("VIMAGE (virtualized process space) enabled\n");
+
+ /*
+ * We MUST clear curvps in vi_init_done() before going SMP,
+ * otherwise CURVPS_SET() macros would scream about unnecessary
+ * curvps recursions.
+ */
+ curvps = prison0.pr_vps = vps0 = vps_alloc(&prison0);
+}
+SYSINIT(vps0_init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, vps0_init, NULL);
+
+#if 0
+/* Compared to vnets, nuking the vps of the current thread does not go down well. */
+static void
+vps_init_done(void *unused __unused)
+{
+
+ curvps = NULL;
+}
+SYSINIT(vps_init_done, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, vps_init_done, NULL);
+#endif
+
+/*
+ * Once on boot, initialize the modspace freelist to entirely cover modspace.
+ */
+static void
+vps_data_startup(void *dummy __unused)
+{
+ struct vps_data_free *df;
+
+ df = malloc(sizeof(*df), M_VPS_DATA_FREE, M_WAITOK | M_ZERO);
+ df->vnd_start = (uintptr_t)&VPS_NAME(modspace);
+ df->vnd_len = VPS_MODMIN;
+ TAILQ_INSERT_HEAD(&vps_data_free_head, df, vnd_link);
+ sx_init(&vps_data_free_lock, "vps_data alloc lock");
+}
+SYSINIT(vps_data, SI_SUB_KLD, SI_ORDER_FIRST, vps_data_startup, 0);
+
+/* Dummy VPS_SYSINIT to make sure we always reach the final end state. */
+static void
+vps_sysinit_done(void *unused __unused)
+{
+
+ return;
+}
+VPS_SYSINIT(vps_sysinit_done, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY,
+ vps_sysinit_done, NULL);
+
+/*
+ * When a module is loaded and requires storage for a virtualized global
+ * variable, allocate space from the modspace free list. This interface
+ * should be used only by the kernel linker.
+ */
+void *
+vps_data_alloc(int size)
+{
+ struct vps_data_free *df;
+ void *s;
+
+ s = NULL;
+ size = roundup2(size, sizeof(void *));
+ sx_xlock(&vps_data_free_lock);
+ TAILQ_FOREACH(df, &vps_data_free_head, vnd_link) {
+ if (df->vnd_len < size)
+ continue;
+ if (df->vnd_len == size) {
+ s = (void *)df->vnd_start;
+ TAILQ_REMOVE(&vps_data_free_head, df, vnd_link);
+ free(df, M_VPS_DATA_FREE);
+ break;
+ }
+ s = (void *)df->vnd_start;
+ df->vnd_len -= size;
+ df->vnd_start = df->vnd_start + size;
+ break;
+ }
+ sx_xunlock(&vps_data_free_lock);
+
+ return (s);
+}
+
+/*
+ * Free space for a virtualized global variable on module unload.
+ */
+void
+vps_data_free(void *start_arg, int size)
+{
+ struct vps_data_free *df;
+ struct vps_data_free *dn;
+ uintptr_t start;
+ uintptr_t end;
+
+ size = roundup2(size, sizeof(void *));
+ start = (uintptr_t)start_arg;
+ end = start + size;
+ /*
+ * Free a region of space and merge it with as many neighbors as
+ * possible. Keeping the list sorted simplifies this operation.
+ */
+ sx_xlock(&vps_data_free_lock);
+ TAILQ_FOREACH(df, &vps_data_free_head, vnd_link) {
+ if (df->vnd_start > end)
+ break;
+ /*
+ * If we expand at the end of an entry we may have to merge
+ * it with the one following it as well.
+ */
+ if (df->vnd_start + df->vnd_len == start) {
+ df->vnd_len += size;
+ dn = TAILQ_NEXT(df, vnd_link);
+ if (df->vnd_start + df->vnd_len == dn->vnd_start) {
+ df->vnd_len += dn->vnd_len;
+ TAILQ_REMOVE(&vps_data_free_head, dn,
+ vnd_link);
+ free(dn, M_VPS_DATA_FREE);
+ }
+ sx_xunlock(&vps_data_free_lock);
+ return;
+ }
+ if (df->vnd_start == end) {
+ df->vnd_start = start;
+ df->vnd_len += size;
+ sx_xunlock(&vps_data_free_lock);
+ return;
+ }
+ }
+ dn = malloc(sizeof(*df), M_VPS_DATA_FREE, M_WAITOK | M_ZERO);
+ dn->vnd_start = start;
+ dn->vnd_len = size;
+ if (df)
+ TAILQ_INSERT_BEFORE(df, dn, vnd_link);
+ else
+ TAILQ_INSERT_TAIL(&vps_data_free_head, dn, vnd_link);
+ sx_xunlock(&vps_data_free_lock);
+}
+
+/*
+ * When a new virtualized global variable has been allocated, propagate its
+ * initial value to each already-allocated virtual process space instance.
+ */
+void
+vps_data_copy(void *start, int size)
+{
+ struct vps *vps;
+
+ VPS_LIST_RLOCK();
+ LIST_FOREACH(vps, &vps_head, vps_le)
+ memcpy((void *)((uintptr_t)vps->vps_data_base +
+ (uintptr_t)start), start, size);
+ VPS_LIST_RUNLOCK();
+}
+
+/*
+ * Support for special SYSINIT handlers registered via VPS_SYSINIT()
+ * and VPS_SYSUNINIT().
+ */
+void
+vps_register_sysinit(void *arg)
+{
+ struct vps_sysinit *vs, *vs2;
+ struct vps *vps;
+
+ vs = arg;
+ KASSERT(vs->subsystem >= SI_SUB_INTRINSIC, ("vps sysinit too early"));
+
+ /* Add the constructor to the global list of vps constructors. */
+ VPS_SYSINIT_WLOCK();
+ TAILQ_FOREACH(vs2, &vps_constructors, link) {
+ if (vs2->subsystem > vs->subsystem)
+ break;
+ if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
+ break;
+ }
+ if (vs2 != NULL)
+ TAILQ_INSERT_BEFORE(vs2, vs, link);
+ else
+ TAILQ_INSERT_TAIL(&vps_constructors, vs, link);
+
+ /*
+ * Invoke the constructor on all the existing vpss when it is
+ * registered.
+ */
+ VPS_FOREACH(vps) {
+ CURVPS_SET_QUIET(vps);
+ vs->func(vs->arg);
+ CURVPS_RESTORE();
+ }
+ VPS_SYSINIT_WUNLOCK();
+}
+
+void
+vps_deregister_sysinit(void *arg)
+{
+ struct vps_sysinit *vs;
+
+ vs = arg;
+
+ /* Remove the constructor from the global list of vps constructors. */
+ VPS_SYSINIT_WLOCK();
+ TAILQ_REMOVE(&vps_constructors, vs, link);
+ VPS_SYSINIT_WUNLOCK();
+}
+
+void
+vps_register_sysuninit(void *arg)
+{
+ struct vps_sysinit *vs, *vs2;
+
+ vs = arg;
+
+ /* Add the destructor to the global list of vps destructors. */
+ VPS_SYSINIT_WLOCK();
+ TAILQ_FOREACH(vs2, &vps_destructors, link) {
+ if (vs2->subsystem > vs->subsystem)
+ break;
+ if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
+ break;
+ }
+ if (vs2 != NULL)
+ TAILQ_INSERT_BEFORE(vs2, vs, link);
+ else
+ TAILQ_INSERT_TAIL(&vps_destructors, vs, link);
+ VPS_SYSINIT_WUNLOCK();
+}
+
+void
+vps_deregister_sysuninit(void *arg)
+{
+ struct vps_sysinit *vs;
+ struct vps *vps;
+
+ vs = arg;
+
+ /*
+ * Invoke the destructor on all the existing vpss when it is
+ * deregistered.
+ */
+ VPS_SYSINIT_WLOCK();
+ VPS_FOREACH(vps) {
+ CURVPS_SET_QUIET(vps);
+ vs->func(vs->arg);
+ CURVPS_RESTORE();
+ }
+
+ /* Remove the destructor from the global list of vps destructors. */
+ TAILQ_REMOVE(&vps_destructors, vs, link);
+ VPS_SYSINIT_WUNLOCK();
+}
+
+/*
+ * Invoke all registered vps constructors on the current vps. Used during
+ * vps construction. The caller is responsible for ensuring the new vps is
+ * the current vps and that the vps_sysinit_sxlock lock is locked.
+ */
+void
+vps_sysinit(void)
+{
+ struct vps_sysinit *vs;
+ struct vps *vps;
+
+ vps = curvps;
+ VPS_SYSINIT_RLOCK();
+ TAILQ_FOREACH(vs, &vps_constructors, link) {
+ curvps->vps_state = vs->subsystem;
+ vs->func(vs->arg);
+ KASSERT((curvps == vps),
+ ("%s: vs %p subsystem %u order %u func %p returned "
+ "with curvps altered: curvps %p should be %p\n",
+ __func__, vs, vs->subsystem, vs->order, vs->func,
+ curvps, vps));
+ }
+ VPS_SYSINIT_RUNLOCK();
+}
+
+/*
+ * Invoke all registered vps destructors on the current vps. Used during
+ * vps destruction. The caller is responsible for ensuring the dying vps
+ * the current vps and that the vps_sysinit_sxlock lock is locked.
+ */
+void
+vps_sysuninit(void)
+{
+ struct vps_sysinit *vs;
+
+ VPS_SYSINIT_RLOCK();
+ TAILQ_FOREACH_REVERSE(vs, &vps_destructors, vps_sysuninit_head,
+ link) {
+ curvps->vps_state = vs->subsystem;
+ vs->func(vs->arg);
+ }
+ VPS_SYSINIT_RUNLOCK();
+}
+
+/*
+ * EVENTHANDLER(9) extensions.
+ */
+/*
+ * Invoke the eventhandler function originally registered with the possibly
+ * registered argument for all virtual process space instances.
+ *
+ * This iterator can only be used for eventhandlers that do not take any
+ * additional arguments, as we do ignore the variadic arguments from the
+ * EVENTHANDLER_INVOKE() call.
+ */
+void
+vps_global_eventhandler_iterator_func(void *arg, ...)
+{
+ VPS_ITERATOR_DECL(vps_iter);
+ struct eventhandler_entry_vimage *v_ee;
+
+ /*
+ * There is a bug here in that we should actually cast things to
+ * (struct eventhandler_entry_ ## name *) but that's not easily
+ * possible in here so just re-using the variadic version we
+ * defined for the generic vimage case.
+ */
+ v_ee = arg;
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ ((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg);
+ CURVPS_RESTORE();
+ }
+ VPS_LIST_RUNLOCK();
+}
+
+#ifdef VPS_DEBUG
+struct vps_recursion {
+ SLIST_ENTRY(vps_recursion) vnr_le;
+ const char *prev_fn;
+ const char *where_fn;
+ int where_line;
+ struct vps *old_vps;
+ struct vps *new_vps;
+};
+
+static SLIST_HEAD(, vps_recursion) vps_recursions =
+ SLIST_HEAD_INITIALIZER(vps_recursions);
+
+static void
+vps_print_recursion(struct vps_recursion *vnr, int brief)
+{
+
+ if (!brief)
+ printf("CURVPS_SET() recursion in ");
+ printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line,
+ vnr->prev_fn);
+ if (brief)
+ printf(", ");
+ else
+ printf("\n ");
+ printf("%p -> %p\n", vnr->old_vps, vnr->new_vps);
+}
+
+void
+vps_log_recursion(struct vps *old_vps, const char *old_fn, int line)
+{
+ struct vps_recursion *vnr;
+
+ /* Skip already logged recursion events. */
+ SLIST_FOREACH(vnr, &vps_recursions, vnr_le)
+ if (vnr->prev_fn == old_fn &&
+ vnr->where_fn == curthread->td_vps_lpush &&
+ vnr->where_line == line &&
+ (vnr->old_vps == vnr->new_vps) == (curvps == old_vps))
+ return;
+
+ vnr = malloc(sizeof(*vnr), M_VPS, M_NOWAIT | M_ZERO);
+ if (vnr == NULL)
+ panic("%s: malloc failed", __func__);
+ vnr->prev_fn = old_fn;
+ vnr->where_fn = curthread->td_vps_lpush;
+ vnr->where_line = line;
+ vnr->old_vps = old_vps;
+ vnr->new_vps = curvps;
+
+ SLIST_INSERT_HEAD(&vps_recursions, vnr, vnr_le);
+
+ vps_print_recursion(vnr, 0);
+#ifdef KDB
+ kdb_backtrace();
+#endif
+}
+#endif /* VPS_DEBUG */
+
+/*
+ * DDB(4).
+ */
+#ifdef DDB
+static void
+db_vps_print(struct vps *vps)
+{
+
+ db_printf("vps = %p\n", vps);
+ db_printf(" vps_magic_n = %#08x (%s, orig %#08x)\n",
+ vps->vps_magic_n,
+ (vps->vps_magic_n == VPS_MAGIC_N) ?
+ "ok" : "mismatch", VPS_MAGIC_N);
+ db_printf(" vps_data_mem = %p\n", vps->vps_data_mem);
+ db_printf(" vps_data_base = %#jx\n",
+ (uintmax_t)vps->vps_data_base);
+ db_printf(" vps_state = %#08x\n", vps->vps_state);
+ db_printf("\n");
+}
+
+DB_SHOW_ALL_COMMAND(vpss, db_show_all_vpss)
+{
+ VPS_ITERATOR_DECL(vps_iter);
+
+ VPS_FOREACH(vps_iter) {
+ db_vps_print(vps_iter);
+ if (db_pager_quit)
+ break;
+ }
+}
+
+DB_SHOW_COMMAND(vps, db_show_vps)
+{
+
+ if (!have_addr) {
+ db_printf("usage: show vps <struct vps *>\n");
+ return;
+ }
+
+ db_vps_print((struct vps *)addr);
+}
+
+static void
+db_show_vps_print_vs(struct vps_sysinit *vs, int ddb)
+{
+ const char *vsname, *funcname;
+ c_db_sym_t sym;
+ db_expr_t offset;
+
+#define xprint(...) \
+ if (ddb) \
+ db_printf(__VA_ARGS__); \
+ else \
+ printf(__VA_ARGS__)
+
+ if (vs == NULL) {
+ xprint("%s: no vps_sysinit * given\n", __func__);
+ return;
+ }
+
+ sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset);
+ db_symbol_values(sym, &vsname, NULL);
+ sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset);
+ db_symbol_values(sym, &funcname, NULL);
+ xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs);
+ xprint(" %#08x %#08x\n", vs->subsystem, vs->order);
+ xprint(" %p(%s)(%p)\n",
+ vs->func, (funcname != NULL) ? funcname : "", vs->arg);
+#undef xprint
+}
+
+DB_SHOW_COMMAND(vps_sysinit, db_show_vps_sysinit)
+{
+ struct vps_sysinit *vs;
+
+ db_printf("VPS_SYSINIT vs Name(Ptr)\n");
+ db_printf(" Subsystem Order\n");
+ db_printf(" Function(Name)(Arg)\n");
+ TAILQ_FOREACH(vs, &vps_constructors, link) {
+ db_show_vps_print_vs(vs, 1);
+ if (db_pager_quit)
+ break;
+ }
+}
+
+DB_SHOW_COMMAND(vps_sysuninit, db_show_vps_sysuninit)
+{
+ struct vps_sysinit *vs;
+
+ db_printf("VPS_SYSUNINIT vs Name(Ptr)\n");
+ db_printf(" Subsystem Order\n");
+ db_printf(" Function(Name)(Arg)\n");
+ TAILQ_FOREACH_REVERSE(vs, &vps_destructors, vps_sysuninit_head,
+ link) {
+ db_show_vps_print_vs(vs, 1);
+ if (db_pager_quit)
+ break;
+ }
+}
+
+DB_COMMAND(setcurvps, db_setcurvps)
+{
+ struct vps *vps;
+
+ if (!have_addr) {
+ db_printf("usage: setcurvps <stauct vps *>\n");
+ return;
+ }
+
+ vps = (struct vps *)addr;
+ db_printf("curvps %p -> %p\n", curvps, vps);
+ curvps = vps;
+ db_vps_print(vps);
+}
+
+#ifdef VPS_DEBUG
+DB_SHOW_COMMAND(vpsrcrs, db_show_vpsrcrs)
+{
+ struct vps_recursion *vnr;
+
+ SLIST_FOREACH(vnr, &vps_recursions, vnr_le)
+ vps_print_recursion(vnr, 1);
+}
+#endif
+#endif /* DDB */
Index: sys/kern/sched_4bsd.c
===================================================================
--- sys/kern/sched_4bsd.c
+++ sys/kern/sched_4bsd.c
@@ -454,17 +454,15 @@
* Recompute process priorities, every hz ticks.
* MP-safe, called without the Giant mutex.
*/
-/* ARGSUSED */
-static void
-schedcpu(void)
+static __inline void
+_schedcpu(fixpt_t loadfac)
{
- fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
struct thread *td;
struct proc *p;
struct td_sched *ts;
int awake;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
@@ -550,7 +548,22 @@
}
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
+}
+
+static void
+schedcpu(void)
+{
+ VPS_ITERATOR_DECL(vps_iter);
+ fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ _schedcpu(loadfac);
+ CURVPS_RESTORE();
+ }
+ VPS_LIST_RUNLOCK();
}
/*
Index: sys/kern/subr_pcpu.c
===================================================================
--- sys/kern/subr_pcpu.c
+++ sys/kern/subr_pcpu.c
@@ -378,6 +378,7 @@
#ifdef VIMAGE
db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet);
+ db_printf("curvps = %p\n", pc->pc_curthread->td_vps);
#endif
#ifdef WITNESS
Index: sys/kern/subr_prf.c
===================================================================
--- sys/kern/subr_prf.c
+++ sys/kern/subr_prf.c
@@ -165,12 +165,12 @@
if (TD_IS_IDLETHREAD(td))
return (0);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
p = td->td_proc;
PROC_LOCK(p);
if ((p->p_flag & P_CONTROLT) == 0) {
PROC_UNLOCK(p);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (0);
}
SESS_LOCK(p->p_session);
@@ -178,14 +178,14 @@
SESS_UNLOCK(p->p_session);
PROC_UNLOCK(p);
if (pca.tty == NULL) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (0);
}
pca.flags = TOTTY;
pca.p_bufr = NULL;
va_start(ap, fmt);
tty_lock(pca.tty);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
retval = kvprintf(fmt, putchar, &pca, 10, ap);
tty_unlock(pca.tty);
va_end(ap);
@@ -214,7 +214,7 @@
struct putchar_arg pca;
struct session *sess = NULL;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (pri != -1)
flags |= TOLOG;
if (p != NULL) {
@@ -237,7 +237,7 @@
pca.p_bufr = NULL;
if (pca.tty != NULL)
tty_lock(pca.tty);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
kvprintf(fmt, putchar, &pca, 10, ap);
if (pca.tty != NULL)
tty_unlock(pca.tty);
Index: sys/kern/subr_turnstile.c
===================================================================
--- sys/kern/subr_turnstile.c
+++ sys/kern/subr_turnstile.c
@@ -1212,22 +1212,32 @@
DB_SHOW_ALL_COMMAND(chains, db_show_allchains)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct thread *td;
struct proc *p;
int i;
i = 1;
- FOREACH_PROC_IN_SYSTEM(p) {
- FOREACH_THREAD_IN_PROC(p, td) {
- if ((TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested))
- || (TD_IS_INHIBITED(td) && TD_ON_SLEEPQ(td))) {
- db_printf("chain %d:\n", i++);
- print_lockchain(td, " ");
+
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if ((TD_ON_LOCK(td) &&
+ LIST_EMPTY(&td->td_contested))
+ || (TD_IS_INHIBITED(td) &&
+ TD_ON_SLEEPQ(td))) {
+ db_printf("chain %d:\n", i++);
+ print_lockchain(td, " ");
+ }
+ if (db_pager_quit)
+ return;
}
- if (db_pager_quit)
- return;
}
+ CURVPS_RESTORE();
}
+ /* VPS_LIST_RUNLOCK(); */
}
DB_SHOW_ALIAS(allchains, db_show_allchains)
Index: sys/kern/subr_witness.c
===================================================================
--- sys/kern/subr_witness.c
+++ sys/kern/subr_witness.c
@@ -2534,6 +2534,7 @@
DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct thread *td;
struct proc *p;
@@ -2542,19 +2543,25 @@
* held sleep locks, but that information is currently not exported
* by WITNESS.
*/
- FOREACH_PROC_IN_SYSTEM(p) {
- if (!witness_proc_has_locks(p))
- continue;
- FOREACH_THREAD_IN_PROC(p, td) {
- if (!witness_thread_has_locks(td))
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (!witness_proc_has_locks(p))
continue;
- db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
- p->p_comm, td, td->td_tid);
- witness_ddb_list(td);
- if (db_pager_quit)
- return;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (!witness_thread_has_locks(td))
+ continue;
+ db_printf("Process %d (%s) thread %p (%d)\n",
+ p->p_pid, p->p_comm, td, td->td_tid);
+ witness_ddb_list(td);
+ if (db_pager_quit)
+ return;
+ }
}
+ CURVPS_RESTORE();
}
+ /* VPS_LIST_RUNLOCK(); */
}
DB_SHOW_ALIAS(alllocks, db_witness_list_all)
Index: sys/kern/sys_procdesc.c
===================================================================
--- sys/kern/sys_procdesc.c
+++ sys/kern/sys_procdesc.c
@@ -153,13 +153,13 @@
goto out;
}
pd = fp->f_data;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (pd->pd_proc != NULL) {
*p = pd->pd_proc;
PROC_LOCK(*p);
} else
error = ESRCH;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
out:
fdrop(fp, td);
return (error);
@@ -305,14 +305,14 @@
{
struct procdesc *pd;
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL"));
pd = p->p_procdesc;
PROCDESC_LOCK(pd);
- KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc,
+ KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == V_initproc,
("procdesc_exit: closed && parent not init"));
pd->pd_flags |= PDF_EXITED;
@@ -349,7 +349,7 @@
{
struct procdesc *pd;
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL"));
pd = p->p_procdesc;
@@ -375,7 +375,7 @@
fp->f_ops = &badfileops;
fp->f_data = NULL;
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
PROCDESC_LOCK(pd);
pd->pd_flags |= PDF_CLOSED;
PROCDESC_UNLOCK(pd);
@@ -385,7 +385,7 @@
* This is the case where process' exit status was already
* collected and procdesc_reap() was already called.
*/
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
} else {
PROC_LOCK(p);
AUDIT_ARG_PROCESS(p);
@@ -415,11 +415,11 @@
* prejudice.
*/
p->p_sigparent = SIGCHLD;
- proc_reparent(p, initproc);
+ proc_reparent(p, V_initproc);
if ((pd->pd_flags & PDF_DAEMON) == 0)
kern_psignal(p, SIGKILL);
PROC_UNLOCK(p);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
}
}
@@ -531,7 +531,7 @@
*/
bzero(sb, sizeof(*sb));
pd = fp->f_data;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (pd->pd_proc != NULL) {
PROC_LOCK(pd->pd_proc);
AUDIT_ARG_PROCESS(pd->pd_proc);
@@ -553,7 +553,7 @@
PROC_UNLOCK(pd->pd_proc);
} else
sb->st_mode = S_IFREG;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (0);
}
Index: sys/kern/sys_process.c
===================================================================
--- sys/kern/sys_process.c
+++ sys/kern/sys_process.c
@@ -688,7 +688,7 @@
proc_set_traced(struct proc *p, bool stop)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
p->p_flag |= P_TRACED;
if (stop)
@@ -733,7 +733,7 @@
case PT_SET_EVENT_MASK:
case PT_DETACH:
case PT_GET_SC_ARGS:
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
proctree_locked = 1;
break;
default:
@@ -747,14 +747,14 @@
if (pid <= PID_MAX) {
if ((p = pfind(pid)) == NULL) {
if (proctree_locked)
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (ESRCH);
}
} else {
td2 = tdfind(pid, -1);
if (td2 == NULL) {
if (proctree_locked)
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (ESRCH);
}
p = td2->td_proc;
@@ -816,7 +816,7 @@
error = EBUSY;
goto fail;
}
- if (p->p_pptr == initproc) {
+ if (p->p_pptr == V_initproc) {
error = EPERM;
goto fail;
}
@@ -923,7 +923,7 @@
CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid,
p->p_oppid);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
proctree_locked = 0;
MPASS(p->p_xthread == NULL);
MPASS((p->p_flag & P_STOPPED_TRACE) == 0);
@@ -1113,7 +1113,7 @@
pp = proc_realparent(p);
proc_reparent(p, pp);
- if (pp == initproc)
+ if (pp == V_initproc)
p->p_sigparent = SIGCHLD;
CTR3(KTR_PTRACE,
"PT_DETACH: pid %d reparented to pid %d, sig %d",
@@ -1142,7 +1142,7 @@
break;
}
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
proctree_locked = 0;
sendsig:
@@ -1456,7 +1456,7 @@
fail:
PROC_UNLOCK(p);
if (proctree_locked)
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (error);
}
#undef PROC_READ
Index: sys/kern/tty.c
===================================================================
--- sys/kern/tty.c
+++ sys/kern/tty.c
@@ -1703,18 +1703,18 @@
/* XXX: This looks awful. */
tty_unlock(tp);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
tty_lock(tp);
if (!SESS_LEADER(p)) {
/* Only the session leader may do this. */
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (EPERM);
}
if (tp->t_session != NULL && tp->t_session == p->p_session) {
/* This is already our controlling TTY. */
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (0);
}
@@ -1732,7 +1732,7 @@
* TTYs of which the session leader has been
* killed or the TTY revoked.
*/
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (EPERM);
}
@@ -1740,7 +1740,7 @@
tp->t_session = p->p_session;
tp->t_session->s_ttyp = tp;
tp->t_sessioncnt++;
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
/* Assign foreground process group. */
tp->t_pgrp = p->p_pgrp;
@@ -1759,12 +1759,12 @@
* decompose proctree_lock.
*/
tty_unlock(tp);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
pg = pgfind(*(int *)data);
if (pg != NULL)
PGRP_UNLOCK(pg);
if (pg == NULL || pg->pg_session != td->td_proc->p_session) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
tty_lock(tp);
return (EPERM);
}
@@ -1775,11 +1775,11 @@
* relocking the TTY.
*/
if (!tty_is_ctty(tp, td->td_proc)) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (ENOTTY);
}
tp->t_pgrp = pg;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
/* Wake up the background process groups. */
cv_broadcast(&tp->t_bgwait);
Index: sys/kern/tty_tty.c
===================================================================
--- sys/kern/tty_tty.c
+++ sys/kern/tty_tty.c
@@ -68,7 +68,7 @@
return;
p = curproc;
sx_sunlock(&clone_drain_lock);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
sx_slock(&clone_drain_lock);
dev_lock();
if (!(p->p_flag & P_CONTROLT))
@@ -83,7 +83,7 @@
*dev = p->p_session->s_ttyvp->v_rdev;
dev_refl(*dev);
dev_unlock();
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
}
static void
Index: sys/net/vnet.c
===================================================================
--- sys/net/vnet.c
+++ sys/net/vnet.c
@@ -80,8 +80,6 @@
* stack instance.
*/
-FEATURE(vimage, "VIMAGE kernel virtualization");
-
static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block");
/*
@@ -307,7 +305,7 @@
sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock");
LIST_INIT(&vnet_head);
}
-SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST,
+SYSINIT(vnet_init_prelink, SI_SUB_VIMAGE_PRELINK, SI_ORDER_FIRST,
vnet_init_prelink, NULL);
static void
Index: sys/sys/jail.h
===================================================================
--- sys/sys/jail.h
+++ sys/sys/jail.h
@@ -166,6 +166,7 @@
struct osd pr_osd; /* (p) additional data */
struct cpuset *pr_cpuset; /* (p) cpuset */
struct vnet *pr_vnet; /* (c) network stack */
+ struct vps *pr_vps; /* (c) process space */
struct vnode *pr_root; /* (c) vnode to rdir */
int pr_ip4s; /* (p) number of v4 IPs */
int pr_ip6s; /* (p) number of v6 IPs */
@@ -209,6 +210,7 @@
/* primary jail address. */
#define PR_IP6_SADDRSEL 0x00000100 /* Do IPv6 src addr sel. or use the */
/* primary jail address. */
+#define PR_VPS 0x00000200 /* Virtual process space */
/* Internal flag bits */
#define PR_IP4 0x02000000 /* IPv4 restricted or disabled */
@@ -370,6 +372,7 @@
int prison_allow(struct ucred *, unsigned);
int prison_check(struct ucred *cred1, struct ucred *cred2);
int prison_owns_vnet(struct ucred *);
+int prison_owns_vps(struct ucred *);
int prison_canseemount(struct ucred *cred, struct mount *mp);
void prison_enforce_statfs(struct ucred *cred, struct mount *mp,
struct statfs *sp);
Index: sys/sys/kernel.h
===================================================================
--- sys/sys/kernel.h
+++ sys/sys/kernel.h
@@ -102,7 +102,7 @@
SI_SUB_MTX_POOL_DYNAMIC = 0x1AC0000, /* dynamic mutex pool */
SI_SUB_LOCK = 0x1B00000, /* various locks */
SI_SUB_EVENTHANDLER = 0x1C00000, /* eventhandler init */
- SI_SUB_VNET_PRELINK = 0x1E00000, /* vnet init before modules */
+ SI_SUB_VIMAGE_PRELINK = 0x1E00000, /* VIMAGE init before modules */
SI_SUB_KLD = 0x2000000, /* KLD and module setup */
SI_SUB_CPU = 0x2100000, /* CPU resource(s)*/
SI_SUB_RACCT = 0x2110000, /* resource accounting */
@@ -159,7 +159,7 @@
SI_SUB_ROOT_CONF = 0xb000000, /* Find root devices */
SI_SUB_INTRINSIC_POST = 0xd000000, /* proc 0 cleanup*/
SI_SUB_SYSCALLS = 0xd800000, /* register system calls */
- SI_SUB_VNET_DONE = 0xdc00000, /* vnet registration complete */
+ SI_SUB_VNET_DONE = 0xdc00000, /* VNET registration complete */
SI_SUB_KTHREAD_INIT = 0xe000000, /* init process*/
SI_SUB_KTHREAD_PAGE = 0xe400000, /* pageout daemon*/
SI_SUB_KTHREAD_VM = 0xe800000, /* vm daemon*/
@@ -170,6 +170,7 @@
SI_SUB_SMP = 0xf000000, /* start the APs*/
#endif
SI_SUB_RACCTD = 0xf100000, /* start racctd*/
+ SI_SUB_VIMAGE_DONE = 0xf800000, /* VIMAGE initialization done */
SI_SUB_LAST = 0xfffffff /* final initialization */
};
Index: sys/sys/proc.h
===================================================================
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -68,6 +68,9 @@
#include <sys/ucred.h>
#include <sys/types.h>
#include <sys/_domainset.h>
+#ifdef _KERNEL
+#include <sys/vps.h>
+#endif
#include <machine/proc.h> /* Machine-dependent proc substruct. */
#ifdef _KERNEL
@@ -351,6 +354,8 @@
/* LP64 hole */
struct vnet *td_vnet; /* (k) Effective vnet. */
const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */
+ struct vps *td_vps; /* (k) Effective vps. */
+ const char *td_vps_lpush; /* (k) Debugging vps push / pop. */
struct trapframe *td_intr_frame;/* (k) Frame of the current irq */
struct proc *td_rfppwait_p; /* (k) The vforked child */
struct vm_page **td_ma; /* (k) uio pages held */
@@ -809,7 +814,7 @@
#endif
#define FOREACH_PROC_IN_SYSTEM(p) \
- LIST_FOREACH((p), &allproc, p_list)
+ LIST_FOREACH((p), &V_allproc, p_list)
#define FOREACH_THREAD_IN_PROC(p, td) \
TAILQ_FOREACH((td), &(p)->p_threads, td_plist)
@@ -939,38 +944,61 @@
#define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0)
-#define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash])
-extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
-extern u_long pidhash;
-#define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash])
+LIST_HEAD(pidhashhead, proc);
+VPS_DECLARE(struct pidhashhead *, pidhashtbl);
+#define V_pidhashtbl VPS(pidhashtbl)
+VPS_DECLARE(u_long, pidhash);
+#define V_pidhash VPS(pidhash)
+#define PIDHASH(pid) (&V_pidhashtbl[(pid) & V_pidhash])
+
extern LIST_HEAD(tidhashhead, thread) *tidhashtbl;
extern u_long tidhash;
+#define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash])
extern struct rwlock tidhash_lock;
-#define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash])
-extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
-extern u_long pgrphash;
+LIST_HEAD(pgrphashhead, pgrp);
+VPS_DECLARE(struct pgrphashhead *, pgrphashtbl);
+#define V_pgrphashtbl VPS(pgrphashtbl)
+VPS_DECLARE(u_long, pgrphash);
+#define V_pgrphash VPS(pgrphash)
+#define PGRPHASH(pgid) (&V_pgrphashtbl[(pgid) & V_pgrphash])
-extern struct sx allproc_lock;
+VPS_DECLARE(struct sx, allproc_lock);
+#define V_allproc_lock VPS(allproc_lock)
extern int allproc_gen;
-extern struct sx proctree_lock;
-extern struct mtx ppeers_lock;
+VPS_DECLARE(struct sx, proctree_lock);
+#define V_proctree_lock VPS(proctree_lock)
+VPS_DECLARE(struct mtx, ppeers_lock);
+#define V_ppeers_lock VPS(ppeers_lock)
extern struct proc proc0; /* Process slot for swapper. */
extern struct thread0_storage thread0_st; /* Primary thread in proc0. */
#define thread0 (thread0_st.t0st_thread)
extern struct vmspace vmspace0; /* VM space for proc0. */
+VPS_DECLARE(struct proc *, vproc0);
+#define V_vproc0 VPS(vproc0)
+#ifdef VIMAGE
+VPS_DECLARE(int, vpsdying);
+#define V_vpsdying VPS(vpsdying)
+#endif
extern int hogticks; /* Limit on kernel cpu hogs. */
-extern int lastpid;
-extern int nprocs, maxproc; /* Current and max number of procs. */
+VPS_DECLARE(int, lastpid);
+#define V_lastpid VPS(lastpid)
+VPS_DECLARE(int, nprocs); /* Current number of procs. */
+#define V_nprocs VPS(nprocs)
+extern int maxproc; /* Max number of procs. */
extern int maxprocperuid; /* Max procs per uid. */
extern u_long ps_arg_cache_limit;
LIST_HEAD(proclist, proc);
TAILQ_HEAD(procqueue, proc);
TAILQ_HEAD(threadqueue, thread);
-extern struct proclist allproc; /* List of all processes. */
-extern struct proclist zombproc; /* List of zombie processes. */
-extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
+VPS_DECLARE(struct proclist, allproc); /* List of all processes. */
+#define V_allproc VPS(allproc)
+VPS_DECLARE(struct proclist, zombproc); /* List of zombie processes. */
+#define V_zombproc VPS(zombproc)
+VPS_DECLARE(struct proc *, initproc); /* Process slots for init. */
+#define V_initproc VPS(initproc)
+extern struct proc *pageproc; /* Process slot for pager. */
extern struct uma_zone *proc_zone;
@@ -1021,6 +1049,7 @@
int fork1(struct thread *, struct fork_req *);
void fork_exit(void (*)(void *, struct trapframe *), void *,
struct trapframe *);
+int fork_findpid(int);
void fork_return(struct thread *, struct trapframe *);
int inferior(struct proc *p);
void kern_proc_vmmap_resident(struct vm_map *map, struct vm_map_entry *entry,
@@ -1043,7 +1072,6 @@
int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb);
int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb);
int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb);
-void procinit(void);
void proc_linkup0(struct proc *p, struct thread *td);
void proc_linkup(struct proc *p, struct thread *td);
struct proc *proc_realparent(struct proc *child);
Index: sys/sys/resourcevar.h
===================================================================
--- sys/sys/resourcevar.h
+++ sys/sys/resourcevar.h
@@ -154,7 +154,6 @@
struct uidinfo
*uifind(uid_t uid);
void uifree(struct uidinfo *uip);
-void uihashinit(void);
void uihold(struct uidinfo *uip);
#ifdef RACCT
void ui_racct_foreach(void (*callback)(struct racct *racct,
Index: sys/sys/sysctl.h
===================================================================
--- sys/sys/sysctl.h
+++ sys/sys/sysctl.h
@@ -104,6 +104,7 @@
#define CTLFLAG_CAPWR 0x00004000 /* Can be written in capability mode */
#define CTLFLAG_STATS 0x00002000 /* Statistics, not a tuneable */
#define CTLFLAG_NOFETCH 0x00001000 /* Don't fetch tunable from getenv() */
+#define CTLFLAG_VPS 0x00000800 /* Prisons with vps can fiddle */
#define CTLFLAG_CAPRW (CTLFLAG_CAPRD|CTLFLAG_CAPWR)
/*
Index: sys/sys/vps.h
===================================================================
--- /dev/null
+++ sys/sys/vps.h
@@ -0,0 +1,381 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2006-2009 University of Zagreb
+ * Copyright (c) 2006-2009 FreeBSD Foundation
+ * Copyright (c) 2018 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by the University of Zagreb and the
+ * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
+ * FreeBSD Foundation.
+ *
+ * Portions of this software were developed by Bjoern Zeeb
+ * under sponsorship from iXsystems, Inc.
+ *
+ * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * This header file defines several sets of interfaces supporting virtualized
+ * process space:
+ *
+ * - Definition of 'struct vps' and functions and macros to allocate/free/
+ * manipulate it.
+ *
+ * - A virtual process stack memory allocator, which provides support for
+ * virtualized global variables via a special linker set, set_vps.
+ *
+ * - Virtualized sysinits/sysuninits, which allow constructors and
+ * destructors to be run for each process space as virtual
+ * instances are created and destroyed.
+ *
+ * If VIMAGE isn't compiled into the kernel, virtualized global variables
+ * compile to normal global variables, and virtualized sysinits to regular
+ * sysinits.
+ */
+
+#ifndef _SYS_VPS_H_
+#define _SYS_VPS_H_
+
+/*
+ * struct vps describes a virtualized process space, and is primarily a
+ * pointer to storage for virtualized global variables. Expose to userspace
+ * as required for libkvm.
+ */
+#if defined(_KERNEL) || defined(_WANT_VPS)
+#include <sys/queue.h>
+
+struct vps {
+ LIST_ENTRY(vps) vps_le; /* all vps list */
+ u_int vps_magic_n;
+ u_int vps_state; /* SI_SUB_* */
+ void *vps_data_mem;
+ uintptr_t vps_data_base;
+ struct prison *vps_pr; /* Put init on this if set. */
+};
+#define VPS_MAGIC_N 0x0f0307e2
+
+/*
+ * These two virtual process space allocator definitions are also required
+ * for libkvm so that it can evaluate virtualized global variables.
+ */
+#define VPS_SETNAME "set_vps"
+#define VPS_SYMPREFIX "vps_entry_"
+#endif
+
+#ifdef _KERNEL
+#ifdef VIMAGE
+#include <sys/lock.h>
+#include <sys/proc.h> /* for struct thread */
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+
+/*
+ * Location of the kernel's 'set_vps' linker set.
+ */
+extern uintptr_t *__start_set_vps;
+__GLOBL(__start_set_vps);
+extern uintptr_t *__stop_set_vps;
+__GLOBL(__stop_set_vps);
+
+#define VPS_START (uintptr_t)&__start_set_vps
+#define VPS_STOP (uintptr_t)&__stop_set_vps
+
+/*
+ * Functions to allocate and destroy virtual process spaces.
+ */
+struct vps *vps_alloc(struct prison *);
+void vps_destroy(struct vps *);
+
+/*
+ * The current virtual process space -- we may wish to move this to struct
+ * pcpu in the future.
+ */
+#define curvps curthread->td_vps
+
+/*
+ * Various macros -- get and set the current process space, but also
+ * assertions.
+ */
+#if defined(INVARIANTS) || defined(VPS_DEBUG)
+#define VPS_ASSERT(exp, msg) do { \
+ if (!(exp)) \
+ panic msg; \
+} while (0)
+#else
+#define VPS_ASSERT(exp, msg) do { \
+} while (0)
+#endif
+
+#ifdef VPS_DEBUG
+void vps_log_recursion(struct vps *, const char *, int);
+
+#define CURVPS_SET_QUIET(arg) \
+ VPS_ASSERT((arg) != NULL && (arg)->vps_magic_n == VPS_MAGIC_N, \
+ ("CURVPS_SET at %s:%d %s() curvps=%p vps=%p", \
+ __FILE__, __LINE__, __func__, curvps, (arg))); \
+ struct vps *saved_vps = curvps; \
+ const char *saved_vps_lpush = curthread->td_vps_lpush; \
+ curvps = arg; \
+ curthread->td_vps_lpush = __func__;
+
+#define CURVPS_SET_VERBOSE(arg) \
+ CURVPS_SET_QUIET(arg) \
+ if (saved_vps) \
+ vps_log_recursion(saved_vps, saved_vps_lpush, __LINE__);
+
+#define CURVPS_SET(arg) CURVPS_SET_VERBOSE(arg)
+
+#define CURVPS_RESTORE() \
+ VPS_ASSERT(curvps != NULL && (saved_vps == NULL || \
+ saved_vps->vps_magic_n == VPS_MAGIC_N), \
+ ("CURVPS_RESTORE at %s:%d %s() curvps=%p saved_vps=%p", \
+ __FILE__, __LINE__, __func__, curvps, saved_vps)); \
+ curvps = saved_vps; \
+ curthread->td_vps_lpush = saved_vps_lpush;
+#else /* !VPS_DEBUG */
+
+#define CURVPS_SET_QUIET(arg) \
+ VPS_ASSERT((arg) != NULL && (arg)->vps_magic_n == VPS_MAGIC_N, \
+ ("CURVPS_SET at %s:%d %s() curvps=%p vps=%p", \
+ __FILE__, __LINE__, __func__, curvps, (arg))); \
+ struct vps *saved_vps = curvps; \
+ curvps = arg;
+
+#define CURVPS_SET_VERBOSE(arg) \
+ CURVPS_SET_QUIET(arg)
+
+#define CURVPS_SET(arg) CURVPS_SET_VERBOSE(arg)
+
+#define CURVPS_RESTORE() \
+ VPS_ASSERT(curvps != NULL && (saved_vps == NULL || \
+ saved_vps->vps_magic_n == VPS_MAGIC_N), \
+ ("CURVPS_RESTORE at %s:%d %s() curvps=%p saved_vps=%p", \
+ __FILE__, __LINE__, __func__, curvps, saved_vps)); \
+ curvps = saved_vps;
+#endif /* VPS_DEBUG */
+
+extern struct vps *vps0;
+#define IS_DEFAULT_VPS(arg) ((arg) == vps0)
+
+#define CRED_TO_VPS(cr) (cr)->cr_prison->pr_vps
+#define TD_TO_VPS(td) CRED_TO_VPS((td)->td_ucred)
+#define P_TO_VPS(p) CRED_TO_VPS((p)->p_ucred)
+
+/*
+ * Global linked list of all virtual process spaces, along with read locks to
+ * access it. If a caller may sleep while accessing the list, it must use
+ * the sleepable lock macros.
+ */
+LIST_HEAD(vps_list_head, vps);
+extern struct vps_list_head vps_head;
+extern struct rwlock vps_rwlock;
+extern struct sx vps_sxlock;
+
+#define VPS_LIST_RLOCK() sx_slock(&vps_sxlock)
+#define VPS_LIST_RLOCK_NOSLEEP() rw_rlock(&vps_rwlock)
+#define VPS_LIST_RUNLOCK() sx_sunlock(&vps_sxlock)
+#define VPS_LIST_RUNLOCK_NOSLEEP() rw_runlock(&vps_rwlock)
+
+/*
+ * Iteration macros to walk the global list of virtual process spaces.
+ */
+#define VPS_ITERATOR_DECL(arg) struct vps *arg
+#define VPS_FOREACH(arg) LIST_FOREACH((arg), &vps_head, vps_le)
+
+/*
+ * Virtual process space memory allocator, which allows global variables to
+ * be automatically instantiated for each process space instance.
+ */
+#define VPS_NAME(n) vps_entry_##n
+#define VPS_DECLARE(t, n) extern t VPS_NAME(n)
+#define VPS_DEFINE(t, n) t VPS_NAME(n) __section(VPS_SETNAME) __used
+#define _VPS_PTR(b, n) (__typeof(VPS_NAME(n))*) \
+ ((b) + (uintptr_t)&VPS_NAME(n))
+
+#define _VPS(b, n) (*_VPS_PTR(b, n))
+
+/*
+ * Virtualized global variable accessor macros.
+ */
+#define VPS_VPS_PTR(vps, n) _VPS_PTR((vps)->vps_data_base, n)
+#define VPS_VPS(vps, n) (*VPS_VPS_PTR((vps), n))
+
+#define VPS_PTR(n) VPS_VPS_PTR(curvps, n)
+#define VPS(n) VPS_VPS(curvps, n)
+
+/*
+ * Virtual process space allocator interfaces from the kernel linker.
+ */
+void *vps_data_alloc(int size);
+void vps_data_copy(void *start, int size);
+void vps_data_free(void *start_arg, int size);
+
+/*
+ * Virtual sysinit mechanism, allowing process space components to declare
+ * startup and shutdown methods to be run when virtual process space
+ * instances are created and destroyed.
+ */
+#include <sys/kernel.h>
+
+/*
+ * SYSINIT/SYSUNINIT variants that provide per-vps constructors and
+ * destructors.
+ */
+struct vps_sysinit {
+ enum sysinit_sub_id subsystem;
+ enum sysinit_elem_order order;
+ sysinit_cfunc_t func;
+ const void *arg;
+ TAILQ_ENTRY(vps_sysinit) link;
+};
+
+#define VPS_SYSINIT(ident, subsystem, order, func, arg) \
+ static struct vps_sysinit ident ## _vps_init = { \
+ subsystem, \
+ order, \
+ (sysinit_cfunc_t)(sysinit_nfunc_t)func, \
+ (arg) \
+ }; \
+ SYSINIT(vps_init_ ## ident, subsystem, order, \
+ vps_register_sysinit, &ident ## _vps_init); \
+ SYSUNINIT(vps_init_ ## ident, subsystem, order, \
+ vps_deregister_sysinit, &ident ## _vps_init)
+
+#define VPS_SYSUNINIT(ident, subsystem, order, func, arg) \
+ static struct vps_sysinit ident ## _vps_uninit = { \
+ subsystem, \
+ order, \
+ (sysinit_cfunc_t)(sysinit_nfunc_t)func, \
+ (arg) \
+ }; \
+ SYSINIT(vps_uninit_ ## ident, subsystem, order, \
+ vps_register_sysuninit, &ident ## _vps_uninit); \
+ SYSUNINIT(vps_uninit_ ## ident, subsystem, order, \
+ vps_deregister_sysuninit, &ident ## _vps_uninit)
+
+/*
+ * Run per-vps sysinits or sysuninits during vps creation/destruction.
+ */
+void vps_sysinit(void);
+void vps_sysuninit(void);
+
+/*
+ * Interfaces for managing per-vps constructors and destructors.
+ */
+void vps_register_sysinit(void *arg);
+void vps_register_sysuninit(void *arg);
+void vps_deregister_sysinit(void *arg);
+void vps_deregister_sysuninit(void *arg);
+
+/*
+ * EVENTHANDLER(9) extensions.
+ */
+#include <sys/eventhandler.h>
+
+void vps_global_eventhandler_iterator_func(void *, ...);
+#define VPS_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \
+do { \
+ if (IS_DEFAULT_VPS(curvps)) { \
+ (tag) = vimage_eventhandler_register(NULL, #name, func, \
+ arg, priority, \
+ vps_global_eventhandler_iterator_func); \
+ } \
+} while(0)
+#define VPS_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \
+do { \
+ if (IS_DEFAULT_VPS(curvps)) { \
+ vimage_eventhandler_register(NULL, #name, func, \
+ arg, priority, \
+ vps_global_eventhandler_iterator_func); \
+ } \
+} while(0)
+
+#else /* !VIMAGE */
+
+/*
+ * Various virtual process space macros compile to no-ops without VIMAGE.
+ */
+#define curvps NULL
+
+#define VPS_ASSERT(exp, msg)
+#define CURVPS_SET(arg)
+#define CURVPS_SET_QUIET(arg)
+#define CURVPS_RESTORE()
+
+#define VPS_LIST_RLOCK()
+#define VPS_LIST_RLOCK_NOSLEEP()
+#define VPS_LIST_RUNLOCK()
+#define VPS_LIST_RUNLOCK_NOSLEEP()
+#define VPS_ITERATOR_DECL(arg)
+#define VPS_FOREACH(arg)
+
+#define IS_DEFAULT_VPS(arg) 1
+#define CRED_TO_VPS(cr) NULL
+#define TD_TO_VPS(td) NULL
+#define P_TO_VPS(p) NULL
+
+/*
+ * Versions of the vps macros that compile to normal global variables and
+ * standard sysctl definitions.
+ */
+#define VPS_NAME(n) n
+#define VPS_DECLARE(t, n) extern t n
+#define VPS_DEFINE(t, n) t n
+#define _VPS_PTR(b, n) &VPS_NAME(n)
+
+/*
+ * Virtualized global variable accessor macros.
+ */
+#define VPS_VPS_PTR(vps, n) (&(n))
+#define VPS_VPS(vps, n) (n)
+
+#define VPS_PTR(n) (&(n))
+#define VPS(n) (n)
+
+/*
+ * When VIMAGE isn't compiled into the kernel, VPS_SYSINIT/VPS_SYSUNINIT
+ * map into normal sysinits, which have the same ordering properties.
+ */
+#define VPS_SYSINIT(ident, subsystem, order, func, arg) \
+ SYSINIT(ident, subsystem, order, func, arg)
+#define VPS_SYSUNINIT(ident, subsystem, order, func, arg) \
+ SYSUNINIT(ident, subsystem, order, func, arg)
+
+/*
+ * Without VIMAGE revert to the default implementation.
+ */
+#define VPS_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \
+ (tag) = eventhandler_register(NULL, #name, func, arg, priority)
+#define VPS_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \
+ eventhandler_register(NULL, #name, func, arg, priority)
+#endif /* VIMAGE */
+#endif /* _KERNEL */
+
+#endif /* !_SYS_VPS_H_ */
Index: sys/vm/vm_meter.c
===================================================================
--- sys/vm/vm_meter.c
+++ sys/vm/vm_meter.c
@@ -177,6 +177,7 @@
static int
vmtotal(SYSCTL_HANDLER_ARGS)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct vmtotal total;
#if defined(COMPAT_FREEBSD11)
struct vmtotal11 total11;
@@ -197,41 +198,48 @@
/*
* Calculate process statistics.
*/
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- if ((p->p_flag & P_SYSTEM) != 0)
- continue;
- PROC_LOCK(p);
- if (p->p_state != PRS_NEW) {
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- switch (td->td_state) {
- case TDS_INHIBITED:
- if (TD_IS_SWAPPED(td))
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if ((p->p_flag & P_SYSTEM) != 0)
+ continue;
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NEW) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ switch (td->td_state) {
+ case TDS_INHIBITED:
+ if (TD_IS_SWAPPED(td))
+ total.t_sw++;
+ else if (TD_IS_SLEEPING(td)) {
+ if (td->td_priority <=
+ PZERO)
+ total.t_dw++;
+ else
+ total.t_sl++;
+ }
+ break;
+ case TDS_CAN_RUN:
total.t_sw++;
- else if (TD_IS_SLEEPING(td)) {
- if (td->td_priority <= PZERO)
- total.t_dw++;
- else
- total.t_sl++;
+ break;
+ case TDS_RUNQ:
+ case TDS_RUNNING:
+ total.t_rq++;
+ break;
+ default:
+ break;
}
- break;
- case TDS_CAN_RUN:
- total.t_sw++;
- break;
- case TDS_RUNQ:
- case TDS_RUNNING:
- total.t_rq++;
- break;
- default:
- break;
+ thread_unlock(td);
}
- thread_unlock(td);
}
+ PROC_UNLOCK(p);
}
- PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
/*
* Calculate object memory usage statistics.
*/
Index: sys/vm/vm_object.c
===================================================================
--- sys/vm/vm_object.c
+++ sys/vm/vm_object.c
@@ -2507,18 +2507,27 @@
static int
vm_object_in_map(vm_object_t object)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
- /* sx_slock(&allproc_lock); */
- FOREACH_PROC_IN_SYSTEM(p) {
- if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
- continue;
- if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
- /* sx_sunlock(&allproc_lock); */
- return 1;
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ /* sx_slock(&V_allproc_lock); */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (!p->p_vmspace
+ /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
+ continue;
+ if (_vm_object_in_map(&p->p_vmspace->vm_map, object,
+ 0)) {
+ /* sx_sunlock(&V_allproc_lock); */
+ return 1;
+ }
}
+ /* sx_sunlock(&V_allproc_lock); */
+ CURVPS_RESTORE();
}
- /* sx_sunlock(&allproc_lock); */
+ /* VPS_LIST_RUNLOCK(); */
if (_vm_object_in_map(kernel_map, object, 0))
return 1;
return 0;
Index: sys/vm/vm_pageout.c
===================================================================
--- sys/vm/vm_pageout.c
+++ sys/vm/vm_pageout.c
@@ -1744,6 +1744,7 @@
void
vm_pageout_oom(int shortage)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p, *bigproc;
vm_offset_t size, bigsize;
struct thread *td;
@@ -1760,80 +1761,88 @@
*/
bigproc = NULL;
bigsize = 0;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- /*
- * If this is a system, protected or killed process, skip it.
- */
- if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
- P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
- p->p_pid == 1 || P_KILLED(p) ||
- (p->p_pid < 48 && swap_pager_avail != 0)) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * If the process is in a non-running type state,
- * don't touch it. Check all the threads individually.
- */
- breakout = false;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (!TD_ON_RUNQ(td) &&
- !TD_IS_RUNNING(td) &&
- !TD_IS_SLEEPING(td) &&
- !TD_IS_SUSPENDED(td) &&
- !TD_IS_SWAPPED(td)) {
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+
+ /*
+ * If this is a system, protected or killed process,
+ * skip it.
+ */
+ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
+ P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
+ p->p_pid == 1 || P_KILLED(p) ||
+ (p->p_pid < 48 && swap_pager_avail != 0)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * If the process is in a non-running type state,
+ * don't touch it. Check all the threads individually.
+ */
+ breakout = false;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (!TD_ON_RUNQ(td) &&
+ !TD_IS_RUNNING(td) &&
+ !TD_IS_SLEEPING(td) &&
+ !TD_IS_SUSPENDED(td) &&
+ !TD_IS_SWAPPED(td)) {
+ thread_unlock(td);
+ breakout = true;
+ break;
+ }
thread_unlock(td);
- breakout = true;
- break;
}
- thread_unlock(td);
- }
- if (breakout) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * get the process size
- */
- vm = vmspace_acquire_ref(p);
- if (vm == NULL) {
+ if (breakout) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * get the process size
+ */
+ vm = vmspace_acquire_ref(p);
+ if (vm == NULL) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ _PHOLD_LITE(p);
PROC_UNLOCK(p);
- continue;
- }
- _PHOLD_LITE(p);
- PROC_UNLOCK(p);
- sx_sunlock(&allproc_lock);
- if (!vm_map_trylock_read(&vm->vm_map)) {
+ sx_sunlock(&V_allproc_lock);
+ if (!vm_map_trylock_read(&vm->vm_map)) {
+ vmspace_free(vm);
+ sx_slock(&V_allproc_lock);
+ PRELE(p);
+ continue;
+ }
+ size = vmspace_swap_count(vm);
+ if (shortage == VM_OOM_MEM)
+ size += vm_pageout_oom_pagecount(vm);
+ vm_map_unlock_read(&vm->vm_map);
vmspace_free(vm);
- sx_slock(&allproc_lock);
- PRELE(p);
- continue;
- }
- size = vmspace_swap_count(vm);
- if (shortage == VM_OOM_MEM)
- size += vm_pageout_oom_pagecount(vm);
- vm_map_unlock_read(&vm->vm_map);
- vmspace_free(vm);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
- /*
- * If this process is bigger than the biggest one,
- * remember it.
- */
- if (size > bigsize) {
- if (bigproc != NULL)
- PRELE(bigproc);
- bigproc = p;
- bigsize = size;
- } else {
- PRELE(p);
+ /*
+ * If this process is bigger than the biggest one,
+ * remember it.
+ */
+ if (size > bigsize) {
+ if (bigproc != NULL)
+ PRELE(bigproc);
+ bigproc = p;
+ bigsize = size;
+ } else {
+ PRELE(p);
+ }
}
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
if (bigproc != NULL) {
if (vm_panic_on_oom != 0)
panic("out of swap space");
Index: sys/vm/vm_swapout.c
===================================================================
--- sys/vm/vm_swapout.c
+++ sys/vm/vm_swapout.c
@@ -378,6 +378,7 @@
static void
vm_daemon(void)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct rlimit rsslim;
struct proc *p;
struct thread *td;
@@ -417,114 +418,129 @@
attempts = 0;
again:
attempts++;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- vm_pindex_t limit, size;
- /*
- * if this is a system process or if we have already
- * looked at this process, skip it.
- */
- PROC_LOCK(p);
- if (p->p_state != PRS_NORMAL ||
- p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * if the process is in a non-running type state,
- * don't touch it.
- */
- breakout = 0;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (!TD_ON_RUNQ(td) &&
- !TD_IS_RUNNING(td) &&
- !TD_IS_SLEEPING(td) &&
- !TD_IS_SUSPENDED(td)) {
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ vm_pindex_t limit, size;
+
+ /*
+ * If this is a system process or if we have
+ * already looked at this process, skip it.
+ */
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL || p->p_flag &
+ (P_INEXEC | P_SYSTEM | P_WEXIT)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * If the process is in a non-running type
+ * state, don't touch it.
+ */
+ breakout = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (!TD_ON_RUNQ(td) &&
+ !TD_IS_RUNNING(td) &&
+ !TD_IS_SLEEPING(td) &&
+ !TD_IS_SUSPENDED(td)) {
+ thread_unlock(td);
+ breakout = 1;
+ break;
+ }
thread_unlock(td);
- breakout = 1;
- break;
}
- thread_unlock(td);
- }
- if (breakout) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * get a limit
- */
- lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
- limit = OFF_TO_IDX(
- qmin(rsslim.rlim_cur, rsslim.rlim_max));
+ if (breakout) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * get a limit
+ */
+ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
+ limit = OFF_TO_IDX(
+ qmin(rsslim.rlim_cur, rsslim.rlim_max));
- /*
- * let processes that are swapped out really be
- * swapped out set the limit to nothing (will force a
- * swap-out.)
- */
- if ((p->p_flag & P_INMEM) == 0)
- limit = 0; /* XXX */
- vm = vmspace_acquire_ref(p);
- _PHOLD_LITE(p);
- PROC_UNLOCK(p);
- if (vm == NULL) {
- PRELE(p);
- continue;
- }
- sx_sunlock(&allproc_lock);
+ /*
+ * let processes that are swapped out really be
+ * swapped out set the limit to nothing
+ * (will force a swap-out.)
+ */
+ if ((p->p_flag & P_INMEM) == 0)
+ limit = 0; /* XXX */
+ vm = vmspace_acquire_ref(p);
+ _PHOLD_LITE(p);
+ PROC_UNLOCK(p);
+ if (vm == NULL) {
+ PRELE(p);
+ continue;
+ }
+ sx_sunlock(&V_allproc_lock);
- size = vmspace_resident_count(vm);
- if (size >= limit) {
- vm_swapout_map_deactivate_pages(
- &vm->vm_map, limit);
size = vmspace_resident_count(vm);
- }
-#ifdef RACCT
- if (racct_enable) {
- rsize = IDX_TO_OFF(size);
- PROC_LOCK(p);
- if (p->p_state == PRS_NORMAL)
- racct_set(p, RACCT_RSS, rsize);
- ravailable = racct_get_available(p, RACCT_RSS);
- PROC_UNLOCK(p);
- if (rsize > ravailable) {
- /*
- * Don't be overly aggressive; this
- * might be an innocent process,
- * and the limit could've been exceeded
- * by some memory hog. Don't try
- * to deactivate more than 1/4th
- * of process' resident set size.
- */
- if (attempts <= 8) {
- if (ravailable < rsize -
- (rsize / 4)) {
- ravailable = rsize -
- (rsize / 4);
- }
- }
+ if (size >= limit) {
vm_swapout_map_deactivate_pages(
- &vm->vm_map,
- OFF_TO_IDX(ravailable));
- /* Update RSS usage after paging out. */
+ &vm->vm_map, limit);
size = vmspace_resident_count(vm);
+ }
+#ifdef RACCT
+ if (racct_enable) {
rsize = IDX_TO_OFF(size);
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL)
racct_set(p, RACCT_RSS, rsize);
+ ravailable = racct_get_available(p,
+ RACCT_RSS);
PROC_UNLOCK(p);
- if (rsize > ravailable)
- tryagain = 1;
+ if (rsize > ravailable) {
+ /*
+ * Don't be overly aggressive;
+ * this might be an innocent
+ * process, and the limit
+ * could've been exceeded by
+ * some memory hog. Don't try to
+ * deactivate more than 1/4th of
+ * process' resident set size.
+ */
+ if (attempts <= 8) {
+ if (ravailable < rsize -
+ (rsize / 4)) {
+ ravailable =
+ rsize -
+ (rsize / 4);
+ }
+ }
+ vm_swapout_map_deactivate_pages(
+ &vm->vm_map,
+ OFF_TO_IDX(ravailable));
+ /*
+ * Update RSS usage after
+ * paging out.
+ */
+ size = vmspace_resident_count(
+ vm);
+ rsize = IDX_TO_OFF(size);
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL)
+ racct_set(p, RACCT_RSS,
+ rsize);
+ PROC_UNLOCK(p);
+ if (rsize > ravailable)
+ tryagain = 1;
+ }
}
- }
#endif
- vmspace_free(vm);
- sx_slock(&allproc_lock);
- PRELE(p);
+ vmspace_free(vm);
+ sx_slock(&V_allproc_lock);
+ PRELE(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
if (tryagain != 0 && attempts <= 10) {
maybe_yield();
goto again;
@@ -656,6 +672,13 @@
int ppri, pri, slptime, swtime;
loop:
+#ifdef VIMAGE
+ if (!IS_DEFAULT_VPS(curvps) && V_vpsdying > 0) {
+ V_vproc0 = NULL;
+ return;
+ }
+#endif
+
if (vm_page_count_min()) {
vm_wait_min();
goto loop;
@@ -663,7 +686,7 @@
pp = NULL;
ppri = INT_MIN;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW ||
@@ -698,13 +721,13 @@
}
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
/*
* Nothing to do, back to sleep.
*/
if ((p = pp) == NULL) {
- tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
+ tsleep(V_vproc0, PVM, "swapin", MAXSLP * hz / 2);
goto loop;
}
PROC_LOCK(p);
@@ -738,6 +761,7 @@
static void
swapout_procs(int action)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
struct thread *td;
int slptime;
@@ -746,74 +770,81 @@
MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0);
didswap = false;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- /*
- * Filter out not yet fully constructed processes. Do
- * not swap out held processes. Avoid processes which
- * are system, exiting, execing, traced, already swapped
- * out or are in the process of being swapped in or out.
- */
- PROC_LOCK(p);
- if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag &
- (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE |
- P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) !=
- P_INMEM) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * Further consideration of this process for swap out
- * requires iterating over its threads. We release
- * allproc_lock here so that process creation and
- * destruction are not blocked while we iterate.
- *
- * To later reacquire allproc_lock and resume
- * iteration over the allproc list, we will first have
- * to release the lock on the process. We place a
- * hold on the process so that it remains in the
- * allproc list while it is unlocked.
- */
- _PHOLD_LITE(p);
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ /*
+ * Filter out not yet fully constructed processes. Do
+ * not swap out held processes. Avoid processes which
+ * are system, exiting, execing, traced, already swapped
+ * out or are in the process of being swapped in or out.
+ */
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL || p->p_lock != 0 ||
+ (p->p_flag & (P_SYSTEM | P_WEXIT | P_INEXEC |
+ P_STOPPED_SINGLE | P_TRACED | P_SWAPPINGOUT |
+ P_SWAPPINGIN | P_INMEM)) != P_INMEM) {
+ PROC_UNLOCK(p);
+ continue;
+ }
- /*
- * Do not swapout a realtime process.
- * Guarantee swap_idle_threshold1 time in memory.
- * If the system is under memory stress, or if we are
- * swapping idle processes >= swap_idle_threshold2,
- * then swap the process out.
- */
- doswap = true;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- slptime = (ticks - td->td_slptick) / hz;
- if (PRI_IS_REALTIME(td->td_pri_class) ||
- slptime < swap_idle_threshold1 ||
- !thread_safetoswapout(td) ||
- ((action & VM_SWAP_NORMAL) == 0 &&
- slptime < swap_idle_threshold2))
- doswap = false;
- thread_unlock(td);
- if (!doswap)
- break;
- }
- if (doswap && swapout(p) == 0)
- didswap = true;
+ /*
+ * Further consideration of this process for swap out
+ * requires iterating over its threads. We release
+ * allproc_lock here so that process creation and
+ * destruction are not blocked while we iterate.
+ *
+ * To later reacquire allproc_lock and resume
+ * iteration over the allproc list, we will first have
+ * to release the lock on the process. We place a
+ * hold on the process so that it remains in the
+ * allproc list while it is unlocked.
+ */
+ _PHOLD_LITE(p);
+ sx_sunlock(&V_allproc_lock);
- PROC_UNLOCK(p);
- sx_slock(&allproc_lock);
- PRELE(p);
+ /*
+ * Do not swapout a realtime process.
+ * Guarantee swap_idle_threshold1 time in memory.
+ * If the system is under memory stress, or if we are
+ * swapping idle processes >= swap_idle_threshold2,
+ * then swap the process out.
+ */
+ doswap = true;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ slptime = (ticks - td->td_slptick) / hz;
+ if (PRI_IS_REALTIME(td->td_pri_class) ||
+ slptime < swap_idle_threshold1 ||
+ !thread_safetoswapout(td) ||
+ ((action & VM_SWAP_NORMAL) == 0 &&
+ slptime < swap_idle_threshold2))
+ doswap = false;
+ thread_unlock(td);
+ if (!doswap)
+ break;
+ }
+ if (doswap && swapout(p) == 0)
+ didswap = true;
+
+ PROC_UNLOCK(p);
+ sx_slock(&V_allproc_lock);
+ PRELE(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
/*
* If we swapped something out, and another process needed memory,
* then wakeup the sched process.
*/
if (didswap)
- wakeup(&proc0);
+ wakeup(V_vproc0);
}
static void

File Metadata

Mime Type
text/plain
Expires
Thu, Mar 12, 12:36 PM (9 h, 49 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
29582528
Default Alt Text
D15865.diff (197 KB)

Event Timeline