Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F147507078
D15865.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
197 KB
Referenced Files
None
Subscribers
None
D15865.diff
View Options
Index: sys/arm/arm/pmap-v6.c
===================================================================
--- sys/arm/arm/pmap-v6.c
+++ sys/arm/arm/pmap-v6.c
@@ -6577,7 +6577,7 @@
int npte2 = 0;
int i, j, index;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_pid != pid || p->p_vmspace == NULL)
continue;
@@ -6605,7 +6605,7 @@
index = 0;
printf("\n");
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (npte2);
}
pte2p = pmap_pte2(pmap, va);
@@ -6632,7 +6632,7 @@
}
}
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (npte2);
}
Index: sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
+++ sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
@@ -1022,11 +1022,11 @@
mutex_enter(pid_mtx);
#else
pp = p;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
while (pp->p_vmspace == pp->p_pptr->p_vmspace)
pp = pp->p_pptr;
pid = pp->p_pid;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
pp = NULL;
rm_rlock(&fasttrap_tp_lock, &tracker);
Index: sys/compat/linprocfs/linprocfs.c
===================================================================
--- sys/compat/linprocfs/linprocfs.c
+++ sys/compat/linprocfs/linprocfs.c
@@ -689,8 +689,8 @@
(int)(averunnable.ldavg[2] / averunnable.fscale),
(int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100),
1, /* number of running tasks */
- nprocs, /* number of tasks */
- lastpid /* the last pid */
+ V_nprocs, /* number of tasks */
+ V_lastpid /* the last pid */
);
return (0);
}
@@ -708,10 +708,10 @@
vm_offset_t startcode, startdata;
getboottime(&boottime);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
fill_kinfo_proc(p, &kp);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
if (p->p_vmspace) {
startcode = (vm_offset_t)p->p_vmspace->vm_taddr;
startdata = (vm_offset_t)p->p_vmspace->vm_daddr;
@@ -787,11 +787,11 @@
struct kinfo_proc kp;
segsz_t lsize;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
fill_kinfo_proc(p, &kp);
PROC_UNLOCK(p);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
/*
* See comments in linprocfs_doprocstatus() regarding the
@@ -825,7 +825,7 @@
l_sigset_t siglist, sigignore, sigcatch;
int i;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */
@@ -864,7 +864,7 @@
}
fill_kinfo_proc(p, &kp);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */
sbuf_printf(sb, "State:\t%s\n", state);
Index: sys/compat/linux/linux_file.c
===================================================================
--- sys/compat/linux/linux_file.c
+++ sys/compat/linux/linux_file.c
@@ -149,17 +149,17 @@
fdrop(fp, td);
goto done;
}
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
if (SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) {
PROC_UNLOCK(p);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
/* XXXPJD: Verify if TIOCSCTTY is allowed. */
(void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0,
td->td_ucred, td);
} else {
PROC_UNLOCK(p);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
}
fdrop(fp, td);
}
Index: sys/compat/linux/linux_fork.c
===================================================================
--- sys/compat/linux/linux_fork.c
+++ sys/compat/linux/linux_fork.c
@@ -233,11 +233,11 @@
* the same as that of the calling process.
*/
if (args->flags & LINUX_CLONE_PARENT) {
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
PROC_LOCK(p2);
proc_reparent(p2, td->td_proc->p_pptr);
PROC_UNLOCK(p2);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
}
#ifdef DEBUG
Index: sys/compat/linux/linux_misc.c
===================================================================
--- sys/compat/linux/linux_misc.c
+++ sys/compat/linux/linux_misc.c
@@ -181,7 +181,7 @@
sysinfo.totalswap = i * PAGE_SIZE;
sysinfo.freeswap = (i - j) * PAGE_SIZE;
- sysinfo.procs = nprocs;
+ sysinfo.procs = V_nprocs;
/* The following are only present in newer Linux kernels. */
sysinfo.totalbig = 0;
Index: sys/compat/linuxkpi/common/src/linux_current.c
===================================================================
--- sys/compat/linuxkpi/common/src/linux_current.c
+++ sys/compat/linuxkpi/common/src/linux_current.c
@@ -226,22 +226,29 @@
static void
linux_current_uninit(void *arg __unused)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
struct task_struct *ts;
struct thread *td;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- FOREACH_THREAD_IN_PROC(p, td) {
- if ((ts = td->td_lkpi_task) != NULL) {
- td->td_lkpi_task = NULL;
- put_task_struct(ts);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if ((ts = td->td_lkpi_task) != NULL) {
+ td->td_lkpi_task = NULL;
+ put_task_struct(ts);
+ }
}
+ PROC_UNLOCK(p);
}
- PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
EVENTHANDLER_DEREGISTER(thread_dtor, linuxkpi_thread_dtor_tag);
}
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -3852,6 +3852,7 @@
kern/kern_tslog.c optional tslog
kern/kern_umtx.c standard
kern/kern_uuid.c standard
+kern/kern_vps.c optional vimage
kern/kern_xxx.c standard
kern/link_elf.c standard
kern/linker_if.m standard
Index: sys/ddb/db_command.c
===================================================================
--- sys/ddb/db_command.c
+++ sys/ddb/db_command.c
@@ -693,11 +693,12 @@
* Find the process in question. allproc_lock is not needed
* since we're in DDB.
*/
- /* sx_slock(&allproc_lock); */
+ /* Operate on current vps instance only. */
+ /* sx_slock(&V_allproc_lock); */
FOREACH_PROC_IN_SYSTEM(p)
if (p->p_pid == pid)
break;
- /* sx_sunlock(&allproc_lock); */
+ /* sx_sunlock(&V_allproc_lock); */
if (p == NULL)
DB_ERROR(("Can't find process with pid %ld\n", (long) pid));
@@ -875,12 +876,26 @@
}
}
+static void
+_db_stack_trace_all_v(bool active_only)
+{
+ VPS_ITERATOR_DECL(vps_iter);
+
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ _db_stack_trace_all(active_only);
+ CURVPS_RESTORE();
+ }
+ /* VPS_LIST_RUNLOCK(); */
+}
+
static void
db_stack_trace_active(db_expr_t dummy, bool dummy2, db_expr_t dummy3,
char *dummy4)
{
- _db_stack_trace_all(true);
+ _db_stack_trace_all_v(true);
}
static void
@@ -888,7 +903,7 @@
char *dummy4)
{
- _db_stack_trace_all(false);
+ _db_stack_trace_all_v(false);
}
/*
Index: sys/ddb/db_expr.c
===================================================================
--- sys/ddb/db_expr.c
+++ sys/ddb/db_expr.c
@@ -58,7 +58,8 @@
if (t == tIDENT) {
if (!db_value_of_name(db_tok_string, valuep) &&
!db_value_of_name_pcpu(db_tok_string, valuep) &&
- !db_value_of_name_vnet(db_tok_string, valuep)) {
+ !db_value_of_name_vnet(db_tok_string, valuep) &&
+ !db_value_of_name_vps(db_tok_string, valuep)) {
db_printf("Symbol '%s' not found\n", db_tok_string);
db_error(NULL);
/*NOTREACHED*/
Index: sys/ddb/db_ps.c
===================================================================
--- sys/ddb/db_ps.c
+++ sys/ddb/db_ps.c
@@ -90,10 +90,11 @@
char state[9];
int np, rflag, sflag, dflag, lflag, wflag;
- np = nprocs;
+ np = V_nprocs;
- if (!LIST_EMPTY(&allproc))
- p = LIST_FIRST(&allproc);
+ /* Operate on current vps instance only. */
+ if (!LIST_EMPTY(&V_allproc))
+ p = LIST_FIRST(&V_allproc);
else
p = &proc0;
@@ -217,8 +218,9 @@
p = LIST_NEXT(p, p_list);
if (p == NULL && np > 0)
- p = LIST_FIRST(&zombproc);
+ p = LIST_FIRST(&V_zombproc);
}
+ db_printf("nprocs = %d, np = %d\n", V_nprocs, np);
}
static void
@@ -397,6 +399,9 @@
db_printf(" last involuntary switch: %d ms ago\n",
1000 * delta / hz);
}
+#ifdef VIMAGE
+ db_printf(" vnet: %p vps: %p\n", td->td_vnet, td->td_vps);
+#endif
}
DB_SHOW_COMMAND(proc, db_show_proc)
@@ -475,6 +480,7 @@
db_findstack_cmd(db_expr_t addr, bool have_addr, db_expr_t dummy3 __unused,
char *dummy4 __unused)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
struct thread *td;
struct kstack_cache_entry *ks_ce;
@@ -487,15 +493,22 @@
return;
}
- FOREACH_PROC_IN_SYSTEM(p) {
- FOREACH_THREAD_IN_PROC(p, td) {
- if (td->td_kstack <= saddr && saddr < td->td_kstack +
- PAGE_SIZE * td->td_kstack_pages) {
- db_printf("Thread %p\n", td);
- return;
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->td_kstack <= saddr &&
+ saddr < td->td_kstack +
+ PAGE_SIZE * td->td_kstack_pages) {
+ db_printf("Thread %p\n", td);
+ return;
+ }
}
}
+ CURVPS_RESTORE();
}
+ /* VPS_LIST_RUNLOCK(); */
for (ks_ce = kstack_cache; ks_ce != NULL;
ks_ce = ks_ce->next_ks_entry) {
Index: sys/ddb/db_sym.c
===================================================================
--- sys/ddb/db_sym.c
+++ sys/ddb/db_sym.c
@@ -37,8 +37,10 @@
#include <sys/param.h>
#include <sys/pcpu.h>
+#include <sys/proc.h>
#include <sys/smp.h>
#include <sys/systm.h>
+#include <sys/vps.h>
#include <net/vnet.h>
@@ -69,6 +71,7 @@
#ifdef VIMAGE
static void *db_vnet = NULL;
+static void *db_vps = NULL;
#endif
/*
@@ -168,6 +171,53 @@
return (0);
}
}
+
+/*
+ * Validate the virtual process space pointer used to interpret per-vps global
+ * variable expansion. Right now we don't do much here, really we should
+ * walk the global vps list to check it's an OK pointer.
+ */
+int
+db_var_db_vps(struct db_variable *vp, db_expr_t *valuep, int op)
+{
+
+ switch (op) {
+ case DB_VAR_GET:
+ *valuep = (db_expr_t)db_vps;
+ return (1);
+
+ case DB_VAR_SET:
+ db_vps = *(void **)valuep;
+ return (1);
+
+ default:
+ db_printf("db_var_db_vps: unknown operation\n");
+ return (0);
+ }
+}
+
+/*
+ * Read-only variable reporting the current vps, which is what we use when
+ * db_vps is set to NULL.
+ */
+int
+db_var_curvps(struct db_variable *vp, db_expr_t *valuep, int op)
+{
+
+ switch (op) {
+ case DB_VAR_GET:
+ *valuep = (db_expr_t)curvps;
+ return (1);
+
+ case DB_VAR_SET:
+ db_printf("Read-only variable.\n");
+ return (0);
+
+ default:
+ db_printf("db_var_curvps: unknown operation\n");
+ return (0);
+ }
+}
#endif
/*
@@ -278,6 +328,33 @@
#endif
}
+bool
+db_value_of_name_vps(const char *name, db_expr_t *valuep)
+{
+#ifdef VIMAGE
+ static char tmp[256];
+ db_expr_t value;
+ c_db_sym_t sym;
+ struct vps *vps;
+
+ if (db_vps != NULL)
+ vps = db_vps;
+ else
+ vps = curvps;
+ snprintf(tmp, sizeof(tmp), "vps_entry_%s", name);
+ sym = db_lookup(tmp);
+ if (sym == C_DB_SYM_NULL)
+ return (false);
+ db_symbol_values(sym, &name, &value);
+ if (value < VPS_START || value >= VPS_STOP)
+ return (false);
+ *valuep = (db_expr_t)((uintptr_t)value + vps->vps_data_base);
+ return (true);
+#else
+ return (false);
+#endif
+}
+
/*
* Lookup a symbol.
* If the symbol has a qualifier (e.g., ux:vm_map),
Index: sys/ddb/db_thread.c
===================================================================
--- sys/ddb/db_thread.c
+++ sys/ddb/db_thread.c
@@ -135,11 +135,12 @@
if (td != NULL)
return (td);
if (check_pid) {
+ /* Operate on current vps instance only. */
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_pid == decaddr)
return (FIRST_THREAD_IN_PROC(p));
}
- LIST_FOREACH(p, &zombproc, p_list) {
+ LIST_FOREACH(p, &V_zombproc, p_list) {
if (p->p_pid == decaddr)
return (FIRST_THREAD_IN_PROC(p));
}
@@ -161,11 +162,12 @@
decaddr = db_hex2dec(addr);
if (decaddr != -1) {
+ /* Operate on current vps instance only. */
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_pid == decaddr)
return (p);
}
- LIST_FOREACH(p, &zombproc, p_list) {
+ LIST_FOREACH(p, &V_zombproc, p_list) {
if (p->p_pid == decaddr)
return (p);
}
Index: sys/ddb/db_variables.h
===================================================================
--- sys/ddb/db_variables.h
+++ sys/ddb/db_variables.h
@@ -56,8 +56,10 @@
extern db_varfcn_t db_var_curcpu; /* DPCPU default CPU */
extern db_varfcn_t db_var_curvnet; /* Default vnet */
+extern db_varfcn_t db_var_curvps; /* Default vps */
extern db_varfcn_t db_var_db_cpu; /* DPCPU active CPU */
extern db_varfcn_t db_var_db_vnet; /* Active vnet */
+extern db_varfcn_t db_var_db_vps; /* Active vps */
int db_read_variable(struct db_variable *, db_expr_t *);
int db_write_variable(struct db_variable *, db_expr_t);
Index: sys/ddb/db_variables.c
===================================================================
--- sys/ddb/db_variables.c
+++ sys/ddb/db_variables.c
@@ -53,6 +53,8 @@
#ifdef VIMAGE
{ "curvnet", NULL, db_var_curvnet },
{ "db_vnet", NULL, db_var_db_vnet },
+ { "curvps", NULL, db_var_curvps },
+ { "db_vps", NULL, db_var_db_vps },
#endif
};
static struct db_variable *db_evars = db_vars + nitems(db_vars);
Index: sys/ddb/ddb.h
===================================================================
--- sys/ddb/ddb.h
+++ sys/ddb/ddb.h
@@ -229,6 +229,7 @@
bool db_value_of_name(const char *name, db_expr_t *valuep);
bool db_value_of_name_pcpu(const char *name, db_expr_t *valuep);
bool db_value_of_name_vnet(const char *name, db_expr_t *valuep);
+bool db_value_of_name_vps(const char *name, db_expr_t *valuep);
int db_write_bytes(vm_offset_t addr, size_t size, char *data);
void db_command_register(struct command_table *, struct command *);
void db_command_unregister(struct command_table *, struct command *);
Index: sys/dev/filemon/filemon.c
===================================================================
--- sys/dev/filemon/filemon.c
+++ sys/dev/filemon/filemon.c
@@ -210,6 +210,7 @@
static void
filemon_untrack_processes(struct filemon *filemon)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
sx_assert(&filemon->lock, SA_XLOCKED);
@@ -223,18 +224,24 @@
* filemon_event_process_exit() will lock on filemon->lock
* which we hold.
*/
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- /*
- * No PROC_LOCK is needed to compare here since it is
- * guaranteed to not change since we have its filemon
- * locked. Everything that changes this p_filemon will
- * be locked on it.
- */
- if (p->p_filemon == filemon)
- filemon_proc_drop(p);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ /*
+ * No PROC_LOCK is needed to compare here since it is
+ * guaranteed to not change since we have its filemon
+ * locked. Everything that changes this p_filemon will
+ * be locked on it.
+ */
+ if (p->p_filemon == filemon)
+ filemon_proc_drop(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
/*
* It's possible some references were acquired but will be
Index: sys/dev/hwpmc/hwpmc_mod.c
===================================================================
--- sys/dev/hwpmc/hwpmc_mod.c
+++ sys/dev/hwpmc/hwpmc_mod.c
@@ -1203,7 +1203,7 @@
* this PMC.
*/
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
top = p;
@@ -1227,7 +1227,7 @@
(void) pmc_detach_process(top, pm);
done:
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return error;
}
@@ -1312,7 +1312,7 @@
* partially attached proc tree.
*/
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
top = p;
@@ -1333,7 +1333,7 @@
}
done:
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
if (LIST_EMPTY(&pm->pm_targets))
pm->pm_flags &= ~PMC_F_ATTACH_DONE;
@@ -2025,7 +2025,7 @@
PROC_UNLOCK(p);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
top = p;
@@ -2044,7 +2044,7 @@
}
}
done:
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
}
/*
@@ -5364,6 +5364,7 @@
static void
pmc_process_allproc(struct pmc *pm)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct pmc_owner *po;
struct thread *td;
struct proc *p;
@@ -5371,15 +5372,22 @@
po = pm->pm_owner;
if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
return;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- pmclog_process_proccreate(po, p, 0 /* sync */);
- PROC_LOCK(p);
- FOREACH_THREAD_IN_PROC(p, td)
- pmclog_process_threadcreate(po, td, 0 /* sync */);
- PROC_UNLOCK(p);
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ pmclog_process_proccreate(po, p, 0 /* sync */);
+ PROC_LOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td)
+ pmclog_process_threadcreate(po, td, 0 /* sync */);
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
pmclog_flush(po, 0);
}
Index: sys/fs/devfs/devfs_vnops.c
===================================================================
--- sys/fs/devfs/devfs_vnops.c
+++ sys/fs/devfs/devfs_vnops.c
@@ -596,7 +596,7 @@
if (vp == p->p_session->s_ttyvp) {
PROC_UNLOCK(p);
oldvp = NULL;
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
if (vp == p->p_session->s_ttyvp) {
SESS_LOCK(p->p_session);
VI_LOCK(vp);
@@ -609,7 +609,7 @@
VI_UNLOCK(vp);
SESS_UNLOCK(p->p_session);
}
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
if (oldvp != NULL)
vrele(oldvp);
} else
@@ -813,9 +813,9 @@
if (error == 0 && com == TIOCSCTTY) {
/* Do nothing if reassigning same control tty */
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (td->td_proc->p_session->s_ttyvp == vp) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (0);
}
@@ -826,7 +826,7 @@
td->td_proc->p_session->s_ttydp = cdev2priv(dev);
SESS_UNLOCK(td->td_proc->p_session);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
/* Get rid of reference to old control tty */
if (vpold)
Index: sys/fs/nfs/nfsport.h
===================================================================
--- sys/fs/nfs/nfsport.h
+++ sys/fs/nfs/nfsport.h
@@ -692,8 +692,8 @@
#define NFSUNLOCKMNT(m) mtx_unlock(&((m)->nm_mtx))
#define NFSLOCKREQUEST(r) mtx_lock(&((r)->r_mtx))
#define NFSUNLOCKREQUEST(r) mtx_unlock(&((r)->r_mtx))
-#define NFSPROCLISTLOCK() sx_slock(&allproc_lock)
-#define NFSPROCLISTUNLOCK() sx_sunlock(&allproc_lock)
+#define NFSPROCLISTLOCK() sx_slock(&V_allproc_lock)
+#define NFSPROCLISTUNLOCK() sx_sunlock(&V_allproc_lock)
#define NFSLOCKSOCKREQ(r) mtx_lock(&((r)->nr_mtx))
#define NFSUNLOCKSOCKREQ(r) mtx_unlock(&((r)->nr_mtx))
#define NFSLOCKDS(d) mtx_lock(&((d)->nfsclds_mtx))
Index: sys/fs/pseudofs/pseudofs_vnops.c
===================================================================
--- sys/fs/pseudofs/pseudofs_vnops.c
+++ sys/fs/pseudofs/pseudofs_vnops.c
@@ -705,7 +705,7 @@
{
int visible;
- sx_assert(&allproc_lock, SX_SLOCKED);
+ sx_assert(&V_allproc_lock, SX_SLOCKED);
pfs_assert_owned(pd);
again:
if (*pn == NULL) {
@@ -716,9 +716,14 @@
*pn = (*pn)->pn_next;
}
if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) {
+ /*
+ * Operate on current vps instance only.
+ * We must not iterate over all vps as duplicate process space
+ * would not work at all and leak a lot of information.
+ */
/* next process */
if (*p == NULL)
- *p = LIST_FIRST(&allproc);
+ *p = LIST_FIRST(&V_allproc);
else
*p = LIST_NEXT(*p, p_list);
/* out of processes: next node */
@@ -791,12 +796,12 @@
if (resid == 0)
PFS_RETURN (0);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
pfs_lock(pd);
/* check if the directory is visible to the caller */
if (!pfs_visible(curthread, pd, pid, true, &proc)) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
pfs_unlock(pd);
PFS_RETURN (ENOENT);
}
@@ -810,7 +815,7 @@
if (proc != NULL)
PROC_UNLOCK(proc);
pfs_unlock(pd);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
PFS_RETURN (0);
}
}
@@ -860,7 +865,7 @@
if (proc != NULL)
PROC_UNLOCK(proc);
pfs_unlock(pd);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
i = 0;
STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) {
if (error == 0)
Index: sys/i386/i386/pmap.c
===================================================================
--- sys/i386/i386/pmap.c
+++ sys/i386/i386/pmap.c
@@ -5799,7 +5799,7 @@
int npte = 0;
int index;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_pid != pid)
continue;
@@ -5822,7 +5822,7 @@
index = 0;
printf("\n");
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (npte);
}
pte = pmap_pte(pmap, va);
@@ -5847,7 +5847,7 @@
}
}
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (npte);
}
#endif
Index: sys/i386/ibcs2/ibcs2_sysvec.c
===================================================================
--- sys/i386/ibcs2/ibcs2_sysvec.c
+++ sys/i386/ibcs2/ibcs2_sysvec.c
@@ -109,6 +109,7 @@
static int
ibcs2_modevent(module_t mod, int type, void *unused)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p = NULL;
int rval = 0;
@@ -117,14 +118,20 @@
break;
case MOD_UNLOAD:
/* if this was an ELF module we'd use elf_brand_inuse()... */
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- if (p->p_sysent == &ibcs2_svr3_sysvec) {
- rval = EBUSY;
- break;
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_sysent == &ibcs2_svr3_sysvec) {
+ rval = EBUSY;
+ break;
+ }
}
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
break;
default:
rval = EOPNOTSUPP;
Index: sys/kern/imgact_elf.c
===================================================================
--- sys/kern/imgact_elf.c
+++ sys/kern/imgact_elf.c
@@ -238,17 +238,24 @@
int
__elfN(brand_inuse)(Elf_Brandinfo *entry)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
int rval = FALSE;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- if (p->p_sysent == entry->sysvec) {
- rval = TRUE;
- break;
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_sysent == entry->sysvec) {
+ rval = TRUE;
+ break;
+ }
}
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
return (rval);
}
@@ -2106,10 +2113,10 @@
KASSERT(*sizep == size, ("invalid size"));
structsize = sizeof(elf_kinfo_proc_t);
sbuf_bcat(sb, &structsize, sizeof(structsize));
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
kern_proc_out(p, sb, ELF_KERN_PROC_MASK);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
}
*sizep = size;
}
Index: sys/kern/init_main.c
===================================================================
--- sys/kern/init_main.c
+++ sys/kern/init_main.c
@@ -56,6 +56,7 @@
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/jail.h>
+#include <sys/kthread.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/loginclass.h>
@@ -79,6 +80,7 @@
#include <sys/malloc.h>
#include <sys/conf.h>
#include <sys/cpuset.h>
+#include <sys/vps.h>
#include <machine/cpu.h>
@@ -103,7 +105,18 @@
struct proc proc0;
struct thread0_storage thread0_st __aligned(32);
struct vmspace vmspace0;
-struct proc *initproc;
+VPS_DEFINE(struct proc *, initproc);
+
+VPS_DEFINE(struct proc *, vproc0);
+#ifdef VIMAGE
+/*
+ * Initialize to -2; after kproc_create() our thread will still be
+ * forked from thread0 and in the wrong vps. Once that is fixed it will
+ * see the local copy and not the DEFAULT_VPS one. Make sure we have
+ * a value that we can spin on until this happens.
+ */
+VPS_DEFINE(int, vpsdying) = -2;
+#endif
#ifndef BOOTHOWTO
#define BOOTHOWTO 0
@@ -461,9 +474,8 @@
p->p_osrel = osreldate;
/*
- * Initialize thread and process structures.
+ * Initialize thread structures.
*/
- procinit(); /* set up proc zone */
threadinit(); /* set up UMA zones */
/*
@@ -475,7 +487,8 @@
/*
* Create process 0 (the swapper).
*/
- LIST_INSERT_HEAD(&allproc, p, p_list);
+ V_vproc0 = p;
+ LIST_INSERT_HEAD(&V_allproc, p, p_list);
LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
p->p_pgrp = &pgrp0;
@@ -511,6 +524,9 @@
td->td_cpuset = cpuset_thread0();
td->td_domain.dr_policy = td->td_cpuset->cs_domain;
prison0_init();
+#ifdef VIMAGE
+ td->td_vps = vps0;
+#endif
p->p_peers = 0;
p->p_leader = p;
p->p_reaper = p;
@@ -549,7 +565,7 @@
p->p_sigacts = sigacts_alloc();
/* Initialize signal state for process 0. */
- siginit(&proc0);
+ siginit(V_vproc0);
/* Create the file descriptor table. */
p->p_fd = fdinit(NULL, false);
@@ -614,7 +630,184 @@
racct_add_force(p, RACCT_NPROC, 1);
PROC_UNLOCK(p);
}
-SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL);
+SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_init, NULL);
+
+#ifdef VIMAGE
+static void
+vps_swapper(void *dummy __unused)
+{
+
+ /*
+ * Make sure the surgical changes to V_vproc0 are done before
+ * entering the long-lasting loop. Otherwise we may start
+ * acquiring locks and accessing variables based on the wrong
+ * credential leading to, e.g., panics when trying to unlock a
+ * lock from a different context which may not be locked.
+ * When entering the function our credentials might still point
+ * to the DEFAULT_VPS; see comment for V_vpsdying declaration above.
+ */
+ while (V_vpsdying < 0)
+ pause("wswvps", hz/2);
+
+ /*
+ * Now hand over this thread to swapper.
+ */
+ swapper();
+
+ if (V_vpsdying < 1)
+ panic("%s: swapper curtd %p ended but V_vpsdying %d\n",
+ __func__, curthread, V_vpsdying);
+
+ kproc_exit(0);
+}
+
+static void
+proc0_init_vps(void *dummy __unused)
+{
+ struct ucred *newcred, *savecred;
+ struct thread *td;
+ struct prison *pr;
+ struct uidinfo tmpuinfo;
+ struct loginclass tmplc = {
+ .lc_name = "",
+ };
+ int error;
+
+ /* vps0 is handled normally in p0init. */
+ if (IS_DEFAULT_VPS(curvps))
+ return;
+
+ KASSERT((curvps->vps_pr != NULL && curvps != vps0),
+ ("%s: curvps %p has vps_pr %p or is vps0 %p\n",
+ __func__, curvps, curvps->vps_pr, vps0));
+ KASSERT((curvps == curvps->vps_pr->pr_vps),
+ ("%s: curvps %p != curvps->vps_pr %p ->pr_vps %p\n",
+ __func__, curvps, curvps->vps_pr, curvps->vps_pr->pr_vps));
+
+ /*
+ * Initialized the non-default VPS version to < 0 so vps_swapper()
+ * will spin once the credential is changed before all other surgery
+ * has happened.
+ */
+ V_vpsdying = -1;
+
+ /*
+ * Default is nprocs = 1 for vps0; need to set it to 0 here as our
+ * "proc0" and with that initproc are forked and not manually constructed.
+ */
+ V_nprocs = 0;
+
+ /*
+ * Set lastpid to -1 so that our swapper gets 0.
+ */
+ V_lastpid = -1;
+
+ error = kproc_create(vps_swapper, NULL, &V_vproc0, 0, 0, "vps%u",
+ curvps->vps_pr->pr_id);
+ if (error)
+ panic("%s: cannot create vps %p swapper: %d\n",
+ __func__, curvps, error);
+
+ /* Create credentials. Copied from proc0. Just using vps_pr. */
+ newcred = crget();
+ newcred->cr_ngroups = 1; /* group 0 */
+ /* A hack to prevent uifind from tripping over NULL pointers. */
+ savecred = curthread->td_ucred;
+ curthread->td_ucred = newcred;
+ tmpuinfo.ui_uid = 1;
+ newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo;
+ newcred->cr_uidinfo = uifind(0);
+ newcred->cr_ruidinfo = uifind(0);
+ newcred->cr_loginclass = &tmplc;
+ newcred->cr_loginclass = loginclass_find("default");
+ /* End hack. creds get properly set later with thread_cow_get_proc */
+ curthread->td_ucred = savecred;
+ PROC_LOCK(V_vproc0);
+ newcred->cr_prison = curvps->vps_pr;
+ prison_hold(newcred->cr_prison);
+ /* The kernel process was accounted to thread0's prison. */
+ prison_proc_hold(newcred->cr_prison);
+ prison_proc_free(savecred->cr_prison);
+ V_vproc0->p_treeflag |= P_TREE_REAPER;
+ savecred = proc_set_cred(V_vproc0, newcred);
+ PROC_UNLOCK(V_vproc0);
+#ifdef AUDIT
+ audit_cred_kproc0(newcred);
+#endif
+#ifdef MAC
+ mac_cred_create_swapper(newcred);
+#endif
+ crfree(savecred);
+
+ PROC_LOCK(V_vproc0);
+ td = FIRST_THREAD_IN_PROC(V_vproc0);
+ thread_cow_get_proc(td, V_vproc0);
+ PROC_UNLOCK(V_vproc0);
+ KASSERT(curvps->vps_pr ==
+ FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred->cr_prison,
+ ("%s:%d: curvps %p vps_pr %p != FTIP(V_vproc0 %p)->td_ucred %p "
+ "cr_prison %p\n", __func__, __LINE__,
+ curvps, curvps->vps_pr, V_vproc0,
+ FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred,
+ FIRST_THREAD_IN_PROC(V_vproc0)->td_ucred->cr_prison));
+ KASSERT(curvps == TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)),
+ ("%s:%d: curvps %p != TD_TO_VPS(..(V_vproc0 %p)) %p\n",
+ __func__, __LINE__,
+ curvps, V_vproc0, TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0))));
+
+ /* Chroot it. */
+ td = FIRST_THREAD_IN_PROC(V_vproc0);
+ pr = curvps->vps_pr;
+ vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = change_dir(pr->pr_root, td)) != 0) {
+ printf("%s: td %p change_dir %p failed: %d\n",
+ __func__, td, pr->pr_root, error);
+ goto err;
+ }
+#ifdef MAC
+ if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) {
+ printf("%s: td %p mac_vnode_check_chroot %p failed: %d\n",
+ __func__, td, pr->pr_root, error);
+ goto err;
+ }
+#endif
+ VOP_UNLOCK(pr->pr_root, 0);
+ if ((error = pwd_chroot(td, pr->pr_root))) {
+ printf("%s: td %p pwd_chroot %p failed: %d\n",
+ __func__, td, pr->pr_root, error);
+ goto err;
+ }
+
+ V_vpsdying = 0;
+ return;
+
+err:
+ /* XXX could panic or singal the jail to abort; cannot really stop. */
+ return;
+}
+VPS_SYSINIT(p0init_vps, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_init_vps, NULL);
+
+static void
+proc0_uninit_vps(void *dummy __unused)
+{
+
+ if (IS_DEFAULT_VPS(curvps))
+ return;
+
+ /*
+ * XXX ideally we want to get that state from elsewhere;
+ * neither prison, not vps state, .. lends itself though.
+ */
+ V_vpsdying = 1;
+ wakeup(V_vproc0);
+
+ /* Operate on current vps instance only. */
+ while (V_vproc0 != NULL ||
+ !LIST_EMPTY(&V_zombproc) || !LIST_EMPTY(&V_allproc))
+ pause("p0uvps", hz/2);
+}
+VPS_SYSUNINIT(p0uninit_vps, SI_SUB_INTRINSIC, SI_ORDER_THIRD, proc0_uninit_vps, NULL);
+#endif
/* ARGSUSED*/
static void
@@ -628,8 +821,9 @@
/*
* Now we can look at the time, having had a chance to verify the
* time from the filesystem. Pretend that proc0 started now.
+ * Operate on vps0 instance only.
*/
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
@@ -649,7 +843,7 @@
}
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
PCPU_SET(switchtime, cpu_ticks());
PCPU_SET(switchticks, ticks);
@@ -729,7 +923,8 @@
td = curthread;
p = td->td_proc;
- vfs_mountroot();
+ if (IS_DEFAULT_VPS(curvps))
+ vfs_mountroot();
/* Wipe GELI passphrase from the environment. */
kern_unsetenv("kern.geom.eli.passphrase");
@@ -753,8 +948,8 @@
while ((path = strsep(&tmp_init_path, ":")) != NULL) {
pathlen = strlen(path) + 1;
if (bootverbose)
- printf("start_init: trying %s\n", path);
-
+ printf("%s: trying %s\n", __func__, path);
+
/*
* Move out the boot flag argument.
*/
@@ -839,38 +1034,60 @@
struct thread *td;
int error;
+ KASSERT(curvps == FIRST_THREAD_IN_PROC(V_vproc0)->td_vps,
+ ("%s: curvps %p != V_vproc0 %p first td %p td_vps %p\n",
+ __func__, curvps, V_vproc0, FIRST_THREAD_IN_PROC(V_vproc0),
+ FIRST_THREAD_IN_PROC(V_vproc0)->td_vps));
+ KASSERT(curvps == TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0)),
+ ("%s: curvps %p != TD_TO_VPS(..(V_vproc0 %p)) %p\n",
+ __func__, curvps, V_vproc0,
+ TD_TO_VPS(FIRST_THREAD_IN_PROC(V_vproc0))));
+
bzero(&fr, sizeof(fr));
fr.fr_flags = RFFDG | RFPROC | RFSTOPPED;
- fr.fr_procp = &initproc;
- error = fork1(&thread0, &fr);
+ fr.fr_procp = &V_initproc;
+ td = FIRST_THREAD_IN_PROC(V_vproc0);
+ error = fork1(td, &fr);
if (error)
panic("cannot fork init: %d\n", error);
- KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
+ KASSERT(V_initproc->p_pid == 1, ("%s: initproc->p_pid(%d) != 1",
+ __func__, V_initproc->p_pid));
+ KASSERT(curvps == FIRST_THREAD_IN_PROC(V_initproc)->td_vps,
+ ("%s: curvps %p != V_initproc %p first td %p td_vps %p\n",
+ __func__, curvps, V_initproc, FIRST_THREAD_IN_PROC(V_initproc),
+ FIRST_THREAD_IN_PROC(V_initproc)->td_vps));
+
/* divorce init's credentials from the kernel's */
newcred = crget();
- sx_xlock(&proctree_lock);
- PROC_LOCK(initproc);
- initproc->p_flag |= P_SYSTEM | P_INMEM;
- initproc->p_treeflag |= P_TREE_REAPER;
- oldcred = initproc->p_ucred;
+ sx_xlock(&V_proctree_lock);
+ PROC_LOCK(V_initproc);
+ V_initproc->p_flag |= P_SYSTEM | P_INMEM;
+ V_initproc->p_treeflag |= P_TREE_REAPER;
+ oldcred = V_initproc->p_ucred;
crcopy(newcred, oldcred);
+#ifdef VIMAGE
+ /* Swap to the correct prison. */
+ /* XXX is this really needed or was this related to a V_vproc0 bug? */
+ prison_free(newcred->cr_prison);
+ newcred->cr_prison = curvps->vps_pr;
+ prison_hold(newcred->cr_prison);
+#endif
#ifdef MAC
mac_cred_create_init(newcred);
#endif
#ifdef AUDIT
audit_cred_proc1(newcred);
#endif
- proc_set_cred(initproc, newcred);
- td = FIRST_THREAD_IN_PROC(initproc);
- crfree(td->td_ucred);
- td->td_ucred = crhold(initproc->p_ucred);
- PROC_UNLOCK(initproc);
- sx_xunlock(&proctree_lock);
+ /* This will also update cowgen. */
+ proc_set_cred(V_initproc, newcred);
+ PROC_UNLOCK(V_initproc);
+ sx_xunlock(&V_proctree_lock);
crfree(oldcred);
- cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc),
+
+ cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(V_initproc),
start_init, NULL);
}
-SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
+VPS_SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
/*
* Make it runnable now.
@@ -880,10 +1097,49 @@
{
struct thread *td;
- td = FIRST_THREAD_IN_PROC(initproc);
+ td = FIRST_THREAD_IN_PROC(V_initproc);
thread_lock(td);
TD_SET_CAN_RUN(td);
sched_add(td, SRQ_BORING);
thread_unlock(td);
}
-SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL);
+VPS_SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL);
+
+#ifdef VIMAGE
+static void
+reapinit(void *ident __unused)
+{
+ struct proc *p, *p2;
+
+ while (V_nprocs > 2) {
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_pid <= 1)
+ continue;
+ PROC_LOCK(p);
+ kern_psignal(p, SIGKILL);
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+ pause("reapin1t", hz/2);
+ }
+
+ /* Operate on current vps instance only. */
+ sx_xlock(&V_proctree_lock);
+ LIST_FOREACH_SAFE(p, &V_zombproc, p_list, p2) {
+ PROC_LOCK(p);
+ proc_reap(FIRST_THREAD_IN_PROC(V_vproc0), p, NULL, 0);
+ sx_xlock(&V_proctree_lock);
+ }
+ sx_xunlock(&V_proctree_lock);
+
+ while (V_nprocs > 1)
+ pause("reapinit", hz/2);
+
+ /* Only our "swapper" left. */
+ KASSERT(V_nprocs == 1, ("%s: vps %p V_nprocs %d != 1",
+ __func__, curvps, V_nprocs));
+}
+/* Run very first. */
+VPS_SYSUNINIT(reapinit, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, reapinit, NULL);
+#endif
Index: sys/kern/kern_acct.c
===================================================================
--- sys/kern/kern_acct.c
+++ sys/kern/kern_acct.c
@@ -378,7 +378,7 @@
* Get process accounting information.
*/
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
/* (1) The terminal from which the process was started */
@@ -386,7 +386,7 @@
acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp);
else
acct.ac_tty = NODEV;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
/* (2) The name of the command that ran */
bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
Index: sys/kern/kern_clock.c
===================================================================
--- sys/kern/kern_clock.c
+++ sys/kern/kern_clock.c
@@ -184,12 +184,78 @@
static int blktime_threshold = 900;
static int sleepfreq = 3;
+static __inline void
+_deadlres_td_on_lock(struct proc *p, struct thread *td, int blkticks)
+{
+ int tticks;
+
+ /*
+ * The thread should be blocked on a turnstile, simply check
+ * if the turnstile channel is in good state.
+ */
+ MPASS(td->td_blocked != NULL);
+
+ tticks = ticks - td->td_blktick;
+ thread_unlock(td);
+ if (tticks > blkticks) {
+
+ /*
+ * Accordingly with provided thresholds, this thread is stuck
+ * for too long on a turnstile.
+ */
+ PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ panic("%s: possible deadlock detected for %p, "
+ "blocked for %d ticks\n", __func__, td, tticks);
+ }
+}
+
+static __inline void
+_deadlres_td_sleep_q(struct proc *p, struct thread *td, int slpticks)
+{
+ void *wchan;
+ int i, slptype, tryl, tticks;
+
+ /*
+ * Check if the thread is sleeping on a lock, otherwise skip the check.
+ * Drop the thread lock in order to avoid a LOR with the sleepqueue
+ * spinlock.
+ */
+ wchan = td->td_wchan;
+ tticks = ticks - td->td_slptick;
+ thread_unlock(td);
+ slptype = sleepq_type(wchan);
+ if ((slptype == SLEEPQ_SX || slptype == SLEEPQ_LK) &&
+ tticks > slpticks) {
+
+ /*
+ * Accordingly with provided thresholds, this thread is stuck
+ * for too long on a sleepqueue.
+ * However, being on a sleepqueue, we might still check for the
+ * blessed list.
+ */
+ tryl = 0;
+ for (i = 0; blessed[i] != NULL; i++) {
+ if (!strcmp(blessed[i], td->td_wmesg)) {
+ tryl = 1;
+ break;
+ }
+ }
+ if (tryl != 0)
+ return;
+ PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ panic("%s: possible deadlock detected for %p, "
+ "blocked for %d ticks\n", __func__, td, tticks);
+ }
+}
+
static void
deadlres_td_on_lock(struct proc *p, struct thread *td, int blkticks)
{
int tticks;
- sx_assert(&allproc_lock, SX_LOCKED);
+ sx_assert(&V_allproc_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
THREAD_LOCK_ASSERT(td, MA_OWNED);
/*
@@ -214,7 +280,7 @@
void *wchan;
int i, slptype, tticks;
- sx_assert(&allproc_lock, SX_LOCKED);
+ sx_assert(&V_allproc_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
THREAD_LOCK_ASSERT(td, MA_OWNED);
/*
@@ -246,6 +312,7 @@
static void
deadlkres(void)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
struct thread *td;
int blkticks, slpticks, tryl;
@@ -255,41 +322,49 @@
blkticks = blktime_threshold * hz;
slpticks = slptime_threshold * hz;
- /*
- * Avoid to sleep on the sx_lock in order to avoid a
- * possible priority inversion problem leading to
- * starvation.
- * If the lock can't be held after 100 tries, panic.
- */
- if (!sx_try_slock(&allproc_lock)) {
- if (tryl > 100)
- panic("%s: possible deadlock detected "
- "on allproc_lock\n", __func__);
- tryl++;
- pause("allproc", sleepfreq * hz);
- continue;
- }
- tryl = 0;
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state == PRS_NEW) {
- PROC_UNLOCK(p);
- continue;
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+again:
+ CURVPS_SET_QUIET(vps_iter);
+ /*
+ * Avoid to sleep on the sx_lock in order to avoid a
+ * possible priority inversion problem leading to
+ * starvation.
+ * If the lock can't be held after 100 tries, panic.
+ */
+ if (!sx_try_slock(&V_allproc_lock)) {
+ if (tryl > 100)
+ panic("%s: possible deadlock detected "
+ "on allproc_lock\n", __func__);
+ tryl++;
+ CURVPS_RESTORE();
+ pause("allproc", sleepfreq * hz);
+ goto again;
}
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (TD_ON_LOCK(td))
- deadlres_td_on_lock(p, td,
- blkticks);
- else if (TD_IS_SLEEPING(td) &&
- TD_ON_SLEEPQ(td))
- deadlres_td_sleep_q(p, td,
- slpticks);
- thread_unlock(td);
+ tryl = 0;
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (TD_ON_LOCK(td))
+ deadlres_td_on_lock(p, td,
+ blkticks);
+ else if (TD_IS_SLEEPING(td) &&
+ TD_ON_SLEEPQ(td))
+ deadlres_td_sleep_q(p, td,
+ slpticks);
+ thread_unlock(td);
+ }
+ PROC_UNLOCK(p);
}
- PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
/* Sleep for sleepfreq seconds. */
pause("-", sleepfreq * hz);
Index: sys/kern/kern_cpuset.c
===================================================================
--- sys/kern/kern_cpuset.c
+++ sys/kern/kern_cpuset.c
@@ -510,24 +510,32 @@
static void
domainset_notify(void)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct thread *td;
struct proc *p;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state == PRS_NEW) {
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ td->td_domain.dr_policy =
+ td->td_cpuset->cs_domain;
+ thread_unlock(td);
+ }
PROC_UNLOCK(p);
- continue;
}
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- td->td_domain.dr_policy = td->td_cpuset->cs_domain;
- thread_unlock(td);
- }
- PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
kernel_object->domain.dr_policy = cpuset_kernel->cs_domain;
}
Index: sys/kern/kern_descrip.c
===================================================================
--- sys/kern/kern_descrip.c
+++ sys/kern/kern_descrip.c
@@ -1063,7 +1063,7 @@
sigio->sio_ucred = crhold(curthread->td_ucred);
sigio->sio_myref = sigiop;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (pgid > 0) {
proc = pfind(pgid);
if (proc == NULL) {
@@ -1131,14 +1131,14 @@
sigio->sio_pgrp = pgrp;
PGRP_UNLOCK(pgrp);
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
SIGIO_LOCK();
*sigiop = sigio;
SIGIO_UNLOCK();
return (0);
fail:
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
crfree(sigio->sio_ucred);
free(sigio, M_SIGIO);
return (ret);
@@ -3190,6 +3190,7 @@
void
mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct filedesc *fdp;
struct prison *pr;
struct proc *p;
@@ -3198,33 +3199,40 @@
if (vrefcnt(olddp) == 1)
return;
nrele = 0;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- fdp = fdhold(p);
- PROC_UNLOCK(p);
- if (fdp == NULL)
- continue;
- FILEDESC_XLOCK(fdp);
- if (fdp->fd_cdir == olddp) {
- vrefact(newdp);
- fdp->fd_cdir = newdp;
- nrele++;
- }
- if (fdp->fd_rdir == olddp) {
- vrefact(newdp);
- fdp->fd_rdir = newdp;
- nrele++;
- }
- if (fdp->fd_jdir == olddp) {
- vrefact(newdp);
- fdp->fd_jdir = newdp;
- nrele++;
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ if (fdp == NULL)
+ continue;
+ FILEDESC_XLOCK(fdp);
+ if (fdp->fd_cdir == olddp) {
+ vrefact(newdp);
+ fdp->fd_cdir = newdp;
+ nrele++;
+ }
+ if (fdp->fd_rdir == olddp) {
+ vrefact(newdp);
+ fdp->fd_rdir = newdp;
+ nrele++;
+ }
+ if (fdp->fd_jdir == olddp) {
+ vrefact(newdp);
+ fdp->fd_jdir = newdp;
+ nrele++;
+ }
+ FILEDESC_XUNLOCK(fdp);
+ fddrop(fdp);
}
- FILEDESC_XUNLOCK(fdp);
- fddrop(fdp);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
if (rootvnode == olddp) {
vrefact(newdp);
rootvnode = newdp;
@@ -3307,6 +3315,7 @@
static int
sysctl_kern_file(SYSCTL_HANDLER_ARGS)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct xfile xf;
struct filedesc *fdp;
struct file *fp;
@@ -3318,68 +3327,82 @@
return (error);
if (req->oldptr == NULL) {
n = 0;
- sx_slock(&allproc_lock);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ if (fdp == NULL)
+ continue;
+ /* overestimates sparse tables. */
+ if (fdp->fd_lastfile > 0)
+ n += fdp->fd_lastfile;
+ fddrop(fdp);
+ }
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
+ }
+ VPS_LIST_RUNLOCK();
+ return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
+ }
+ error = 0;
+ bzero(&xf, sizeof(xf));
+ xf.xf_size = sizeof(xf);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
PROC_UNLOCK(p);
continue;
}
+ if (p_cansee(req->td, p) != 0) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ xf.xf_pid = p->p_pid;
+ xf.xf_uid = p->p_ucred->cr_uid;
fdp = fdhold(p);
PROC_UNLOCK(p);
if (fdp == NULL)
continue;
- /* overestimates sparse tables. */
- if (fdp->fd_lastfile > 0)
- n += fdp->fd_lastfile;
+ FILEDESC_SLOCK(fdp);
+ for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
+ if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+ continue;
+ xf.xf_fd = n;
+ xf.xf_file = (kvaddr_t)(uintptr_t)fp;
+ xf.xf_data = (kvaddr_t)(uintptr_t)fp->f_data;
+ xf.xf_vnode = (kvaddr_t)(uintptr_t)fp->f_vnode;
+ xf.xf_type = (kvaddr_t)(uintptr_t)fp->f_type;
+ xf.xf_count = fp->f_count;
+ xf.xf_msgcount = 0;
+ xf.xf_offset = foffset_get(fp);
+ xf.xf_flag = fp->f_flag;
+ error = SYSCTL_OUT(req, &xf, sizeof(xf));
+ if (error)
+ break;
+ }
+ FILEDESC_SUNLOCK(fdp);
fddrop(fdp);
- }
- sx_sunlock(&allproc_lock);
- return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
- }
- error = 0;
- bzero(&xf, sizeof(xf));
- xf.xf_size = sizeof(xf);
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state == PRS_NEW) {
- PROC_UNLOCK(p);
- continue;
- }
- if (p_cansee(req->td, p) != 0) {
- PROC_UNLOCK(p);
- continue;
- }
- xf.xf_pid = p->p_pid;
- xf.xf_uid = p->p_ucred->cr_uid;
- fdp = fdhold(p);
- PROC_UNLOCK(p);
- if (fdp == NULL)
- continue;
- FILEDESC_SLOCK(fdp);
- for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
- if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
- continue;
- xf.xf_fd = n;
- xf.xf_file = (kvaddr_t)(uintptr_t)fp;
- xf.xf_data = (kvaddr_t)(uintptr_t)fp->f_data;
- xf.xf_vnode = (kvaddr_t)(uintptr_t)fp->f_vnode;
- xf.xf_type = (kvaddr_t)(uintptr_t)fp->f_type;
- xf.xf_count = fp->f_count;
- xf.xf_msgcount = 0;
- xf.xf_offset = foffset_get(fp);
- xf.xf_flag = fp->f_flag;
- error = SYSCTL_OUT(req, &xf, sizeof(xf));
if (error)
break;
}
- FILEDESC_SUNLOCK(fdp);
- fddrop(fdp);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
if (error)
break;
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
return (error);
}
@@ -3930,21 +3953,28 @@
static struct proc *
file_to_first_proc(struct file *fp)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct filedesc *fdp;
struct proc *p;
int n;
- FOREACH_PROC_IN_SYSTEM(p) {
- if (p->p_state == PRS_NEW)
- continue;
- fdp = p->p_fd;
- if (fdp == NULL)
- continue;
- for (n = 0; n <= fdp->fd_lastfile; n++) {
- if (fp == fdp->fd_ofiles[n].fde_file)
- return (p);
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_state == PRS_NEW)
+ continue;
+ fdp = p->p_fd;
+ if (fdp == NULL)
+ continue;
+ for (n = 0; n <= fdp->fd_lastfile; n++) {
+ if (fp == fdp->fd_ofiles[n].fde_file)
+ return (p);
+ }
}
+ CURVPS_RESTORE();
}
+ /* VPS_LIST_RUNLOCK(); */
return (NULL);
}
@@ -3982,6 +4012,7 @@
DB_SHOW_COMMAND(files, db_show_files)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct filedesc *fdp;
struct file *fp;
struct proc *p;
@@ -3989,18 +4020,24 @@
int n;
header = 1;
- FOREACH_PROC_IN_SYSTEM(p) {
- if (p->p_state == PRS_NEW)
- continue;
- if ((fdp = p->p_fd) == NULL)
- continue;
- for (n = 0; n <= fdp->fd_lastfile; ++n) {
- if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_state == PRS_NEW)
+ continue;
+ if ((fdp = p->p_fd) == NULL)
continue;
- db_print_file(fp, header);
- header = 0;
+ for (n = 0; n <= fdp->fd_lastfile; ++n) {
+ if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+ continue;
+ db_print_file(fp, header);
+ header = 0;
+ }
}
+ CURVPS_RESTORE();
}
+ /* VPS_LIST_RUNLOCK(); */
}
#endif
Index: sys/kern/kern_exit.c
===================================================================
--- sys/kern/kern_exit.c
+++ sys/kern/kern_exit.c
@@ -96,6 +96,11 @@
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE1(proc, , , exit, "int");
+#ifdef VIMAGE
+VPS_DECLARE(int, vrebooting); /* kern_reboot() has been called. */
+#define V_vrebooting VPS(vrebooting)
+#endif
+
/* Hook for NFS teardown procedure. */
void (*nlminfo_release_p)(struct proc *p);
@@ -106,13 +111,13 @@
{
struct proc *p, *parent;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
if ((child->p_treeflag & P_TREE_ORPHANED) == 0) {
if (child->p_oppid == 0 ||
child->p_pptr->p_pid == child->p_oppid)
parent = child->p_pptr;
else
- parent = initproc;
+ parent = V_initproc;
return (parent);
}
for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) {
@@ -132,10 +137,16 @@
{
struct proc *p1, *p2, *ptmp;
- sx_assert(&proctree_lock, SX_LOCKED);
- KASSERT(p != initproc, ("reaper_abandon_children for initproc"));
- if ((p->p_treeflag & P_TREE_REAPER) == 0)
+ sx_assert(&V_proctree_lock, SX_LOCKED);
+ /* init inside a vps may die on prison_remove. */
+ KASSERT(!IS_DEFAULT_VPS(curvps) || p != V_initproc,
+ ("%s: for initproc %p", __func__, p));
+ if ((p->p_treeflag & P_TREE_REAPER) == 0) {
+ KASSERT((p != V_initproc && p->p_pid != 1 && p->p_pid != 0),
+ ("%s:%d curvps %p p %p pid %d p_treeflag %#x",
+ __func__, __LINE__, curvps, p, p->p_pid, p->p_treeflag));
return;
+ }
p1 = p->p_reaper;
LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) {
LIST_REMOVE(p2, p_reapsibling);
@@ -148,7 +159,8 @@
PROC_UNLOCK(p2);
}
}
- KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty"));
+ KASSERT(LIST_EMPTY(&p->p_reaplist),
+ ("%s: p %p p_reaplist not empty", __func__, p));
p->p_treeflag &= ~P_TREE_REAPER;
}
@@ -157,7 +169,7 @@
{
struct proc *p1;
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
if ((p->p_treeflag & P_TREE_ORPHANED) == 0)
return;
if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) {
@@ -203,9 +215,19 @@
* work around an unsolved stack overflow seen very late during
* shutdown on sparc64 when the gmirror worker process exists.
*/
- if (p == initproc && rebooting == 0) {
+ if (p == V_initproc && (rebooting == 0
+#ifdef VIMAGE
+ || V_vrebooting
+#endif
+ )) {
printf("init died (signal %d, exit %d)\n", signo, rval);
- panic("Going nowhere without my init!");
+#ifdef VIMAGE
+ if (!IS_DEFAULT_VPS(TD_TO_VPS(td)))
+ /* XXX-BZ make this jail go away. */ ;
+ else
+#endif
+ panic("%s: Going nowhere without my init! td %p",
+ __func__, td);
}
/*
@@ -313,7 +335,7 @@
/* Are we a task leader with peers? */
if (p->p_peers != NULL && p == p->p_leader) {
- mtx_lock(&ppeers_lock);
+ mtx_lock(&V_ppeers_lock);
q = p->p_peers;
while (q != NULL) {
PROC_LOCK(q);
@@ -322,8 +344,8 @@
q = q->p_peers;
}
while (p->p_peers != NULL)
- msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
- mtx_unlock(&ppeers_lock);
+ msleep(p, &V_ppeers_lock, PWAIT, "exit1", 0);
+ mtx_unlock(&V_ppeers_lock);
}
/*
@@ -388,7 +410,7 @@
* Remove ourself from our leader's peer list and wake our leader.
*/
if (p->p_leader->p_peers != NULL) {
- mtx_lock(&ppeers_lock);
+ mtx_lock(&V_ppeers_lock);
if (p->p_leader->p_peers != NULL) {
q = p->p_leader;
while (q->p_peers != p)
@@ -396,7 +418,7 @@
q->p_peers = p->p_peers;
wakeup(p->p_leader);
}
- mtx_unlock(&ppeers_lock);
+ mtx_unlock(&V_ppeers_lock);
}
vmspace_exit(td);
@@ -432,16 +454,17 @@
WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
/*
* Remove proc from allproc queue and pidhash chain.
* Place onto zombproc. Unlink from parent's child list.
*/
- sx_xlock(&allproc_lock);
+ /* Operate on current vps instance only. */
+ sx_xlock(&V_allproc_lock);
LIST_REMOVE(p, p_list);
- LIST_INSERT_HEAD(&zombproc, p, p_list);
+ LIST_INSERT_HEAD(&V_zombproc, p, p_list);
LIST_REMOVE(p, p_hash);
- sx_xunlock(&allproc_lock);
+ sx_xunlock(&V_allproc_lock);
/*
* Reparent all children processes:
@@ -602,7 +625,7 @@
} else
mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
- if (p->p_pptr == p->p_reaper || p->p_pptr == initproc) {
+ if (p->p_pptr == p->p_reaper || p->p_pptr == V_initproc) {
signal_parent = 1;
} else if (p->p_sigparent != 0) {
if (p->p_sigparent == SIGCHLD) {
@@ -613,7 +636,7 @@
}
} else
PROC_LOCK(p->p_pptr);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
if (signal_parent == 1) {
childproc_exited(p);
@@ -827,9 +850,9 @@
{
struct proc *q, *t;
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
- KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE"));
+ KASSERT(p->p_state == PRS_ZOMBIE, ("%s: !PRS_ZOMBIE", __func__));
mtx_spin_wait_unlocked(&p->p_slock);
@@ -843,7 +866,7 @@
* release the proc struct just yet.
*/
PROC_UNLOCK(p);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return;
}
@@ -870,7 +893,7 @@
wakeup(t);
cv_broadcast(&p->p_pwait);
PROC_UNLOCK(t);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return;
}
p->p_oppid = 0;
@@ -880,9 +903,9 @@
* Remove other references to this process to ensure we have an
* exclusive reference.
*/
- sx_xlock(&allproc_lock);
+ sx_xlock(&V_allproc_lock);
LIST_REMOVE(p, p_list); /* off zombproc */
- sx_xunlock(&allproc_lock);
+ sx_xunlock(&V_allproc_lock);
LIST_REMOVE(p, p_sibling);
reaper_abandon_children(p, true);
LIST_REMOVE(p, p_reapsibling);
@@ -892,7 +915,7 @@
leavepgrp(p);
if (p->p_procdesc != NULL)
procdesc_reap(p);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
PROC_LOCK(p);
knlist_detach(p->p_klist);
@@ -953,9 +976,9 @@
#endif
KASSERT(FIRST_THREAD_IN_PROC(p),
- ("proc_reap: no residual thread!"));
+ ("%s: no residual thread!", __func__));
uma_zfree(proc_zone, p);
- atomic_add_int(&nprocs, -1);
+ atomic_add_int(&V_nprocs, -1);
}
static int
@@ -965,7 +988,7 @@
{
struct rusage *rup;
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
PROC_LOCK(p);
@@ -1156,7 +1179,7 @@
bool cont;
PROC_LOCK_ASSERT(p, MA_OWNED);
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
MPASS(si_code == CLD_TRAPPED || si_code == CLD_STOPPED ||
si_code == CLD_CONTINUED);
@@ -1170,7 +1193,7 @@
sigqueue_take(p->p_ksi);
PROC_UNLOCK(td->td_proc);
}
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
if (siginfo != NULL) {
siginfo->si_code = si_code;
siginfo->si_status = cont ? SIGCONT : p->p_xsig;
@@ -1223,7 +1246,7 @@
q->p_flag &= ~P_STATCHILD;
PROC_UNLOCK(q);
}
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
loop_locked:
nfound = 0;
LIST_FOREACH(p, &q->p_children, p_sibling) {
@@ -1307,11 +1330,11 @@
}
}
if (nfound == 0) {
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (ECHILD);
}
if (options & WNOHANG) {
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
td->td_retval[0] = 0;
return (0);
}
@@ -1321,7 +1344,7 @@
PROC_UNLOCK(q);
goto loop_locked;
}
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
error = msleep(q, &q->p_mtx, PWAIT | PCATCH | PDROP, "wait", 0);
if (error)
return (error);
@@ -1336,7 +1359,7 @@
proc_reparent(struct proc *child, struct proc *parent)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(child, MA_OWNED);
if (child->p_pptr == parent)
return;
Index: sys/kern/kern_fork.c
===================================================================
--- sys/kern/kern_fork.c
+++ sys/kern/kern_fork.c
@@ -74,6 +74,7 @@
#include <sys/sx.h>
#include <sys/sysent.h>
#include <sys/signalvar.h>
+#include <sys/vps.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
@@ -184,10 +185,10 @@
return (error);
}
-int nprocs = 1; /* process 0 */
-int lastpid = 0;
-SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
- "Last used PID");
+VPS_DEFINE(int, nprocs) = 1; /* process 0 */
+VPS_DEFINE(int, lastpid) = 0;
+SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD|CTLFLAG_VPS,
+ &VPS_NAME(lastpid), 0, "Last used PID");
/*
* Random component to lastpid generation. We mix in a random factor to make
@@ -197,7 +198,8 @@
* modulus that is too big causes a LOT more process table scans and slows
* down fork processing as the pidchecked caching is defeated.
*/
-static int randompid = 0;
+static VPS_DEFINE(int, randompid) = 0;
+#define V_randompid VPS(randompid)
static int
sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
@@ -207,44 +209,46 @@
error = sysctl_wire_old_buffer(req, sizeof(int));
if (error != 0)
return(error);
- sx_xlock(&allproc_lock);
- pid = randompid;
+ sx_xlock(&V_allproc_lock);
+ pid = V_randompid;
error = sysctl_handle_int(oidp, &pid, 0, req);
if (error == 0 && req->newptr != NULL) {
if (pid == 0)
- randompid = 0;
+ V_randompid = 0;
else if (pid == 1)
/* generate a random PID modulus between 100 and 1123 */
- randompid = 100 + arc4random() % 1024;
+ V_randompid = 100 + arc4random() % 1024;
else if (pid < 0 || pid > pid_max - 100)
/* out of range */
- randompid = pid_max - 100;
+ V_randompid = pid_max - 100;
else if (pid < 100)
/* Make it reasonable */
- randompid = 100;
+ V_randompid = 100;
else
- randompid = pid;
+ V_randompid = pid;
}
- sx_xunlock(&allproc_lock);
+ sx_xunlock(&V_allproc_lock);
return (error);
}
SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
0, 0, sysctl_kern_randompid, "I", "Random PID modulus. Special values: 0: disable, 1: choose random value");
-static int
+static VPS_DEFINE(int, pidchecked) = 0;
+#define V_pidchecked VPS(pidchecked)
+
+int
fork_findpid(int flags)
{
struct proc *p;
int trypid;
- static int pidchecked = 0;
/*
* Requires allproc_lock in order to iterate over the list
* of processes, and proctree_lock to access p_pgrp.
*/
- sx_assert(&allproc_lock, SX_LOCKED);
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_allproc_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
/*
* Find an unused process ID. We remember a range of unused IDs
@@ -253,13 +257,13 @@
* If RFHIGHPID is set (used during system boot), do not allocate
* low-numbered pids.
*/
- trypid = lastpid + 1;
+ trypid = V_lastpid + 1;
if (flags & RFHIGHPID) {
if (trypid < 10)
trypid = 10;
} else {
- if (randompid)
- trypid += arc4random() % randompid;
+ if (V_randompid)
+ trypid += arc4random() % V_randompid;
}
retry:
/*
@@ -271,12 +275,12 @@
trypid = trypid % pid_max;
if (trypid < 100)
trypid += 100;
- pidchecked = 0;
+ V_pidchecked = 0;
}
- if (trypid >= pidchecked) {
+ if (trypid >= V_pidchecked) {
int doingzomb = 0;
- pidchecked = PID_MAX;
+ V_pidchecked = PID_MAX;
/*
* Scan the active and zombie procs to check whether this pid
* is in use. Remember the lowest pid that's greater
@@ -291,7 +295,8 @@
* reserved pids is limited by process limit times
* two.
*/
- p = LIST_FIRST(&allproc);
+ /* Operate on current vps instance only. */
+ p = LIST_FIRST(&V_allproc);
again:
for (; p != NULL; p = LIST_NEXT(p, p_list)) {
while (p->p_pid == trypid ||
@@ -301,24 +306,25 @@
(p->p_session != NULL &&
p->p_session->s_sid == trypid)))) {
trypid++;
- if (trypid >= pidchecked)
+ if (trypid >= V_pidchecked)
goto retry;
}
- if (p->p_pid > trypid && pidchecked > p->p_pid)
- pidchecked = p->p_pid;
+ if (p->p_pid > trypid && V_pidchecked > p->p_pid)
+ V_pidchecked = p->p_pid;
if (p->p_pgrp != NULL) {
if (p->p_pgrp->pg_id > trypid &&
- pidchecked > p->p_pgrp->pg_id)
- pidchecked = p->p_pgrp->pg_id;
+ V_pidchecked > p->p_pgrp->pg_id)
+ V_pidchecked = p->p_pgrp->pg_id;
if (p->p_session != NULL &&
p->p_session->s_sid > trypid &&
- pidchecked > p->p_session->s_sid)
- pidchecked = p->p_session->s_sid;
+ V_pidchecked > p->p_session->s_sid)
+ V_pidchecked = p->p_session->s_sid;
}
}
if (!doingzomb) {
+ /* Operate on current vps instance only. */
doingzomb = 1;
- p = LIST_FIRST(&zombproc);
+ p = LIST_FIRST(&V_zombproc);
goto again;
}
}
@@ -327,9 +333,9 @@
* RFHIGHPID does not mess with the lastpid counter during boot.
*/
if (flags & RFHIGHPID)
- pidchecked = 0;
+ V_pidchecked = 0;
else
- lastpid = trypid;
+ V_lastpid = trypid;
return (trypid);
}
@@ -394,8 +400,8 @@
struct filedesc_to_leader *fdtol;
struct sigacts *newsigacts;
- sx_assert(&proctree_lock, SX_LOCKED);
- sx_assert(&allproc_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
+ sx_assert(&V_allproc_lock, SX_XLOCKED);
p1 = td->td_proc;
@@ -404,14 +410,14 @@
p2->p_state = PRS_NEW; /* protect against others */
p2->p_pid = trypid;
AUDIT_ARG_PID(p2->p_pid);
- LIST_INSERT_HEAD(&allproc, p2, p_list);
+ LIST_INSERT_HEAD(&V_allproc, p2, p_list);
allproc_gen++;
LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
PROC_LOCK(p2);
PROC_LOCK(p1);
- sx_xunlock(&allproc_lock);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_allproc_lock);
+ sx_xunlock(&V_proctree_lock);
bcopy(&p1->p_startcopy, &p2->p_startcopy,
__rangeof(struct proc, p_startcopy, p_endcopy));
@@ -490,6 +496,7 @@
td2->td_lend_user_pri = PRI_MAX;
#ifdef VIMAGE
+ td2->td_vps = TD_TO_VPS(td);
td2->td_vnet = NULL;
td2->td_vnet_lpush = NULL;
#endif
@@ -554,11 +561,11 @@
* Set up linkage for kernel based threading.
*/
if ((fr->fr_flags & RFTHREAD) != 0) {
- mtx_lock(&ppeers_lock);
+ mtx_lock(&V_ppeers_lock);
p2->p_peers = p1->p_peers;
p1->p_peers = p2;
p2->p_leader = p1->p_leader;
- mtx_unlock(&ppeers_lock);
+ mtx_unlock(&V_ppeers_lock);
PROC_LOCK(p1->p_leader);
if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
PROC_UNLOCK(p1->p_leader);
@@ -585,7 +592,7 @@
p2->p_leader = p2;
}
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
PGRP_LOCK(p1->p_pgrp);
PROC_LOCK(p2);
PROC_LOCK(p1);
@@ -648,7 +655,7 @@
LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling);
if (p2->p_reaper == p1)
p2->p_reapsubtree = p2->p_pid;
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
/* Inform accounting that we have forked. */
p2->p_acflag = AFORK;
@@ -751,7 +758,7 @@
* if being set atm.
*/
if ((p1->p_ptevents & PTRACE_FORK) != 0) {
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
PROC_LOCK(p2);
/*
@@ -777,7 +784,7 @@
proc_reparent(p2, p1->p_pptr);
}
PROC_UNLOCK(p2);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
}
if ((fr->fr_flags & RFSTOPPED) == 0) {
@@ -801,6 +808,11 @@
PROC_UNLOCK(p2);
}
+static VPS_DEFINE(int, curfail);
+#define V_curfail VPS(curfail)
+static VPS_DEFINE(struct timeval, lastfail);
+#define V_lastfail VPS(lastfail)
+
int
fork1(struct thread *td, struct fork_req *fr)
{
@@ -810,8 +822,6 @@
struct file *fp_procdesc;
vm_ooffset_t mem_charged;
int error, nprocs_new, ok;
- static int curfail;
- static struct timeval lastfail;
int flags, pages;
flags = fr->fr_flags;
@@ -881,17 +891,17 @@
* Don't allow a nonprivileged user to use the last ten
* processes; don't let root exceed the limit.
*/
- nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1;
+ nprocs_new = atomic_fetchadd_int(&V_nprocs, 1) + 1;
if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred,
PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) {
error = EAGAIN;
- sx_xlock(&allproc_lock);
- if (ppsratecheck(&lastfail, &curfail, 1)) {
+ sx_xlock(&V_allproc_lock);
+ if (ppsratecheck(&V_lastfail, &V_curfail, 1)) {
printf("maxproc limit exceeded by uid %u (pid %d); "
"see tuning(7) and login.conf(5)\n",
td->td_ucred->cr_ruid, p1->p_pid);
}
- sx_xunlock(&allproc_lock);
+ sx_xunlock(&V_allproc_lock);
goto fail2;
}
@@ -973,8 +983,8 @@
STAILQ_INIT(&newproc->p_ktr);
/* We have to lock the process tree while we look for a pid. */
- sx_xlock(&proctree_lock);
- sx_xlock(&allproc_lock);
+ sx_xlock(&V_proctree_lock);
+ sx_xlock(&V_allproc_lock);
/*
* Increment the count of procs running with this uid. Don't allow
@@ -995,8 +1005,8 @@
}
error = EAGAIN;
- sx_xunlock(&allproc_lock);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_allproc_lock);
+ sx_xunlock(&V_proctree_lock);
#ifdef MAC
mac_proc_destroy(newproc);
#endif
@@ -1012,7 +1022,7 @@
fdclose(td, fp_procdesc, *fr->fr_pd_fd);
fdrop(fp_procdesc, td);
}
- atomic_add_int(&nprocs, -1);
+ atomic_add_int(&V_nprocs, -1);
pause("fork", hz / 2);
return (error);
}
Index: sys/kern/kern_jail.c
===================================================================
--- sys/kern/kern_jail.c
+++ sys/kern/kern_jail.c
@@ -62,6 +62,10 @@
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
+#include <sys/vps.h>
+#ifdef VIMAGE
+#include <sys/reboot.h>
+#endif
#include <net/if.h>
#include <net/vnet.h>
@@ -107,7 +111,7 @@
.pr_hostuuid = DEFAULT_HOSTUUID,
.pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children),
#ifdef VIMAGE
- .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
+ .pr_flags = PR_HOST|PR_VNET|PR_VPS|_PR_IP_SADDRSEL,
#else
.pr_flags = PR_HOST|_PR_IP_SADDRSEL,
#endif
@@ -171,6 +175,7 @@
{"host", 0, PR_HOST},
#ifdef VIMAGE
{"vnet", 0, PR_VNET},
+ {"vps", 0, PR_VPS},
#endif
#ifdef INET
{"ip4", PR_IP4_USER, PR_IP4_USER},
@@ -627,6 +632,11 @@
vfs_opterror(opts, "vnet cannot be changed after creation");
goto done_errmsg;
}
+ if ((flags & JAIL_UPDATE) && (ch_flags & PR_VPS)) {
+ error = EINVAL;
+ vfs_opterror(opts, "vps cannot be changed after creation");
+ goto done_errmsg;
+ }
#endif
#ifdef INET
if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
@@ -1801,6 +1811,39 @@
goto done_errmsg;
}
+#ifdef VIMAGE
+ /* Allocate a new vps if specified. */
+ if (pr_flags & PR_VPS) {
+ vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = change_dir(pr->pr_root, td)) != 0)
+ goto c_unlock;
+#ifdef MAC
+ if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
+ goto c_unlock;
+#endif
+c_unlock:
+ VOP_UNLOCK(pr->pr_root, 0);
+ if (error || (error = pwd_chroot(td, pr->pr_root))) {
+ vfs_opterror(opts, "vps chroot failed");
+ if (!created)
+ prison_deref(pr, PD_DEREF);
+ goto done_errmsg;
+ }
+
+ /* We temporary need a ref as otheriwse a prhold will panic. */
+ mtx_lock(&pr->pr_mtx);
+ pr->pr_ref++;
+ pr->pr_uref++;
+ mtx_unlock(&pr->pr_mtx);
+ pr->pr_vps = vps_alloc(pr);
+ mtx_lock(&pr->pr_mtx);
+ pr->pr_ref--;
+ pr->pr_uref--;
+ mtx_unlock(&pr->pr_mtx);
+ } else {
+ pr->pr_vps = ppr->pr_vps;
+ }
+#endif
/* Attach this process to the prison if requested. */
if (flags & JAIL_ATTACH) {
mtx_lock(&pr->pr_mtx);
@@ -2285,7 +2328,28 @@
/*
* Kill all processes unfortunate enough to be attached to this prison.
*/
- sx_slock(&allproc_lock);
+#ifdef VIMAGE
+ if (pr->pr_vps) {
+ /*
+ * Send signal to init and let init do it's job.
+ * This should run rc.shutdown and processes should go away.
+ * All but init? We need to catch the tail-end of reboot(2)
+ * and handle appropriately for the non-default vpss.
+ * vps_destroy() will ensure init and swapper will also go
+ * away and might sleep. If they do not go something will
+ * hold refs on cred and prisons.
+ * XXX There are other places which might do that for a long
+ * time as well.
+ */
+ CURVPS_SET(pr->pr_vps);
+ shutdown_nice(RB_HALT|RB_POWEROFF);
+ vps_destroy(pr->pr_vps);
+ CURVPS_RESTORE();
+ } else
+#endif
+ {
+ /* Operate on current vps instance only. */
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state != PRS_NEW && p->p_ucred &&
@@ -2293,7 +2357,8 @@
kern_psignal(p, SIGKILL);
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
+ }
/* Remove the temporary reference added by jail_remove. */
prison_deref(pr, deuref | PD_DEREF);
}
@@ -2348,6 +2413,24 @@
struct ucred *newcred, *oldcred;
int error;
+#ifdef VIMAGE
+ /*
+ * Do not allow to migrate a process between virtual process spaces.
+ * Use the console to attach to it. Getting all process spaces things
+ * right, including a new pid, progress group, session, terminal,
+ * tracing is one thing (with a lot of work) and may break apps if the
+ * pid changes, the pgrp no longer has the same (p)id; getting things
+ * restored to original state and properly re-parented is virtually
+ * impossile. So do what we do on a normal machine, present a terminal
+ * to login to.
+ */
+ if (pr->pr_flags & PR_VPS) {
+ mtx_unlock(&pr->pr_mtx);
+ sx_sunlock(&allprison_lock);
+ return (EPERM);
+ }
+#endif
+
/*
* XXX: Note that there is a slight race here if two threads
* in the same privileged process attempt to attach to two
@@ -2628,6 +2711,9 @@
#ifdef VIMAGE
if (pr->pr_vnet != ppr->pr_vnet)
vnet_destroy(pr->pr_vnet);
+ KASSERT((pr->pr_vps == ppr->pr_vps || pr->pr_vps == NULL),
+ ("%s: pr %p pr_vps %p != NULL\n",
+ __func__, pr, pr->pr_vps));
#endif
if (pr->pr_root != NULL)
vrele(pr->pr_root);
@@ -2912,9 +2998,9 @@
#ifdef VIMAGE
/*
* Determine whether the prison represented by cred owns
- * its vnet rather than having it inherited.
+ * its vnet/vps rather than having it inherited.
*
- * Returns 1 in case the prison owns the vnet, 0 otherwise.
+ * Returns 1 in case the prison owns the vnet/vps, 0 otherwise.
*/
int
prison_owns_vnet(struct ucred *cred)
@@ -2926,6 +3012,17 @@
*/
return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
}
+
+int
+prison_owns_vps(struct ucred *cred)
+{
+
+ /*
+ * vps cannot be added/removed after jail creation,
+ * so no need to lock here.
+ */
+ return (cred->cr_prison->pr_flags & PR_VPS ? 1 : 0);
+}
#endif
/*
@@ -3542,6 +3639,26 @@
CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
sysctl_jail_vnet, "I", "Jail owns vnet?");
+static int
+sysctl_jail_vps(SYSCTL_HANDLER_ARGS)
+{
+ int error, havevps;
+#ifdef VIMAGE
+ struct ucred *cred = req->td->td_ucred;
+
+ havevps = jailed(cred) && prison_owns_vps(cred);
+#else
+ havevps = 0;
+#endif
+ error = SYSCTL_OUT(req, &havevps, sizeof(havevps));
+
+ return (error);
+}
+
+SYSCTL_PROC(_security_jail, OID_AUTO, vps,
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_jail_vps, "I", "Jail owns vps?");
+
#if defined(INET) || defined(INET6)
SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
&jail_max_af_ips, 0,
@@ -3697,6 +3814,8 @@
#ifdef VIMAGE
SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
"E,jailsys", "Virtual network stack");
+SYSCTL_JAIL_PARAM(, vps, CTLTYPE_INT | CTLFLAG_RDTUN,
+ "E,jailsys", "Virtual process space");
#endif
SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
"B", "Jail is in the process of shutting down");
@@ -4023,12 +4142,12 @@
ASSERT_RACCT_ENABLED();
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
sx_xlock(&allprison_lock);
if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
sx_xunlock(&allprison_lock);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return;
}
@@ -4046,6 +4165,7 @@
/*
* Force rctl to reattach rules to processes.
*/
+ /* XXX do we need to do this over all vps instances as well? */
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
cred = crhold(p->p_ucred);
@@ -4055,7 +4175,7 @@
}
#endif
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
prison_racct_free_locked(oldprr);
sx_xunlock(&allprison_lock);
}
@@ -4103,6 +4223,7 @@
? pr->pr_cpuset->cs_id : -1);
#ifdef VIMAGE
db_printf(" vnet = %p\n", pr->pr_vnet);
+ db_printf(" vps = %p\n", pr->pr_vps);
#endif
db_printf(" root = %p\n", pr->pr_root);
db_printf(" securelevel = %d\n", pr->pr_securelevel);
Index: sys/kern/kern_kthread.c
===================================================================
--- sys/kern/kern_kthread.c
+++ sys/kern/kern_kthread.c
@@ -32,6 +32,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/cpuset.h>
+#include <sys/jail.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@@ -45,6 +46,8 @@
#include <sys/wait.h>
#include <sys/sched.h>
#include <sys/tslog.h>
+#include <sys/vps.h>
+
#include <vm/vm.h>
#include <vm/vm_extern.h>
@@ -164,11 +167,30 @@
* Reparent curthread from proc0 to init so that the zombie
* is harvested.
*/
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
PROC_LOCK(p);
- proc_reparent(p, initproc);
+#ifdef VIMAGE
+ /*
+ * In the VIMAGE case if the kproc is our virtual "swapper"
+ * do not reparent it to our init as otherwise it would create
+ * a circle and never go away. Let the parent vps reap it
+ * as it was setup. And it needs to be the init there and
+ * not the swapper(kernel).
+ */
+ if (!IS_DEFAULT_VPS(TD_TO_VPS(FIRST_THREAD_IN_PROC(p))) &&
+ p->p_pid == 0) {
+ struct proc *init0;
+
+ CURVPS_SET_QUIET(vps0)
+ init0 = V_initproc;
+ CURVPS_RESTORE();
+
+ proc_reparent(p, init0);
+ } else
+#endif
+ proc_reparent(p, V_initproc);
PROC_UNLOCK(p);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
/*
* Wakeup anyone waiting for us to exit.
@@ -271,7 +293,7 @@
/* If no process supplied, put it on proc0 */
if (p == NULL)
- p = &proc0;
+ p = V_vproc0;
/* Initialize our new td */
newtd = thread_alloc(pages);
@@ -294,6 +316,9 @@
TSTHREAD(newtd, newtd->td_name);
newtd->td_proc = p; /* needed for cpu_copy_thread */
+#ifdef VIMAGE
+ newtd->td_vps = TD_TO_VPS(oldtd);
+#endif
/* might be further optimized for kthread */
cpu_copy_thread(newtd, oldtd);
/* put the designated function(arg) as the resume context */
Index: sys/kern/kern_ktrace.c
===================================================================
--- sys/kern/kern_ktrace.c
+++ sys/kern/kern_ktrace.c
@@ -952,25 +952,33 @@
* Clear all uses of the tracefile.
*/
if (ops == KTROP_CLEARFILE) {
+ VPS_ITERATOR_DECL(vps_iter);
int vrele_count;
vrele_count = 0;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_tracevp == vp) {
- if (ktrcanset(td, p)) {
- mtx_lock(&ktrace_mtx);
- ktr_freeproc(p, &cred, NULL);
- mtx_unlock(&ktrace_mtx);
- vrele_count++;
- crfree(cred);
- } else
- error = EPERM;
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_tracevp == vp) {
+ if (ktrcanset(td, p)) {
+ mtx_lock(&ktrace_mtx);
+ ktr_freeproc(p, &cred, NULL);
+ mtx_unlock(&ktrace_mtx);
+ vrele_count++;
+ crfree(cred);
+ } else
+ error = EPERM;
+ }
+ PROC_UNLOCK(p);
}
- PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
if (vrele_count > 0) {
while (vrele_count-- > 0)
vrele(vp);
@@ -980,14 +988,14 @@
/*
* do it
*/
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (uap->pid < 0) {
/*
* by process group
*/
pg = pgfind(-uap->pid);
if (pg == NULL) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
error = ESRCH;
goto done;
}
@@ -1011,7 +1019,7 @@
ret |= ktrops(td, p, ops, facs, vp);
}
if (nfound == 0) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
error = ESRCH;
goto done;
}
@@ -1027,7 +1035,7 @@
if (error) {
if (p != NULL)
PROC_UNLOCK(p);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
goto done;
}
if (descend)
@@ -1035,7 +1043,7 @@
else
ret |= ktrops(td, p, ops, facs, vp);
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
if (!ret)
error = EPERM;
done:
@@ -1143,7 +1151,7 @@
p = top;
PROC_LOCK_ASSERT(p, MA_OWNED);
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
for (;;) {
ret |= ktrops(td, p, ops, facs, vp);
/*
@@ -1170,6 +1178,7 @@
static void
ktr_writerequest(struct thread *td, struct ktr_request *req)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct ktr_header *kth;
struct vnode *vp;
struct proc *p;
@@ -1270,22 +1279,28 @@
* credentials for the operation.
*/
cred = NULL;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_tracevp == vp) {
- mtx_lock(&ktrace_mtx);
- ktr_freeproc(p, &cred, NULL);
- mtx_unlock(&ktrace_mtx);
- vrele_count++;
- }
- PROC_UNLOCK(p);
- if (cred != NULL) {
- crfree(cred);
- cred = NULL;
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_tracevp == vp) {
+ mtx_lock(&ktrace_mtx);
+ ktr_freeproc(p, &cred, NULL);
+ mtx_unlock(&ktrace_mtx);
+ vrele_count++;
+ }
+ PROC_UNLOCK(p);
+ if (cred != NULL) {
+ crfree(cred);
+ cred = NULL;
+ }
}
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
while (vrele_count-- > 0)
vrele(vp);
Index: sys/kern/kern_mib.c
===================================================================
--- sys/kern/kern_mib.c
+++ sys/kern/kern_mib.c
@@ -556,8 +556,8 @@
error = sysctl_handle_int(oidp, &pm, 0, req);
if (error || !req->newptr)
return (error);
- sx_xlock(&proctree_lock);
- sx_xlock(&allproc_lock);
+ sx_xlock(&V_proctree_lock);
+ sx_xlock(&V_allproc_lock);
/*
* Only permit the values less then PID_MAX.
@@ -567,8 +567,8 @@
error = EINVAL;
else
pid_max = pm;
- sx_xunlock(&allproc_lock);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_allproc_lock);
+ sx_xunlock(&V_proctree_lock);
return (error);
}
SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT |
Index: sys/kern/kern_proc.c
===================================================================
--- sys/kern/kern_proc.c
+++ sys/kern/kern_proc.c
@@ -126,15 +126,21 @@
/*
* Other process lists
*/
-struct pidhashhead *pidhashtbl;
-u_long pidhash;
-struct pgrphashhead *pgrphashtbl;
-u_long pgrphash;
-struct proclist allproc;
-struct proclist zombproc;
+VPS_DEFINE(struct pidhashhead *, pidhashtbl);
+VPS_DEFINE(u_long, pidhash);
+VPS_DEFINE(struct pgrphashhead *, pgrphashtbl);
+VPS_DEFINE(u_long, pgrphash);
+VPS_DEFINE(struct proclist, allproc);
+VPS_DEFINE(struct proclist, zombproc);
+#ifndef VIMAGE
struct sx __exclusive_cache_line allproc_lock;
struct sx __exclusive_cache_line proctree_lock;
struct mtx __exclusive_cache_line ppeers_lock;
+#else
+VPS_DEFINE(struct sx, allproc_lock);
+VPS_DEFINE(struct sx, proctree_lock);
+VPS_DEFINE(struct mtx, ppeers_lock);
+#endif
uma_zone_t proc_zone;
/*
@@ -179,22 +185,46 @@
/*
* Initialize global process hashing structures.
*/
-void
+static void
procinit(void)
{
- sx_init(&allproc_lock, "allproc");
- sx_init(&proctree_lock, "proctree");
- mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
- LIST_INIT(&allproc);
- LIST_INIT(&zombproc);
- pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
- pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
- proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
- proc_ctor, proc_dtor, proc_init, proc_fini,
- UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- uihashinit();
+ sx_init(&V_allproc_lock, "allproc");
+ sx_init(&V_proctree_lock, "proctree");
+ mtx_init(&V_ppeers_lock, "p_peers", NULL, MTX_DEF);
+ LIST_INIT(&V_allproc);
+ LIST_INIT(&V_zombproc);
+ V_pidhashtbl = hashinit(maxproc / 4, M_PROC, &V_pidhash);
+ V_pgrphashtbl = hashinit(maxproc / 4, M_PROC, &V_pgrphash);
+ if (IS_DEFAULT_VPS(curvps)) {
+ proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
+ proc_ctor, proc_dtor, proc_init, proc_fini,
+ UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ }
}
+VPS_SYSINIT(procinit, SI_SUB_INTRINSIC, SI_ORDER_SECOND, procinit, NULL);
+
+#ifdef VIMAGE
+static void
+procdestroy(void *ident __unused)
+{
+
+ KASSERT((LIST_EMPTY(&V_allproc)), ("%s: list allproc %p not empty\n",
+ __func__, &V_allproc));
+ KASSERT((LIST_EMPTY(&V_zombproc)), ("%s: list zombproc %p not empty\n",
+ __func__, &V_zombproc));
+
+ /* proc_zone */
+ hashdestroy(V_pgrphashtbl, M_PROC, V_pgrphash);
+ hashdestroy(V_pidhashtbl, M_PROC, V_pidhash);
+
+ mtx_destroy(&V_ppeers_lock);
+ sx_destroy(&V_proctree_lock);
+ sx_destroy(&V_allproc_lock);
+}
+VPS_SYSUNINIT(procdestroy, SI_SUB_INTRINSIC, SI_ORDER_SECOND, procdestroy,
+ NULL);
+#endif
/*
* Prepare a proc for use.
@@ -303,7 +333,7 @@
inferior(struct proc *p)
{
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
for (; p != curproc; p = proc_realparent(p)) {
if (p->p_pid == 0)
@@ -317,7 +347,7 @@
{
struct proc *p;
- sx_assert(&allproc_lock, SX_LOCKED);
+ sx_assert(&V_allproc_lock, SX_LOCKED);
LIST_FOREACH(p, PIDHASH(pid), p_hash) {
if (p->p_pid == pid) {
PROC_LOCK(p);
@@ -347,9 +377,9 @@
PROC_LOCK(p);
return (p);
}
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
p = pfind_locked(pid);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (p);
}
@@ -361,11 +391,11 @@
{
struct proc *p;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
p = pfind_locked(pid);
if (p == NULL)
p = zpfind_locked(pid);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (p);
}
@@ -376,7 +406,8 @@
struct proc *p;
struct thread *td;
- sx_assert(&allproc_lock, SX_LOCKED);
+ /* Operate on current vps instance only. */
+ sx_assert(&V_allproc_lock, SX_LOCKED);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
@@ -402,7 +433,7 @@
{
struct pgrp *pgrp;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
if (pgrp->pg_id == pgid) {
@@ -426,7 +457,7 @@
if (p->p_pid == pid) {
PROC_LOCK(p);
} else {
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
if (pid <= PID_MAX) {
p = pfind_locked(pid);
if (p == NULL && (flags & PGET_NOTWEXIT) == 0)
@@ -436,7 +467,7 @@
} else {
p = NULL;
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
if (p == NULL)
return (ESRCH);
if ((flags & PGET_CANSEE) != 0) {
@@ -486,7 +517,7 @@
enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
KASSERT(p->p_pid == pgid,
@@ -547,7 +578,7 @@
enterthispgrp(struct proc *p, struct pgrp *pgrp)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
@@ -573,7 +604,7 @@
{
struct pgrp *savepgrp;
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
@@ -610,7 +641,7 @@
{
struct pgrp *savepgrp;
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
savepgrp = p->p_pgrp;
PGRP_LOCK(savepgrp);
PROC_LOCK(p);
@@ -632,7 +663,7 @@
struct session *savesess;
struct tty *tp;
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
@@ -691,7 +722,7 @@
struct session *mysession;
struct proc *q;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_NOTOWNED);
PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
@@ -744,7 +775,7 @@
}
PROC_UNLOCK(p);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
if (SESS_LEADER(p)) {
sp = p->p_session;
@@ -781,17 +812,17 @@
}
if (ttyvp != NULL) {
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) {
VOP_REVOKE(ttyvp, REVOKEALL);
VOP_UNLOCK(ttyvp, 0);
}
vrele(ttyvp);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
}
}
fixjobc(p, p->p_pgrp, 0);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
}
/*
@@ -851,10 +882,10 @@
struct proc *p;
int i;
- for (i = 0; i <= pgrphash; i++) {
- if (!LIST_EMPTY(&pgrphashtbl[i])) {
+ for (i = 0; i <= V_pgrphash; i++) {
+ if (!LIST_EMPTY(&V_pgrphashtbl[i])) {
printf("\tindx %d\n", i);
- LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
+ LIST_FOREACH(pgrp, &V_pgrphashtbl[i], pg_hash) {
printf(
"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
(void *)pgrp, (long)pgrp->pg_id,
@@ -910,7 +941,7 @@
struct timeval boottime;
/* For proc_realparent. */
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
bzero(kp, sizeof(*kp));
@@ -1019,7 +1050,7 @@
kp->ki_kiflag |= KI_CTTY;
if (SESS_LEADER(p))
kp->ki_kiflag |= KI_SLEADER;
- /* XXX proctree_lock */
+ /* XXX V_proctree_lock */
tp = sp->s_ttyp;
SESS_UNLOCK(sp);
}
@@ -1209,8 +1240,9 @@
{
struct proc *p;
- sx_assert(&allproc_lock, SX_LOCKED);
- LIST_FOREACH(p, &zombproc, p_list) {
+ /* Operate on current vps instance only. */
+ sx_assert(&V_allproc_lock, SX_LOCKED);
+ LIST_FOREACH(p, &V_zombproc, p_list) {
if (p->p_pid == pid) {
PROC_LOCK(p);
break;
@@ -1227,9 +1259,9 @@
{
struct proc *p;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
p = zpfind_locked(pid);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (p);
}
@@ -1465,11 +1497,11 @@
error = sysctl_wire_old_buffer(req, 0);
if (error)
return (error);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
error = pget((pid_t)name[0], PGET_CANSEE, &p);
if (error == 0)
error = sysctl_out_proc(p, req, flags);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (error);
}
@@ -1502,14 +1534,15 @@
* traced process. Only grab it if we are producing any
* data to begin with.
*/
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
}
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
+ /* Operate on current vps instance only. */
if (!doingzomb)
- p = LIST_FIRST(&allproc);
+ p = LIST_FIRST(&V_allproc);
else
- p = LIST_FIRST(&zombproc);
+ p = LIST_FIRST(&V_zombproc);
for (; p != NULL; p = LIST_NEXT(p, p_list)) {
/*
* Skip embryonic processes.
@@ -1569,7 +1602,7 @@
PROC_UNLOCK(p);
continue;
}
- /* XXX proctree_lock */
+ /* XXX V_proctree_lock */
SESS_LOCK(p->p_session);
if (p->p_session->s_ttyp == NULL ||
tty_udev(p->p_session->s_ttyp) !=
@@ -1609,9 +1642,9 @@
}
}
out:
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
if (req->oldptr != NULL)
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (error);
}
@@ -3095,101 +3128,133 @@
void
stop_all_proc(void)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *cp, *p;
int r, gen;
bool restart, seen_stopped, seen_exiting, stopped_some;
- cp = curproc;
+ KASSERT(IS_DEFAULT_VPS(curvps),
+ ("%s: called from non vps0 %p: vps %p\n", __func__, vps0, curvps));
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ cp = NULL;
+ else
+#endif
+ cp = curproc;
allproc_loop:
- sx_xlock(&allproc_lock);
- gen = allproc_gen;
- seen_exiting = seen_stopped = stopped_some = restart = false;
- LIST_REMOVE(cp, p_list);
- LIST_INSERT_HEAD(&allproc, cp, p_list);
- for (;;) {
- p = LIST_NEXT(cp, p_list);
- if (p == NULL)
- break;
+ sx_xlock(&V_allproc_lock);
+ if (cp == NULL)
+ cp = LIST_FIRST(&V_allproc);
+ gen = allproc_gen;
+ seen_exiting = seen_stopped = stopped_some = restart = false;
LIST_REMOVE(cp, p_list);
- LIST_INSERT_AFTER(p, cp, p_list);
- PROC_LOCK(p);
- if ((p->p_flag & (P_KPROC | P_SYSTEM | P_TOTAL_STOP)) != 0) {
- PROC_UNLOCK(p);
- continue;
- }
- if ((p->p_flag & P_WEXIT) != 0) {
- seen_exiting = true;
- PROC_UNLOCK(p);
- continue;
- }
- if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
- /*
- * Stopped processes are tolerated when there
- * are no other processes which might continue
- * them. P_STOPPED_SINGLE but not
- * P_TOTAL_STOP process still has at least one
- * thread running.
- */
- seen_stopped = true;
+ LIST_INSERT_HEAD(&V_allproc, cp, p_list);
+ for (;;) {
+ p = LIST_NEXT(cp, p_list);
+ if (p == NULL)
+ break;
+ LIST_REMOVE(cp, p_list);
+ LIST_INSERT_AFTER(p, cp, p_list);
+ PROC_LOCK(p);
+ if ((p->p_flag & (P_KPROC | P_SYSTEM | P_TOTAL_STOP)) != 0) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ if ((p->p_flag & P_WEXIT) != 0) {
+ seen_exiting = true;
+ PROC_UNLOCK(p);
+ continue;
+ }
+ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+ /*
+ * Stopped processes are tolerated when there
+ * are no other processes which might continue
+ * them. P_STOPPED_SINGLE but not
+ * P_TOTAL_STOP process still has at least one
+ * thread running.
+ */
+ seen_stopped = true;
+ PROC_UNLOCK(p);
+ continue;
+ }
+ _PHOLD(p);
+ sx_xunlock(&V_allproc_lock);
+ r = thread_single(p, SINGLE_ALLPROC);
+ if (r != 0)
+ restart = true;
+ else
+ stopped_some = true;
+ _PRELE(p);
PROC_UNLOCK(p);
- continue;
+ sx_xlock(&V_allproc_lock);
}
- _PHOLD(p);
- sx_xunlock(&allproc_lock);
- r = thread_single(p, SINGLE_ALLPROC);
- if (r != 0)
+ /* Catch forked children we did not see in iteration. */
+ if (gen != allproc_gen)
restart = true;
- else
- stopped_some = true;
- _PRELE(p);
- PROC_UNLOCK(p);
- sx_xlock(&allproc_lock);
- }
- /* Catch forked children we did not see in iteration. */
- if (gen != allproc_gen)
- restart = true;
- sx_xunlock(&allproc_lock);
- if (restart || stopped_some || seen_exiting || seen_stopped) {
- kern_yield(PRI_USER);
- goto allproc_loop;
+ sx_xunlock(&V_allproc_lock);
+ if (restart || stopped_some || seen_exiting || seen_stopped) {
+ kern_yield(PRI_USER);
+ goto allproc_loop;
+ }
+ CURVPS_RESTORE();
}
+ VPS_LIST_RUNLOCK();
}
void
resume_all_proc(void)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *cp, *p;
- cp = curproc;
- sx_xlock(&allproc_lock);
+ KASSERT(IS_DEFAULT_VPS(curvps),
+ ("%s: called from non vps0 %p: vps %p\n", __func__, vps0, curvps));
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ cp = NULL;
+ else
+#endif
+ cp = curproc;
+ sx_xlock(&V_allproc_lock);
again:
- LIST_REMOVE(cp, p_list);
- LIST_INSERT_HEAD(&allproc, cp, p_list);
- for (;;) {
- p = LIST_NEXT(cp, p_list);
- if (p == NULL)
- break;
LIST_REMOVE(cp, p_list);
- LIST_INSERT_AFTER(p, cp, p_list);
- PROC_LOCK(p);
- if ((p->p_flag & P_TOTAL_STOP) != 0) {
- sx_xunlock(&allproc_lock);
- _PHOLD(p);
- thread_single_end(p, SINGLE_ALLPROC);
- _PRELE(p);
- PROC_UNLOCK(p);
- sx_xlock(&allproc_lock);
- } else {
- PROC_UNLOCK(p);
+ LIST_INSERT_HEAD(&V_allproc, cp, p_list);
+ for (;;) {
+ p = LIST_NEXT(cp, p_list);
+ if (p == NULL)
+ break;
+ LIST_REMOVE(cp, p_list);
+ LIST_INSERT_AFTER(p, cp, p_list);
+ PROC_LOCK(p);
+ if ((p->p_flag & P_TOTAL_STOP) != 0) {
+ sx_xunlock(&V_allproc_lock);
+ _PHOLD(p);
+ thread_single_end(p, SINGLE_ALLPROC);
+ _PRELE(p);
+ PROC_UNLOCK(p);
+ sx_xlock(&V_allproc_lock);
+ } else {
+ PROC_UNLOCK(p);
+ }
}
+ /* Did the loop above missed any stopped process ? */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ /* No need for proc lock. */
+ if ((p->p_flag & P_TOTAL_STOP) != 0)
+ goto again;
+ }
+ sx_xunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- /* Did the loop above missed any stopped process ? */
- FOREACH_PROC_IN_SYSTEM(p) {
- /* No need for proc lock. */
- if ((p->p_flag & P_TOTAL_STOP) != 0)
- goto again;
- }
- sx_xunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
}
/* #define TOTAL_STOP_DEBUG 1 */
Index: sys/kern/kern_procctl.c
===================================================================
--- sys/kern/kern_procctl.c
+++ sys/kern/kern_procctl.c
@@ -69,7 +69,7 @@
p = top;
ret = 0;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
for (;;) {
ret |= protect_setchild(td, p, flags);
PROC_UNLOCK(p);
@@ -128,7 +128,7 @@
reap_acquire(struct thread *td, struct proc *p)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
if (p != curproc)
return (EPERM);
if ((p->p_treeflag & P_TREE_REAPER) != 0)
@@ -145,10 +145,10 @@
reap_release(struct thread *td, struct proc *p)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
if (p != curproc)
return (EPERM);
- if (p == initproc)
+ if (p == V_initproc)
return (EINVAL);
if ((p->p_treeflag & P_TREE_REAPER) == 0)
return (EINVAL);
@@ -162,7 +162,7 @@
{
struct proc *reap, *p2, *first_p;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
bzero(rs, sizeof(*rs));
if ((p->p_treeflag & P_TREE_REAPER) == 0) {
reap = p->p_reaper;
@@ -170,7 +170,7 @@
reap = p;
rs->rs_flags |= REAPER_STATUS_OWNED;
}
- if (reap == initproc)
+ if (reap == V_initproc)
rs->rs_flags |= REAPER_STATUS_REALINIT;
rs->rs_reaper = reap->p_pid;
rs->rs_descendants = 0;
@@ -199,18 +199,18 @@
u_int i, n;
int error;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
PROC_UNLOCK(p);
reap = (p->p_treeflag & P_TREE_REAPER) == 0 ? p->p_reaper : p;
n = i = 0;
error = 0;
LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling)
n++;
- sx_unlock(&proctree_lock);
+ sx_unlock(&V_proctree_lock);
if (rp->rp_count < n)
n = rp->rp_count;
pi = malloc(n * sizeof(*pi), M_TEMP, M_WAITOK);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling) {
if (i == n)
break;
@@ -225,10 +225,10 @@
pip->pi_flags |= REAPER_PIDINFO_REAPER;
i++;
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
error = copyout(pi, rp->rp_pids, i * sizeof(*pi));
free(pi, M_TEMP);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
PROC_LOCK(p);
return (error);
}
@@ -278,7 +278,7 @@
struct reap_kill_tracker *t;
int error;
- sx_assert(&proctree_lock, SX_LOCKED);
+ sx_assert(&V_proctree_lock, SX_LOCKED);
if (IN_CAPABILITY_MODE(td))
return (ECAPMODE);
if (rk->rk_sig <= 0 || rk->rk_sig > _SIG_MAXSIG ||
@@ -585,12 +585,12 @@
case PROC_REAP_KILL:
case PROC_TRACE_CTL:
case PROC_TRAPCAP_CTL:
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
tree_locked = true;
break;
case PROC_REAP_ACQUIRE:
case PROC_REAP_RELEASE:
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
tree_locked = true;
break;
case PROC_TRACE_STATUS:
@@ -657,6 +657,6 @@
break;
}
if (tree_locked)
- sx_unlock(&proctree_lock);
+ sx_unlock(&V_proctree_lock);
return (error);
}
Index: sys/kern/kern_prot.c
===================================================================
--- sys/kern/kern_prot.c
+++ sys/kern/kern_prot.c
@@ -52,6 +52,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/acct.h>
+#include <sys/filedesc.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/lock.h>
@@ -133,10 +134,10 @@
PROC_UNLOCK(p);
} else {
PROC_UNLOCK(p);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
pp = proc_realparent(p);
ppid = pp->p_pid;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
}
return (ppid);
@@ -340,7 +341,7 @@
newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
if (pgrp != NULL)
@@ -353,7 +354,7 @@
newsess = NULL;
}
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
if (newpgrp != NULL)
free(newpgrp, M_PGRP);
@@ -399,7 +400,7 @@
newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
if (uap->pid != 0 && uap->pid != curp->p_pid) {
if ((targp = pfind(uap->pid)) == NULL) {
error = ESRCH;
@@ -457,7 +458,7 @@
error = enterthispgrp(targp, pgrp);
}
done:
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
KASSERT((error == 0) || (newpgrp != NULL),
("setpgid failed and newpgrp is NULL"));
if (newpgrp != NULL)
@@ -1738,7 +1739,7 @@
}
/* Can't trace init when securelevel > 0. */
- if (p == initproc) {
+ if (p == V_initproc) {
error = securelevel_gt(td->td_ucred, 0);
if (error)
return (error);
@@ -1860,8 +1861,10 @@
crfree(struct ucred *cr)
{
- KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
- KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
+ KASSERT(cr->cr_ref > 0, ("%s: bad ucred %p refcount: %d",
+ __func__, cr, cr->cr_ref));
+ KASSERT(cr->cr_ref != 0xdeadc0de,
+ ("%s: dangling reference to ucred %p", __func__, cr));
if (refcount_release(&cr->cr_ref)) {
/*
* Some callers of crget(), such as nfs_statfs(),
Index: sys/kern/kern_racct.c
===================================================================
--- sys/kern/kern_racct.c
+++ sys/kern/kern_racct.c
@@ -1214,94 +1214,107 @@
}
static void
-racctd(void)
+_racctd(void)
{
struct thread *td;
struct proc *p;
struct timeval wallclock;
uint64_t pct, pct_estimate, runtime;
- ASSERT_RACCT_ENABLED();
-
- for (;;) {
- racct_decay();
+ sx_slock(&V_allproc_lock);
- sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &V_zombproc, p_list) {
+ PROC_LOCK(p);
+ racct_set(p, RACCT_PCTCPU, 0);
+ PROC_UNLOCK(p);
+ }
- LIST_FOREACH(p, &zombproc, p_list) {
- PROC_LOCK(p);
- racct_set(p, RACCT_PCTCPU, 0);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL) {
PROC_UNLOCK(p);
+ continue;
}
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state != PRS_NORMAL) {
- PROC_UNLOCK(p);
- continue;
- }
-
- microuptime(&wallclock);
- timevalsub(&wallclock, &p->p_stats->p_start);
- PROC_STATLOCK(p);
- FOREACH_THREAD_IN_PROC(p, td)
- ruxagg(p, td);
- runtime = cputick2usec(p->p_rux.rux_runtime);
- PROC_STATUNLOCK(p);
+ microuptime(&wallclock);
+ timevalsub(&wallclock, &p->p_stats->p_start);
+ PROC_STATLOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td)
+ ruxagg(p, td);
+ runtime = cputick2usec(p->p_rux.rux_runtime);
+ PROC_STATUNLOCK(p);
#ifdef notyet
- KASSERT(runtime >= p->p_prev_runtime,
- ("runtime < p_prev_runtime"));
+ KASSERT(runtime >= p->p_prev_runtime,
+ ("runtime < p_prev_runtime"));
#else
- if (runtime < p->p_prev_runtime)
- runtime = p->p_prev_runtime;
+ if (runtime < p->p_prev_runtime)
+ runtime = p->p_prev_runtime;
#endif
- p->p_prev_runtime = runtime;
- if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
- pct_estimate = (1000000 * runtime * 100) /
- ((uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec);
- } else
- pct_estimate = 0;
- pct = racct_getpcpu(p, pct_estimate);
- RACCT_LOCK();
+ p->p_prev_runtime = runtime;
+ if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+ pct_estimate = (1000000 * runtime * 100) /
+ ((uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec);
+ } else
+ pct_estimate = 0;
+ pct = racct_getpcpu(p, pct_estimate);
+ RACCT_LOCK();
#ifdef RCTL
- rctl_throttle_decay(p->p_racct, RACCT_READBPS);
- rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
- rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
- rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
+ rctl_throttle_decay(p->p_racct, RACCT_READBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
+ rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
+ rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
#endif
- racct_set_locked(p, RACCT_PCTCPU, pct, 1);
- racct_set_locked(p, RACCT_CPU, runtime, 0);
- racct_set_locked(p, RACCT_WALLCLOCK,
- (uint64_t)wallclock.tv_sec * 1000000 +
- wallclock.tv_usec, 0);
- RACCT_UNLOCK();
+ racct_set_locked(p, RACCT_PCTCPU, pct, 1);
+ racct_set_locked(p, RACCT_CPU, runtime, 0);
+ racct_set_locked(p, RACCT_WALLCLOCK,
+ (uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec, 0);
+ RACCT_UNLOCK();
+ PROC_UNLOCK(p);
+ }
+
+ /*
+ * To ensure that processes are throttled in a fair way, we need
+ * to iterate over all processes again and check the limits
+ * for %cpu resource only after ucred racct containers have been
+ * properly filled.
+ */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL) {
PROC_UNLOCK(p);
+ continue;
}
- /*
- * To ensure that processes are throttled in a fair way, we need
- * to iterate over all processes again and check the limits
- * for %cpu resource only after ucred racct containers have been
- * properly filled.
- */
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state != PRS_NORMAL) {
- PROC_UNLOCK(p);
- continue;
- }
+ if (racct_pcpu_available(p) <= 0) {
+ if (p->p_racct->r_resources[RACCT_PCTCPU] >
+ pcpu_threshold)
+ racct_proc_throttle(p, -1);
+ } else if (p->p_throttled == -1) {
+ racct_proc_wakeup(p);
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+}
- if (racct_pcpu_available(p) <= 0) {
- if (p->p_racct->r_resources[RACCT_PCTCPU] >
- pcpu_threshold)
- racct_proc_throttle(p, -1);
- } else if (p->p_throttled == -1) {
- racct_proc_wakeup(p);
- }
- PROC_UNLOCK(p);
+static void
+racctd(void)
+{
+ VPS_ITERATOR_DECL(vps_iter);
+
+ ASSERT_RACCT_ENABLED();
+
+ for (;;) {
+ racct_decay();
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ _racctd();
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
pause("-", hz);
}
}
Index: sys/kern/kern_rctl.c
===================================================================
--- sys/kern/kern_rctl.c
+++ sys/kern/kern_rctl.c
@@ -1175,7 +1175,7 @@
error = str2id(subject_idstr, &id);
if (error != 0)
goto out;
- sx_assert(&allproc_lock, SA_LOCKED);
+ sx_assert(&V_allproc_lock, SA_LOCKED);
rule->rr_subject.rs_proc = pfind(id);
if (rule->rr_subject.rs_proc == NULL) {
error = ESRCH;
@@ -1266,6 +1266,7 @@
int
rctl_rule_add(struct rctl_rule *rule)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
struct ucred *cred;
struct uidinfo *uip;
@@ -1357,37 +1358,51 @@
* Now go through all the processes and add the new rule to the ones
* it applies to.
*/
- sx_assert(&allproc_lock, SA_LOCKED);
- FOREACH_PROC_IN_SYSTEM(p) {
- cred = p->p_ucred;
- switch (rule->rr_subject_type) {
- case RCTL_SUBJECT_TYPE_USER:
- if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
- cred->cr_ruidinfo == rule->rr_subject.rs_uip)
- break;
- continue;
- case RCTL_SUBJECT_TYPE_LOGINCLASS:
- if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
- break;
- continue;
- case RCTL_SUBJECT_TYPE_JAIL:
- match = 0;
- for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
- if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
- match = 1;
+ sx_assert(&V_allproc_lock, SA_LOCKED);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_slock(&V_allproc_lock);
+#endif
+ FOREACH_PROC_IN_SYSTEM(p) {
+ cred = p->p_ucred;
+ switch (rule->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_USER:
+ if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
+ cred->cr_ruidinfo == rule->rr_subject.rs_uip)
+ break;
+ continue;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
break;
+ continue;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ match = 0;
+ for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
+ if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
+ match = 1;
+ break;
+ }
}
+ if (match)
+ break;
+ continue;
+ default:
+ panic("rctl_rule_add: unknown subject type %d",
+ rule->rr_subject_type);
}
- if (match)
- break;
- continue;
- default:
- panic("rctl_rule_add: unknown subject type %d",
- rule->rr_subject_type);
- }
- rctl_racct_add_rule(p->p_racct, rule);
+ rctl_racct_add_rule(p->p_racct, rule);
+ }
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_sunlock(&V_allproc_lock);
+#endif
+ CURVPS_RESTORE();
}
+ VPS_LIST_RUNLOCK();
return (0);
}
@@ -1426,6 +1441,7 @@
int
rctl_rule_remove(struct rctl_rule *filter)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
int found = 0;
@@ -1452,11 +1468,25 @@
rctl_rule_pre_callback, rctl_rule_post_callback,
filter, (void *)&found);
- sx_assert(&allproc_lock, SA_LOCKED);
+ sx_assert(&V_allproc_lock, SA_LOCKED);
RACCT_LOCK();
- FOREACH_PROC_IN_SYSTEM(p) {
- found += rctl_racct_remove_rules(p->p_racct, filter);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_slock(&V_allproc_lock);
+#endif
+ FOREACH_PROC_IN_SYSTEM(p) {
+ found += rctl_racct_remove_rules(p->p_racct, filter);
+ }
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_sunlock(&V_allproc_lock);
+#endif
+ CURVPS_RESTORE();
}
+ VPS_LIST_RUNLOCK();
RACCT_UNLOCK();
if (found)
@@ -1623,11 +1653,11 @@
if (error != 0)
return (error);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
@@ -1669,7 +1699,7 @@
}
out:
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
if (error != 0)
return (error);
@@ -1699,6 +1729,7 @@
int
sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct sbuf *sb;
struct rctl_rule *filter;
struct rctl_rule_link *link;
@@ -1718,41 +1749,56 @@
if (error != 0)
return (error);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
bufsize = uap->outbuflen;
if (bufsize > rctl_maxbufsize) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (E2BIG);
}
buf = malloc(bufsize, M_RCTL, M_WAITOK);
sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
KASSERT(sb != NULL, ("sbuf_new failed"));
-
- FOREACH_PROC_IN_SYSTEM(p) {
- RACCT_LOCK();
- LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
- /*
- * Non-process rules will be added to the buffer later.
- * Adding them here would result in duplicated output.
- */
- if (link->rrl_rule->rr_subject_type !=
- RCTL_SUBJECT_TYPE_PROCESS)
- continue;
- if (!rctl_rule_matches(link->rrl_rule, filter))
- continue;
- rctl_rule_to_sbuf(sb, link->rrl_rule);
- sbuf_printf(sb, ",");
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_slock(&V_allproc_lock);
+#endif
+ FOREACH_PROC_IN_SYSTEM(p) {
+ RACCT_LOCK();
+ LIST_FOREACH(link, &p->p_racct->r_rule_links,
+ rrl_next) {
+ /*
+ * Non-process rules will be added to the
+ * buffer later. Adding them here would result
+ * in duplicated output.
+ */
+ if (link->rrl_rule->rr_subject_type !=
+ RCTL_SUBJECT_TYPE_PROCESS)
+ continue;
+ if (!rctl_rule_matches(link->rrl_rule, filter))
+ continue;
+ rctl_rule_to_sbuf(sb, link->rrl_rule);
+ sbuf_printf(sb, ",");
+ }
+ RACCT_UNLOCK();
}
- RACCT_UNLOCK();
+#ifdef VIMAGE
+ if (saved_vps != vps_iter)
+ sx_sunlock(&V_allproc_lock);
+#endif
+ CURVPS_RESTORE();
}
+ VPS_LIST_RUNLOCK();
loginclass_racct_foreach(rctl_get_rules_callback,
rctl_rule_pre_callback, rctl_rule_post_callback,
@@ -1777,7 +1823,7 @@
error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
out:
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
free(buf, M_RCTL);
return (error);
}
@@ -1803,34 +1849,34 @@
if (error != 0)
return (error);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (EINVAL);
}
if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (EOPNOTSUPP);
}
if (filter->rr_subject.rs_proc == NULL) {
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (EINVAL);
}
bufsize = uap->outbuflen;
if (bufsize > rctl_maxbufsize) {
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (E2BIG);
}
@@ -1860,7 +1906,7 @@
error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
out:
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
free(buf, M_RCTL);
return (error);
}
@@ -1883,11 +1929,11 @@
if (error != 0)
return (error);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
error = rctl_string_to_rule(inputstr, &rule);
free(inputstr, M_RCTL);
if (error != 0) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
/*
@@ -1906,7 +1952,7 @@
out:
rctl_rule_release(rule);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
@@ -1928,17 +1974,17 @@
if (error != 0)
return (error);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
error = rctl_string_to_rule(inputstr, &filter);
free(inputstr, M_RCTL);
if (error != 0) {
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
error = rctl_rule_remove(filter);
rctl_rule_release(filter);
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
return (error);
}
Index: sys/kern/kern_resource.c
===================================================================
--- sys/kern/kern_resource.c
+++ sys/kern/kern_resource.c
@@ -69,10 +69,15 @@
static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
-#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
+
static struct rwlock uihashtbl_lock;
-static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
-static u_long uihash; /* size of hash table - 1 */
+
+LIST_HEAD(uihashhead, uidinfo);
+static VPS_DEFINE(struct uihashhead *, uihashtbl);
+#define V_uihashtbl VPS(uihashtbl)
+static VPS_DEFINE(u_long, uihash); /* size of hash table - 1 */
+#define V_uihash VPS(uihash)
+#define UIHASH(uid) (&V_uihashtbl[(uid) & V_uihash])
static void calcru1(struct proc *p, struct rusage_ext *ruxp,
struct timeval *up, struct timeval *sp);
@@ -114,18 +119,18 @@
break;
case PRIO_PGRP:
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (uap->who == 0) {
pg = td->td_proc->p_pgrp;
PGRP_LOCK(pg);
} else {
pg = pgfind(uap->who);
if (pg == NULL) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
break;
}
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
@@ -141,7 +146,8 @@
case PRIO_USER:
if (uap->who == 0)
uap->who = td->td_ucred->cr_uid;
- sx_slock(&allproc_lock);
+ /* Operate on current vps instance only. */
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
@@ -152,7 +158,7 @@
}
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
break;
default:
@@ -199,18 +205,18 @@
break;
case PRIO_PGRP:
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (uap->who == 0) {
pg = curp->p_pgrp;
PGRP_LOCK(pg);
} else {
pg = pgfind(uap->who);
if (pg == NULL) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
break;
}
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
@@ -226,7 +232,8 @@
case PRIO_USER:
if (uap->who == 0)
uap->who = td->td_ucred->cr_uid;
- sx_slock(&allproc_lock);
+ /* Operate on current vps instance only. */
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL &&
@@ -237,7 +244,7 @@
}
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
break;
default:
@@ -1214,13 +1221,43 @@
p->p_sysent->sv_fixlimit(rlp, which);
}
-void
+static void
uihashinit()
{
- uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
rw_init(&uihashtbl_lock, "uidinfo hash");
}
+SYSINIT(uihashinit, SI_SUB_INTRINSIC, SI_ORDER_SECOND, uihashinit, NULL);
+
+static void
+uihashinit_vps()
+{
+
+ V_uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &V_uihash);
+}
+VPS_SYSINIT(uihashinit_vps, SI_SUB_INTRINSIC, SI_ORDER_SECOND, uihashinit_vps,
+ NULL);
+
+#ifdef VIMAGE
+static void
+uihashdestroy_vps(void *ident __unused)
+{
+ struct uidinfo *uip;
+ struct uihashhead *uih;
+ int i;
+
+ i = 0;
+ for (uih = &V_uihashtbl[V_uihash]; uih >= V_uihashtbl; uih--)
+ LIST_FOREACH(uip, uih, ui_hash)
+ i++;
+ if (i == 0)
+ hashdestroy(V_uihashtbl, M_UIDINFO, V_uihash);
+ else
+ printf("%s: leaking %d uihash entries\n", __func__, i);
+}
+VPS_SYSUNINIT(uihashdestroy_vps, SI_SUB_INTRINSIC, SI_ORDER_SECOND,
+ uihashdestroy_vps, NULL);
+#endif
/*
* Look up a uidinfo struct for the parameter uid.
@@ -1368,7 +1405,7 @@
rw_rlock(&uihashtbl_lock);
if (pre != NULL)
(pre)();
- for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) {
+ for (uih = &V_uihashtbl[V_uihash]; uih >= V_uihashtbl; uih--) {
LIST_FOREACH(uip, uih, ui_hash) {
(callback)(uip->ui_racct, arg2, arg3);
}
@@ -1392,7 +1429,8 @@
return (0);
}
} else if (new < 0)
- printf("negative %s for uid = %d\n", name, uip->ui_uid);
+ printf("%s: curthread %p uip %p negative %s for uid = %d\n",
+ __func__, curthread, uip, name, uip->ui_uid);
return (1);
}
Index: sys/kern/kern_shutdown.c
===================================================================
--- sys/kern/kern_shutdown.c
+++ sys/kern/kern_shutdown.c
@@ -204,6 +204,10 @@
int dumping; /* system is dumping */
int rebooting; /* system is rebooting */
+#ifdef VIMAGE
+VPS_DEFINE(int, vrebooting); /* vps is rebooting */
+#define V_vrebooting VPS(vrebooting)
+#endif
static struct dumperinfo dumper; /* our selected dumper */
/* Context information for dump-debuggers. */
@@ -276,29 +280,42 @@
if (error == 0) {
if (uap->opt & RB_REROOT)
error = kern_reroot();
- else
+ else {
+#ifdef VIMAGE
+ /* XXX Can argue that we should never make it here. */
+ /* Init will want to _exit() in this case. */
+ if (!IS_DEFAULT_VPS(TD_TO_VPS(curthread))) {
+ V_vrebooting = 1;
+ return (error);
+ }
+#endif
kern_reboot(uap->opt);
+ }
}
return (error);
}
+static VPS_DEFINE(int, vhowto);
+#define V_vhowto VPS(vhowto)
+
static void
shutdown_nice_task_fn(void *arg, int pending __unused)
{
- int howto;
- howto = (uintptr_t)arg;
+ CURVPS_SET((struct vps *)arg);
/* Send a signal to init(8) and have it shutdown the world. */
- PROC_LOCK(initproc);
- if (howto & RB_POWEROFF)
- kern_psignal(initproc, SIGUSR2);
- else if (howto & RB_POWERCYCLE)
- kern_psignal(initproc, SIGWINCH);
- else if (howto & RB_HALT)
- kern_psignal(initproc, SIGUSR1);
+ PROC_LOCK(V_initproc);
+ if (V_vhowto & RB_POWEROFF)
+ kern_psignal(V_initproc, SIGUSR2);
+ else if (V_vhowto & RB_POWERCYCLE)
+ kern_psignal(V_initproc, SIGWINCH);
+ else if (V_vhowto & RB_HALT)
+ kern_psignal(V_initproc, SIGUSR1);
else
- kern_psignal(initproc, SIGINT);
- PROC_UNLOCK(initproc);
+ kern_psignal(V_initproc, SIGINT);
+ PROC_UNLOCK(V_initproc);
+ V_vhowto = 0;
+ CURVPS_RESTORE();
}
static struct task shutdown_nice_task = TASK_INITIALIZER(0,
@@ -311,10 +328,22 @@
shutdown_nice(int howto)
{
- if (initproc != NULL && !SCHEDULER_STOPPED()) {
- shutdown_nice_task.ta_context = (void *)(uintptr_t)howto;
+ if (V_initproc != NULL && !SCHEDULER_STOPPED()) {
+
+ KASSERT(V_vhowto == 0, ("%s: vps %p howto not 0: %d\n",
+ __func__, curvps, V_vhowto));
+ V_vhowto = howto;
+ shutdown_nice_task.ta_context = (void *)curvps;
taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task);
} else {
+#ifdef VIMAGE
+ /* XXX Can argue that we should never make it here. */
+ /* Init will want to _exit() in this case. */
+ if (!IS_DEFAULT_VPS(TD_TO_VPS(curthread))) {
+ V_vrebooting = 1;
+ return;
+ }
+#endif
/*
* No init(8) running, or scheduler would not allow it
* to run, so simply reboot.
@@ -462,7 +491,7 @@
struct mount *mp, *devmp;
int error;
- if (curproc != initproc)
+ if (curproc != V_initproc)
return (EPERM);
/*
Index: sys/kern/kern_sig.c
===================================================================
--- sys/kern/kern_sig.c
+++ sys/kern/kern_sig.c
@@ -1669,9 +1669,9 @@
ret = ESRCH;
if (all) {
/*
- * broadcast
+ * broadcast; current vps context only.
*/
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
p == td->td_proc || p->p_state == PRS_NEW) {
@@ -1688,9 +1688,9 @@
ret = err;
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
} else {
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (pgid == 0) {
/*
* zero pgid means send to my process group.
@@ -1700,11 +1700,11 @@
} else {
pgrp = pgfind(pgid);
if (pgrp == NULL) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (ESRCH);
}
}
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
PROC_LOCK(p);
if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
@@ -1891,9 +1891,9 @@
struct pgrp *pgrp;
if (pgid != 0) {
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
pgrp = pgfind(pgid);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
if (pgrp != NULL) {
pgsignal(pgrp, sig, 0, ksi);
PGRP_UNLOCK(pgrp);
@@ -3279,7 +3279,7 @@
/*
* Protect the access to corefilename[] by allproc_lock.
*/
-#define corefilename_lock allproc_lock
+#define corefilename_lock V_allproc_lock
static char corefilename[MAXPATHLEN] = {"%N.core"};
TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
Index: sys/kern/kern_sysctl.c
===================================================================
--- sys/kern/kern_sysctl.c
+++ sys/kern/kern_sysctl.c
@@ -60,6 +60,7 @@
#include <sys/sx.h>
#include <sys/sysproto.h>
#include <sys/uio.h>
+#include <sys/vps.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
@@ -498,6 +499,7 @@
if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE &&
#ifdef VIMAGE
(oidp->oid_kind & CTLFLAG_VNET) == 0 &&
+ (oidp->oid_kind & CTLFLAG_VPS) == 0 &&
#endif
(oidp->oid_kind & CTLFLAG_TUN) != 0 &&
(oidp->oid_kind & CTLFLAG_NOFETCH) == 0) {
@@ -1998,6 +2000,9 @@
else if ((oid->oid_kind & CTLFLAG_VNET) &&
prison_owns_vnet(req->td->td_ucred))
priv = PRIV_SYSCTL_WRITEJAIL;
+ else if ((oid->oid_kind & CTLFLAG_VPS) &&
+ prison_owns_vps(req->td->td_ucred))
+ priv = PRIV_SYSCTL_WRITEJAIL;
#endif
else
priv = PRIV_SYSCTL_WRITE;
@@ -2025,8 +2030,13 @@
goto out;
#endif
#ifdef VIMAGE
+ KASSERT(((oid->oid_kind & (CTLFLAG_VNET|CTLFLAG_VPS)) !=
+ (CTLFLAG_VNET|CTLFLAG_VPS)),
+ ("CTLFLAG VNET and VPS set oid %p", oid));
if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL)
arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1);
+ if ((oid->oid_kind & CTLFLAG_VPS) && arg1 != NULL)
+ arg1 = (void *)(curvps->vps_data_base + (uintptr_t)arg1);
#endif
error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker);
@@ -2118,6 +2128,7 @@
memlocked = 1;
sx_xlock(&sysctlmemlock);
}
+ CURVPS_SET(TD_TO_VPS(td));
CURVNET_SET(TD_TO_VNET(td));
for (;;) {
@@ -2130,6 +2141,7 @@
}
CURVNET_RESTORE();
+ CURVPS_RESTORE();
if (req.lock == REQ_WIRED && req.validlen > 0)
vsunlock(req.oldptr, req.validlen);
Index: sys/kern/kern_thr.c
===================================================================
--- sys/kern/kern_thr.c
+++ sys/kern/kern_thr.c
@@ -32,6 +32,7 @@
#include "opt_posix.h"
#include "opt_hwpmc_hooks.h"
#include <sys/param.h>
+#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@@ -56,6 +57,7 @@
#include <sys/rtprio.h>
#include <sys/umtx.h>
#include <sys/limits.h>
+#include <sys/vps.h>
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
@@ -238,6 +240,9 @@
bcopy(&td->td_startcopy, &newtd->td_startcopy,
__rangeof(struct thread, td_startcopy, td_endcopy));
newtd->td_proc = td->td_proc;
+#ifdef VIMAGE
+ newtd->td_vps = TD_TO_VPS(td);
+#endif
newtd->td_rb_list = newtd->td_rbp_list = newtd->td_rb_inact = 0;
thread_cow_get(newtd, td);
Index: sys/kern/kern_thread.c
===================================================================
--- sys/kern/kern_thread.c
+++ sys/kern/kern_thread.c
@@ -58,6 +58,9 @@
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
+#ifdef VIMAGE
+#include <sys/jail.h>
+#endif
#include <security/audit/audit.h>
@@ -83,7 +86,7 @@
"struct thread KBI td_pflags");
_Static_assert(offsetof(struct thread, td_frame) == 0x470,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x518,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x528,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0xb0,
"struct proc KBI p_flag");
@@ -103,7 +106,7 @@
"struct thread KBI td_pflags");
_Static_assert(offsetof(struct thread, td_frame) == 0x2e8,
"struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x334,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x33c,
"struct thread KBI td_emuldata");
_Static_assert(offsetof(struct proc, p_flag) == 0x68,
"struct proc KBI p_flag");
@@ -451,6 +454,10 @@
PROC_LOCK_ASSERT(p, MA_OWNED);
newtd->td_ucred = crhold(p->p_ucred);
+#ifdef VIMAGE
+ /* Make sure the cached vps stays correct. */
+ newtd->td_vps = p->p_ucred->cr_prison->pr_vps;
+#endif
newtd->td_limit = lim_hold(p->p_limit);
newtd->td_cowgen = p->p_cowgen;
}
@@ -460,6 +467,10 @@
{
newtd->td_ucred = crhold(td->td_ucred);
+#ifdef VIMAGE
+ /* Make sure to inherit the cached vps as well. */
+ newtd->td_vps = td->td_vps;
+#endif
newtd->td_limit = lim_hold(td->td_limit);
newtd->td_cowgen = td->td_cowgen;
}
@@ -489,6 +500,11 @@
oldcred = td->td_ucred;
td->td_ucred = crhold(p->p_ucred);
}
+#ifdef VIMAGE
+ /* Make sure the cached vps stays correct. */
+ if (td->td_vps != p->p_ucred->cr_prison->pr_vps)
+ td->td_vps = p->p_ucred->cr_prison->pr_vps;
+#endif
if (td->td_limit != p->p_limit) {
oldlimit = td->td_limit;
td->td_limit = lim_hold(p->p_limit);
Index: sys/kern/kern_vps.c
===================================================================
--- /dev/null
+++ sys/kern/kern_vps.c
@@ -0,0 +1,835 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2004-2009 University of Zagreb
+ * Copyright (c) 2006-2009 FreeBSD Foundation
+ * Copyright (c) 2018 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by the University of Zagreb and the
+ * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
+ * FreeBSD Foundation.
+ *
+ * Portions of this software were developed by Bjoern Zeeb
+ * under sponsorship from iXsystems, Inc.
+ *
+ * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kdb.h"
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/jail.h>
+#include <sys/sdt.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/vps.h>
+
+#include <machine/stdarg.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+#endif
+
+
+/*-
+ * This file implements core functions for virtual process spaces:
+ *
+ * - Virtual process space management functions.
+ *
+ * - Virtual process space memory allocator, which virtualizes global
+ * variables in the process space.
+ *
+ * - Virtualized SYSINIT's/SYSUNINIT's, which allow process spaces
+ * to register startup/shutdown events to be run for each virtual process
+ * space instance.
+ */
+
+static MALLOC_DEFINE(M_VPS, "vps", "process space control block");
+
+/*
+ * The virtual process space list has two read-write locks, one sleepable and
+ * the other not, so that the list can be stablized and walked in a variety
+ * of process space contexts. Both must be acquired exclusively to modify
+ * the list, but a read lock of either lock is sufficient to walk the list.
+ */
+struct rwlock vps_rwlock;
+struct sx vps_sxlock;
+
+#define VPS_LIST_WLOCK() do { \
+ sx_xlock(&vps_sxlock); \
+ rw_wlock(&vps_rwlock); \
+} while (0)
+
+#define VPS_LIST_WUNLOCK() do { \
+ rw_wunlock(&vps_rwlock); \
+ sx_xunlock(&vps_sxlock); \
+} while (0)
+
+struct vps_list_head vps_head;
+struct vps *vps0;
+
+/*
+ * The virtual process space allocator provides storage for virtualized
+ * global variables. These variables are defined/declared using the
+ * VPS_DEFINE()/VPS_DECLARE() macros, which place them in the 'set_vps'
+ * linker set. The details of the implementation are somewhat subtle, but
+ * allow the majority of most process subsystems to maintain
+ * virtualization-agnostic.
+ *
+ * The virtual process space allocator handles variables in the base kernel
+ * vs. modules in similar but different ways. In both cases, virtualized
+ * global variables are marked as such by being declared to be part of the
+ * vps linker set. These "primary" copies of global variables serve two
+ * functions:
+ *
+ * (1) They contain static initialization or "default" values for global
+ * variables which will be propagated to each virtual process space
+ * instance when created. As with normal global variables, they default
+ * to zero-filled.
+ *
+ * (2) They act as unique global names by which the variable can be referred
+ * to, regardless of process space instance. The single global symbol
+ * will be used to calculate the location of a per-virtual instance
+ * variable at run-time.
+ *
+ * Each virtual process space instance has a complete copy of each
+ * virtualized global variable, stored in a malloc'd block of memory
+ * referred to by vps->vps_data_mem. Critical to the design is that each
+ * per-instance memory block is laid out identically to the primary block so
+ * that the offset of each global variable is the same across all blocks.
+ * To optimize run-time access, a precalculated 'base' address,
+ * vps->vps_data_base, is stored in each vps, and is the amount that can
+ * be added to the address of a 'primary' instance of a variable to get to the
+ * per-vps instance.
+ *
+ * Virtualized global variables are handled in a similar manner, but as each
+ * module has its own 'set_vps' linker set, and we want to keep all
+ * virtualized globals togther, we reserve space in the kernel's linker set
+ * for potential module variables using a per-vps character array,
+ * 'modspace'. The virtual process space allocator maintains a free list to
+ * track what space in the array is free (all, initially) and as modules are
+ * linked, allocates portions of the space to specific globals. The kernel
+ * module linker queries the virtual process space allocator and will
+ * bind references of the global to the location during linking. It also
+ * calls into the virtual process space allocator, once the memory is
+ * initialized, in order to propagate the new static initializations to all
+ * existing virtual process space instances so that the soon-to-be executing
+ * module will find every process space instance with proper default values.
+ */
+
+/*
+ * Number of bytes of data in the 'set_vps' linker set, and hence the total
+ * size of all kernel virtualized global variables, and the malloc(9) type
+ * that will be used to allocate it.
+ */
+#define VPS_BYTES (VPS_STOP - VPS_START)
+
+static MALLOC_DEFINE(M_VPS_DATA, "vps_data", "VPS data");
+
+/*
+ * VPS_MODMIN is the minimum number of bytes we will reserve for the sum of
+ * global variables across all loaded modules. As this actually sizes an
+ * array declared as a virtualized global variable in the kernel itself, and
+ * we want the virtualized global variable space to be page-sized, we may
+ * have more space than that in practice.
+ */
+#define VPS_MODMIN 8192
+#define VPS_SIZE roundup2(VPS_BYTES, PAGE_SIZE)
+
+/*
+ * Space to store virtualized global variables from loadable kernel modules,
+ * and the free list to manage it.
+ */
+static VPS_DEFINE(char, modspace[VPS_MODMIN]);
+
+/*
+ * Global lists of subsystem constructor and destructors for vpss. They are
+ * registered via VPS_SYSINIT() and VPS_SYSUNINIT(). Both lists are
+ * protected by the vps_sysinit_sxlock global lock.
+ */
+static TAILQ_HEAD(vps_sysinit_head, vps_sysinit) vps_constructors =
+ TAILQ_HEAD_INITIALIZER(vps_constructors);
+static TAILQ_HEAD(vps_sysuninit_head, vps_sysinit) vps_destructors =
+ TAILQ_HEAD_INITIALIZER(vps_destructors);
+
+struct sx vps_sysinit_sxlock;
+
+#define VPS_SYSINIT_WLOCK() sx_xlock(&vps_sysinit_sxlock);
+#define VPS_SYSINIT_WUNLOCK() sx_xunlock(&vps_sysinit_sxlock);
+#define VPS_SYSINIT_RLOCK() sx_slock(&vps_sysinit_sxlock);
+#define VPS_SYSINIT_RUNLOCK() sx_sunlock(&vps_sysinit_sxlock);
+
+/* XXX-BZ should probably be vpd_* instead of vnd_* but in the hope to
+ * harmonize most of this later on keep the names the same for now. */
+struct vps_data_free {
+ uintptr_t vnd_start;
+ int vnd_len;
+ TAILQ_ENTRY(vps_data_free) vnd_link;
+};
+
+static MALLOC_DEFINE(M_VPS_DATA_FREE, "vps_data_free",
+ "VPS resource accounting");
+static TAILQ_HEAD(, vps_data_free) vps_data_free_head =
+ TAILQ_HEAD_INITIALIZER(vps_data_free_head);
+static struct sx vps_data_free_lock;
+
+SDT_PROVIDER_DEFINE(vps);
+SDT_PROBE_DEFINE1(vps, functions, vps_alloc, entry, "int");
+SDT_PROBE_DEFINE2(vps, functions, vps_alloc, alloc, "int", "struct vps *");
+SDT_PROBE_DEFINE2(vps, functions, vps_alloc, return, "int", "struct vps *");
+SDT_PROBE_DEFINE2(vps, functions, vps_destroy, entry, "int", "struct vps *");
+SDT_PROBE_DEFINE1(vps, functions, vps_destroy, return, "int");
+
+#ifdef DDB
+static void db_show_vps_print_vs(struct vps_sysinit *, int);
+#endif
+
+/*
+ * Allocate a virtual process space.
+ */
+struct vps *
+vps_alloc(struct prison *pr)
+{
+ struct vps *vps;
+
+ SDT_PROBE1(vps, functions, vps_alloc, entry, __LINE__);
+ vps = malloc(sizeof(struct vps), M_VPS, M_WAITOK | M_ZERO);
+ vps->vps_magic_n = VPS_MAGIC_N;
+ vps->vps_state = 0;
+ vps->vps_pr = pr;
+ /* Cheat for vps_sysinit() to get creds right. */
+ pr->pr_vps = vps;
+ SDT_PROBE2(vps, functions, vps_alloc, alloc, __LINE__, vps);
+
+ /*
+ * Allocate storage for virtualized global variables and copy in
+ * initial values form our 'primary' copy.
+ */
+ vps->vps_data_mem = malloc(VPS_SIZE, M_VPS_DATA, M_WAITOK);
+ memcpy(vps->vps_data_mem, (void *)VPS_START, VPS_BYTES);
+
+ /*
+ * All use of vps-specific data will immediately subtract VPS_START
+ * from the base memory pointer, so pre-calculate that now to avoid
+ * it on each use.
+ */
+ vps->vps_data_base = (uintptr_t)vps->vps_data_mem - VPS_START;
+
+ /* Initialize / attach vps module instances. */
+ CURVPS_SET_QUIET(vps);
+ vps_sysinit();
+ CURVPS_RESTORE();
+
+ VPS_LIST_WLOCK();
+ LIST_INSERT_HEAD(&vps_head, vps, vps_le);
+ VPS_LIST_WUNLOCK();
+
+ SDT_PROBE2(vps, functions, vps_alloc, return, __LINE__, vps);
+ return (vps);
+}
+
+/*
+ * Destroy a virtual process space.
+ */
+void
+vps_destroy(struct vps *vps)
+{
+
+ SDT_PROBE2(vps, functions, vps_destroy, entry, __LINE__, vps);
+
+ VPS_LIST_WLOCK();
+ if (vps->vps_le.le_prev == NULL && vps->vps_le.le_next == NULL) {
+ VPS_LIST_WUNLOCK();
+ DELAY(10000);
+ return;
+ }
+ LIST_REMOVE(vps, vps_le);
+ vps->vps_le.le_prev = NULL;
+ vps->vps_le.le_next = NULL;
+ VPS_LIST_WUNLOCK();
+
+ CURVPS_SET_QUIET(vps);
+ vps_sysuninit();
+ CURVPS_RESTORE();
+
+ /*
+ * Release storage for the virtual process space instance.
+ */
+ free(vps->vps_data_mem, M_VPS_DATA);
+ vps->vps_data_mem = NULL;
+ vps->vps_data_base = 0;
+ vps->vps_pr->pr_vps = NULL;
+ vps->vps_pr = NULL;
+ vps->vps_magic_n = 0xdeadbeef;
+ free(vps, M_VPS);
+ SDT_PROBE1(vps, functions, vps_destroy, return, __LINE__);
+}
+
+/*
+ * Boot time initialization and allocation of virtual process space.
+ */
+static void
+vps_init_prelink(void *arg __unused)
+{
+
+ rw_init(&vps_rwlock, "vps_rwlock");
+ sx_init(&vps_sxlock, "vps_sxlock");
+ sx_init(&vps_sysinit_sxlock, "vps_sysinit_sxlock");
+ LIST_INIT(&vps_head);
+}
+SYSINIT(vps_init_prelink, SI_SUB_VIMAGE_PRELINK, SI_ORDER_FIRST,
+ vps_init_prelink, NULL);
+
+static void
+vps0_init(void *arg __unused)
+{
+
+ if (bootverbose)
+ printf("VIMAGE (virtualized process space) enabled\n");
+
+ /*
+ * We MUST clear curvps in vi_init_done() before going SMP,
+ * otherwise CURVPS_SET() macros would scream about unnecessary
+ * curvps recursions.
+ */
+ curvps = prison0.pr_vps = vps0 = vps_alloc(&prison0);
+}
+SYSINIT(vps0_init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, vps0_init, NULL);
+
+#if 0
+/* Compared to vnets, nuking the vps of the current thread does not go down well. */
+static void
+vps_init_done(void *unused __unused)
+{
+
+ curvps = NULL;
+}
+SYSINIT(vps_init_done, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY, vps_init_done, NULL);
+#endif
+
+/*
+ * Once on boot, initialize the modspace freelist to entirely cover modspace.
+ */
+static void
+vps_data_startup(void *dummy __unused)
+{
+ struct vps_data_free *df;
+
+ df = malloc(sizeof(*df), M_VPS_DATA_FREE, M_WAITOK | M_ZERO);
+ df->vnd_start = (uintptr_t)&VPS_NAME(modspace);
+ df->vnd_len = VPS_MODMIN;
+ TAILQ_INSERT_HEAD(&vps_data_free_head, df, vnd_link);
+ sx_init(&vps_data_free_lock, "vps_data alloc lock");
+}
+SYSINIT(vps_data, SI_SUB_KLD, SI_ORDER_FIRST, vps_data_startup, 0);
+
+/* Dummy VPS_SYSINIT to make sure we always reach the final end state. */
+static void
+vps_sysinit_done(void *unused __unused)
+{
+
+ return;
+}
+VPS_SYSINIT(vps_sysinit_done, SI_SUB_VIMAGE_DONE, SI_ORDER_ANY,
+ vps_sysinit_done, NULL);
+
+/*
+ * When a module is loaded and requires storage for a virtualized global
+ * variable, allocate space from the modspace free list. This interface
+ * should be used only by the kernel linker.
+ */
+void *
+vps_data_alloc(int size)
+{
+ struct vps_data_free *df;
+ void *s;
+
+ s = NULL;
+ size = roundup2(size, sizeof(void *));
+ sx_xlock(&vps_data_free_lock);
+ TAILQ_FOREACH(df, &vps_data_free_head, vnd_link) {
+ if (df->vnd_len < size)
+ continue;
+ if (df->vnd_len == size) {
+ s = (void *)df->vnd_start;
+ TAILQ_REMOVE(&vps_data_free_head, df, vnd_link);
+ free(df, M_VPS_DATA_FREE);
+ break;
+ }
+ s = (void *)df->vnd_start;
+ df->vnd_len -= size;
+ df->vnd_start = df->vnd_start + size;
+ break;
+ }
+ sx_xunlock(&vps_data_free_lock);
+
+ return (s);
+}
+
+/*
+ * Free space for a virtualized global variable on module unload.
+ */
+void
+vps_data_free(void *start_arg, int size)
+{
+ struct vps_data_free *df;
+ struct vps_data_free *dn;
+ uintptr_t start;
+ uintptr_t end;
+
+ size = roundup2(size, sizeof(void *));
+ start = (uintptr_t)start_arg;
+ end = start + size;
+ /*
+ * Free a region of space and merge it with as many neighbors as
+ * possible. Keeping the list sorted simplifies this operation.
+ */
+ sx_xlock(&vps_data_free_lock);
+ TAILQ_FOREACH(df, &vps_data_free_head, vnd_link) {
+ if (df->vnd_start > end)
+ break;
+ /*
+ * If we expand at the end of an entry we may have to merge
+ * it with the one following it as well.
+ */
+ if (df->vnd_start + df->vnd_len == start) {
+ df->vnd_len += size;
+ dn = TAILQ_NEXT(df, vnd_link);
+ if (df->vnd_start + df->vnd_len == dn->vnd_start) {
+ df->vnd_len += dn->vnd_len;
+ TAILQ_REMOVE(&vps_data_free_head, dn,
+ vnd_link);
+ free(dn, M_VPS_DATA_FREE);
+ }
+ sx_xunlock(&vps_data_free_lock);
+ return;
+ }
+ if (df->vnd_start == end) {
+ df->vnd_start = start;
+ df->vnd_len += size;
+ sx_xunlock(&vps_data_free_lock);
+ return;
+ }
+ }
+ dn = malloc(sizeof(*df), M_VPS_DATA_FREE, M_WAITOK | M_ZERO);
+ dn->vnd_start = start;
+ dn->vnd_len = size;
+ if (df)
+ TAILQ_INSERT_BEFORE(df, dn, vnd_link);
+ else
+ TAILQ_INSERT_TAIL(&vps_data_free_head, dn, vnd_link);
+ sx_xunlock(&vps_data_free_lock);
+}
+
+/*
+ * When a new virtualized global variable has been allocated, propagate its
+ * initial value to each already-allocated virtual process space instance.
+ */
+void
+vps_data_copy(void *start, int size)
+{
+ struct vps *vps;
+
+ VPS_LIST_RLOCK();
+ LIST_FOREACH(vps, &vps_head, vps_le)
+ memcpy((void *)((uintptr_t)vps->vps_data_base +
+ (uintptr_t)start), start, size);
+ VPS_LIST_RUNLOCK();
+}
+
+/*
+ * Support for special SYSINIT handlers registered via VPS_SYSINIT()
+ * and VPS_SYSUNINIT().
+ */
+void
+vps_register_sysinit(void *arg)
+{
+ struct vps_sysinit *vs, *vs2;
+ struct vps *vps;
+
+ vs = arg;
+ KASSERT(vs->subsystem >= SI_SUB_INTRINSIC, ("vps sysinit too early"));
+
+ /* Add the constructor to the global list of vps constructors. */
+ VPS_SYSINIT_WLOCK();
+ TAILQ_FOREACH(vs2, &vps_constructors, link) {
+ if (vs2->subsystem > vs->subsystem)
+ break;
+ if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
+ break;
+ }
+ if (vs2 != NULL)
+ TAILQ_INSERT_BEFORE(vs2, vs, link);
+ else
+ TAILQ_INSERT_TAIL(&vps_constructors, vs, link);
+
+ /*
+ * Invoke the constructor on all the existing vpss when it is
+ * registered.
+ */
+ VPS_FOREACH(vps) {
+ CURVPS_SET_QUIET(vps);
+ vs->func(vs->arg);
+ CURVPS_RESTORE();
+ }
+ VPS_SYSINIT_WUNLOCK();
+}
+
+void
+vps_deregister_sysinit(void *arg)
+{
+ struct vps_sysinit *vs;
+
+ vs = arg;
+
+ /* Remove the constructor from the global list of vps constructors. */
+ VPS_SYSINIT_WLOCK();
+ TAILQ_REMOVE(&vps_constructors, vs, link);
+ VPS_SYSINIT_WUNLOCK();
+}
+
+void
+vps_register_sysuninit(void *arg)
+{
+ struct vps_sysinit *vs, *vs2;
+
+ vs = arg;
+
+ /* Add the destructor to the global list of vps destructors. */
+ VPS_SYSINIT_WLOCK();
+ TAILQ_FOREACH(vs2, &vps_destructors, link) {
+ if (vs2->subsystem > vs->subsystem)
+ break;
+ if (vs2->subsystem == vs->subsystem && vs2->order > vs->order)
+ break;
+ }
+ if (vs2 != NULL)
+ TAILQ_INSERT_BEFORE(vs2, vs, link);
+ else
+ TAILQ_INSERT_TAIL(&vps_destructors, vs, link);
+ VPS_SYSINIT_WUNLOCK();
+}
+
+void
+vps_deregister_sysuninit(void *arg)
+{
+ struct vps_sysinit *vs;
+ struct vps *vps;
+
+ vs = arg;
+
+ /*
+ * Invoke the destructor on all the existing vpss when it is
+ * deregistered.
+ */
+ VPS_SYSINIT_WLOCK();
+ VPS_FOREACH(vps) {
+ CURVPS_SET_QUIET(vps);
+ vs->func(vs->arg);
+ CURVPS_RESTORE();
+ }
+
+ /* Remove the destructor from the global list of vps destructors. */
+ TAILQ_REMOVE(&vps_destructors, vs, link);
+ VPS_SYSINIT_WUNLOCK();
+}
+
+/*
+ * Invoke all registered vps constructors on the current vps. Used during
+ * vps construction. The caller is responsible for ensuring the new vps is
+ * the current vps and that the vps_sysinit_sxlock lock is locked.
+ */
+void
+vps_sysinit(void)
+{
+ struct vps_sysinit *vs;
+ struct vps *vps;
+
+ vps = curvps;
+ VPS_SYSINIT_RLOCK();
+ TAILQ_FOREACH(vs, &vps_constructors, link) {
+ curvps->vps_state = vs->subsystem;
+ vs->func(vs->arg);
+ KASSERT((curvps == vps),
+ ("%s: vs %p subsystem %u order %u func %p returned "
+ "with curvps altered: curvps %p should be %p\n",
+ __func__, vs, vs->subsystem, vs->order, vs->func,
+ curvps, vps));
+ }
+ VPS_SYSINIT_RUNLOCK();
+}
+
+/*
+ * Invoke all registered vps destructors on the current vps. Used during
+ * vps destruction. The caller is responsible for ensuring the dying vps
+ * the current vps and that the vps_sysinit_sxlock lock is locked.
+ */
+void
+vps_sysuninit(void)
+{
+ struct vps_sysinit *vs;
+
+ VPS_SYSINIT_RLOCK();
+ TAILQ_FOREACH_REVERSE(vs, &vps_destructors, vps_sysuninit_head,
+ link) {
+ curvps->vps_state = vs->subsystem;
+ vs->func(vs->arg);
+ }
+ VPS_SYSINIT_RUNLOCK();
+}
+
+/*
+ * EVENTHANDLER(9) extensions.
+ */
+/*
+ * Invoke the eventhandler function originally registered with the possibly
+ * registered argument for all virtual process space instances.
+ *
+ * This iterator can only be used for eventhandlers that do not take any
+ * additional arguments, as we do ignore the variadic arguments from the
+ * EVENTHANDLER_INVOKE() call.
+ */
+void
+vps_global_eventhandler_iterator_func(void *arg, ...)
+{
+ VPS_ITERATOR_DECL(vps_iter);
+ struct eventhandler_entry_vimage *v_ee;
+
+ /*
+ * There is a bug here in that we should actually cast things to
+ * (struct eventhandler_entry_ ## name *) but that's not easily
+ * possible in here so just re-using the variadic version we
+ * defined for the generic vimage case.
+ */
+ v_ee = arg;
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ ((vimage_iterator_func_t)v_ee->func)(v_ee->ee_arg);
+ CURVPS_RESTORE();
+ }
+ VPS_LIST_RUNLOCK();
+}
+
+#ifdef VPS_DEBUG
+struct vps_recursion {
+ SLIST_ENTRY(vps_recursion) vnr_le;
+ const char *prev_fn;
+ const char *where_fn;
+ int where_line;
+ struct vps *old_vps;
+ struct vps *new_vps;
+};
+
+static SLIST_HEAD(, vps_recursion) vps_recursions =
+ SLIST_HEAD_INITIALIZER(vps_recursions);
+
+static void
+vps_print_recursion(struct vps_recursion *vnr, int brief)
+{
+
+ if (!brief)
+ printf("CURVPS_SET() recursion in ");
+ printf("%s() line %d, prev in %s()", vnr->where_fn, vnr->where_line,
+ vnr->prev_fn);
+ if (brief)
+ printf(", ");
+ else
+ printf("\n ");
+ printf("%p -> %p\n", vnr->old_vps, vnr->new_vps);
+}
+
+void
+vps_log_recursion(struct vps *old_vps, const char *old_fn, int line)
+{
+ struct vps_recursion *vnr;
+
+ /* Skip already logged recursion events. */
+ SLIST_FOREACH(vnr, &vps_recursions, vnr_le)
+ if (vnr->prev_fn == old_fn &&
+ vnr->where_fn == curthread->td_vps_lpush &&
+ vnr->where_line == line &&
+ (vnr->old_vps == vnr->new_vps) == (curvps == old_vps))
+ return;
+
+ vnr = malloc(sizeof(*vnr), M_VPS, M_NOWAIT | M_ZERO);
+ if (vnr == NULL)
+ panic("%s: malloc failed", __func__);
+ vnr->prev_fn = old_fn;
+ vnr->where_fn = curthread->td_vps_lpush;
+ vnr->where_line = line;
+ vnr->old_vps = old_vps;
+ vnr->new_vps = curvps;
+
+ SLIST_INSERT_HEAD(&vps_recursions, vnr, vnr_le);
+
+ vps_print_recursion(vnr, 0);
+#ifdef KDB
+ kdb_backtrace();
+#endif
+}
+#endif /* VPS_DEBUG */
+
+/*
+ * DDB(4).
+ */
+#ifdef DDB
+static void
+db_vps_print(struct vps *vps)
+{
+
+ db_printf("vps = %p\n", vps);
+ db_printf(" vps_magic_n = %#08x (%s, orig %#08x)\n",
+ vps->vps_magic_n,
+ (vps->vps_magic_n == VPS_MAGIC_N) ?
+ "ok" : "mismatch", VPS_MAGIC_N);
+ db_printf(" vps_data_mem = %p\n", vps->vps_data_mem);
+ db_printf(" vps_data_base = %#jx\n",
+ (uintmax_t)vps->vps_data_base);
+ db_printf(" vps_state = %#08x\n", vps->vps_state);
+ db_printf("\n");
+}
+
+DB_SHOW_ALL_COMMAND(vpss, db_show_all_vpss)
+{
+ VPS_ITERATOR_DECL(vps_iter);
+
+ VPS_FOREACH(vps_iter) {
+ db_vps_print(vps_iter);
+ if (db_pager_quit)
+ break;
+ }
+}
+
+DB_SHOW_COMMAND(vps, db_show_vps)
+{
+
+ if (!have_addr) {
+ db_printf("usage: show vps <struct vps *>\n");
+ return;
+ }
+
+ db_vps_print((struct vps *)addr);
+}
+
+static void
+db_show_vps_print_vs(struct vps_sysinit *vs, int ddb)
+{
+ const char *vsname, *funcname;
+ c_db_sym_t sym;
+ db_expr_t offset;
+
+#define xprint(...) \
+ if (ddb) \
+ db_printf(__VA_ARGS__); \
+ else \
+ printf(__VA_ARGS__)
+
+ if (vs == NULL) {
+ xprint("%s: no vps_sysinit * given\n", __func__);
+ return;
+ }
+
+ sym = db_search_symbol((vm_offset_t)vs, DB_STGY_ANY, &offset);
+ db_symbol_values(sym, &vsname, NULL);
+ sym = db_search_symbol((vm_offset_t)vs->func, DB_STGY_PROC, &offset);
+ db_symbol_values(sym, &funcname, NULL);
+ xprint("%s(%p)\n", (vsname != NULL) ? vsname : "", vs);
+ xprint(" %#08x %#08x\n", vs->subsystem, vs->order);
+ xprint(" %p(%s)(%p)\n",
+ vs->func, (funcname != NULL) ? funcname : "", vs->arg);
+#undef xprint
+}
+
+DB_SHOW_COMMAND(vps_sysinit, db_show_vps_sysinit)
+{
+ struct vps_sysinit *vs;
+
+ db_printf("VPS_SYSINIT vs Name(Ptr)\n");
+ db_printf(" Subsystem Order\n");
+ db_printf(" Function(Name)(Arg)\n");
+ TAILQ_FOREACH(vs, &vps_constructors, link) {
+ db_show_vps_print_vs(vs, 1);
+ if (db_pager_quit)
+ break;
+ }
+}
+
+DB_SHOW_COMMAND(vps_sysuninit, db_show_vps_sysuninit)
+{
+ struct vps_sysinit *vs;
+
+ db_printf("VPS_SYSUNINIT vs Name(Ptr)\n");
+ db_printf(" Subsystem Order\n");
+ db_printf(" Function(Name)(Arg)\n");
+ TAILQ_FOREACH_REVERSE(vs, &vps_destructors, vps_sysuninit_head,
+ link) {
+ db_show_vps_print_vs(vs, 1);
+ if (db_pager_quit)
+ break;
+ }
+}
+
+DB_COMMAND(setcurvps, db_setcurvps)
+{
+ struct vps *vps;
+
+ if (!have_addr) {
+ db_printf("usage: setcurvps <stauct vps *>\n");
+ return;
+ }
+
+ vps = (struct vps *)addr;
+ db_printf("curvps %p -> %p\n", curvps, vps);
+ curvps = vps;
+ db_vps_print(vps);
+}
+
+#ifdef VPS_DEBUG
+DB_SHOW_COMMAND(vpsrcrs, db_show_vpsrcrs)
+{
+ struct vps_recursion *vnr;
+
+ SLIST_FOREACH(vnr, &vps_recursions, vnr_le)
+ vps_print_recursion(vnr, 1);
+}
+#endif
+#endif /* DDB */
Index: sys/kern/sched_4bsd.c
===================================================================
--- sys/kern/sched_4bsd.c
+++ sys/kern/sched_4bsd.c
@@ -454,17 +454,15 @@
* Recompute process priorities, every hz ticks.
* MP-safe, called without the Giant mutex.
*/
-/* ARGSUSED */
-static void
-schedcpu(void)
+static __inline void
+_schedcpu(fixpt_t loadfac)
{
- fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
struct thread *td;
struct proc *p;
struct td_sched *ts;
int awake;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
@@ -550,7 +548,22 @@
}
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
+}
+
+static void
+schedcpu(void)
+{
+ VPS_ITERATOR_DECL(vps_iter);
+ fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ _schedcpu(loadfac);
+ CURVPS_RESTORE();
+ }
+ VPS_LIST_RUNLOCK();
}
/*
Index: sys/kern/subr_pcpu.c
===================================================================
--- sys/kern/subr_pcpu.c
+++ sys/kern/subr_pcpu.c
@@ -378,6 +378,7 @@
#ifdef VIMAGE
db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet);
+ db_printf("curvps = %p\n", pc->pc_curthread->td_vps);
#endif
#ifdef WITNESS
Index: sys/kern/subr_prf.c
===================================================================
--- sys/kern/subr_prf.c
+++ sys/kern/subr_prf.c
@@ -165,12 +165,12 @@
if (TD_IS_IDLETHREAD(td))
return (0);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
p = td->td_proc;
PROC_LOCK(p);
if ((p->p_flag & P_CONTROLT) == 0) {
PROC_UNLOCK(p);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (0);
}
SESS_LOCK(p->p_session);
@@ -178,14 +178,14 @@
SESS_UNLOCK(p->p_session);
PROC_UNLOCK(p);
if (pca.tty == NULL) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (0);
}
pca.flags = TOTTY;
pca.p_bufr = NULL;
va_start(ap, fmt);
tty_lock(pca.tty);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
retval = kvprintf(fmt, putchar, &pca, 10, ap);
tty_unlock(pca.tty);
va_end(ap);
@@ -214,7 +214,7 @@
struct putchar_arg pca;
struct session *sess = NULL;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (pri != -1)
flags |= TOLOG;
if (p != NULL) {
@@ -237,7 +237,7 @@
pca.p_bufr = NULL;
if (pca.tty != NULL)
tty_lock(pca.tty);
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
kvprintf(fmt, putchar, &pca, 10, ap);
if (pca.tty != NULL)
tty_unlock(pca.tty);
Index: sys/kern/subr_turnstile.c
===================================================================
--- sys/kern/subr_turnstile.c
+++ sys/kern/subr_turnstile.c
@@ -1212,22 +1212,32 @@
DB_SHOW_ALL_COMMAND(chains, db_show_allchains)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct thread *td;
struct proc *p;
int i;
i = 1;
- FOREACH_PROC_IN_SYSTEM(p) {
- FOREACH_THREAD_IN_PROC(p, td) {
- if ((TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested))
- || (TD_IS_INHIBITED(td) && TD_ON_SLEEPQ(td))) {
- db_printf("chain %d:\n", i++);
- print_lockchain(td, " ");
+
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if ((TD_ON_LOCK(td) &&
+ LIST_EMPTY(&td->td_contested))
+ || (TD_IS_INHIBITED(td) &&
+ TD_ON_SLEEPQ(td))) {
+ db_printf("chain %d:\n", i++);
+ print_lockchain(td, " ");
+ }
+ if (db_pager_quit)
+ return;
}
- if (db_pager_quit)
- return;
}
+ CURVPS_RESTORE();
}
+ /* VPS_LIST_RUNLOCK(); */
}
DB_SHOW_ALIAS(allchains, db_show_allchains)
Index: sys/kern/subr_witness.c
===================================================================
--- sys/kern/subr_witness.c
+++ sys/kern/subr_witness.c
@@ -2534,6 +2534,7 @@
DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct thread *td;
struct proc *p;
@@ -2542,19 +2543,25 @@
* held sleep locks, but that information is currently not exported
* by WITNESS.
*/
- FOREACH_PROC_IN_SYSTEM(p) {
- if (!witness_proc_has_locks(p))
- continue;
- FOREACH_THREAD_IN_PROC(p, td) {
- if (!witness_thread_has_locks(td))
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (!witness_proc_has_locks(p))
continue;
- db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
- p->p_comm, td, td->td_tid);
- witness_ddb_list(td);
- if (db_pager_quit)
- return;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (!witness_thread_has_locks(td))
+ continue;
+ db_printf("Process %d (%s) thread %p (%d)\n",
+ p->p_pid, p->p_comm, td, td->td_tid);
+ witness_ddb_list(td);
+ if (db_pager_quit)
+ return;
+ }
}
+ CURVPS_RESTORE();
}
+ /* VPS_LIST_RUNLOCK(); */
}
DB_SHOW_ALIAS(alllocks, db_witness_list_all)
Index: sys/kern/sys_procdesc.c
===================================================================
--- sys/kern/sys_procdesc.c
+++ sys/kern/sys_procdesc.c
@@ -153,13 +153,13 @@
goto out;
}
pd = fp->f_data;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (pd->pd_proc != NULL) {
*p = pd->pd_proc;
PROC_LOCK(*p);
} else
error = ESRCH;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
out:
fdrop(fp, td);
return (error);
@@ -305,14 +305,14 @@
{
struct procdesc *pd;
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL"));
pd = p->p_procdesc;
PROCDESC_LOCK(pd);
- KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc,
+ KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == V_initproc,
("procdesc_exit: closed && parent not init"));
pd->pd_flags |= PDF_EXITED;
@@ -349,7 +349,7 @@
{
struct procdesc *pd;
- sx_assert(&proctree_lock, SA_XLOCKED);
+ sx_assert(&V_proctree_lock, SA_XLOCKED);
KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL"));
pd = p->p_procdesc;
@@ -375,7 +375,7 @@
fp->f_ops = &badfileops;
fp->f_data = NULL;
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
PROCDESC_LOCK(pd);
pd->pd_flags |= PDF_CLOSED;
PROCDESC_UNLOCK(pd);
@@ -385,7 +385,7 @@
* This is the case where process' exit status was already
* collected and procdesc_reap() was already called.
*/
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
} else {
PROC_LOCK(p);
AUDIT_ARG_PROCESS(p);
@@ -415,11 +415,11 @@
* prejudice.
*/
p->p_sigparent = SIGCHLD;
- proc_reparent(p, initproc);
+ proc_reparent(p, V_initproc);
if ((pd->pd_flags & PDF_DAEMON) == 0)
kern_psignal(p, SIGKILL);
PROC_UNLOCK(p);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
}
}
@@ -531,7 +531,7 @@
*/
bzero(sb, sizeof(*sb));
pd = fp->f_data;
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
if (pd->pd_proc != NULL) {
PROC_LOCK(pd->pd_proc);
AUDIT_ARG_PROCESS(pd->pd_proc);
@@ -553,7 +553,7 @@
PROC_UNLOCK(pd->pd_proc);
} else
sb->st_mode = S_IFREG;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (0);
}
Index: sys/kern/sys_process.c
===================================================================
--- sys/kern/sys_process.c
+++ sys/kern/sys_process.c
@@ -688,7 +688,7 @@
proc_set_traced(struct proc *p, bool stop)
{
- sx_assert(&proctree_lock, SX_XLOCKED);
+ sx_assert(&V_proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(p, MA_OWNED);
p->p_flag |= P_TRACED;
if (stop)
@@ -733,7 +733,7 @@
case PT_SET_EVENT_MASK:
case PT_DETACH:
case PT_GET_SC_ARGS:
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
proctree_locked = 1;
break;
default:
@@ -747,14 +747,14 @@
if (pid <= PID_MAX) {
if ((p = pfind(pid)) == NULL) {
if (proctree_locked)
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (ESRCH);
}
} else {
td2 = tdfind(pid, -1);
if (td2 == NULL) {
if (proctree_locked)
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (ESRCH);
}
p = td2->td_proc;
@@ -816,7 +816,7 @@
error = EBUSY;
goto fail;
}
- if (p->p_pptr == initproc) {
+ if (p->p_pptr == V_initproc) {
error = EPERM;
goto fail;
}
@@ -923,7 +923,7 @@
CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid,
p->p_oppid);
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
proctree_locked = 0;
MPASS(p->p_xthread == NULL);
MPASS((p->p_flag & P_STOPPED_TRACE) == 0);
@@ -1113,7 +1113,7 @@
pp = proc_realparent(p);
proc_reparent(p, pp);
- if (pp == initproc)
+ if (pp == V_initproc)
p->p_sigparent = SIGCHLD;
CTR3(KTR_PTRACE,
"PT_DETACH: pid %d reparented to pid %d, sig %d",
@@ -1142,7 +1142,7 @@
break;
}
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
proctree_locked = 0;
sendsig:
@@ -1456,7 +1456,7 @@
fail:
PROC_UNLOCK(p);
if (proctree_locked)
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (error);
}
#undef PROC_READ
Index: sys/kern/tty.c
===================================================================
--- sys/kern/tty.c
+++ sys/kern/tty.c
@@ -1703,18 +1703,18 @@
/* XXX: This looks awful. */
tty_unlock(tp);
- sx_xlock(&proctree_lock);
+ sx_xlock(&V_proctree_lock);
tty_lock(tp);
if (!SESS_LEADER(p)) {
/* Only the session leader may do this. */
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (EPERM);
}
if (tp->t_session != NULL && tp->t_session == p->p_session) {
/* This is already our controlling TTY. */
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (0);
}
@@ -1732,7 +1732,7 @@
* TTYs of which the session leader has been
* killed or the TTY revoked.
*/
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
return (EPERM);
}
@@ -1740,7 +1740,7 @@
tp->t_session = p->p_session;
tp->t_session->s_ttyp = tp;
tp->t_sessioncnt++;
- sx_xunlock(&proctree_lock);
+ sx_xunlock(&V_proctree_lock);
/* Assign foreground process group. */
tp->t_pgrp = p->p_pgrp;
@@ -1759,12 +1759,12 @@
* decompose proctree_lock.
*/
tty_unlock(tp);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
pg = pgfind(*(int *)data);
if (pg != NULL)
PGRP_UNLOCK(pg);
if (pg == NULL || pg->pg_session != td->td_proc->p_session) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
tty_lock(tp);
return (EPERM);
}
@@ -1775,11 +1775,11 @@
* relocking the TTY.
*/
if (!tty_is_ctty(tp, td->td_proc)) {
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
return (ENOTTY);
}
tp->t_pgrp = pg;
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
/* Wake up the background process groups. */
cv_broadcast(&tp->t_bgwait);
Index: sys/kern/tty_tty.c
===================================================================
--- sys/kern/tty_tty.c
+++ sys/kern/tty_tty.c
@@ -68,7 +68,7 @@
return;
p = curproc;
sx_sunlock(&clone_drain_lock);
- sx_slock(&proctree_lock);
+ sx_slock(&V_proctree_lock);
sx_slock(&clone_drain_lock);
dev_lock();
if (!(p->p_flag & P_CONTROLT))
@@ -83,7 +83,7 @@
*dev = p->p_session->s_ttyvp->v_rdev;
dev_refl(*dev);
dev_unlock();
- sx_sunlock(&proctree_lock);
+ sx_sunlock(&V_proctree_lock);
}
static void
Index: sys/net/vnet.c
===================================================================
--- sys/net/vnet.c
+++ sys/net/vnet.c
@@ -80,8 +80,6 @@
* stack instance.
*/
-FEATURE(vimage, "VIMAGE kernel virtualization");
-
static MALLOC_DEFINE(M_VNET, "vnet", "network stack control block");
/*
@@ -307,7 +305,7 @@
sx_init(&vnet_sysinit_sxlock, "vnet_sysinit_sxlock");
LIST_INIT(&vnet_head);
}
-SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST,
+SYSINIT(vnet_init_prelink, SI_SUB_VIMAGE_PRELINK, SI_ORDER_FIRST,
vnet_init_prelink, NULL);
static void
Index: sys/sys/jail.h
===================================================================
--- sys/sys/jail.h
+++ sys/sys/jail.h
@@ -166,6 +166,7 @@
struct osd pr_osd; /* (p) additional data */
struct cpuset *pr_cpuset; /* (p) cpuset */
struct vnet *pr_vnet; /* (c) network stack */
+ struct vps *pr_vps; /* (c) process space */
struct vnode *pr_root; /* (c) vnode to rdir */
int pr_ip4s; /* (p) number of v4 IPs */
int pr_ip6s; /* (p) number of v6 IPs */
@@ -209,6 +210,7 @@
/* primary jail address. */
#define PR_IP6_SADDRSEL 0x00000100 /* Do IPv6 src addr sel. or use the */
/* primary jail address. */
+#define PR_VPS 0x00000200 /* Virtual process space */
/* Internal flag bits */
#define PR_IP4 0x02000000 /* IPv4 restricted or disabled */
@@ -370,6 +372,7 @@
int prison_allow(struct ucred *, unsigned);
int prison_check(struct ucred *cred1, struct ucred *cred2);
int prison_owns_vnet(struct ucred *);
+int prison_owns_vps(struct ucred *);
int prison_canseemount(struct ucred *cred, struct mount *mp);
void prison_enforce_statfs(struct ucred *cred, struct mount *mp,
struct statfs *sp);
Index: sys/sys/kernel.h
===================================================================
--- sys/sys/kernel.h
+++ sys/sys/kernel.h
@@ -102,7 +102,7 @@
SI_SUB_MTX_POOL_DYNAMIC = 0x1AC0000, /* dynamic mutex pool */
SI_SUB_LOCK = 0x1B00000, /* various locks */
SI_SUB_EVENTHANDLER = 0x1C00000, /* eventhandler init */
- SI_SUB_VNET_PRELINK = 0x1E00000, /* vnet init before modules */
+ SI_SUB_VIMAGE_PRELINK = 0x1E00000, /* VIMAGE init before modules */
SI_SUB_KLD = 0x2000000, /* KLD and module setup */
SI_SUB_CPU = 0x2100000, /* CPU resource(s)*/
SI_SUB_RACCT = 0x2110000, /* resource accounting */
@@ -159,7 +159,7 @@
SI_SUB_ROOT_CONF = 0xb000000, /* Find root devices */
SI_SUB_INTRINSIC_POST = 0xd000000, /* proc 0 cleanup*/
SI_SUB_SYSCALLS = 0xd800000, /* register system calls */
- SI_SUB_VNET_DONE = 0xdc00000, /* vnet registration complete */
+ SI_SUB_VNET_DONE = 0xdc00000, /* VNET registration complete */
SI_SUB_KTHREAD_INIT = 0xe000000, /* init process*/
SI_SUB_KTHREAD_PAGE = 0xe400000, /* pageout daemon*/
SI_SUB_KTHREAD_VM = 0xe800000, /* vm daemon*/
@@ -170,6 +170,7 @@
SI_SUB_SMP = 0xf000000, /* start the APs*/
#endif
SI_SUB_RACCTD = 0xf100000, /* start racctd*/
+ SI_SUB_VIMAGE_DONE = 0xf800000, /* VIMAGE initialization done */
SI_SUB_LAST = 0xfffffff /* final initialization */
};
Index: sys/sys/proc.h
===================================================================
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -68,6 +68,9 @@
#include <sys/ucred.h>
#include <sys/types.h>
#include <sys/_domainset.h>
+#ifdef _KERNEL
+#include <sys/vps.h>
+#endif
#include <machine/proc.h> /* Machine-dependent proc substruct. */
#ifdef _KERNEL
@@ -351,6 +354,8 @@
/* LP64 hole */
struct vnet *td_vnet; /* (k) Effective vnet. */
const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */
+ struct vps *td_vps; /* (k) Effective vps. */
+ const char *td_vps_lpush; /* (k) Debugging vps push / pop. */
struct trapframe *td_intr_frame;/* (k) Frame of the current irq */
struct proc *td_rfppwait_p; /* (k) The vforked child */
struct vm_page **td_ma; /* (k) uio pages held */
@@ -809,7 +814,7 @@
#endif
#define FOREACH_PROC_IN_SYSTEM(p) \
- LIST_FOREACH((p), &allproc, p_list)
+ LIST_FOREACH((p), &V_allproc, p_list)
#define FOREACH_THREAD_IN_PROC(p, td) \
TAILQ_FOREACH((td), &(p)->p_threads, td_plist)
@@ -939,38 +944,61 @@
#define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0)
-#define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash])
-extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
-extern u_long pidhash;
-#define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash])
+LIST_HEAD(pidhashhead, proc);
+VPS_DECLARE(struct pidhashhead *, pidhashtbl);
+#define V_pidhashtbl VPS(pidhashtbl)
+VPS_DECLARE(u_long, pidhash);
+#define V_pidhash VPS(pidhash)
+#define PIDHASH(pid) (&V_pidhashtbl[(pid) & V_pidhash])
+
extern LIST_HEAD(tidhashhead, thread) *tidhashtbl;
extern u_long tidhash;
+#define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash])
extern struct rwlock tidhash_lock;
-#define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash])
-extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
-extern u_long pgrphash;
+LIST_HEAD(pgrphashhead, pgrp);
+VPS_DECLARE(struct pgrphashhead *, pgrphashtbl);
+#define V_pgrphashtbl VPS(pgrphashtbl)
+VPS_DECLARE(u_long, pgrphash);
+#define V_pgrphash VPS(pgrphash)
+#define PGRPHASH(pgid) (&V_pgrphashtbl[(pgid) & V_pgrphash])
-extern struct sx allproc_lock;
+VPS_DECLARE(struct sx, allproc_lock);
+#define V_allproc_lock VPS(allproc_lock)
extern int allproc_gen;
-extern struct sx proctree_lock;
-extern struct mtx ppeers_lock;
+VPS_DECLARE(struct sx, proctree_lock);
+#define V_proctree_lock VPS(proctree_lock)
+VPS_DECLARE(struct mtx, ppeers_lock);
+#define V_ppeers_lock VPS(ppeers_lock)
extern struct proc proc0; /* Process slot for swapper. */
extern struct thread0_storage thread0_st; /* Primary thread in proc0. */
#define thread0 (thread0_st.t0st_thread)
extern struct vmspace vmspace0; /* VM space for proc0. */
+VPS_DECLARE(struct proc *, vproc0);
+#define V_vproc0 VPS(vproc0)
+#ifdef VIMAGE
+VPS_DECLARE(int, vpsdying);
+#define V_vpsdying VPS(vpsdying)
+#endif
extern int hogticks; /* Limit on kernel cpu hogs. */
-extern int lastpid;
-extern int nprocs, maxproc; /* Current and max number of procs. */
+VPS_DECLARE(int, lastpid);
+#define V_lastpid VPS(lastpid)
+VPS_DECLARE(int, nprocs); /* Current number of procs. */
+#define V_nprocs VPS(nprocs)
+extern int maxproc; /* Max number of procs. */
extern int maxprocperuid; /* Max procs per uid. */
extern u_long ps_arg_cache_limit;
LIST_HEAD(proclist, proc);
TAILQ_HEAD(procqueue, proc);
TAILQ_HEAD(threadqueue, thread);
-extern struct proclist allproc; /* List of all processes. */
-extern struct proclist zombproc; /* List of zombie processes. */
-extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
+VPS_DECLARE(struct proclist, allproc); /* List of all processes. */
+#define V_allproc VPS(allproc)
+VPS_DECLARE(struct proclist, zombproc); /* List of zombie processes. */
+#define V_zombproc VPS(zombproc)
+VPS_DECLARE(struct proc *, initproc); /* Process slots for init. */
+#define V_initproc VPS(initproc)
+extern struct proc *pageproc; /* Process slot for pager. */
extern struct uma_zone *proc_zone;
@@ -1021,6 +1049,7 @@
int fork1(struct thread *, struct fork_req *);
void fork_exit(void (*)(void *, struct trapframe *), void *,
struct trapframe *);
+int fork_findpid(int);
void fork_return(struct thread *, struct trapframe *);
int inferior(struct proc *p);
void kern_proc_vmmap_resident(struct vm_map *map, struct vm_map_entry *entry,
@@ -1043,7 +1072,6 @@
int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb);
int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb);
int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb);
-void procinit(void);
void proc_linkup0(struct proc *p, struct thread *td);
void proc_linkup(struct proc *p, struct thread *td);
struct proc *proc_realparent(struct proc *child);
Index: sys/sys/resourcevar.h
===================================================================
--- sys/sys/resourcevar.h
+++ sys/sys/resourcevar.h
@@ -154,7 +154,6 @@
struct uidinfo
*uifind(uid_t uid);
void uifree(struct uidinfo *uip);
-void uihashinit(void);
void uihold(struct uidinfo *uip);
#ifdef RACCT
void ui_racct_foreach(void (*callback)(struct racct *racct,
Index: sys/sys/sysctl.h
===================================================================
--- sys/sys/sysctl.h
+++ sys/sys/sysctl.h
@@ -104,6 +104,7 @@
#define CTLFLAG_CAPWR 0x00004000 /* Can be written in capability mode */
#define CTLFLAG_STATS 0x00002000 /* Statistics, not a tuneable */
#define CTLFLAG_NOFETCH 0x00001000 /* Don't fetch tunable from getenv() */
+#define CTLFLAG_VPS 0x00000800 /* Prisons with vps can fiddle */
#define CTLFLAG_CAPRW (CTLFLAG_CAPRD|CTLFLAG_CAPWR)
/*
Index: sys/sys/vps.h
===================================================================
--- /dev/null
+++ sys/sys/vps.h
@@ -0,0 +1,381 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2006-2009 University of Zagreb
+ * Copyright (c) 2006-2009 FreeBSD Foundation
+ * Copyright (c) 2018 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by the University of Zagreb and the
+ * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
+ * FreeBSD Foundation.
+ *
+ * Portions of this software were developed by Bjoern Zeeb
+ * under sponsorship from iXsystems, Inc.
+ *
+ * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*-
+ * This header file defines several sets of interfaces supporting virtualized
+ * process space:
+ *
+ * - Definition of 'struct vps' and functions and macros to allocate/free/
+ * manipulate it.
+ *
+ * - A virtual process stack memory allocator, which provides support for
+ * virtualized global variables via a special linker set, set_vps.
+ *
+ * - Virtualized sysinits/sysuninits, which allow constructors and
+ * destructors to be run for each process space as virtual
+ * instances are created and destroyed.
+ *
+ * If VIMAGE isn't compiled into the kernel, virtualized global variables
+ * compile to normal global variables, and virtualized sysinits to regular
+ * sysinits.
+ */
+
+#ifndef _SYS_VPS_H_
+#define _SYS_VPS_H_
+
+/*
+ * struct vps describes a virtualized process space, and is primarily a
+ * pointer to storage for virtualized global variables. Expose to userspace
+ * as required for libkvm.
+ */
+#if defined(_KERNEL) || defined(_WANT_VPS)
+#include <sys/queue.h>
+
+struct vps {
+ LIST_ENTRY(vps) vps_le; /* all vps list */
+ u_int vps_magic_n;
+ u_int vps_state; /* SI_SUB_* */
+ void *vps_data_mem;
+ uintptr_t vps_data_base;
+ struct prison *vps_pr; /* Put init on this if set. */
+};
+#define VPS_MAGIC_N 0x0f0307e2
+
+/*
+ * These two virtual process space allocator definitions are also required
+ * for libkvm so that it can evaluate virtualized global variables.
+ */
+#define VPS_SETNAME "set_vps"
+#define VPS_SYMPREFIX "vps_entry_"
+#endif
+
+#ifdef _KERNEL
+#ifdef VIMAGE
+#include <sys/lock.h>
+#include <sys/proc.h> /* for struct thread */
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+
+/*
+ * Location of the kernel's 'set_vps' linker set.
+ */
+extern uintptr_t *__start_set_vps;
+__GLOBL(__start_set_vps);
+extern uintptr_t *__stop_set_vps;
+__GLOBL(__stop_set_vps);
+
+#define VPS_START (uintptr_t)&__start_set_vps
+#define VPS_STOP (uintptr_t)&__stop_set_vps
+
+/*
+ * Functions to allocate and destroy virtual process spaces.
+ */
+struct vps *vps_alloc(struct prison *);
+void vps_destroy(struct vps *);
+
+/*
+ * The current virtual process space -- we may wish to move this to struct
+ * pcpu in the future.
+ */
+#define curvps curthread->td_vps
+
+/*
+ * Various macros -- get and set the current process space, but also
+ * assertions.
+ */
+#if defined(INVARIANTS) || defined(VPS_DEBUG)
+#define VPS_ASSERT(exp, msg) do { \
+ if (!(exp)) \
+ panic msg; \
+} while (0)
+#else
+#define VPS_ASSERT(exp, msg) do { \
+} while (0)
+#endif
+
+#ifdef VPS_DEBUG
+void vps_log_recursion(struct vps *, const char *, int);
+
+#define CURVPS_SET_QUIET(arg) \
+ VPS_ASSERT((arg) != NULL && (arg)->vps_magic_n == VPS_MAGIC_N, \
+ ("CURVPS_SET at %s:%d %s() curvps=%p vps=%p", \
+ __FILE__, __LINE__, __func__, curvps, (arg))); \
+ struct vps *saved_vps = curvps; \
+ const char *saved_vps_lpush = curthread->td_vps_lpush; \
+ curvps = arg; \
+ curthread->td_vps_lpush = __func__;
+
+#define CURVPS_SET_VERBOSE(arg) \
+ CURVPS_SET_QUIET(arg) \
+ if (saved_vps) \
+ vps_log_recursion(saved_vps, saved_vps_lpush, __LINE__);
+
+#define CURVPS_SET(arg) CURVPS_SET_VERBOSE(arg)
+
+#define CURVPS_RESTORE() \
+ VPS_ASSERT(curvps != NULL && (saved_vps == NULL || \
+ saved_vps->vps_magic_n == VPS_MAGIC_N), \
+ ("CURVPS_RESTORE at %s:%d %s() curvps=%p saved_vps=%p", \
+ __FILE__, __LINE__, __func__, curvps, saved_vps)); \
+ curvps = saved_vps; \
+ curthread->td_vps_lpush = saved_vps_lpush;
+#else /* !VPS_DEBUG */
+
+#define CURVPS_SET_QUIET(arg) \
+ VPS_ASSERT((arg) != NULL && (arg)->vps_magic_n == VPS_MAGIC_N, \
+ ("CURVPS_SET at %s:%d %s() curvps=%p vps=%p", \
+ __FILE__, __LINE__, __func__, curvps, (arg))); \
+ struct vps *saved_vps = curvps; \
+ curvps = arg;
+
+#define CURVPS_SET_VERBOSE(arg) \
+ CURVPS_SET_QUIET(arg)
+
+#define CURVPS_SET(arg) CURVPS_SET_VERBOSE(arg)
+
+#define CURVPS_RESTORE() \
+ VPS_ASSERT(curvps != NULL && (saved_vps == NULL || \
+ saved_vps->vps_magic_n == VPS_MAGIC_N), \
+ ("CURVPS_RESTORE at %s:%d %s() curvps=%p saved_vps=%p", \
+ __FILE__, __LINE__, __func__, curvps, saved_vps)); \
+ curvps = saved_vps;
+#endif /* VPS_DEBUG */
+
+extern struct vps *vps0;
+#define IS_DEFAULT_VPS(arg) ((arg) == vps0)
+
+#define CRED_TO_VPS(cr) (cr)->cr_prison->pr_vps
+#define TD_TO_VPS(td) CRED_TO_VPS((td)->td_ucred)
+#define P_TO_VPS(p) CRED_TO_VPS((p)->p_ucred)
+
+/*
+ * Global linked list of all virtual process spaces, along with read locks to
+ * access it. If a caller may sleep while accessing the list, it must use
+ * the sleepable lock macros.
+ */
+LIST_HEAD(vps_list_head, vps);
+extern struct vps_list_head vps_head;
+extern struct rwlock vps_rwlock;
+extern struct sx vps_sxlock;
+
+#define VPS_LIST_RLOCK() sx_slock(&vps_sxlock)
+#define VPS_LIST_RLOCK_NOSLEEP() rw_rlock(&vps_rwlock)
+#define VPS_LIST_RUNLOCK() sx_sunlock(&vps_sxlock)
+#define VPS_LIST_RUNLOCK_NOSLEEP() rw_runlock(&vps_rwlock)
+
+/*
+ * Iteration macros to walk the global list of virtual process spaces.
+ */
+#define VPS_ITERATOR_DECL(arg) struct vps *arg
+#define VPS_FOREACH(arg) LIST_FOREACH((arg), &vps_head, vps_le)
+
+/*
+ * Virtual process space memory allocator, which allows global variables to
+ * be automatically instantiated for each process space instance.
+ */
+#define VPS_NAME(n) vps_entry_##n
+#define VPS_DECLARE(t, n) extern t VPS_NAME(n)
+#define VPS_DEFINE(t, n) t VPS_NAME(n) __section(VPS_SETNAME) __used
+#define _VPS_PTR(b, n) (__typeof(VPS_NAME(n))*) \
+ ((b) + (uintptr_t)&VPS_NAME(n))
+
+#define _VPS(b, n) (*_VPS_PTR(b, n))
+
+/*
+ * Virtualized global variable accessor macros.
+ */
+#define VPS_VPS_PTR(vps, n) _VPS_PTR((vps)->vps_data_base, n)
+#define VPS_VPS(vps, n) (*VPS_VPS_PTR((vps), n))
+
+#define VPS_PTR(n) VPS_VPS_PTR(curvps, n)
+#define VPS(n) VPS_VPS(curvps, n)
+
+/*
+ * Virtual process space allocator interfaces from the kernel linker.
+ */
+void *vps_data_alloc(int size);
+void vps_data_copy(void *start, int size);
+void vps_data_free(void *start_arg, int size);
+
+/*
+ * Virtual sysinit mechanism, allowing process space components to declare
+ * startup and shutdown methods to be run when virtual process space
+ * instances are created and destroyed.
+ */
+#include <sys/kernel.h>
+
+/*
+ * SYSINIT/SYSUNINIT variants that provide per-vps constructors and
+ * destructors.
+ */
+struct vps_sysinit {
+ enum sysinit_sub_id subsystem;
+ enum sysinit_elem_order order;
+ sysinit_cfunc_t func;
+ const void *arg;
+ TAILQ_ENTRY(vps_sysinit) link;
+};
+
+#define VPS_SYSINIT(ident, subsystem, order, func, arg) \
+ static struct vps_sysinit ident ## _vps_init = { \
+ subsystem, \
+ order, \
+ (sysinit_cfunc_t)(sysinit_nfunc_t)func, \
+ (arg) \
+ }; \
+ SYSINIT(vps_init_ ## ident, subsystem, order, \
+ vps_register_sysinit, &ident ## _vps_init); \
+ SYSUNINIT(vps_init_ ## ident, subsystem, order, \
+ vps_deregister_sysinit, &ident ## _vps_init)
+
+#define VPS_SYSUNINIT(ident, subsystem, order, func, arg) \
+ static struct vps_sysinit ident ## _vps_uninit = { \
+ subsystem, \
+ order, \
+ (sysinit_cfunc_t)(sysinit_nfunc_t)func, \
+ (arg) \
+ }; \
+ SYSINIT(vps_uninit_ ## ident, subsystem, order, \
+ vps_register_sysuninit, &ident ## _vps_uninit); \
+ SYSUNINIT(vps_uninit_ ## ident, subsystem, order, \
+ vps_deregister_sysuninit, &ident ## _vps_uninit)
+
+/*
+ * Run per-vps sysinits or sysuninits during vps creation/destruction.
+ */
+void vps_sysinit(void);
+void vps_sysuninit(void);
+
+/*
+ * Interfaces for managing per-vps constructors and destructors.
+ */
+void vps_register_sysinit(void *arg);
+void vps_register_sysuninit(void *arg);
+void vps_deregister_sysinit(void *arg);
+void vps_deregister_sysuninit(void *arg);
+
+/*
+ * EVENTHANDLER(9) extensions.
+ */
+#include <sys/eventhandler.h>
+
+void vps_global_eventhandler_iterator_func(void *, ...);
+#define VPS_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \
+do { \
+ if (IS_DEFAULT_VPS(curvps)) { \
+ (tag) = vimage_eventhandler_register(NULL, #name, func, \
+ arg, priority, \
+ vps_global_eventhandler_iterator_func); \
+ } \
+} while(0)
+#define VPS_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \
+do { \
+ if (IS_DEFAULT_VPS(curvps)) { \
+ vimage_eventhandler_register(NULL, #name, func, \
+ arg, priority, \
+ vps_global_eventhandler_iterator_func); \
+ } \
+} while(0)
+
+#else /* !VIMAGE */
+
+/*
+ * Various virtual process space macros compile to no-ops without VIMAGE.
+ */
+#define curvps NULL
+
+#define VPS_ASSERT(exp, msg)
+#define CURVPS_SET(arg)
+#define CURVPS_SET_QUIET(arg)
+#define CURVPS_RESTORE()
+
+#define VPS_LIST_RLOCK()
+#define VPS_LIST_RLOCK_NOSLEEP()
+#define VPS_LIST_RUNLOCK()
+#define VPS_LIST_RUNLOCK_NOSLEEP()
+#define VPS_ITERATOR_DECL(arg)
+#define VPS_FOREACH(arg)
+
+#define IS_DEFAULT_VPS(arg) 1
+#define CRED_TO_VPS(cr) NULL
+#define TD_TO_VPS(td) NULL
+#define P_TO_VPS(p) NULL
+
+/*
+ * Versions of the vps macros that compile to normal global variables and
+ * standard sysctl definitions.
+ */
+#define VPS_NAME(n) n
+#define VPS_DECLARE(t, n) extern t n
+#define VPS_DEFINE(t, n) t n
+#define _VPS_PTR(b, n) &VPS_NAME(n)
+
+/*
+ * Virtualized global variable accessor macros.
+ */
+#define VPS_VPS_PTR(vps, n) (&(n))
+#define VPS_VPS(vps, n) (n)
+
+#define VPS_PTR(n) (&(n))
+#define VPS(n) (n)
+
+/*
+ * When VIMAGE isn't compiled into the kernel, VPS_SYSINIT/VPS_SYSUNINIT
+ * map into normal sysinits, which have the same ordering properties.
+ */
+#define VPS_SYSINIT(ident, subsystem, order, func, arg) \
+ SYSINIT(ident, subsystem, order, func, arg)
+#define VPS_SYSUNINIT(ident, subsystem, order, func, arg) \
+ SYSUNINIT(ident, subsystem, order, func, arg)
+
+/*
+ * Without VIMAGE revert to the default implementation.
+ */
+#define VPS_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \
+ (tag) = eventhandler_register(NULL, #name, func, arg, priority)
+#define VPS_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \
+ eventhandler_register(NULL, #name, func, arg, priority)
+#endif /* VIMAGE */
+#endif /* _KERNEL */
+
+#endif /* !_SYS_VPS_H_ */
Index: sys/vm/vm_meter.c
===================================================================
--- sys/vm/vm_meter.c
+++ sys/vm/vm_meter.c
@@ -177,6 +177,7 @@
static int
vmtotal(SYSCTL_HANDLER_ARGS)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct vmtotal total;
#if defined(COMPAT_FREEBSD11)
struct vmtotal11 total11;
@@ -197,41 +198,48 @@
/*
* Calculate process statistics.
*/
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- if ((p->p_flag & P_SYSTEM) != 0)
- continue;
- PROC_LOCK(p);
- if (p->p_state != PRS_NEW) {
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- switch (td->td_state) {
- case TDS_INHIBITED:
- if (TD_IS_SWAPPED(td))
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if ((p->p_flag & P_SYSTEM) != 0)
+ continue;
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NEW) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ switch (td->td_state) {
+ case TDS_INHIBITED:
+ if (TD_IS_SWAPPED(td))
+ total.t_sw++;
+ else if (TD_IS_SLEEPING(td)) {
+ if (td->td_priority <=
+ PZERO)
+ total.t_dw++;
+ else
+ total.t_sl++;
+ }
+ break;
+ case TDS_CAN_RUN:
total.t_sw++;
- else if (TD_IS_SLEEPING(td)) {
- if (td->td_priority <= PZERO)
- total.t_dw++;
- else
- total.t_sl++;
+ break;
+ case TDS_RUNQ:
+ case TDS_RUNNING:
+ total.t_rq++;
+ break;
+ default:
+ break;
}
- break;
- case TDS_CAN_RUN:
- total.t_sw++;
- break;
- case TDS_RUNQ:
- case TDS_RUNNING:
- total.t_rq++;
- break;
- default:
- break;
+ thread_unlock(td);
}
- thread_unlock(td);
}
+ PROC_UNLOCK(p);
}
- PROC_UNLOCK(p);
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
/*
* Calculate object memory usage statistics.
*/
Index: sys/vm/vm_object.c
===================================================================
--- sys/vm/vm_object.c
+++ sys/vm/vm_object.c
@@ -2507,18 +2507,27 @@
static int
vm_object_in_map(vm_object_t object)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
- /* sx_slock(&allproc_lock); */
- FOREACH_PROC_IN_SYSTEM(p) {
- if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
- continue;
- if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
- /* sx_sunlock(&allproc_lock); */
- return 1;
+ /* VPS_LIST_RLOCK(); */
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ /* sx_slock(&V_allproc_lock); */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (!p->p_vmspace
+ /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
+ continue;
+ if (_vm_object_in_map(&p->p_vmspace->vm_map, object,
+ 0)) {
+ /* sx_sunlock(&V_allproc_lock); */
+ return 1;
+ }
}
+ /* sx_sunlock(&V_allproc_lock); */
+ CURVPS_RESTORE();
}
- /* sx_sunlock(&allproc_lock); */
+ /* VPS_LIST_RUNLOCK(); */
if (_vm_object_in_map(kernel_map, object, 0))
return 1;
return 0;
Index: sys/vm/vm_pageout.c
===================================================================
--- sys/vm/vm_pageout.c
+++ sys/vm/vm_pageout.c
@@ -1744,6 +1744,7 @@
void
vm_pageout_oom(int shortage)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p, *bigproc;
vm_offset_t size, bigsize;
struct thread *td;
@@ -1760,80 +1761,88 @@
*/
bigproc = NULL;
bigsize = 0;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- /*
- * If this is a system, protected or killed process, skip it.
- */
- if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
- P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
- p->p_pid == 1 || P_KILLED(p) ||
- (p->p_pid < 48 && swap_pager_avail != 0)) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * If the process is in a non-running type state,
- * don't touch it. Check all the threads individually.
- */
- breakout = false;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (!TD_ON_RUNQ(td) &&
- !TD_IS_RUNNING(td) &&
- !TD_IS_SLEEPING(td) &&
- !TD_IS_SUSPENDED(td) &&
- !TD_IS_SWAPPED(td)) {
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+
+ /*
+ * If this is a system, protected or killed process,
+ * skip it.
+ */
+ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
+ P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
+ p->p_pid == 1 || P_KILLED(p) ||
+ (p->p_pid < 48 && swap_pager_avail != 0)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * If the process is in a non-running type state,
+ * don't touch it. Check all the threads individually.
+ */
+ breakout = false;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (!TD_ON_RUNQ(td) &&
+ !TD_IS_RUNNING(td) &&
+ !TD_IS_SLEEPING(td) &&
+ !TD_IS_SUSPENDED(td) &&
+ !TD_IS_SWAPPED(td)) {
+ thread_unlock(td);
+ breakout = true;
+ break;
+ }
thread_unlock(td);
- breakout = true;
- break;
}
- thread_unlock(td);
- }
- if (breakout) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * get the process size
- */
- vm = vmspace_acquire_ref(p);
- if (vm == NULL) {
+ if (breakout) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * get the process size
+ */
+ vm = vmspace_acquire_ref(p);
+ if (vm == NULL) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ _PHOLD_LITE(p);
PROC_UNLOCK(p);
- continue;
- }
- _PHOLD_LITE(p);
- PROC_UNLOCK(p);
- sx_sunlock(&allproc_lock);
- if (!vm_map_trylock_read(&vm->vm_map)) {
+ sx_sunlock(&V_allproc_lock);
+ if (!vm_map_trylock_read(&vm->vm_map)) {
+ vmspace_free(vm);
+ sx_slock(&V_allproc_lock);
+ PRELE(p);
+ continue;
+ }
+ size = vmspace_swap_count(vm);
+ if (shortage == VM_OOM_MEM)
+ size += vm_pageout_oom_pagecount(vm);
+ vm_map_unlock_read(&vm->vm_map);
vmspace_free(vm);
- sx_slock(&allproc_lock);
- PRELE(p);
- continue;
- }
- size = vmspace_swap_count(vm);
- if (shortage == VM_OOM_MEM)
- size += vm_pageout_oom_pagecount(vm);
- vm_map_unlock_read(&vm->vm_map);
- vmspace_free(vm);
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
- /*
- * If this process is bigger than the biggest one,
- * remember it.
- */
- if (size > bigsize) {
- if (bigproc != NULL)
- PRELE(bigproc);
- bigproc = p;
- bigsize = size;
- } else {
- PRELE(p);
+ /*
+ * If this process is bigger than the biggest one,
+ * remember it.
+ */
+ if (size > bigsize) {
+ if (bigproc != NULL)
+ PRELE(bigproc);
+ bigproc = p;
+ bigsize = size;
+ } else {
+ PRELE(p);
+ }
}
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
if (bigproc != NULL) {
if (vm_panic_on_oom != 0)
panic("out of swap space");
Index: sys/vm/vm_swapout.c
===================================================================
--- sys/vm/vm_swapout.c
+++ sys/vm/vm_swapout.c
@@ -378,6 +378,7 @@
static void
vm_daemon(void)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct rlimit rsslim;
struct proc *p;
struct thread *td;
@@ -417,114 +418,129 @@
attempts = 0;
again:
attempts++;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- vm_pindex_t limit, size;
- /*
- * if this is a system process or if we have already
- * looked at this process, skip it.
- */
- PROC_LOCK(p);
- if (p->p_state != PRS_NORMAL ||
- p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * if the process is in a non-running type state,
- * don't touch it.
- */
- breakout = 0;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- if (!TD_ON_RUNQ(td) &&
- !TD_IS_RUNNING(td) &&
- !TD_IS_SLEEPING(td) &&
- !TD_IS_SUSPENDED(td)) {
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ vm_pindex_t limit, size;
+
+ /*
+ * If this is a system process or if we have
+ * already looked at this process, skip it.
+ */
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL || p->p_flag &
+ (P_INEXEC | P_SYSTEM | P_WEXIT)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * If the process is in a non-running type
+ * state, don't touch it.
+ */
+ breakout = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (!TD_ON_RUNQ(td) &&
+ !TD_IS_RUNNING(td) &&
+ !TD_IS_SLEEPING(td) &&
+ !TD_IS_SUSPENDED(td)) {
+ thread_unlock(td);
+ breakout = 1;
+ break;
+ }
thread_unlock(td);
- breakout = 1;
- break;
}
- thread_unlock(td);
- }
- if (breakout) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * get a limit
- */
- lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
- limit = OFF_TO_IDX(
- qmin(rsslim.rlim_cur, rsslim.rlim_max));
+ if (breakout) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * get a limit
+ */
+ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
+ limit = OFF_TO_IDX(
+ qmin(rsslim.rlim_cur, rsslim.rlim_max));
- /*
- * let processes that are swapped out really be
- * swapped out set the limit to nothing (will force a
- * swap-out.)
- */
- if ((p->p_flag & P_INMEM) == 0)
- limit = 0; /* XXX */
- vm = vmspace_acquire_ref(p);
- _PHOLD_LITE(p);
- PROC_UNLOCK(p);
- if (vm == NULL) {
- PRELE(p);
- continue;
- }
- sx_sunlock(&allproc_lock);
+ /*
+ * let processes that are swapped out really be
+ * swapped out set the limit to nothing
+ * (will force a swap-out.)
+ */
+ if ((p->p_flag & P_INMEM) == 0)
+ limit = 0; /* XXX */
+ vm = vmspace_acquire_ref(p);
+ _PHOLD_LITE(p);
+ PROC_UNLOCK(p);
+ if (vm == NULL) {
+ PRELE(p);
+ continue;
+ }
+ sx_sunlock(&V_allproc_lock);
- size = vmspace_resident_count(vm);
- if (size >= limit) {
- vm_swapout_map_deactivate_pages(
- &vm->vm_map, limit);
size = vmspace_resident_count(vm);
- }
-#ifdef RACCT
- if (racct_enable) {
- rsize = IDX_TO_OFF(size);
- PROC_LOCK(p);
- if (p->p_state == PRS_NORMAL)
- racct_set(p, RACCT_RSS, rsize);
- ravailable = racct_get_available(p, RACCT_RSS);
- PROC_UNLOCK(p);
- if (rsize > ravailable) {
- /*
- * Don't be overly aggressive; this
- * might be an innocent process,
- * and the limit could've been exceeded
- * by some memory hog. Don't try
- * to deactivate more than 1/4th
- * of process' resident set size.
- */
- if (attempts <= 8) {
- if (ravailable < rsize -
- (rsize / 4)) {
- ravailable = rsize -
- (rsize / 4);
- }
- }
+ if (size >= limit) {
vm_swapout_map_deactivate_pages(
- &vm->vm_map,
- OFF_TO_IDX(ravailable));
- /* Update RSS usage after paging out. */
+ &vm->vm_map, limit);
size = vmspace_resident_count(vm);
+ }
+#ifdef RACCT
+ if (racct_enable) {
rsize = IDX_TO_OFF(size);
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL)
racct_set(p, RACCT_RSS, rsize);
+ ravailable = racct_get_available(p,
+ RACCT_RSS);
PROC_UNLOCK(p);
- if (rsize > ravailable)
- tryagain = 1;
+ if (rsize > ravailable) {
+ /*
+ * Don't be overly aggressive;
+ * this might be an innocent
+ * process, and the limit
+ * could've been exceeded by
+ * some memory hog. Don't try to
+ * deactivate more than 1/4th of
+ * process' resident set size.
+ */
+ if (attempts <= 8) {
+ if (ravailable < rsize -
+ (rsize / 4)) {
+ ravailable =
+ rsize -
+ (rsize / 4);
+ }
+ }
+ vm_swapout_map_deactivate_pages(
+ &vm->vm_map,
+ OFF_TO_IDX(ravailable));
+ /*
+ * Update RSS usage after
+ * paging out.
+ */
+ size = vmspace_resident_count(
+ vm);
+ rsize = IDX_TO_OFF(size);
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL)
+ racct_set(p, RACCT_RSS,
+ rsize);
+ PROC_UNLOCK(p);
+ if (rsize > ravailable)
+ tryagain = 1;
+ }
}
- }
#endif
- vmspace_free(vm);
- sx_slock(&allproc_lock);
- PRELE(p);
+ vmspace_free(vm);
+ sx_slock(&V_allproc_lock);
+ PRELE(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
if (tryagain != 0 && attempts <= 10) {
maybe_yield();
goto again;
@@ -656,6 +672,13 @@
int ppri, pri, slptime, swtime;
loop:
+#ifdef VIMAGE
+ if (!IS_DEFAULT_VPS(curvps) && V_vpsdying > 0) {
+ V_vproc0 = NULL;
+ return;
+ }
+#endif
+
if (vm_page_count_min()) {
vm_wait_min();
goto loop;
@@ -663,7 +686,7 @@
pp = NULL;
ppri = INT_MIN;
- sx_slock(&allproc_lock);
+ sx_slock(&V_allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW ||
@@ -698,13 +721,13 @@
}
PROC_UNLOCK(p);
}
- sx_sunlock(&allproc_lock);
+ sx_sunlock(&V_allproc_lock);
/*
* Nothing to do, back to sleep.
*/
if ((p = pp) == NULL) {
- tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
+ tsleep(V_vproc0, PVM, "swapin", MAXSLP * hz / 2);
goto loop;
}
PROC_LOCK(p);
@@ -738,6 +761,7 @@
static void
swapout_procs(int action)
{
+ VPS_ITERATOR_DECL(vps_iter);
struct proc *p;
struct thread *td;
int slptime;
@@ -746,74 +770,81 @@
MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0);
didswap = false;
- sx_slock(&allproc_lock);
- FOREACH_PROC_IN_SYSTEM(p) {
- /*
- * Filter out not yet fully constructed processes. Do
- * not swap out held processes. Avoid processes which
- * are system, exiting, execing, traced, already swapped
- * out or are in the process of being swapped in or out.
- */
- PROC_LOCK(p);
- if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag &
- (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE |
- P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) !=
- P_INMEM) {
- PROC_UNLOCK(p);
- continue;
- }
- /*
- * Further consideration of this process for swap out
- * requires iterating over its threads. We release
- * allproc_lock here so that process creation and
- * destruction are not blocked while we iterate.
- *
- * To later reacquire allproc_lock and resume
- * iteration over the allproc list, we will first have
- * to release the lock on the process. We place a
- * hold on the process so that it remains in the
- * allproc list while it is unlocked.
- */
- _PHOLD_LITE(p);
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RLOCK();
+ VPS_FOREACH(vps_iter) {
+ CURVPS_SET(vps_iter);
+ sx_slock(&V_allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ /*
+ * Filter out not yet fully constructed processes. Do
+ * not swap out held processes. Avoid processes which
+ * are system, exiting, execing, traced, already swapped
+ * out or are in the process of being swapped in or out.
+ */
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL || p->p_lock != 0 ||
+ (p->p_flag & (P_SYSTEM | P_WEXIT | P_INEXEC |
+ P_STOPPED_SINGLE | P_TRACED | P_SWAPPINGOUT |
+ P_SWAPPINGIN | P_INMEM)) != P_INMEM) {
+ PROC_UNLOCK(p);
+ continue;
+ }
- /*
- * Do not swapout a realtime process.
- * Guarantee swap_idle_threshold1 time in memory.
- * If the system is under memory stress, or if we are
- * swapping idle processes >= swap_idle_threshold2,
- * then swap the process out.
- */
- doswap = true;
- FOREACH_THREAD_IN_PROC(p, td) {
- thread_lock(td);
- slptime = (ticks - td->td_slptick) / hz;
- if (PRI_IS_REALTIME(td->td_pri_class) ||
- slptime < swap_idle_threshold1 ||
- !thread_safetoswapout(td) ||
- ((action & VM_SWAP_NORMAL) == 0 &&
- slptime < swap_idle_threshold2))
- doswap = false;
- thread_unlock(td);
- if (!doswap)
- break;
- }
- if (doswap && swapout(p) == 0)
- didswap = true;
+ /*
+ * Further consideration of this process for swap out
+ * requires iterating over its threads. We release
+ * allproc_lock here so that process creation and
+ * destruction are not blocked while we iterate.
+ *
+ * To later reacquire allproc_lock and resume
+ * iteration over the allproc list, we will first have
+ * to release the lock on the process. We place a
+ * hold on the process so that it remains in the
+ * allproc list while it is unlocked.
+ */
+ _PHOLD_LITE(p);
+ sx_sunlock(&V_allproc_lock);
- PROC_UNLOCK(p);
- sx_slock(&allproc_lock);
- PRELE(p);
+ /*
+ * Do not swapout a realtime process.
+ * Guarantee swap_idle_threshold1 time in memory.
+ * If the system is under memory stress, or if we are
+ * swapping idle processes >= swap_idle_threshold2,
+ * then swap the process out.
+ */
+ doswap = true;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ slptime = (ticks - td->td_slptick) / hz;
+ if (PRI_IS_REALTIME(td->td_pri_class) ||
+ slptime < swap_idle_threshold1 ||
+ !thread_safetoswapout(td) ||
+ ((action & VM_SWAP_NORMAL) == 0 &&
+ slptime < swap_idle_threshold2))
+ doswap = false;
+ thread_unlock(td);
+ if (!doswap)
+ break;
+ }
+ if (doswap && swapout(p) == 0)
+ didswap = true;
+
+ PROC_UNLOCK(p);
+ sx_slock(&V_allproc_lock);
+ PRELE(p);
+ }
+ sx_sunlock(&V_allproc_lock);
+ CURVPS_RESTORE();
}
- sx_sunlock(&allproc_lock);
+ VPS_LIST_RUNLOCK();
/*
* If we swapped something out, and another process needed memory,
* then wakeup the sched process.
*/
if (didswap)
- wakeup(&proc0);
+ wakeup(V_vproc0);
}
static void
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Mar 12, 12:36 PM (9 h, 49 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
29582528
Default Alt Text
D15865.diff (197 KB)
Attached To
Mode
D15865: Provide process space virtualisation functionality for jails.
Attached
Detach File
Event Timeline
Log In to Comment