Page MenuHomeFreeBSD

No OneTemporary

diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 3e4f19d655e6..7386a0729835 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -1,919 +1,920 @@
/*-
* SPDX-License-Identifier: BSD-4-Clause
*
* Copyright (c) 1995 Terrence R. Lambert
* All rights reserved.
*
* Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "opt_ddb.h"
#include "opt_kdb.h"
#include "opt_init_path.h"
#include "opt_verbose_sysinit.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/boottrace.h>
#include <sys/conf.h>
#include <sys/cpuset.h>
#include <sys/dtrace_bsd.h>
#include <sys/epoch.h>
#include <sys/eventhandler.h>
#include <sys/exec.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/imgact.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/loginclass.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/reboot.h>
#include <sys/resourcevar.h>
#include <sys/queue.h>
#include <sys/queue_mergesort.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/sysproto.h>
#include <sys/unistd.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <machine/cpu.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <sys/copyright.h>
#include <ddb/ddb.h>
#include <ddb/db_sym.h>
void mi_startup(void); /* Should be elsewhere */
/* Components of the first process -- never freed. */
static struct session session0;
static struct pgrp pgrp0;
struct proc proc0;
struct thread0_storage thread0_st __aligned(32);
struct vmspace vmspace0;
struct proc *initproc;
int
linux_alloc_current_noop(struct thread *td __unused, int flags __unused)
{
return (0);
}
int (*lkpi_alloc_current)(struct thread *, int) = linux_alloc_current_noop;
#ifndef BOOTHOWTO
#define BOOTHOWTO 0
#endif
int boothowto = BOOTHOWTO; /* initialized so that it can be patched */
SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0,
"Boot control flags, passed from loader");
#ifndef BOOTVERBOSE
#define BOOTVERBOSE 0
#endif
int bootverbose = BOOTVERBOSE;
SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0,
"Control the output of verbose kernel messages");
#ifdef VERBOSE_SYSINIT
/*
* We'll use the defined value of VERBOSE_SYSINIT from the kernel config to
* dictate the default VERBOSE_SYSINIT behavior. Significant values for this
* option and associated tunable are:
* - 0, 'compiled in but silent by default'
* - 1, 'compiled in but verbose by default' (default)
*/
int verbose_sysinit = VERBOSE_SYSINIT;
TUNABLE_INT("debug.verbose_sysinit", &verbose_sysinit);
#endif
#ifdef INVARIANTS
FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance");
#endif
/*
* This ensures that there is at least one entry so that the sysinit_set
* symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never
* executed.
*/
SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL);
/*
* The sysinit linker set compiled into the kernel. These are placed onto the
* sysinit list by mi_startup; sysinit_add can add (e.g., from klds) additional
* sysinits to the linked list but the linker set here does not change.
*/
SET_DECLARE(sysinit_set, struct sysinit);
/*
* The sysinit lists. Items are moved to sysinit_done_list when done.
*/
static STAILQ_HEAD(sysinitlist, sysinit) sysinit_list;
static struct sysinitlist sysinit_done_list =
STAILQ_HEAD_INITIALIZER(sysinit_done_list);
/*
* Compare two sysinits; return -1, 0, or 1 if a comes before, at the same time
* as, or after b.
*/
static int
sysinit_compar(struct sysinit *a, struct sysinit *b, void *thunk __unused)
{
if (a->subsystem < b->subsystem)
return (-1);
if (a->subsystem > b->subsystem)
return (1);
if (a->order < b->order)
return (-1);
if (a->order > b->order)
return (1);
return (0);
}
static void
sysinit_mklist(struct sysinitlist *list, struct sysinit **set,
struct sysinit **set_end)
{
struct sysinit **sipp;
TSENTER();
TSENTER2("listify");
STAILQ_INIT(list);
for (sipp = set; sipp < set_end; sipp++)
STAILQ_INSERT_TAIL(list, *sipp, next);
TSEXIT2("listify");
TSENTER2("mergesort");
STAILQ_MERGESORT(list, NULL, sysinit_compar, sysinit, next);
TSEXIT2("mergesort");
TSEXIT();
}
/*
* Merge a new sysinit set into the sysinit list.
*/
void
sysinit_add(struct sysinit **set, struct sysinit **set_end)
{
struct sysinitlist new_list;
TSENTER();
/* Construct a sorted list from the new sysinits. */
sysinit_mklist(&new_list, set, set_end);
/* Merge the new list into the existing one. */
TSENTER2("STAILQ_MERGE");
STAILQ_MERGE(&sysinit_list, &new_list, NULL, sysinit_compar, sysinit, next);
TSEXIT2("STAILQ_MERGE");
TSEXIT();
}
#if defined (DDB) && defined(VERBOSE_SYSINIT)
static const char *
symbol_name(vm_offset_t va, db_strategy_t strategy)
{
const char *name;
c_db_sym_t sym;
db_expr_t offset;
if (va == 0)
return (NULL);
sym = db_search_symbol(va, strategy, &offset);
if (offset != 0)
return (NULL);
db_symbol_values(sym, &name, NULL);
return (name);
}
#endif
/*
* System startup; initialize the world, create process 0, mount root
* filesystem, and fork to create init and pagedaemon. Most of the
* hard work is done in the lower-level initialization routines including
* startup(), which does memory initialization and autoconfiguration.
*
* This allows simple addition of new kernel subsystems that require
* boot time initialization. It also allows substitution of subsystem
* (for instance, a scheduler, kernel profiler, or VM system) by object
* module. Finally, it allows for optional "kernel threads".
*/
void
mi_startup(void)
{
-
struct sysinit *sip;
int last;
#if defined(VERBOSE_SYSINIT)
int verbose;
#endif
TSENTER();
if (boothowto & RB_VERBOSE)
bootverbose++;
/* Construct and sort sysinit list. */
sysinit_mklist(&sysinit_list, SET_BEGIN(sysinit_set), SET_LIMIT(sysinit_set));
last = SI_SUB_COPYRIGHT;
#if defined(VERBOSE_SYSINIT)
verbose = 0;
#if !defined(DDB)
printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n");
#endif
#endif
/*
* Perform each system initialization task from the ordered list. Note
* that if sysinit_list is modified (e.g. by a KLD) we will nonetheless
* always perform the earlist-sorted sysinit at each step; using the
* STAILQ_FOREACH macro would result in items being skipped if inserted
* earlier than the "current item".
*/
while ((sip = STAILQ_FIRST(&sysinit_list)) != NULL) {
STAILQ_REMOVE_HEAD(&sysinit_list, next);
STAILQ_INSERT_TAIL(&sysinit_done_list, sip, next);
if (sip->subsystem == SI_SUB_DUMMY)
continue; /* skip dummy task(s)*/
if (sip->subsystem > last)
BOOTTRACE_INIT("sysinit 0x%7x", sip->subsystem);
#if defined(VERBOSE_SYSINIT)
if (sip->subsystem > last && verbose_sysinit != 0) {
verbose = 1;
printf("subsystem %x\n", last);
}
if (verbose) {
#if defined(DDB)
const char *func, *data;
func = symbol_name((vm_offset_t)sip->func,
DB_STGY_PROC);
data = symbol_name((vm_offset_t)sip->udata,
DB_STGY_ANY);
if (func != NULL && data != NULL)
printf(" %s(&%s)... ", func, data);
else if (func != NULL)
printf(" %s(%p)... ", func, sip->udata);
else
#endif
printf(" %p(%p)... ", sip->func,
sip->udata);
}
#endif
/* Call function */
(*(sip->func))(sip->udata);
#if defined(VERBOSE_SYSINIT)
if (verbose)
printf("done.\n");
#endif
/* Check off the one we're just done */
last = sip->subsystem;
}
TSEXIT(); /* Here so we don't overlap with start_init. */
BOOTTRACE("mi_startup done");
mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
mtx_unlock(&Giant);
/*
- * Now hand over this thread to swapper.
+ * We can't free our thread structure since it is statically allocated.
+ * Just sleep forever. This thread could be repurposed for something if
+ * the need arises.
*/
- swapper();
- /* NOTREACHED*/
+ for (;;)
+ tsleep(__builtin_frame_address(0), PNOLOCK, "parked", 0);
}
static void
print_caddr_t(void *data)
{
printf("%s", (char *)data);
}
static void
print_version(void *data __unused)
{
int len;
/* Strip a trailing newline from version. */
len = strlen(version);
while (len > 0 && version[len - 1] == '\n')
len--;
printf("%.*s %s\n", len, version, machine);
printf("%s\n", compiler_version);
}
SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t,
copyright);
SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t,
trademark);
SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL);
#ifdef WITNESS
static char wit_warn[] =
"WARNING: WITNESS option enabled, expect reduced performance.\n";
SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_FOURTH,
print_caddr_t, wit_warn);
SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_FOURTH,
print_caddr_t, wit_warn);
#endif
#ifdef DIAGNOSTIC
static char diag_warn[] =
"WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_FIFTH,
print_caddr_t, diag_warn);
SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_FIFTH,
print_caddr_t, diag_warn);
#endif
#if __SIZEOF_LONG__ == 4
static char ilp32_warn[] =
"WARNING: 32-bit kernels are deprecated and may be removed in FreeBSD 15.0.\n";
SYSINIT(ilp32warn, SI_SUB_COPYRIGHT, SI_ORDER_FIFTH,
print_caddr_t, ilp32_warn);
SYSINIT(ilp32warn2, SI_SUB_LAST, SI_ORDER_FIFTH,
print_caddr_t, ilp32_warn);
#endif
static int
null_fetch_syscall_args(struct thread *td __unused)
{
panic("null_fetch_syscall_args");
}
static void
null_set_syscall_retval(struct thread *td __unused, int error __unused)
{
panic("null_set_syscall_retval");
}
static void
null_set_fork_retval(struct thread *td __unused)
{
}
struct sysentvec null_sysvec = {
.sv_size = 0,
.sv_table = NULL,
.sv_fixup = NULL,
.sv_sendsig = NULL,
.sv_sigcode = NULL,
.sv_szsigcode = NULL,
.sv_name = "null",
.sv_coredump = NULL,
.sv_minsigstksz = 0,
.sv_minuser = VM_MIN_ADDRESS,
.sv_maxuser = VM_MAXUSER_ADDRESS,
.sv_usrstack = USRSTACK,
.sv_psstrings = PS_STRINGS,
.sv_psstringssz = sizeof(struct ps_strings),
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_strings = NULL,
.sv_setregs = NULL,
.sv_fixlimit = NULL,
.sv_maxssiz = NULL,
.sv_flags = 0,
.sv_set_syscall_retval = null_set_syscall_retval,
.sv_fetch_syscall_args = null_fetch_syscall_args,
.sv_syscallnames = NULL,
.sv_schedtail = NULL,
.sv_thread_detach = NULL,
.sv_trap = NULL,
.sv_set_fork_retval = null_set_fork_retval,
.sv_regset_begin = NULL,
.sv_regset_end = NULL,
};
/*
* The two following SYSINIT's are proc0 specific glue code. I am not
* convinced that they can not be safely combined, but their order of
* operation has been maintained as the same as the original init_main.c
* for right now.
*/
/* ARGSUSED*/
static void
proc0_init(void *dummy __unused)
{
struct proc *p;
struct thread *td;
struct ucred *newcred;
struct uidinfo tmpuinfo;
struct loginclass tmplc = {
.lc_name = "",
};
vm_paddr_t pageablemem;
int i;
GIANT_REQUIRED;
p = &proc0;
td = &thread0;
/*
* Initialize magic number and osrel.
*/
p->p_magic = P_MAGIC;
p->p_osrel = osreldate;
/*
* Initialize thread and process structures.
*/
procinit(); /* set up proc zone */
threadinit(); /* set up UMA zones */
/*
* Initialise scheduler resources.
* Add scheduler specific parts to proc, thread as needed.
*/
schedinit(); /* scheduler gets its house in order */
/*
* Create process 0 (the swapper).
*/
LIST_INSERT_HEAD(&allproc, p, p_list);
LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
sx_init(&pgrp0.pg_killsx, "killpg racer");
p->p_pgrp = &pgrp0;
LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
LIST_INIT(&pgrp0.pg_members);
LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
pgrp0.pg_session = &session0;
mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
refcount_init(&session0.s_count, 1);
session0.s_leader = p;
p->p_sysent = &null_sysvec;
p->p_flag = P_SYSTEM | P_INMEM | P_KPROC;
p->p_flag2 = 0;
p->p_state = PRS_NORMAL;
p->p_klist = knlist_alloc(&p->p_mtx);
STAILQ_INIT(&p->p_ktr);
p->p_nice = NZERO;
td->td_tid = THREAD0_TID;
tidhash_add(td);
TD_SET_STATE(td, TDS_RUNNING);
td->td_pri_class = PRI_TIMESHARE;
td->td_user_pri = PUSER;
td->td_base_user_pri = PUSER;
td->td_lend_user_pri = PRI_MAX;
td->td_priority = PVM;
td->td_base_pri = PVM;
td->td_oncpu = curcpu;
td->td_flags = TDF_INMEM;
td->td_pflags = TDP_KTHREAD;
td->td_cpuset = cpuset_thread0();
td->td_domain.dr_policy = td->td_cpuset->cs_domain;
prison0_init();
p->p_peers = 0;
p->p_leader = p;
p->p_reaper = p;
p->p_treeflag |= P_TREE_REAPER;
LIST_INIT(&p->p_reaplist);
strncpy(p->p_comm, "kernel", sizeof (p->p_comm));
strncpy(td->td_name, "swapper", sizeof (td->td_name));
callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0);
callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
callout_init(&td->td_slpcallout, 1);
TAILQ_INIT(&p->p_kqtim_stop);
/* Create credentials. */
newcred = crget();
newcred->cr_ngroups = 1; /* group 0 */
/* A hack to prevent uifind from tripping over NULL pointers. */
curthread->td_ucred = newcred;
tmpuinfo.ui_uid = 1;
newcred->cr_uidinfo = newcred->cr_ruidinfo = &tmpuinfo;
newcred->cr_uidinfo = uifind(0);
newcred->cr_ruidinfo = uifind(0);
newcred->cr_loginclass = &tmplc;
newcred->cr_loginclass = loginclass_find("default");
/* End hack. creds get properly set later with thread_cow_get_proc */
curthread->td_ucred = NULL;
newcred->cr_prison = &prison0;
newcred->cr_users++; /* avoid assertion failure */
p->p_ucred = crcowget(newcred);
newcred->cr_users--;
crfree(newcred);
#ifdef AUDIT
audit_cred_kproc0(newcred);
#endif
#ifdef MAC
mac_cred_create_swapper(newcred);
#endif
/* Create sigacts. */
p->p_sigacts = sigacts_alloc();
/* Initialize signal state for process 0. */
siginit(&proc0);
/* Create the file descriptor table. */
p->p_pd = pdinit(NULL, false);
p->p_fd = fdinit();
p->p_fdtol = NULL;
/* Create the limits structures. */
p->p_limit = lim_alloc();
for (i = 0; i < RLIM_NLIMITS; i++)
p->p_limit->pl_rlimit[i].rlim_cur =
p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur =
p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur =
p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
/* Cast to avoid overflow on i386/PAE. */
pageablemem = ptoa((vm_paddr_t)vm_free_count());
p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur =
p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem;
p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3;
p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem;
p->p_cpulimit = RLIM_INFINITY;
PROC_LOCK(p);
thread_cow_get_proc(td, p);
PROC_UNLOCK(p);
/* Initialize resource accounting structures. */
racct_create(&p->p_racct);
p->p_stats = pstats_alloc();
/* Allocate a prototype map so we have something to fork. */
p->p_vmspace = &vmspace0;
refcount_init(&vmspace0.vm_refcnt, 1);
pmap_pinit0(vmspace_pmap(&vmspace0));
/*
* proc0 is not expected to enter usermode, so there is no special
* handling for sv_minuser here, like is done for exec_new_vmspace().
*/
vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0),
p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser);
/*
* Call the init and ctor for the new thread and proc. We wait
* to do this until all other structures are fairly sane.
*/
EVENTHANDLER_DIRECT_INVOKE(process_init, p);
EVENTHANDLER_DIRECT_INVOKE(thread_init, td);
#ifdef KDTRACE_HOOKS
kdtrace_proc_ctor(p);
kdtrace_thread_ctor(td);
#endif
EVENTHANDLER_DIRECT_INVOKE(process_ctor, p);
EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td);
/*
* Charge root for one process.
*/
(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
PROC_LOCK(p);
racct_add_force(p, RACCT_NPROC, 1);
PROC_UNLOCK(p);
}
SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL);
/* ARGSUSED*/
static void
proc0_post(void *dummy __unused)
{
struct proc *p;
struct rusage ru;
struct thread *td;
/*
* Now we can look at the time, having had a chance to verify the
* time from the filesystem. Pretend that proc0 started now.
*/
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_state == PRS_NEW) {
PROC_UNLOCK(p);
continue;
}
microuptime(&p->p_stats->p_start);
PROC_STATLOCK(p);
rufetch(p, &ru); /* Clears thread stats */
p->p_rux.rux_runtime = 0;
p->p_rux.rux_uticks = 0;
p->p_rux.rux_sticks = 0;
p->p_rux.rux_iticks = 0;
PROC_STATUNLOCK(p);
FOREACH_THREAD_IN_PROC(p, td) {
td->td_runtime = 0;
}
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
PCPU_SET(switchtime, cpu_ticks());
PCPU_SET(switchticks, ticks);
}
SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL);
/*
***************************************************************************
****
**** The following SYSINIT's and glue code should be moved to the
**** respective files on a per subsystem basis.
****
***************************************************************************
*/
/*
* List of paths to try when searching for "init".
*/
static char init_path[MAXPATHLEN] =
#ifdef INIT_PATH
__XSTRING(INIT_PATH);
#else
"/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init";
#endif
SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
"Path used to search the init process");
/*
* Shutdown timeout of init(8).
* Unused within kernel, but used to control init(8), hence do not remove.
*/
#ifndef INIT_SHUTDOWN_TIMEOUT
#define INIT_SHUTDOWN_TIMEOUT 120
#endif
static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT;
SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout,
CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). "
"Unused within kernel, but used to control init(8)");
/*
* Start the initial user process; try exec'ing each pathname in init_path.
* The program is invoked with one argument containing the boot flags.
*/
static void
start_init(void *dummy)
{
struct image_args args;
int error;
char *var, *path;
char *free_init_path, *tmp_init_path;
struct thread *td;
struct proc *p;
struct vmspace *oldvmspace;
TSENTER(); /* Here so we don't overlap with mi_startup. */
td = curthread;
p = td->td_proc;
vfs_mountroot();
/* Wipe GELI passphrase from the environment. */
kern_unsetenv("kern.geom.eli.passphrase");
/* For Multicons, report which console is primary to both */
if (boothowto & RB_MULTIPLE) {
if (boothowto & RB_SERIAL)
printf("Dual Console: Serial Primary, Video Secondary\n");
else
printf("Dual Console: Video Primary, Serial Secondary\n");
}
if ((var = kern_getenv("init_path")) != NULL) {
strlcpy(init_path, var, sizeof(init_path));
freeenv(var);
}
free_init_path = tmp_init_path = strdup(init_path, M_TEMP);
while ((path = strsep(&tmp_init_path, ":")) != NULL) {
if (bootverbose)
printf("start_init: trying %s\n", path);
memset(&args, 0, sizeof(args));
error = exec_alloc_args(&args);
if (error != 0)
panic("%s: Can't allocate space for init arguments %d",
__func__, error);
error = exec_args_add_fname(&args, path, UIO_SYSSPACE);
if (error != 0)
panic("%s: Can't add fname %d", __func__, error);
error = exec_args_add_arg(&args, path, UIO_SYSSPACE);
if (error != 0)
panic("%s: Can't add argv[0] %d", __func__, error);
if (boothowto & RB_SINGLE)
error = exec_args_add_arg(&args, "-s", UIO_SYSSPACE);
if (error != 0)
panic("%s: Can't add argv[0] %d", __func__, error);
/*
* Now try to exec the program. If can't for any reason
* other than it doesn't exist, complain.
*
* Otherwise, return via fork_trampoline() all the way
* to user mode as init!
*/
KASSERT((td->td_pflags & TDP_EXECVMSPC) == 0,
("nested execve"));
memset(td->td_frame, 0, sizeof(*td->td_frame));
oldvmspace = p->p_vmspace;
error = kern_execve(td, &args, NULL, oldvmspace);
KASSERT(error != 0,
("kern_execve returned success, not EJUSTRETURN"));
if (error == EJUSTRETURN) {
exec_cleanup(td, oldvmspace);
free(free_init_path, M_TEMP);
TSEXIT();
return;
}
if (error != ENOENT)
printf("exec %s: error %d\n", path, error);
}
free(free_init_path, M_TEMP);
printf("init: not found in path %s\n", init_path);
panic("no init");
}
/*
* Like kproc_create(), but runs in its own address space. We do this
* early to reserve pid 1. Note special case - do not make it
* runnable yet, init execution is started when userspace can be served.
*/
static void
create_init(const void *udata __unused)
{
struct fork_req fr;
struct ucred *newcred, *oldcred;
struct thread *td;
int error;
bzero(&fr, sizeof(fr));
fr.fr_flags = RFFDG | RFPROC | RFSTOPPED;
fr.fr_procp = &initproc;
error = fork1(&thread0, &fr);
if (error)
panic("cannot fork init: %d\n", error);
KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
/* divorce init's credentials from the kernel's */
newcred = crget();
sx_xlock(&proctree_lock);
PROC_LOCK(initproc);
initproc->p_flag |= P_SYSTEM | P_INMEM;
initproc->p_treeflag |= P_TREE_REAPER;
oldcred = initproc->p_ucred;
crcopy(newcred, oldcred);
#ifdef MAC
mac_cred_create_init(newcred);
#endif
#ifdef AUDIT
audit_cred_proc1(newcred);
#endif
proc_set_cred(initproc, newcred);
td = FIRST_THREAD_IN_PROC(initproc);
crcowfree(td);
td->td_realucred = crcowget(initproc->p_ucred);
td->td_ucred = td->td_realucred;
PROC_UNLOCK(initproc);
sx_xunlock(&proctree_lock);
crfree(oldcred);
cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc),
start_init, NULL);
}
SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
/*
* Make it runnable now.
*/
static void
kick_init(const void *udata __unused)
{
struct thread *td;
td = FIRST_THREAD_IN_PROC(initproc);
thread_lock(td);
TD_SET_CAN_RUN(td);
sched_add(td, SRQ_BORING);
}
SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL);
/*
* DDB(4).
*/
#ifdef DDB
static void
db_show_print_syinit(struct sysinit *sip, bool ddb)
{
const char *sname, *funcname;
c_db_sym_t sym;
db_expr_t offset;
#define xprint(...) \
if (ddb) \
db_printf(__VA_ARGS__); \
else \
printf(__VA_ARGS__)
if (sip == NULL) {
xprint("%s: no sysinit * given\n", __func__);
return;
}
sym = db_search_symbol((vm_offset_t)sip, DB_STGY_ANY, &offset);
db_symbol_values(sym, &sname, NULL);
sym = db_search_symbol((vm_offset_t)sip->func, DB_STGY_PROC, &offset);
db_symbol_values(sym, &funcname, NULL);
xprint("%s(%p)\n", (sname != NULL) ? sname : "", sip);
xprint(" %#08x %#08x\n", sip->subsystem, sip->order);
xprint(" %p(%s)(%p)\n",
sip->func, (funcname != NULL) ? funcname : "", sip->udata);
#undef xprint
}
DB_SHOW_COMMAND_FLAGS(sysinit, db_show_sysinit, DB_CMD_MEMSAFE)
{
struct sysinit *sip;
db_printf("SYSINIT vs Name(Ptr)\n");
db_printf(" Subsystem Order\n");
db_printf(" Function(Name)(Arg)\n");
STAILQ_FOREACH(sip, &sysinit_done_list, next) {
db_show_print_syinit(sip, true);
if (db_pager_quit)
return;
}
STAILQ_FOREACH(sip, &sysinit_list, next) {
db_show_print_syinit(sip, true);
if (db_pager_quit)
break;
}
}
#endif /* DDB */
diff --git a/sys/vm/vm.h b/sys/vm/vm.h
index b7d149a2fca2..d28c84dd1c95 100644
--- a/sys/vm/vm.h
+++ b/sys/vm/vm.h
@@ -1,184 +1,183 @@
/*-
* SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
*
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Authors: Avadis Tevanian, Jr., Michael Wayne Young
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef VM_H
#define VM_H
#include <machine/vm.h>
typedef char vm_inherit_t; /* inheritance codes */
#define VM_INHERIT_SHARE ((vm_inherit_t) 0)
#define VM_INHERIT_COPY ((vm_inherit_t) 1)
#define VM_INHERIT_NONE ((vm_inherit_t) 2)
#define VM_INHERIT_ZERO ((vm_inherit_t) 3)
#define VM_INHERIT_DEFAULT VM_INHERIT_COPY
typedef u_char vm_prot_t; /* protection codes */
#define VM_PROT_NONE ((vm_prot_t) 0x00)
#define VM_PROT_READ ((vm_prot_t) 0x01)
#define VM_PROT_WRITE ((vm_prot_t) 0x02)
#define VM_PROT_EXECUTE ((vm_prot_t) 0x04)
#define VM_PROT_COPY ((vm_prot_t) 0x08) /* copy-on-read */
#define VM_PROT_PRIV_FLAG ((vm_prot_t) 0x10)
#define VM_PROT_FAULT_LOOKUP VM_PROT_PRIV_FLAG
#define VM_PROT_NO_PROMOTE VM_PROT_PRIV_FLAG
#define VM_PROT_QUICK_NOFAULT VM_PROT_PRIV_FLAG /* same to save bits */
#define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)
#define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE)
#define VM_PROT_DEFAULT VM_PROT_ALL
enum obj_type {
OBJT_RESERVED = 0, /* was OBJT_DEFAULT */
OBJT_SWAP,
OBJT_DEFAULT = OBJT_SWAP,
OBJT_VNODE,
OBJT_DEVICE,
OBJT_PHYS,
OBJT_DEAD,
OBJT_SG,
OBJT_MGTDEVICE,
OBJT_FIRST_DYN,
};
typedef u_char objtype_t;
union vm_map_object;
typedef union vm_map_object vm_map_object_t;
struct vm_map_entry;
typedef struct vm_map_entry *vm_map_entry_t;
struct vm_map;
typedef struct vm_map *vm_map_t;
struct vm_object;
typedef struct vm_object *vm_object_t;
#ifndef _KERNEL
/*
* This is defined in <sys/types.h> for the kernel so that non-vm kernel
* sources (mainly Mach-derived ones such as ddb) don't have to include
* vm stuff. Defining it there for applications might break things.
* Define it here for "applications" that include vm headers (e.g.,
* genassym).
*/
#ifndef HAVE_BOOLEAN
typedef int boolean_t;
#endif
/*
* The exact set of memory attributes is machine dependent. However,
* every machine is required to define VM_MEMATTR_DEFAULT and
* VM_MEMATTR_UNCACHEABLE.
*/
typedef char vm_memattr_t; /* memory attribute codes */
/*
* This is defined in <sys/types.h> for the kernel so that vnode_if.h
* doesn't have to include <vm/vm.h>.
*/
struct vm_page;
typedef struct vm_page *vm_page_t;
#endif /* _KERNEL */
struct vm_reserv;
typedef struct vm_reserv *vm_reserv_t;
/*
* Information passed from the machine-independent VM initialization code
* for use by machine-dependant code (mainly for MMU support)
*/
struct kva_md_info {
vm_offset_t buffer_sva;
vm_offset_t buffer_eva;
vm_offset_t clean_sva;
vm_offset_t clean_eva;
};
/* bits from overcommit */
#define SWAP_RESERVE_FORCE_ON (1 << 0)
#define SWAP_RESERVE_RLIMIT_ON (1 << 1)
#define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2)
#ifdef NUMA
#define __numa_used
#else
#define __numa_used __unused
#endif
#ifdef _KERNEL
struct ucred;
void vm_ksubmap_init(struct kva_md_info *);
bool swap_reserve(vm_ooffset_t incr);
bool swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred);
void swap_reserve_force(vm_ooffset_t incr);
void swap_release(vm_ooffset_t decr);
void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred);
-void swapper(void);
extern struct kva_md_info kmi;
#define VA_IS_CLEANMAP(va) \
((va) >= kmi.clean_sva && (va) < kmi.clean_eva)
extern int old_mlock;
extern int vm_ndomains;
extern int vm_overcommit;
#endif /* _KERNEL */
#endif /* VM_H */
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index 4f8121fa1064..63417687a1a5 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -1,842 +1,843 @@
/*-
* SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
*
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#include <sys/cdefs.h>
#include "opt_vm.h"
#include "opt_kstack_pages.h"
#include "opt_kstack_max_pages.h"
#include "opt_kstack_usage_prof.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/asan.h>
#include <sys/domainset.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/msan.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/racct.h>
#include <sys/refcount.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sf_buf.h>
#include <sys/shm.h>
#include <sys/smp.h>
#include <sys/vmmeter.h>
#include <sys/vmem.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/unistd.h>
#include <vm/uma.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_domainset.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pagequeue.h>
#include <vm/vm_object.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>
#include <vm/vm_phys.h>
#include <machine/cpu.h>
#if VM_NRESERVLEVEL > 1
#define KVA_KSTACK_QUANTUM_SHIFT (VM_LEVEL_1_ORDER + VM_LEVEL_0_ORDER + \
PAGE_SHIFT)
#elif VM_NRESERVLEVEL > 0
#define KVA_KSTACK_QUANTUM_SHIFT (VM_LEVEL_0_ORDER + PAGE_SHIFT)
#else
#define KVA_KSTACK_QUANTUM_SHIFT (8 + PAGE_SHIFT)
#endif
#define KVA_KSTACK_QUANTUM (1ul << KVA_KSTACK_QUANTUM_SHIFT)
/*
* MPSAFE
*
* WARNING! This code calls vm_map_check_protection() which only checks
* the associated vm_map_entry range. It does not determine whether the
* contents of the memory is actually readable or writable. In most cases
* just checking the vm_map_entry is sufficient within the kernel's address
* space.
*/
bool
kernacc(void *addr, int len, int rw)
{
boolean_t rv;
vm_offset_t saddr, eaddr;
vm_prot_t prot;
KASSERT((rw & ~VM_PROT_ALL) == 0,
("illegal ``rw'' argument to kernacc (%x)\n", rw));
if ((vm_offset_t)addr + len > vm_map_max(kernel_map) ||
(vm_offset_t)addr + len < (vm_offset_t)addr)
return (false);
prot = rw;
saddr = trunc_page((vm_offset_t)addr);
eaddr = round_page((vm_offset_t)addr + len);
vm_map_lock_read(kernel_map);
rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
vm_map_unlock_read(kernel_map);
return (rv == TRUE);
}
/*
* MPSAFE
*
* WARNING! This code calls vm_map_check_protection() which only checks
* the associated vm_map_entry range. It does not determine whether the
* contents of the memory is actually readable or writable. vmapbuf(),
* vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
* used in conjunction with this call.
*/
bool
useracc(void *addr, int len, int rw)
{
boolean_t rv;
vm_prot_t prot;
vm_map_t map;
KASSERT((rw & ~VM_PROT_ALL) == 0,
("illegal ``rw'' argument to useracc (%x)\n", rw));
prot = rw;
map = &curproc->p_vmspace->vm_map;
if ((vm_offset_t)addr + len > vm_map_max(map) ||
(vm_offset_t)addr + len < (vm_offset_t)addr) {
return (false);
}
vm_map_lock_read(map);
rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
round_page((vm_offset_t)addr + len), prot);
vm_map_unlock_read(map);
return (rv == TRUE);
}
int
vslock(void *addr, size_t len)
{
vm_offset_t end, last, start;
vm_size_t npages;
int error;
last = (vm_offset_t)addr + len;
start = trunc_page((vm_offset_t)addr);
end = round_page(last);
if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
return (EINVAL);
npages = atop(end - start);
if (npages > vm_page_max_user_wired)
return (ENOMEM);
error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
if (error == KERN_SUCCESS) {
curthread->td_vslock_sz += len;
return (0);
}
/*
* Return EFAULT on error to match copy{in,out}() behaviour
* rather than returning ENOMEM like mlock() would.
*/
return (EFAULT);
}
void
vsunlock(void *addr, size_t len)
{
/* Rely on the parameter sanity checks performed by vslock(). */
MPASS(curthread->td_vslock_sz >= len);
curthread->td_vslock_sz -= len;
(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
}
/*
* Pin the page contained within the given object at the given offset. If the
* page is not resident, allocate and load it using the given object's pager.
* Return the pinned page if successful; otherwise, return NULL.
*/
static vm_page_t
vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
{
vm_page_t m;
vm_pindex_t pindex;
pindex = OFF_TO_IDX(offset);
(void)vm_page_grab_valid_unlocked(&m, object, pindex,
VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED);
return (m);
}
/*
* Return a CPU private mapping to the page at the given offset within the
* given object. The page is pinned before it is mapped.
*/
struct sf_buf *
vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
{
vm_page_t m;
m = vm_imgact_hold_page(object, offset);
if (m == NULL)
return (NULL);
sched_pin();
return (sf_buf_alloc(m, SFB_CPUPRIVATE));
}
/*
* Destroy the given CPU private mapping and unpin the page that it mapped.
*/
void
vm_imgact_unmap_page(struct sf_buf *sf)
{
vm_page_t m;
m = sf_buf_page(sf);
sf_buf_free(sf);
sched_unpin();
vm_page_unwire(m, PQ_ACTIVE);
}
void
vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz)
{
pmap_sync_icache(map->pmap, va, sz);
}
static vm_object_t kstack_object;
static vm_object_t kstack_alt_object;
static uma_zone_t kstack_cache;
static int kstack_cache_size;
static vmem_t *vmd_kstack_arena[MAXMEMDOM];
static int
sysctl_kstack_cache_size(SYSCTL_HANDLER_ARGS)
{
int error, oldsize;
oldsize = kstack_cache_size;
error = sysctl_handle_int(oidp, arg1, arg2, req);
if (error == 0 && req->newptr && oldsize != kstack_cache_size)
uma_zone_set_maxcache(kstack_cache, kstack_cache_size);
return (error);
}
SYSCTL_PROC(_vm, OID_AUTO, kstack_cache_size,
CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &kstack_cache_size, 0,
sysctl_kstack_cache_size, "IU", "Maximum number of cached kernel stacks");
/*
* Allocate a virtual address range from a domain kstack arena, following
* the specified NUMA policy.
*/
static vm_offset_t
vm_thread_alloc_kstack_kva(vm_size_t size, int domain)
{
#ifndef __ILP32__
int rv;
vmem_t *arena;
vm_offset_t addr = 0;
size = round_page(size);
/* Allocate from the kernel arena for non-standard kstack sizes. */
if (size != ptoa(kstack_pages + KSTACK_GUARD_PAGES)) {
arena = vm_dom[domain].vmd_kernel_arena;
} else {
arena = vmd_kstack_arena[domain];
}
rv = vmem_alloc(arena, size, M_BESTFIT | M_NOWAIT, &addr);
if (rv == ENOMEM)
return (0);
KASSERT(atop(addr - VM_MIN_KERNEL_ADDRESS) %
(kstack_pages + KSTACK_GUARD_PAGES) == 0,
("%s: allocated kstack KVA not aligned to multiple of kstack size",
__func__));
return (addr);
#else
return (kva_alloc(size));
#endif
}
/*
* Release a region of kernel virtual memory
* allocated from the kstack arena.
*/
static __noinline void
vm_thread_free_kstack_kva(vm_offset_t addr, vm_size_t size, int domain)
{
vmem_t *arena;
size = round_page(size);
#ifdef __ILP32__
arena = kernel_arena;
#else
arena = vmd_kstack_arena[domain];
if (size != ptoa(kstack_pages + KSTACK_GUARD_PAGES)) {
arena = vm_dom[domain].vmd_kernel_arena;
}
#endif
vmem_free(arena, addr, size);
}
static vmem_size_t
vm_thread_kstack_import_quantum(void)
{
#ifndef __ILP32__
/*
* The kstack_quantum is larger than KVA_QUANTUM to account
* for holes induced by guard pages.
*/
return (KVA_KSTACK_QUANTUM * (kstack_pages + KSTACK_GUARD_PAGES));
#else
return (KVA_KSTACK_QUANTUM);
#endif
}
/*
* Import KVA from a parent arena into the kstack arena. Imports must be
* a multiple of kernel stack pages + guard pages in size.
*
* Kstack VA allocations need to be aligned so that the linear KVA pindex
* is divisible by the total number of kstack VA pages. This is necessary to
* make vm_kstack_pindex work properly.
*
* We import a multiple of KVA_KSTACK_QUANTUM-sized region from the parent
* arena. The actual size used by the kstack arena is one kstack smaller to
* allow for the necessary alignment adjustments to be made.
*/
static int
vm_thread_kstack_arena_import(void *arena, vmem_size_t size, int flags,
vmem_addr_t *addrp)
{
int error, rem;
size_t kpages = kstack_pages + KSTACK_GUARD_PAGES;
KASSERT(atop(size) % kpages == 0,
("%s: Size %jd is not a multiple of kstack pages (%d)", __func__,
(intmax_t)size, (int)kpages));
error = vmem_xalloc(arena, vm_thread_kstack_import_quantum(),
KVA_KSTACK_QUANTUM, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags,
addrp);
if (error) {
return (error);
}
rem = atop(*addrp - VM_MIN_KERNEL_ADDRESS) % kpages;
if (rem != 0) {
/* Bump addr to next aligned address */
*addrp = *addrp + (kpages - rem) * PAGE_SIZE;
}
return (0);
}
/*
* Release KVA from a parent arena into the kstack arena. Released imports must
* be a multiple of kernel stack pages + guard pages in size.
*/
static void
vm_thread_kstack_arena_release(void *arena, vmem_addr_t addr, vmem_size_t size)
{
int rem;
size_t kpages __diagused = kstack_pages + KSTACK_GUARD_PAGES;
KASSERT(size % kpages == 0,
("%s: Size %jd is not a multiple of kstack pages (%d)", __func__,
(intmax_t)size, (int)kpages));
KASSERT((addr - VM_MIN_KERNEL_ADDRESS) % kpages == 0,
("%s: Address %p is not properly aligned (%p)", __func__,
(void *)addr, (void *)VM_MIN_KERNEL_ADDRESS));
/*
* If the address is not KVA_KSTACK_QUANTUM-aligned we have to decrement
* it to account for the shift in kva_import_kstack.
*/
rem = addr % KVA_KSTACK_QUANTUM;
if (rem) {
KASSERT(rem <= ptoa(kpages),
("%s: rem > kpages (%d), (%d)", __func__, rem,
(int)kpages));
addr -= rem;
}
vmem_xfree(arena, addr, vm_thread_kstack_import_quantum());
}
/*
* Create the kernel stack for a new thread.
*/
static vm_offset_t
vm_thread_stack_create(struct domainset *ds, int pages)
{
vm_page_t ma[KSTACK_MAX_PAGES];
struct vm_domainset_iter di;
int req = VM_ALLOC_NORMAL;
vm_object_t obj;
vm_offset_t ks;
int domain, i;
obj = vm_thread_kstack_size_to_obj(pages);
if (vm_ndomains > 1)
obj->domain.dr_policy = ds;
vm_domainset_iter_page_init(&di, obj, 0, &domain, &req);
do {
/*
* Get a kernel virtual address for this thread's kstack.
*/
ks = vm_thread_alloc_kstack_kva(ptoa(pages + KSTACK_GUARD_PAGES),
domain);
if (ks == 0)
continue;
ks += ptoa(KSTACK_GUARD_PAGES);
/*
* Allocate physical pages to back the stack.
*/
if (vm_thread_stack_back(ks, ma, pages, req, domain) != 0) {
vm_thread_free_kstack_kva(ks - ptoa(KSTACK_GUARD_PAGES),
ptoa(pages + KSTACK_GUARD_PAGES), domain);
continue;
}
if (KSTACK_GUARD_PAGES != 0) {
pmap_qremove(ks - ptoa(KSTACK_GUARD_PAGES),
KSTACK_GUARD_PAGES);
}
for (i = 0; i < pages; i++)
vm_page_valid(ma[i]);
pmap_qenter(ks, ma, pages);
return (ks);
} while (vm_domainset_iter_page(&di, obj, &domain) == 0);
return (0);
}
static __noinline void
vm_thread_stack_dispose(vm_offset_t ks, int pages)
{
vm_page_t m;
vm_pindex_t pindex;
int i, domain;
vm_object_t obj = vm_thread_kstack_size_to_obj(pages);
pindex = vm_kstack_pindex(ks, pages);
domain = vm_phys_domain(vtophys(ks));
pmap_qremove(ks, pages);
VM_OBJECT_WLOCK(obj);
for (i = 0; i < pages; i++) {
m = vm_page_lookup(obj, pindex + i);
if (m == NULL)
panic("%s: kstack already missing?", __func__);
KASSERT(vm_page_domain(m) == domain,
("%s: page %p domain mismatch, expected %d got %d",
__func__, m, domain, vm_page_domain(m)));
vm_page_xbusy_claim(m);
vm_page_unwire_noq(m);
vm_page_free(m);
}
VM_OBJECT_WUNLOCK(obj);
kasan_mark((void *)ks, ptoa(pages), ptoa(pages), 0);
vm_thread_free_kstack_kva(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
ptoa(pages + KSTACK_GUARD_PAGES), domain);
}
/*
* Allocate the kernel stack for a new thread.
*/
int
vm_thread_new(struct thread *td, int pages)
{
vm_offset_t ks;
u_short ks_domain;
/* Bounds check */
if (pages <= 1)
pages = kstack_pages;
else if (pages > KSTACK_MAX_PAGES)
pages = KSTACK_MAX_PAGES;
ks = 0;
if (pages == kstack_pages && kstack_cache != NULL)
ks = (vm_offset_t)uma_zalloc(kstack_cache, M_NOWAIT);
/*
* Ensure that kstack objects can draw pages from any memory
* domain. Otherwise a local memory shortage can block a process
* swap-in.
*/
if (ks == 0)
ks = vm_thread_stack_create(DOMAINSET_PREF(PCPU_GET(domain)),
pages);
if (ks == 0)
return (0);
ks_domain = vm_phys_domain(vtophys(ks));
KASSERT(ks_domain >= 0 && ks_domain < vm_ndomains,
("%s: invalid domain for kstack %p", __func__, (void *)ks));
td->td_kstack = ks;
td->td_kstack_pages = pages;
td->td_kstack_domain = ks_domain;
return (1);
}
/*
* Dispose of a thread's kernel stack.
*/
void
vm_thread_dispose(struct thread *td)
{
vm_offset_t ks;
int pages;
pages = td->td_kstack_pages;
ks = td->td_kstack;
td->td_kstack = 0;
td->td_kstack_pages = 0;
td->td_kstack_domain = MAXMEMDOM;
if (pages == kstack_pages) {
kasan_mark((void *)ks, 0, ptoa(pages), KASAN_KSTACK_FREED);
uma_zfree(kstack_cache, (void *)ks);
} else {
vm_thread_stack_dispose(ks, pages);
}
}
/*
* Calculate kstack pindex.
*
* Uses a non-identity mapping if guard pages are
* active to avoid pindex holes in the kstack object.
*/
vm_pindex_t
vm_kstack_pindex(vm_offset_t ks, int kpages)
{
vm_pindex_t pindex = atop(ks - VM_MIN_KERNEL_ADDRESS);
#ifdef __ILP32__
return (pindex);
#else
/*
* Return the linear pindex if guard pages aren't active or if we are
* allocating a non-standard kstack size.
*/
if (KSTACK_GUARD_PAGES == 0 || kpages != kstack_pages) {
return (pindex);
}
KASSERT(pindex % (kpages + KSTACK_GUARD_PAGES) >= KSTACK_GUARD_PAGES,
("%s: Attempting to calculate kstack guard page pindex", __func__));
return (pindex -
(pindex / (kpages + KSTACK_GUARD_PAGES) + 1) * KSTACK_GUARD_PAGES);
#endif
}
/*
* Allocate physical pages, following the specified NUMA policy, to back a
* kernel stack.
*/
int
vm_thread_stack_back(vm_offset_t ks, vm_page_t ma[], int npages, int req_class,
int domain)
{
vm_object_t obj = vm_thread_kstack_size_to_obj(npages);
vm_pindex_t pindex;
vm_page_t m;
int n;
pindex = vm_kstack_pindex(ks, npages);
VM_OBJECT_WLOCK(obj);
for (n = 0; n < npages;) {
m = vm_page_grab(obj, pindex + n,
VM_ALLOC_NOCREAT | VM_ALLOC_WIRED);
if (m == NULL) {
m = vm_page_alloc_domain(obj, pindex + n, domain,
req_class | VM_ALLOC_WIRED);
}
if (m == NULL)
break;
ma[n++] = m;
}
if (n < npages)
goto cleanup;
VM_OBJECT_WUNLOCK(obj);
return (0);
cleanup:
for (int i = 0; i < n; i++) {
m = ma[i];
(void)vm_page_unwire_noq(m);
vm_page_free(m);
}
VM_OBJECT_WUNLOCK(obj);
return (ENOMEM);
}
vm_object_t
vm_thread_kstack_size_to_obj(int npages)
{
return (npages == kstack_pages ? kstack_object : kstack_alt_object);
}
static int
kstack_import(void *arg, void **store, int cnt, int domain, int flags)
{
struct domainset *ds;
int i;
if (domain == UMA_ANYDOMAIN)
ds = DOMAINSET_RR();
else
ds = DOMAINSET_PREF(domain);
for (i = 0; i < cnt; i++) {
store[i] = (void *)vm_thread_stack_create(ds, kstack_pages);
if (store[i] == NULL)
break;
}
return (i);
}
static void
kstack_release(void *arg, void **store, int cnt)
{
vm_offset_t ks;
int i;
for (i = 0; i < cnt; i++) {
ks = (vm_offset_t)store[i];
vm_thread_stack_dispose(ks, kstack_pages);
}
}
static void
kstack_cache_init(void *null)
{
vm_size_t kstack_quantum;
int domain;
kstack_object = vm_object_allocate(OBJT_SWAP,
atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS));
kstack_cache = uma_zcache_create("kstack_cache",
kstack_pages * PAGE_SIZE, NULL, NULL, NULL, NULL,
kstack_import, kstack_release, NULL,
UMA_ZONE_FIRSTTOUCH);
kstack_cache_size = imax(128, mp_ncpus * 4);
uma_zone_set_maxcache(kstack_cache, kstack_cache_size);
kstack_alt_object = vm_object_allocate(OBJT_SWAP,
atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS));
kstack_quantum = vm_thread_kstack_import_quantum();
/*
* Reduce size used by the kstack arena to allow for
* alignment adjustments in vm_thread_kstack_arena_import.
*/
kstack_quantum -= (kstack_pages + KSTACK_GUARD_PAGES) * PAGE_SIZE;
/*
* Create the kstack_arena for each domain and set kernel_arena as
* parent.
*/
for (domain = 0; domain < vm_ndomains; domain++) {
vmd_kstack_arena[domain] = vmem_create("kstack arena", 0, 0,
PAGE_SIZE, 0, M_WAITOK);
KASSERT(vmd_kstack_arena[domain] != NULL,
("%s: failed to create domain %d kstack_arena", __func__,
domain));
vmem_set_import(vmd_kstack_arena[domain],
vm_thread_kstack_arena_import,
vm_thread_kstack_arena_release,
vm_dom[domain].vmd_kernel_arena, kstack_quantum);
}
}
SYSINIT(vm_kstacks, SI_SUB_KMEM, SI_ORDER_ANY, kstack_cache_init, NULL);
#ifdef KSTACK_USAGE_PROF
/*
* Track maximum stack used by a thread in kernel.
*/
static int max_kstack_used;
SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD,
&max_kstack_used, 0,
"Maximum stack depth used by a thread in kernel");
void
intr_prof_stack_use(struct thread *td, struct trapframe *frame)
{
vm_offset_t stack_top;
vm_offset_t current;
int used, prev_used;
/*
* Testing for interrupted kernel mode isn't strictly
* needed. It optimizes the execution, since interrupts from
* usermode will have only the trap frame on the stack.
*/
if (TRAPF_USERMODE(frame))
return;
stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE;
current = (vm_offset_t)(uintptr_t)&stack_top;
/*
* Try to detect if interrupt is using kernel thread stack.
* Hardware could use a dedicated stack for interrupt handling.
*/
if (stack_top <= current || current < td->td_kstack)
return;
used = stack_top - current;
for (;;) {
prev_used = max_kstack_used;
if (prev_used >= used)
break;
if (atomic_cmpset_int(&max_kstack_used, prev_used, used))
break;
}
}
#endif /* KSTACK_USAGE_PROF */
/*
* Implement fork's actions on an address space.
* Here we arrange for the address space to be copied or referenced,
* allocate a user struct (pcb and kernel stack), then call the
* machine-dependent layer to fill those in and make the new process
* ready to run. The new process is set up so that it returns directly
* to user mode to avoid stack copying and relocation problems.
*/
int
vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2,
struct vmspace *vm2, int flags)
{
struct proc *p1 = td->td_proc;
struct domainset *dset;
int error;
if ((flags & RFPROC) == 0) {
/*
* Divorce the memory, if it is shared, essentially
* this changes shared memory amongst threads, into
* COW locally.
*/
if ((flags & RFMEM) == 0) {
error = vmspace_unshare(p1);
if (error)
return (error);
}
cpu_fork(td, p2, td2, flags);
return (0);
}
if (flags & RFMEM) {
p2->p_vmspace = p1->p_vmspace;
refcount_acquire(&p1->p_vmspace->vm_refcnt);
}
dset = td2->td_domain.dr_policy;
while (vm_page_count_severe_set(&dset->ds_mask)) {
vm_wait_doms(&dset->ds_mask, 0);
}
if ((flags & RFMEM) == 0) {
p2->p_vmspace = vm2;
if (p1->p_vmspace->vm_shm)
shmfork(p1, p2);
}
/*
* cpu_fork will copy and update the pcb, set up the kernel stack,
* and make the child ready to run.
*/
cpu_fork(td, p2, td2, flags);
return (0);
}
/*
* Called after process has been wait(2)'ed upon and is being reaped.
* The idea is to reclaim resources that we could not reclaim while
* the process was still executing.
*/
void
vm_waitproc(struct proc *p)
{
vmspace_exitfree(p); /* and clean-out the vmspace */
}
+/*
+ * This used to kick the thread which faults in threads.
+ */
void
kick_proc0(void)
{
-
- wakeup(&proc0);
}
diff --git a/sys/vm/vm_swapout.c b/sys/vm/vm_swapout.c
index 85708d61d849..b97f6904ab5a 100644
--- a/sys/vm/vm_swapout.c
+++ b/sys/vm/vm_swapout.c
@@ -1,715 +1,595 @@
/*-
* SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
*
* Copyright (c) 1991 Regents of the University of California.
* All rights reserved.
* Copyright (c) 1994 John S. Dyson
* All rights reserved.
* Copyright (c) 1994 David Greenman
* All rights reserved.
* Copyright (c) 2005 Yahoo! Technologies Norway AS
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Authors: Avadis Tevanian, Jr., Michael Wayne Young
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#include <sys/cdefs.h>
#include "opt_kstack_pages.h"
#include "opt_kstack_max_pages.h"
#include "opt_vm.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/limits.h>
#include <sys/kernel.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/ktr.h>
#include <sys/mount.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/refcount.h>
#include <sys/sched.h>
#include <sys/sdt.h>
#include <sys/signalvar.h>
#include <sys/smp.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/vmmeter.h>
#include <sys/rwlock.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_kern.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
#include <vm/vm_phys.h>
#include <vm/swap_pager.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
/* the kernel process "vm_daemon" */
static void vm_daemon(void);
static struct proc *vmproc;
static struct kproc_desc vm_kp = {
"vmdaemon",
vm_daemon,
&vmproc
};
SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
static int vm_swap_enabled = 1;
static int vm_swap_idle_enabled = 0;
SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW,
&vm_swap_enabled, 0,
"Enable entire process swapout");
SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW,
&vm_swap_idle_enabled, 0,
"Allow swapout on idle criteria");
/*
* Swap_idle_threshold1 is the guaranteed swapped in time for a process
*/
static int swap_idle_threshold1 = 2;
SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
&swap_idle_threshold1, 0,
"Guaranteed swapped in time for a process");
/*
* Swap_idle_threshold2 is the time that a process can be idle before
* it will be swapped out, if idle swapping is enabled.
*/
static int swap_idle_threshold2 = 10;
SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
&swap_idle_threshold2, 0,
"Time before a process will be swapped out");
static int vm_daemon_timeout = 0;
SYSCTL_INT(_vm, OID_AUTO, vmdaemon_timeout, CTLFLAG_RW,
&vm_daemon_timeout, 0,
"Time between vmdaemon runs");
static int vm_daemon_needed;
static struct mtx vm_daemon_mtx;
/* Allow for use by vm_pageout before vm_daemon is initialized. */
MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
static int swapped_cnt;
static int swap_inprogress; /* Pending swap-ins done outside swapper. */
static int last_swapin;
static void swapclear(struct proc *);
static void vm_swapout_map_deactivate_pages(vm_map_t, long);
static void vm_swapout_object_deactivate(pmap_t, vm_object_t, long);
static void
vm_swapout_object_deactivate_page(pmap_t pmap, vm_page_t m, bool unmap)
{
/*
* Ignore unreclaimable wired pages. Repeat the check after busying
* since a busy holder may wire the page.
*/
if (vm_page_wired(m) || !vm_page_tryxbusy(m))
return;
if (vm_page_wired(m) || !pmap_page_exists_quick(pmap, m)) {
vm_page_xunbusy(m);
return;
}
if (!pmap_is_referenced(m)) {
if (!vm_page_active(m))
(void)vm_page_try_remove_all(m);
else if (unmap && vm_page_try_remove_all(m))
vm_page_deactivate(m);
}
vm_page_xunbusy(m);
}
/*
* vm_swapout_object_deactivate
*
* Deactivate enough pages to satisfy the inactive target
* requirements.
*
* The object and map must be locked.
*/
static void
vm_swapout_object_deactivate(pmap_t pmap, vm_object_t first_object,
long desired)
{
vm_object_t backing_object, object;
vm_page_t m;
bool unmap;
VM_OBJECT_ASSERT_LOCKED(first_object);
if ((first_object->flags & OBJ_FICTITIOUS) != 0)
return;
for (object = first_object;; object = backing_object) {
if (pmap_resident_count(pmap) <= desired)
goto unlock_return;
VM_OBJECT_ASSERT_LOCKED(object);
if ((object->flags & OBJ_UNMANAGED) != 0 ||
blockcount_read(&object->paging_in_progress) > 0)
goto unlock_return;
unmap = true;
if (object->shadow_count > 1)
unmap = false;
/*
* Scan the object's entire memory queue.
*/
TAILQ_FOREACH(m, &object->memq, listq) {
if (pmap_resident_count(pmap) <= desired)
goto unlock_return;
if (should_yield())
goto unlock_return;
vm_swapout_object_deactivate_page(pmap, m, unmap);
}
if ((backing_object = object->backing_object) == NULL)
goto unlock_return;
VM_OBJECT_RLOCK(backing_object);
if (object != first_object)
VM_OBJECT_RUNLOCK(object);
}
unlock_return:
if (object != first_object)
VM_OBJECT_RUNLOCK(object);
}
/*
* deactivate some number of pages in a map, try to do it fairly, but
* that is really hard to do.
*/
static void
vm_swapout_map_deactivate_pages(vm_map_t map, long desired)
{
vm_map_entry_t tmpe;
vm_object_t obj, bigobj;
int nothingwired;
if (!vm_map_trylock_read(map))
return;
bigobj = NULL;
nothingwired = TRUE;
/*
* first, search out the biggest object, and try to free pages from
* that.
*/
VM_MAP_ENTRY_FOREACH(tmpe, map) {
if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
obj = tmpe->object.vm_object;
if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
if (obj->shadow_count <= 1 &&
(bigobj == NULL ||
bigobj->resident_page_count <
obj->resident_page_count)) {
if (bigobj != NULL)
VM_OBJECT_RUNLOCK(bigobj);
bigobj = obj;
} else
VM_OBJECT_RUNLOCK(obj);
}
}
if (tmpe->wired_count > 0)
nothingwired = FALSE;
}
if (bigobj != NULL) {
vm_swapout_object_deactivate(map->pmap, bigobj, desired);
VM_OBJECT_RUNLOCK(bigobj);
}
/*
* Next, hunt around for other pages to deactivate. We actually
* do this search sort of wrong -- .text first is not the best idea.
*/
VM_MAP_ENTRY_FOREACH(tmpe, map) {
if (pmap_resident_count(vm_map_pmap(map)) <= desired)
break;
if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
obj = tmpe->object.vm_object;
if (obj != NULL) {
VM_OBJECT_RLOCK(obj);
vm_swapout_object_deactivate(map->pmap, obj,
desired);
VM_OBJECT_RUNLOCK(obj);
}
}
}
/*
* Remove all mappings if a process is swapped out, this will free page
* table pages.
*/
if (desired == 0 && nothingwired) {
pmap_remove(vm_map_pmap(map), vm_map_min(map),
vm_map_max(map));
}
vm_map_unlock_read(map);
}
static void
vm_daemon(void)
{
struct rlimit rsslim;
struct proc *p;
struct thread *td;
struct vmspace *vm;
int breakout, tryagain, attempts;
#ifdef RACCT
uint64_t rsize, ravailable;
if (racct_enable && vm_daemon_timeout == 0)
vm_daemon_timeout = hz;
#endif
while (TRUE) {
mtx_lock(&vm_daemon_mtx);
msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
vm_daemon_timeout);
mtx_unlock(&vm_daemon_mtx);
/*
* scan the processes for exceeding their rlimits or if
* process is swapped out -- deactivate pages
*/
tryagain = 0;
attempts = 0;
again:
attempts++;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
vm_pindex_t limit, size;
/*
* if this is a system process or if we have already
* looked at this process, skip it.
*/
PROC_LOCK(p);
if (p->p_state != PRS_NORMAL ||
p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
PROC_UNLOCK(p);
continue;
}
/*
* if the process is in a non-running type state,
* don't touch it.
*/
breakout = 0;
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
if (!TD_ON_RUNQ(td) &&
!TD_IS_RUNNING(td) &&
!TD_IS_SLEEPING(td) &&
!TD_IS_SUSPENDED(td)) {
thread_unlock(td);
breakout = 1;
break;
}
thread_unlock(td);
}
if (breakout) {
PROC_UNLOCK(p);
continue;
}
/*
* get a limit
*/
lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
limit = OFF_TO_IDX(
qmin(rsslim.rlim_cur, rsslim.rlim_max));
/*
* let processes that are swapped out really be
* swapped out set the limit to nothing (will force a
* swap-out.)
*/
if ((p->p_flag & P_INMEM) == 0)
limit = 0; /* XXX */
vm = vmspace_acquire_ref(p);
_PHOLD_LITE(p);
PROC_UNLOCK(p);
if (vm == NULL) {
PRELE(p);
continue;
}
sx_sunlock(&allproc_lock);
size = vmspace_resident_count(vm);
if (size >= limit) {
vm_swapout_map_deactivate_pages(
&vm->vm_map, limit);
size = vmspace_resident_count(vm);
}
#ifdef RACCT
if (racct_enable) {
rsize = IDX_TO_OFF(size);
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL)
racct_set(p, RACCT_RSS, rsize);
ravailable = racct_get_available(p, RACCT_RSS);
PROC_UNLOCK(p);
if (rsize > ravailable) {
/*
* Don't be overly aggressive; this
* might be an innocent process,
* and the limit could've been exceeded
* by some memory hog. Don't try
* to deactivate more than 1/4th
* of process' resident set size.
*/
if (attempts <= 8) {
if (ravailable < rsize -
(rsize / 4)) {
ravailable = rsize -
(rsize / 4);
}
}
vm_swapout_map_deactivate_pages(
&vm->vm_map,
OFF_TO_IDX(ravailable));
/* Update RSS usage after paging out. */
size = vmspace_resident_count(vm);
rsize = IDX_TO_OFF(size);
PROC_LOCK(p);
if (p->p_state == PRS_NORMAL)
racct_set(p, RACCT_RSS, rsize);
PROC_UNLOCK(p);
if (rsize > ravailable)
tryagain = 1;
}
}
#endif
vmspace_free(vm);
sx_slock(&allproc_lock);
PRELE(p);
}
sx_sunlock(&allproc_lock);
if (tryagain != 0 && attempts <= 10) {
maybe_yield();
goto again;
}
}
}
/*
* Bring the kernel stack for a specified thread back in.
*/
static void
vm_thread_swapin(struct thread *td, int oom_alloc)
{
vm_page_t ma[KSTACK_MAX_PAGES];
vm_offset_t kaddr;
vm_object_t obj;
int a, count, i, j, pages, rv __diagused;
kaddr = td->td_kstack;
pages = td->td_kstack_pages;
obj = vm_thread_kstack_size_to_obj(pages);
while (vm_thread_stack_back(kaddr, ma, pages, oom_alloc,
td->td_kstack_domain) == ENOMEM)
;
for (i = 0; i < pages;) {
vm_page_assert_xbusied(ma[i]);
if (vm_page_all_valid(ma[i])) {
i++;
continue;
}
vm_object_pip_add(obj, 1);
for (j = i + 1; j < pages; j++)
if (vm_page_all_valid(ma[j]))
break;
VM_OBJECT_WLOCK(obj);
rv = vm_pager_has_page(obj, ma[i]->pindex, NULL, &a);
VM_OBJECT_WUNLOCK(obj);
KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i]));
count = min(a + 1, j - i);
rv = vm_pager_get_pages(obj, ma + i, count, NULL, NULL);
KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d",
__func__, td->td_proc->p_pid));
vm_object_pip_wakeup(obj);
i += count;
}
pmap_qenter(kaddr, ma, pages);
cpu_thread_swapin(td);
}
void
faultin(struct proc *p)
{
struct thread *td;
int oom_alloc;
PROC_LOCK_ASSERT(p, MA_OWNED);
/*
* If another process is swapping in this process,
* just wait until it finishes.
*/
if (p->p_flag & P_SWAPPINGIN) {
while (p->p_flag & P_SWAPPINGIN)
msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
return;
}
if ((p->p_flag & P_INMEM) == 0) {
oom_alloc = (p->p_flag & P_WKILLED) != 0 ? VM_ALLOC_SYSTEM :
VM_ALLOC_NORMAL;
/*
* Don't let another thread swap process p out while we are
* busy swapping it in.
*/
++p->p_lock;
p->p_flag |= P_SWAPPINGIN;
PROC_UNLOCK(p);
sx_xlock(&allproc_lock);
MPASS(swapped_cnt > 0);
swapped_cnt--;
if (curthread != &thread0)
swap_inprogress++;
sx_xunlock(&allproc_lock);
/*
* We hold no lock here because the list of threads
* can not change while all threads in the process are
* swapped out.
*/
FOREACH_THREAD_IN_PROC(p, td)
vm_thread_swapin(td, oom_alloc);
if (curthread != &thread0) {
sx_xlock(&allproc_lock);
MPASS(swap_inprogress > 0);
swap_inprogress--;
last_swapin = ticks;
sx_xunlock(&allproc_lock);
}
PROC_LOCK(p);
swapclear(p);
p->p_swtick = ticks;
/* Allow other threads to swap p out now. */
wakeup(&p->p_flag);
--p->p_lock;
}
}
-/*
- * This swapin algorithm attempts to swap-in processes only if there
- * is enough space for them. Of course, if a process waits for a long
- * time, it will be swapped in anyway.
- */
-
-static struct proc *
-swapper_selector(bool wkilled_only)
-{
- struct proc *p, *res;
- struct thread *td;
- int ppri, pri, slptime, swtime;
-
- sx_assert(&allproc_lock, SA_SLOCKED);
- if (swapped_cnt == 0)
- return (NULL);
- res = NULL;
- ppri = INT_MIN;
- FOREACH_PROC_IN_SYSTEM(p) {
- PROC_LOCK(p);
- if (p->p_state == PRS_NEW || (p->p_flag & (P_SWAPPINGOUT |
- P_SWAPPINGIN | P_INMEM)) != 0) {
- PROC_UNLOCK(p);
- continue;
- }
- if (p->p_state == PRS_NORMAL && (p->p_flag & P_WKILLED) != 0) {
- /*
- * A swapped-out process might have mapped a
- * large portion of the system's pages as
- * anonymous memory. There is no other way to
- * release the memory other than to kill the
- * process, for which we need to swap it in.
- */
- return (p);
- }
- if (wkilled_only) {
- PROC_UNLOCK(p);
- continue;
- }
- swtime = (ticks - p->p_swtick) / hz;
- FOREACH_THREAD_IN_PROC(p, td) {
- /*
- * An otherwise runnable thread of a process
- * swapped out has only the TDI_SWAPPED bit set.
- */
- thread_lock(td);
- if (td->td_inhibitors == TDI_SWAPPED) {
- slptime = (ticks - td->td_slptick) / hz;
- pri = swtime + slptime;
- if ((td->td_flags & TDF_SWAPINREQ) == 0)
- pri -= p->p_nice * 8;
- /*
- * if this thread is higher priority
- * and there is enough space, then select
- * this process instead of the previous
- * selection.
- */
- if (pri > ppri) {
- res = p;
- ppri = pri;
- }
- }
- thread_unlock(td);
- }
- PROC_UNLOCK(p);
- }
-
- if (res != NULL)
- PROC_LOCK(res);
- return (res);
-}
-
-#define SWAPIN_INTERVAL (MAXSLP * hz / 2)
-
-/*
- * Limit swapper to swap in one non-WKILLED process in MAXSLP/2
- * interval, assuming that there is:
- * - at least one domain that is not suffering from a shortage of free memory;
- * - no parallel swap-ins;
- * - no other swap-ins in the current SWAPIN_INTERVAL.
- */
-static bool
-swapper_wkilled_only(void)
-{
-
- return (vm_page_count_min_set(&all_domains) || swap_inprogress > 0 ||
- (u_int)(ticks - last_swapin) < SWAPIN_INTERVAL);
-}
-
-void
-swapper(void)
-{
- struct proc *p;
-
- for (;;) {
- sx_slock(&allproc_lock);
- p = swapper_selector(swapper_wkilled_only());
- sx_sunlock(&allproc_lock);
-
- if (p == NULL) {
- tsleep(&proc0, PVM, "swapin", SWAPIN_INTERVAL);
- } else {
- PROC_LOCK_ASSERT(p, MA_OWNED);
-
- /*
- * Another process may be bringing or may have
- * already brought this process in while we
- * traverse all threads. Or, this process may
- * have exited or even being swapped out
- * again.
- */
- if (p->p_state == PRS_NORMAL && (p->p_flag & (P_INMEM |
- P_SWAPPINGOUT | P_SWAPPINGIN)) == 0) {
- faultin(p);
- }
- PROC_UNLOCK(p);
- }
- }
-}
-
static void
swapclear(struct proc *p)
{
struct thread *td;
PROC_LOCK_ASSERT(p, MA_OWNED);
FOREACH_THREAD_IN_PROC(p, td) {
thread_lock(td);
td->td_flags |= TDF_INMEM;
td->td_flags &= ~TDF_SWAPINREQ;
TD_CLR_SWAPPED(td);
if (TD_CAN_RUN(td)) {
if (setrunnable(td, 0)) {
#ifdef INVARIANTS
/*
* XXX: We just cleared TDI_SWAPPED
* above and set TDF_INMEM, so this
* should never happen.
*/
panic("not waking up swapper");
#endif
}
} else
thread_unlock(td);
}
p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT);
p->p_flag |= P_INMEM;
}
diff --git a/sys/vm/vm_swapout_dummy.c b/sys/vm/vm_swapout_dummy.c
index 0e0a268c8c46..7697a86f9d0b 100644
--- a/sys/vm/vm_swapout_dummy.c
+++ b/sys/vm/vm_swapout_dummy.c
@@ -1,109 +1,101 @@
/*-
* SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
*
* Copyright (c) 1991 Regents of the University of California.
* All rights reserved.
* Copyright (c) 1994 John S. Dyson
* All rights reserved.
* Copyright (c) 1994 David Greenman
* All rights reserved.
* Copyright (c) 2005 Yahoo! Technologies Norway AS
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Authors: Avadis Tevanian, Jr., Michael Wayne Young
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_pageout.h>
static int vm_swap_enabled = 0;
SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD,
&vm_swap_enabled, 0,
"Enable entire process swapout");
static int vm_swap_idle_enabled = 0;
SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD,
&vm_swap_idle_enabled, 0,
"Allow swapout on idle criteria");
void
faultin(struct proc *p)
{
PROC_LOCK_ASSERT(p, MA_OWNED);
if ((p->p_flag & P_INMEM) == 0)
panic("faultin: proc %p swapped out with NO_SWAPPING", p);
}
-
-void
-swapper(void)
-{
-
- for (;;)
- tsleep(&proc0, PVM, "swapin", MAXSLP * hz);
-}

File Metadata

Mime Type
text/x-diff
Expires
Fri, Feb 21, 9:07 PM (10 h, 7 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16745022
Default Alt Text
(79 KB)

Event Timeline