Page MenuHomeFreeBSD

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.
Index: head/sys/alpha/osf1/imgact_osf1.c
===================================================================
--- head/sys/alpha/osf1/imgact_osf1.c (revision 103766)
+++ head/sys/alpha/osf1/imgact_osf1.c (revision 103767)
@@ -1,251 +1,251 @@
/*
* Copyright (c) 1998-1999 Andrew Gallatin
* All rights reserved.
*
* Based heavily on imgact_linux.c which is
* Copyright (c) 1994-1996 Søren Schmidt.
* Which in turn is based heavily on /sys/kern/imgact_aout.c which is:
* Copyright (c) 1993, David Greenman
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/types.h>
#include <sys/malloc.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mount.h>
#include <sys/filedesc.h>
#include <sys/fcntl.h>
#include <sys/resourcevar.h>
#include <sys/exec.h>
#include <sys/mman.h>
#include <sys/imgact.h>
#include <sys/imgact_aout.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/namei.h>
#include <sys/sysent.h>
#include <sys/shm.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <alpha/osf1/exec_ecoff.h>
extern struct sysentvec osf1_sysvec;
#ifdef DEBUG
#define DPRINTF(a) printf a;
#else
#define DPRINTF(a)
#endif
static int
exec_osf1_imgact(struct image_params *imgp)
{
int error;
int path_not_saved;
size_t bytes;
const struct ecoff_exechdr *execp;
const struct ecoff_aouthdr *eap;
struct vmspace *vmspace;
vm_offset_t baddr;
vm_offset_t bsize;
vm_offset_t bss_start;
vm_offset_t daddr;
vm_offset_t dend;
vm_offset_t dsize;
vm_offset_t raw_dend;
vm_offset_t taddr;
vm_offset_t tend;
vm_offset_t tsize;
struct nameidata *ndp;
Osf_Auxargs *osf_auxargs;
GIANT_REQUIRED;
execp = (const struct ecoff_exechdr*)imgp->image_header;
eap = &execp->a;
ndp = NULL;
/* check to make sure we have an alpha ecoff executable */
if (ECOFF_BADMAG(execp))
return -1;
/* verfify it an OSF/1 exectutable */
if (eap->magic != ECOFF_ZMAGIC) {
printf("unknown ecoff magic %x\n", eap->magic);
return ENOEXEC;
}
osf_auxargs = malloc(sizeof(Osf_Auxargs), M_TEMP, M_WAITOK | M_ZERO);
imgp->auxargs = osf_auxargs;
osf_auxargs->executable = osf_auxargs->exec_path;
path_not_saved = copyinstr(imgp->fname, osf_auxargs->executable,
PATH_MAX, &bytes);
if (execp->f.f_flags & DYNAMIC_FLAG) {
if (path_not_saved) {
uprintf("path to dynamic exectutable not found\n");
free(imgp->auxargs, M_TEMP);
return(path_not_saved);
}
/*
* Unmap the executable & attempt to slide in
* /sbin/loader in its place.
*/
if (imgp->firstpage)
exec_unmap_first_page(imgp);
/*
* Replicate what execve does, and map the first
* page of the loader.
*/
ndp = (struct nameidata *)malloc(sizeof(struct nameidata),
M_TEMP, M_WAITOK);
NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE,
"/compat/osf1/sbin/loader",
FIRST_THREAD_IN_PROC(imgp->proc));
error = namei(ndp);
if (error) {
uprintf("imgact_osf1: can't read /compat/osf1/sbin/loader\n");
free(imgp->auxargs, M_TEMP);
return(error);
}
if (imgp->vp) {
vput(imgp->vp);
/* leaking in the nameizone ??? XXX */
}
imgp->vp = ndp->ni_vp;
error = exec_map_first_page(imgp);
osf_auxargs->loader = "/compat/osf1/sbin/loader";
}
execp = (const struct ecoff_exechdr*)imgp->image_header;
eap = &execp->a;
taddr = ECOFF_SEGMENT_ALIGN(execp, eap->text_start);
tend = round_page(eap->text_start + eap->tsize);
tsize = tend - taddr;
daddr = ECOFF_SEGMENT_ALIGN(execp, eap->data_start);
dend = round_page(eap->data_start + eap->dsize);
dsize = dend - daddr;
bss_start = ECOFF_SEGMENT_ALIGN(execp, eap->bss_start);
bsize = eap->bsize;
imgp->entry_addr = eap->entry;
/* copy in arguments and/or environment from old process */
error = exec_extract_strings(imgp);
if (error)
goto bail;
/*
* Destroy old process VM and create a new one (with a new stack).
*/
- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
+ exec_new_vmspace(imgp, &osf1_sysvec);
/*
* The vm space can now be changed.
*/
vmspace = imgp->proc->p_vmspace;
imgp->interpreted = 0;
imgp->proc->p_sysent = &osf1_sysvec;
/* set up text segment */
if ((error = vm_mmap(&vmspace->vm_map, &taddr, tsize,
VM_PROT_READ|VM_PROT_EXECUTE, VM_PROT_ALL, MAP_FIXED|MAP_COPY,
(caddr_t)imgp->vp, ECOFF_TXTOFF(execp)))) {
DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
return error;
}
/* .. data .. */
if ((error = vm_mmap(&vmspace->vm_map, &daddr, dsize,
VM_PROT_READ|VM_PROT_EXECUTE|VM_PROT_WRITE, VM_PROT_ALL,
MAP_FIXED|MAP_COPY, (caddr_t)imgp->vp, ECOFF_DATOFF(execp)))) {
DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
goto bail;
}
/* .. bss .. */
if (round_page(bsize)) {
baddr = bss_start;
if ((error = vm_map_find(&vmspace->vm_map, NULL,
(vm_offset_t) 0, &baddr, round_page(bsize), FALSE,
VM_PROT_ALL, VM_PROT_ALL, FALSE))) {
DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__,
error));
goto bail;
}
}
raw_dend = (eap->data_start + eap->dsize);
if (dend > raw_dend) {
caddr_t zeros;
zeros = malloc(dend-raw_dend,M_TEMP,M_WAITOK|M_ZERO);
if ((error = copyout(zeros, (caddr_t)raw_dend,
dend-raw_dend))) {
uprintf("Can't zero start of bss, error %d\n",error);
free(zeros,M_TEMP);
goto bail;
}
free(zeros,M_TEMP);
}
vmspace->vm_tsize = btoc(round_page(tsize));
vmspace->vm_dsize = btoc((round_page(dsize) + round_page(bsize)));
vmspace->vm_taddr = (caddr_t)taddr;
vmspace->vm_daddr = (caddr_t)daddr;
return(0);
bail:
free(imgp->auxargs, M_TEMP);
if (ndp) {
VOP_CLOSE(ndp->ni_vp, FREAD, imgp->proc->p_ucred,
FIRST_THREAD_IN_PROC(imgp->proc));
vrele(ndp->ni_vp);
}
return(error);
}
/*
* Tell kern_execve.c about it, with a little help from the linker.
*/
struct execsw osf1_execsw = { exec_osf1_imgact, "OSF/1 ECOFF" };
EXEC_SET(osf1_ecoff, osf1_execsw);
Index: head/sys/compat/linprocfs/linprocfs.c
===================================================================
--- head/sys/compat/linprocfs/linprocfs.c (revision 103766)
+++ head/sys/compat/linprocfs/linprocfs.c (revision 103767)
@@ -1,856 +1,857 @@
/*
* Copyright (c) 2000 Dag-Erling Coïdan Smørgrav
* Copyright (c) 1999 Pierre Beyssac
* Copyright (c) 1993 Jan-Simon Pendry
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_status.c 8.4 (Berkeley) 6/15/94
*
* $FreeBSD$
*/
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/blist.h>
#include <sys/conf.h>
#include <sys/dkstat.h>
#include <sys/exec.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sbuf.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/user.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <net/if.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_param.h>
#include <vm/vm_object.h>
#include <vm/swap_pager.h>
#include <machine/clock.h>
#ifdef __alpha__
#include <machine/alpha_cpu.h>
#include <machine/cpuconf.h>
#include <machine/rpb.h>
extern int ncpus;
#endif /* __alpha__ */
#ifdef __i386__
#include <machine/cputypes.h>
#include <machine/md_var.h>
#endif /* __i386__ */
#include <machine/../linux/linux.h>
#include <compat/linux/linux_ioctl.h>
#include <compat/linux/linux_mib.h>
#include <compat/linux/linux_util.h>
#include <fs/pseudofs/pseudofs.h>
#include <fs/procfs/procfs.h>
/*
* Various conversion macros
*/
#define T2J(x) (((x) * 100UL) / (stathz ? stathz : hz)) /* ticks to jiffies */
#define T2S(x) ((x) / (stathz ? stathz : hz)) /* ticks to seconds */
#define B2K(x) ((x) >> 10) /* bytes to kbytes */
#define B2P(x) ((x) >> PAGE_SHIFT) /* bytes to pages */
#define P2B(x) ((x) << PAGE_SHIFT) /* pages to bytes */
#define P2K(x) ((x) << (PAGE_SHIFT - 10)) /* pages to kbytes */
/*
* Filler function for proc/meminfo
*/
static int
linprocfs_domeminfo(PFS_FILL_ARGS)
{
unsigned long memtotal; /* total memory in bytes */
unsigned long memused; /* used memory in bytes */
unsigned long memfree; /* free memory in bytes */
unsigned long memshared; /* shared memory ??? */
unsigned long buffers, cached; /* buffer / cache memory ??? */
u_quad_t swaptotal; /* total swap space in bytes */
u_quad_t swapused; /* used swap space in bytes */
u_quad_t swapfree; /* free swap space in bytes */
vm_object_t object;
memtotal = physmem * PAGE_SIZE;
/*
* The correct thing here would be:
*
memfree = cnt.v_free_count * PAGE_SIZE;
memused = memtotal - memfree;
*
* but it might mislead linux binaries into thinking there
* is very little memory left, so we cheat and tell them that
* all memory that isn't wired down is free.
*/
memused = cnt.v_wire_count * PAGE_SIZE;
memfree = memtotal - memused;
if (swapblist == NULL) {
swaptotal = 0;
swapfree = 0;
} else {
swaptotal = (u_quad_t)swapblist->bl_blocks * 1024; /* XXX why 1024? */
swapfree = (u_quad_t)swapblist->bl_root->u.bmu_avail * PAGE_SIZE;
}
swapused = swaptotal - swapfree;
memshared = 0;
TAILQ_FOREACH(object, &vm_object_list, object_list)
if (object->shadow_count > 1)
memshared += object->resident_page_count;
memshared *= PAGE_SIZE;
/*
* We'd love to be able to write:
*
buffers = bufspace;
*
* but bufspace is internal to vfs_bio.c and we don't feel
* like unstaticizing it just for linprocfs's sake.
*/
buffers = 0;
cached = cnt.v_cache_count * PAGE_SIZE;
sbuf_printf(sb,
" total: used: free: shared: buffers: cached:\n"
"Mem: %lu %lu %lu %lu %lu %lu\n"
"Swap: %llu %llu %llu\n"
"MemTotal: %9lu kB\n"
"MemFree: %9lu kB\n"
"MemShared:%9lu kB\n"
"Buffers: %9lu kB\n"
"Cached: %9lu kB\n"
"SwapTotal:%9llu kB\n"
"SwapFree: %9llu kB\n",
memtotal, memused, memfree, memshared, buffers, cached,
swaptotal, swapused, swapfree,
B2K(memtotal), B2K(memfree),
B2K(memshared), B2K(buffers), B2K(cached),
B2K(swaptotal), B2K(swapfree));
return (0);
}
#ifdef __alpha__
/*
* Filler function for proc/cpuinfo (Alpha version)
*/
static int
linprocfs_docpuinfo(PFS_FILL_ARGS)
{
u_int64_t type, major;
struct pcs *pcsp;
const char *model, *sysname;
static const char *cpuname[] = {
"EV3", "EV4", "Simulate", "LCA4", "EV5", "EV45", "EV56",
"EV6", "PCA56", "PCA57", "EV67", "EV68CB", "EV68AL"
};
pcsp = LOCATE_PCS(hwrpb, hwrpb->rpb_primary_cpu_id);
type = pcsp->pcs_proc_type;
major = (type & PCS_PROC_MAJOR) >> PCS_PROC_MAJORSHIFT;
if (major < sizeof(cpuname)/sizeof(char *)) {
model = cpuname[major - 1];
} else {
model = "unknown";
}
sysname = alpha_dsr_sysname();
sbuf_printf(sb,
"cpu\t\t\t: Alpha\n"
"cpu model\t\t: %s\n"
"cpu variation\t\t: %ld\n"
"cpu revision\t\t: %ld\n"
"cpu serial number\t: %s\n"
"system type\t\t: %s\n"
"system variation\t: %s\n"
"system revision\t\t: %ld\n"
"system serial number\t: %s\n"
"cycle frequency [Hz]\t: %lu\n"
"timer frequency [Hz]\t: %lu\n"
"page size [bytes]\t: %ld\n"
"phys. address bits\t: %ld\n"
"max. addr. space #\t: %ld\n"
"BogoMIPS\t\t: %lu.%02lu\n"
"kernel unaligned acc\t: %ld (pc=%lx,va=%lx)\n"
"user unaligned acc\t: %ld (pc=%lx,va=%lx)\n"
"platform string\t\t: %s\n"
"cpus detected\t\t: %d\n"
,
model,
pcsp->pcs_proc_var,
*(int *)hwrpb->rpb_revision,
" ",
" ",
"0",
0,
" ",
hwrpb->rpb_cc_freq,
hz,
hwrpb->rpb_page_size,
hwrpb->rpb_phys_addr_size,
hwrpb->rpb_max_asn,
0, 0,
0, 0, 0,
0, 0, 0,
sysname,
ncpus);
return (0);
}
#endif /* __alpha__ */
#ifdef __i386__
/*
* Filler function for proc/cpuinfo (i386 version)
*/
static int
linprocfs_docpuinfo(PFS_FILL_ARGS)
{
int class, i, fqmhz, fqkhz;
/*
* We default the flags to include all non-conflicting flags,
* and the Intel versions of conflicting flags.
*/
static char *flags[] = {
"fpu", "vme", "de", "pse", "tsc",
"msr", "pae", "mce", "cx8", "apic",
"sep", "sep", "mtrr", "pge", "mca",
"cmov", "pat", "pse36", "pn", "b19",
"b20", "b21", "mmxext", "mmx", "fxsr",
"xmm", "b26", "b27", "b28", "b29",
"3dnowext", "3dnow"
};
switch (cpu_class) {
case CPUCLASS_286:
class = 2;
break;
case CPUCLASS_386:
class = 3;
break;
case CPUCLASS_486:
class = 4;
break;
case CPUCLASS_586:
class = 5;
break;
case CPUCLASS_686:
class = 6;
break;
default:
class = 0;
break;
}
sbuf_printf(sb,
"processor\t: %d\n"
"vendor_id\t: %.20s\n"
"cpu family\t: %d\n"
"model\t\t: %d\n"
"stepping\t: %d\n",
0, cpu_vendor, class, cpu, cpu_id & 0xf);
sbuf_cat(sb,
"flags\t\t:");
if (!strcmp(cpu_vendor, "AuthenticAMD") && (class < 6)) {
flags[16] = "fcmov";
} else if (!strcmp(cpu_vendor, "CyrixInstead")) {
flags[24] = "cxmmx";
}
for (i = 0; i < 32; i++)
if (cpu_feature & (1 << i))
sbuf_printf(sb, " %s", flags[i]);
sbuf_cat(sb, "\n");
if (class >= 5) {
fqmhz = (tsc_freq + 4999) / 1000000;
fqkhz = ((tsc_freq + 4999) / 10000) % 100;
sbuf_printf(sb,
"cpu MHz\t\t: %d.%02d\n"
"bogomips\t: %d.%02d\n",
fqmhz, fqkhz, fqmhz, fqkhz);
}
return (0);
}
#endif /* __i386__ */
/*
* Filler function for proc/mtab
*
* This file doesn't exist in Linux' procfs, but is included here so
* users can symlink /compat/linux/etc/mtab to /proc/mtab
*/
static int
linprocfs_domtab(PFS_FILL_ARGS)
{
struct nameidata nd;
struct mount *mp;
const char *lep;
char *dlep, *flep, *mntto, *mntfrom, *fstype;
size_t lep_len;
int error;
/* resolve symlinks etc. in the emulation tree prefix */
NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, linux_emul_path, td);
flep = NULL;
if (namei(&nd) != 0 || vn_fullpath(td, nd.ni_vp, &dlep, &flep) == -1)
lep = linux_emul_path;
else
lep = dlep;
lep_len = strlen(lep);
mtx_lock(&mountlist_mtx);
error = 0;
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
error = VFS_STATFS(mp, &mp->mnt_stat, td);
if (error)
break;
/* determine device name */
mntfrom = mp->mnt_stat.f_mntfromname;
/* determine mount point */
mntto = mp->mnt_stat.f_mntonname;
if (strncmp(mntto, lep, lep_len) == 0 &&
mntto[lep_len] == '/')
mntto += lep_len;
/* determine fs type */
fstype = mp->mnt_stat.f_fstypename;
if (strcmp(fstype, pn->pn_info->pi_name) == 0)
mntfrom = fstype = "proc";
else if (strcmp(fstype, "procfs") == 0)
continue;
sbuf_printf(sb, "%s %s %s %s", mntfrom, mntto, fstype,
mp->mnt_stat.f_flags & MNT_RDONLY ? "ro" : "rw");
#define ADD_OPTION(opt, name) \
if (mp->mnt_stat.f_flags & (opt)) sbuf_printf(sb, "," name);
ADD_OPTION(MNT_SYNCHRONOUS, "sync");
ADD_OPTION(MNT_NOEXEC, "noexec");
ADD_OPTION(MNT_NOSUID, "nosuid");
ADD_OPTION(MNT_NODEV, "nodev");
ADD_OPTION(MNT_UNION, "union");
ADD_OPTION(MNT_ASYNC, "async");
ADD_OPTION(MNT_SUIDDIR, "suiddir");
ADD_OPTION(MNT_NOSYMFOLLOW, "nosymfollow");
ADD_OPTION(MNT_NOATIME, "noatime");
#undef ADD_OPTION
/* a real Linux mtab will also show NFS options */
sbuf_printf(sb, " 0 0\n");
}
mtx_unlock(&mountlist_mtx);
if (flep != NULL)
free(flep, M_TEMP);
return (error);
}
/*
* Filler function for proc/stat
*/
static int
linprocfs_dostat(PFS_FILL_ARGS)
{
sbuf_printf(sb,
"cpu %ld %ld %ld %ld\n"
"disk 0 0 0 0\n"
"page %u %u\n"
"swap %u %u\n"
"intr %u\n"
"ctxt %u\n"
"btime %lld\n",
T2J(cp_time[CP_USER]),
T2J(cp_time[CP_NICE]),
T2J(cp_time[CP_SYS] /*+ cp_time[CP_INTR]*/),
T2J(cp_time[CP_IDLE]),
cnt.v_vnodepgsin,
cnt.v_vnodepgsout,
cnt.v_swappgsin,
cnt.v_swappgsout,
cnt.v_intr,
cnt.v_swtch,
(quad_t)boottime.tv_sec);
return (0);
}
/*
* Filler function for proc/uptime
*/
static int
linprocfs_douptime(PFS_FILL_ARGS)
{
struct timeval tv;
getmicrouptime(&tv);
sbuf_printf(sb, "%lld.%02ld %ld.%02ld\n",
(quad_t)tv.tv_sec, tv.tv_usec / 10000,
T2S(cp_time[CP_IDLE]), T2J(cp_time[CP_IDLE]) % 100);
return (0);
}
/*
* Filler function for proc/version
*/
static int
linprocfs_doversion(PFS_FILL_ARGS)
{
char osname[LINUX_MAX_UTSNAME];
char osrelease[LINUX_MAX_UTSNAME];
linux_get_osname(td->td_proc, osname);
linux_get_osrelease(td->td_proc, osrelease);
sbuf_printf(sb,
"%s version %s (des@freebsd.org) (gcc version " __VERSION__ ")"
" #4 Sun Dec 18 04:30:00 CET 1977\n", osname, osrelease);
return (0);
}
/*
* Filler function for proc/loadavg
*/
static int
linprocfs_doloadavg(PFS_FILL_ARGS)
{
sbuf_printf(sb,
"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
(int)(averunnable.ldavg[0] / averunnable.fscale),
(int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100),
(int)(averunnable.ldavg[1] / averunnable.fscale),
(int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100),
(int)(averunnable.ldavg[2] / averunnable.fscale),
(int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100),
1, /* number of running tasks */
nprocs, /* number of tasks */
lastpid /* the last pid */
);
return (0);
}
/*
* Filler function for proc/pid/stat
*/
static int
linprocfs_doprocstat(PFS_FILL_ARGS)
{
struct kinfo_proc kp;
PROC_LOCK(p);
fill_kinfo_proc(p, &kp);
sbuf_printf(sb, "%d", p->p_pid);
#define PS_ADD(name, fmt, arg) sbuf_printf(sb, " " fmt, arg)
PS_ADD("comm", "(%s)", p->p_comm);
PS_ADD("statr", "%c", '0'); /* XXX */
PS_ADD("ppid", "%d", p->p_pptr ? p->p_pptr->p_pid : 0);
PS_ADD("pgrp", "%d", p->p_pgid);
PS_ADD("session", "%d", p->p_session->s_sid);
PROC_UNLOCK(p);
PS_ADD("tty", "%d", 0); /* XXX */
PS_ADD("tpgid", "%d", 0); /* XXX */
PS_ADD("flags", "%u", 0); /* XXX */
PS_ADD("minflt", "%u", 0); /* XXX */
PS_ADD("cminflt", "%u", 0); /* XXX */
PS_ADD("majflt", "%u", 0); /* XXX */
PS_ADD("cminflt", "%u", 0); /* XXX */
PS_ADD("utime", "%d", 0); /* XXX */
PS_ADD("stime", "%d", 0); /* XXX */
PS_ADD("cutime", "%d", 0); /* XXX */
PS_ADD("cstime", "%d", 0); /* XXX */
PS_ADD("counter", "%d", 0); /* XXX */
PS_ADD("priority", "%d", 0); /* XXX */
PS_ADD("timeout", "%u", 0); /* XXX */
PS_ADD("itrealvalue", "%u", 0); /* XXX */
PS_ADD("starttime", "%d", 0); /* XXX */
PS_ADD("vsize", "%u", kp.ki_size);
PS_ADD("rss", "%u", P2K(kp.ki_rssize));
PS_ADD("rlim", "%u", 0); /* XXX */
PS_ADD("startcode", "%u", (unsigned)0);
PS_ADD("endcode", "%u", 0); /* XXX */
PS_ADD("startstack", "%u", 0); /* XXX */
PS_ADD("esp", "%u", 0); /* XXX */
PS_ADD("eip", "%u", 0); /* XXX */
PS_ADD("signal", "%d", 0); /* XXX */
PS_ADD("blocked", "%d", 0); /* XXX */
PS_ADD("sigignore", "%d", 0); /* XXX */
PS_ADD("sigcatch", "%d", 0); /* XXX */
PS_ADD("wchan", "%u", 0); /* XXX */
PS_ADD("nswap", "%lu", (long unsigned)0); /* XXX */
PS_ADD("cnswap", "%lu", (long unsigned)0); /* XXX */
PS_ADD("exitsignal", "%d", 0); /* XXX */
PS_ADD("processor", "%d", 0); /* XXX */
#undef PS_ADD
sbuf_putc(sb, '\n');
return (0);
}
/*
* Filler function for proc/pid/status
*/
static int
linprocfs_doprocstatus(PFS_FILL_ARGS)
{
struct kinfo_proc kp;
char *state;
segsz_t lsize;
struct thread *td2;
int i;
mtx_lock_spin(&sched_lock);
td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */
if (P_SHOULDSTOP(p)) {
state = "T (stopped)";
} else {
switch(p->p_state) {
case PRS_NEW:
state = "I (idle)";
break;
case PRS_NORMAL:
if (p->p_flag & P_WEXIT) {
state = "X (exiting)";
break;
}
switch(td2->td_state) {
case TDS_INHIBITED:
state = "S (sleeping)";
break;
case TDS_RUNQ:
case TDS_RUNNING:
state = "R (running)";
break;
default:
state = "? (unknown)";
break;
}
break;
case PRS_WAIT:
state = "W (waiting)";
break;
case PRS_ZOMBIE:
state = "Z (zombie)";
break;
default:
state = "? (unknown)";
break;
}
}
mtx_unlock_spin(&sched_lock);
PROC_LOCK(p);
fill_kinfo_proc(p, &kp);
sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */
sbuf_printf(sb, "State:\t%s\n", state);
/*
* Credentials
*/
sbuf_printf(sb, "Pid:\t%d\n", p->p_pid);
sbuf_printf(sb, "PPid:\t%d\n", p->p_pptr ?
p->p_pptr->p_pid : 0);
sbuf_printf(sb, "Uid:\t%d %d %d %d\n", p->p_ucred->cr_ruid,
p->p_ucred->cr_uid,
p->p_ucred->cr_svuid,
/* FreeBSD doesn't have fsuid */
p->p_ucred->cr_uid);
sbuf_printf(sb, "Gid:\t%d %d %d %d\n", p->p_ucred->cr_rgid,
p->p_ucred->cr_gid,
p->p_ucred->cr_svgid,
/* FreeBSD doesn't have fsgid */
p->p_ucred->cr_gid);
sbuf_cat(sb, "Groups:\t");
for (i = 0; i < p->p_ucred->cr_ngroups; i++)
sbuf_printf(sb, "%d ", p->p_ucred->cr_groups[i]);
PROC_UNLOCK(p);
sbuf_putc(sb, '\n');
/*
* Memory
*
* While our approximation of VmLib may not be accurate (I
* don't know of a simple way to verify it, and I'm not sure
* it has much meaning anyway), I believe it's good enough.
*
* The same code that could (I think) accurately compute VmLib
* could also compute VmLck, but I don't really care enough to
* implement it. Submissions are welcome.
*/
sbuf_printf(sb, "VmSize:\t%8u kB\n", B2K(kp.ki_size));
sbuf_printf(sb, "VmLck:\t%8u kB\n", P2K(0)); /* XXX */
sbuf_printf(sb, "VmRss:\t%8u kB\n", P2K(kp.ki_rssize));
sbuf_printf(sb, "VmData:\t%8u kB\n", P2K(kp.ki_dsize));
sbuf_printf(sb, "VmStk:\t%8u kB\n", P2K(kp.ki_ssize));
sbuf_printf(sb, "VmExe:\t%8u kB\n", P2K(kp.ki_tsize));
lsize = B2P(kp.ki_size) - kp.ki_dsize -
kp.ki_ssize - kp.ki_tsize - 1;
sbuf_printf(sb, "VmLib:\t%8u kB\n", P2K(lsize));
/*
* Signal masks
*
* We support up to 128 signals, while Linux supports 32,
* but we only define 32 (the same 32 as Linux, to boot), so
* just show the lower 32 bits of each mask. XXX hack.
*
* NB: on certain platforms (Sparc at least) Linux actually
* supports 64 signals, but this code is a long way from
* running on anything but i386, so ignore that for now.
*/
PROC_LOCK(p);
sbuf_printf(sb, "SigPnd:\t%08x\n", p->p_siglist.__bits[0]);
/*
* I can't seem to find out where the signal mask is in
* relation to struct proc, so SigBlk is left unimplemented.
*/
sbuf_printf(sb, "SigBlk:\t%08x\n", 0); /* XXX */
sbuf_printf(sb, "SigIgn:\t%08x\n", p->p_sigignore.__bits[0]);
sbuf_printf(sb, "SigCgt:\t%08x\n", p->p_sigcatch.__bits[0]);
PROC_UNLOCK(p);
/*
* Linux also prints the capability masks, but we don't have
* capabilities yet, and when we do get them they're likely to
* be meaningless to Linux programs, so we lie. XXX
*/
sbuf_printf(sb, "CapInh:\t%016x\n", 0);
sbuf_printf(sb, "CapPrm:\t%016x\n", 0);
sbuf_printf(sb, "CapEff:\t%016x\n", 0);
return (0);
}
/*
* Filler function for proc/pid/cmdline
*/
static int
linprocfs_doproccmdline(PFS_FILL_ARGS)
{
struct ps_strings pstr;
int error, i;
/*
* If we are using the ps/cmdline caching, use that. Otherwise
* revert back to the old way which only implements full cmdline
* for the currept process and just p->p_comm for all other
* processes.
* Note that if the argv is no longer available, we deliberately
* don't fall back on p->p_comm or return an error: the authentic
* Linux behaviour is to return zero-length in this case.
*/
PROC_LOCK(p);
if (p->p_args && (ps_argsopen || !p_cansee(td, p))) {
sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length);
PROC_UNLOCK(p);
} else if (p != td->td_proc) {
PROC_UNLOCK(p);
sbuf_printf(sb, "%.*s", MAXCOMLEN, p->p_comm);
} else {
PROC_UNLOCK(p);
- error = copyin((void*)PS_STRINGS, &pstr, sizeof(pstr));
+ error = copyin((void *)p->p_sysent->sv_psstrings, &pstr,
+ sizeof(pstr));
if (error)
return (error);
for (i = 0; i < pstr.ps_nargvstr; i++) {
sbuf_copyin(sb, pstr.ps_argvstr[i], 0);
sbuf_printf(sb, "%c", '\0');
}
}
return (0);
}
/*
* Filler function for proc/net/dev
*/
static int
linprocfs_donetdev(PFS_FILL_ARGS)
{
char ifname[16]; /* XXX LINUX_IFNAMSIZ */
struct ifnet *ifp;
sbuf_printf(sb, "%6s|%58s|%s\n%6s|%58s|%58s\n",
"Inter-", " Receive", " Transmit", " face",
"bytes packets errs drop fifo frame compressed",
"bytes packets errs drop fifo frame compressed");
TAILQ_FOREACH(ifp, &ifnet, if_link) {
linux_ifname(ifp, ifname, sizeof ifname);
sbuf_printf(sb, "%6.6s:", ifname);
sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu ",
0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
}
return (0);
}
#if 0
extern struct cdevsw *cdevsw[];
/*
* Filler function for proc/devices
*/
static int
linprocfs_dodevices(PFS_FILL_ARGS)
{
int i;
sbuf_printf(sb, "Character devices:\n");
for (i = 0; i < NUMCDEVSW; i++)
if (cdevsw[i] != NULL)
sbuf_printf(sb, "%3d %s\n", i, cdevsw[i]->d_name);
sbuf_printf(sb, "\nBlock devices:\n");
return (0);
}
#endif
/*
* Filler function for proc/cmdline
*/
static int
linprocfs_docmdline(PFS_FILL_ARGS)
{
sbuf_printf(sb, "BOOT_IMAGE=%s", kernelname);
sbuf_printf(sb, " ro root=302\n");
return (0);
}
#if 0
/*
* Filler function for proc/modules
*/
static int
linprocfs_domodules(PFS_FILL_ARGS)
{
struct linker_file *lf;
TAILQ_FOREACH(lf, &linker_files, link) {
sbuf_printf(sb, "%-20s%8lu%4d\n", lf->filename,
(unsigned long)lf->size, lf->refs);
}
return (0);
}
#endif
/*
* Constructor
*/
static int
linprocfs_init(PFS_INIT_ARGS)
{
struct pfs_node *root;
struct pfs_node *dir;
root = pi->pi_root;
#define PFS_CREATE_FILE(name) \
pfs_create_file(root, #name, &linprocfs_do##name, NULL, NULL, PFS_RD)
PFS_CREATE_FILE(cmdline);
PFS_CREATE_FILE(cpuinfo);
#if 0
PFS_CREATE_FILE(devices);
#endif
PFS_CREATE_FILE(loadavg);
PFS_CREATE_FILE(meminfo);
#if 0
PFS_CREATE_FILE(modules);
#endif
PFS_CREATE_FILE(mtab);
PFS_CREATE_FILE(stat);
PFS_CREATE_FILE(uptime);
PFS_CREATE_FILE(version);
#undef PFS_CREATE_FILE
pfs_create_link(root, "self", &procfs_docurproc,
NULL, NULL, 0);
dir = pfs_create_dir(root, "net", NULL, NULL, 0);
pfs_create_file(dir, "dev", &linprocfs_donetdev,
NULL, NULL, PFS_RD);
dir = pfs_create_dir(root, "pid", NULL, NULL, PFS_PROCDEP);
pfs_create_file(dir, "cmdline", &linprocfs_doproccmdline,
NULL, NULL, PFS_RD);
pfs_create_link(dir, "exe", &procfs_doprocfile,
NULL, &procfs_notsystem, 0);
pfs_create_file(dir, "mem", &procfs_doprocmem,
&procfs_attr, &procfs_candebug, PFS_RDWR|PFS_RAW);
pfs_create_file(dir, "stat", &linprocfs_doprocstat,
NULL, NULL, PFS_RD);
pfs_create_file(dir, "status", &linprocfs_doprocstatus,
NULL, NULL, PFS_RD);
return (0);
}
/*
* Destructor
*/
static int
linprocfs_uninit(PFS_INIT_ARGS)
{
/* nothing to do, pseudofs will GC */
return (0);
}
PSEUDOFS(linprocfs, 1);
MODULE_DEPEND(linprocfs, linux, 1, 1, 1);
MODULE_DEPEND(linprocfs, procfs, 1, 1, 1);
Index: head/sys/compat/pecoff/imgact_pecoff.c
===================================================================
--- head/sys/compat/pecoff/imgact_pecoff.c (revision 103766)
+++ head/sys/compat/pecoff/imgact_pecoff.c (revision 103767)
@@ -1,674 +1,674 @@
/* $NetBSD$ */
/* $FreeBSD$ */
/*
* Copyright (c) 2000 Masaru OKI
* Copyright (c) 1994, 1995, 1998 Scott Bartram
* Copyright (c) 1994 Adam Glass
* Copyright (c) 1993, 1994 Christopher G. Demetriou
*
* originally from NetBSD kern/exec_ecoff.c
*
* Copyright (c) 2000 Takanori Watanabe
* Copyright (c) 2000 KUROSAWA Takahiro
* Copyright (c) 1995-1996 Sen Schmidt
* Copyright (c) 1996 Peter Wemm
* All rights reserved.
*
* originally from FreeBSD kern/imgact_elf.c
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Masaru OKI.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/imgact.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/sysent.h>
#include <sys/vnode.h>
#include <machine/reg.h>
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <sys/user.h>
#include <sys/exec.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <machine/cpu.h>
#include <sys/syscall.h>
#include <sys/sysent.h>
#include <machine/md_var.h>
#include <machine/pecoff_machdep.h>
#include <compat/pecoff/imgact_pecoff.h>
#include "opt_pecoff.h"
#define PECOFF_PE_SIGNATURE "PE\0\0"
static int pecoff_fixup(register_t **, struct image_params *);
static int
pecoff_coredump(register struct thread *, register struct vnode *,
off_t);
#ifndef PECOFF_DEBUG
#define DPRINTF(a)
#else
#define DPRINTF(a) printf a
#endif
static struct sysentvec pecoff_sysvec = {
SYS_MAXSYSCALL,
sysent,
0,
0,
NULL,
0,
NULL,
NULL,
pecoff_fixup,
sendsig,
sigcode,
&szsigcode,
0,
"FreeBSD PECoff",
pecoff_coredump,
NULL,
MINSIGSTKSZ,
PAGE_SIZE,
VM_MIN_ADDRESS,
VM_MAXUSER_ADDRESS,
USRSTACK,
PS_STRINGS,
VM_PROT_ALL,
exec_copyout_strings,
exec_setregs
};
static const char signature[] = PECOFF_PE_SIGNATURE;
static int
exec_pecoff_coff_prep_omagic(struct image_params *,
struct coff_filehdr *,
struct coff_aouthdr *, int peoffs);
static int
exec_pecoff_coff_prep_nmagic(struct image_params *,
struct coff_filehdr *,
struct coff_aouthdr *, int peoffs);
static int
exec_pecoff_coff_prep_zmagic(struct image_params *,
struct coff_filehdr *,
struct coff_aouthdr *, int peoffs);
static int
exec_pecoff_coff_makecmds(struct image_params *,
struct coff_filehdr *, int);
static int pecoff_signature(struct thread *, struct vnode *, const struct pecoff_dos_filehdr *);
static int pecoff_read_from(struct thread *, struct vnode *, int, caddr_t, int);
static int
pecoff_load_section(struct thread * td,
struct vmspace * vmspace, struct vnode * vp,
vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
vm_prot_t prot);
static int
pecoff_fixup(register_t ** stack_base, struct image_params * imgp)
{
int len = sizeof(struct pecoff_args);
struct pecoff_imghdr *ap;
register_t *pos;
pos = *stack_base + (imgp->argc + imgp->envc + 2);
ap = (struct pecoff_imghdr *) imgp->auxargs;
if (copyout(ap, pos, len)) {
return 0;
}
free(ap, M_TEMP);
imgp->auxargs = NULL;
(*stack_base)--;
suword(*stack_base, (long) imgp->argc);
return 0;
}
static int
pecoff_coredump(register struct thread * td, register struct vnode * vp,
off_t limit)
{
register struct ucred *cred = td->td_ucred;
struct proc *p = td->td_proc;
register struct vmspace *vm = p->p_vmspace;
char *tempuser;
int error;
#ifdef PECOFF_DEBUG
struct vm_map *map;
struct vm_map_entry *ent;
struct reg regs;
#endif
if (ctob((uarea_pages + kstack_pages) + vm->vm_dsize + vm->vm_ssize) >=
limit)
return (EFAULT);
tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
M_WAITOK | M_ZERO);
if (tempuser == NULL)
return (ENOMEM);
PROC_LOCK(p);
fill_kinfo_proc(p, &p->p_uarea->u_kproc);
PROC_UNLOCK(p);
bcopy(p->p_uarea, tempuser, sizeof(struct user));
bcopy(td->td_frame,
tempuser + ctob(uarea_pages) +
((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
sizeof(struct trapframe));
#if PECOFF_DEBUG
fill_regs(td, &regs);
printf("EIP%x\n", regs.r_eip);
printf("EAX%x EBX%x ECX%x EDI%x\n",
regs.r_eax, regs.r_ebx, regs.r_ecx, regs.r_edi);
map = &vm->vm_map;
ent = &map->header;
printf("%p %p %p\n", ent, ent->prev, ent->next);
#endif
error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
ctob(uarea_pages + kstack_pages),
(off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
(int *)NULL, td);
free(tempuser, M_TEMP);
if (error == 0)
error = vn_rdwr_inchunks(UIO_WRITE, vp, vm->vm_daddr,
(int)ctob(vm->vm_dsize),
(off_t)ctob((uarea_pages + kstack_pages)),
UIO_USERSPACE, IO_UNIT, cred, NOCRED, (int *)NULL, td);
if (error == 0)
error = vn_rdwr_inchunks(UIO_WRITE, vp,
(caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
round_page(ctob(vm->vm_ssize)),
(off_t)ctob((uarea_pages + kstack_pages)) +
ctob(vm->vm_dsize),
UIO_USERSPACE, IO_UNIT, cred, NOCRED, (int *)NULL, td);
return (error);
}
static int
pecoff_load_section(struct thread * td, struct vmspace * vmspace, struct vnode * vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
{
size_t map_len;
vm_offset_t map_addr;
int error, rv;
size_t copy_len;
size_t copy_map_len;
size_t copy_start;
vm_object_t object;
vm_offset_t copy_map_offset;
vm_offset_t file_addr;
vm_offset_t data_buf = 0;
object = vp->v_object;
error = 0;
map_addr = trunc_page((vm_offset_t) vmaddr);
file_addr = trunc_page(offset);
DPRINTF(("SECARG:%x %p %x %x\n", offset, vmaddr, memsz, filsz));
if (file_addr != offset) {
/*
* The section is not on page boundary. We can't use
* vm_map_insert(). Use copyin instead.
*/
map_len = round_page(memsz);
copy_len = filsz;
copy_map_offset = file_addr;
copy_map_len = round_page(offset + filsz) - file_addr;
copy_start = offset - file_addr;
DPRINTF(("offset=%x vmaddr=%lx filsz=%x memsz=%x\n",
offset, (long)vmaddr, filsz, memsz));
DPRINTF(("map_len=%x copy_len=%x copy_map_offset=%x"
" copy_map_len=%x copy_start=%x\n",
map_len, copy_len, copy_map_offset,
copy_map_len, copy_start));
} else {
map_len = trunc_page(filsz);
if (map_len != 0) {
vm_object_reference(object);
vm_map_lock(&vmspace->vm_map);
rv = vm_map_insert(&vmspace->vm_map,
object,
file_addr, /* file offset */
map_addr, /* virtual start */
map_addr + map_len, /* virtual end */
prot,
VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT);
vm_map_unlock(&vmspace->vm_map);
if (rv != KERN_SUCCESS) {
vm_object_deallocate(object);
return EINVAL;
}
/* we can stop now if we've covered it all */
if (memsz == filsz)
return 0;
}
copy_map_offset = trunc_page(offset + filsz);
copy_map_len = PAGE_SIZE;
copy_start = 0;
copy_len = (offset + filsz) - trunc_page(offset + filsz);
map_addr = trunc_page((vm_offset_t) vmaddr + filsz);
map_len = round_page((vm_offset_t) vmaddr + memsz) - map_addr;
}
if (map_len != 0) {
vm_map_lock(&vmspace->vm_map);
rv = vm_map_insert(&vmspace->vm_map, NULL, 0,
map_addr, map_addr + map_len,
VM_PROT_ALL, VM_PROT_ALL, 0);
vm_map_unlock(&vmspace->vm_map);
DPRINTF(("EMP-rv:%d,%x %x\n", rv, map_addr, map_addr + map_len));
if (rv != KERN_SUCCESS) {
return EINVAL;
}
}
DPRINTF(("COPYARG %x %x\n", map_addr, copy_len));
if (copy_len != 0) {
vm_object_reference(object);
rv = vm_map_find(exec_map,
object,
copy_map_offset,
&data_buf,
copy_map_len,
TRUE,
VM_PROT_READ,
VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
if (rv != KERN_SUCCESS) {
vm_object_deallocate(object);
return EINVAL;
}
/* send the page fragment to user space */
error = copyout((caddr_t) data_buf + copy_start,
(caddr_t) map_addr, copy_len);
vm_map_remove(exec_map, data_buf, data_buf + copy_map_len);
DPRINTF(("%d\n", error));
if (error)
return (error);
}
/*
* set it to the specified protection
*/
vm_map_protect(&vmspace->vm_map, map_addr,
map_addr + map_len, prot,
FALSE);
return error;
}
static int
pecoff_load_file(struct thread * td, const char *file, u_long * addr, u_long * entry, u_long * ldexport)
{
struct nameidata nd;
struct pecoff_dos_filehdr dh;
struct coff_filehdr *fp = 0;
struct coff_aouthdr *ap;
struct pecoff_opthdr *wp;
struct coff_scnhdr *sh = 0;
struct vmspace *vmspace = td->td_proc->p_vmspace;
struct vattr attr;
struct image_params image_params, *imgp;
int peofs;
int error, i, scnsiz;
imgp = &image_params;
/*
* Initialize part of the common data
*/
imgp->proc = td->td_proc;
imgp->uap = NULL;
imgp->attr = &attr;
imgp->firstpage = NULL;
NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, td);
if ((error = namei(&nd)) != 0) {
nd.ni_vp = NULL;
goto fail;
}
NDFREE(&nd, NDF_ONLY_PNBUF);
imgp->vp = nd.ni_vp;
/*
* Check permissions, modes, uid, etc on the file, and "open" it.
*/
error = exec_check_permissions(imgp);
if (error) {
VOP_UNLOCK(nd.ni_vp, 0, td);
goto fail;
}
VOP_UNLOCK(nd.ni_vp, 0, td);
if (error)
goto fail;
if ((error = pecoff_read_from(td, imgp->vp, 0, (caddr_t) & dh, sizeof(dh))) != 0)
goto fail;
if ((error = pecoff_signature(td, imgp->vp, &dh) != 0))
goto fail;
fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK);
peofs = dh.d_peofs + sizeof(signature) - 1;
if ((error = pecoff_read_from(td, imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE) != 0))
goto fail;
if (COFF_BADMAG(fp)) {
error = ENOEXEC;
goto fail;
}
ap = (void *) ((char *) fp + sizeof(struct coff_filehdr));
wp = (void *) ((char *) ap + sizeof(struct coff_aouthdr));
/* read section header */
scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns;
sh = malloc(scnsiz, M_TEMP, M_WAITOK);
if ((error = pecoff_read_from(td, imgp->vp, peofs + PECOFF_HDR_SIZE,
(caddr_t) sh, scnsiz)) != 0)
goto fail;
/*
* Read Section infomation and map sections.
*/
for (i = 0; i < fp->f_nscns; i++) {
int prot = 0;
if (sh[i].s_flags & COFF_STYP_DISCARD)
continue;
/* XXX ? */
if ((sh[i].s_flags & COFF_STYP_TEXT) &&
(sh[i].s_flags & COFF_STYP_EXEC) == 0)
continue;
if ((sh[i].s_flags & (COFF_STYP_TEXT | COFF_STYP_DATA | COFF_STYP_BSS)) == 0)
continue;
prot |= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0;
prot |= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0;
prot |= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0;
sh[i].s_vaddr += wp->w_base; /* RVA --> VA */
if ((error = pecoff_load_section(td, vmspace, imgp->vp, sh[i].s_scnptr
,(caddr_t) sh[i].s_vaddr,
sh[i].s_paddr, sh[i].s_size
,prot)) != 0)
goto fail;
}
*entry = wp->w_base + ap->a_entry;
*addr = wp->w_base;
*ldexport = wp->w_imghdr[0].i_vaddr + wp->w_base;
fail:
if (fp)
free(fp, M_TEMP);
if (sh)
free(sh, M_TEMP);
if (nd.ni_vp)
vrele(nd.ni_vp);
return error;
}
static int
exec_pecoff_coff_prep_omagic(struct image_params * imgp,
struct coff_filehdr * fp,
struct coff_aouthdr * ap, int peofs)
{
return ENOEXEC;
}
static int
exec_pecoff_coff_prep_nmagic(struct image_params * imgp,
struct coff_filehdr * fp,
struct coff_aouthdr * ap, int peofs)
{
return ENOEXEC;
}
static int
exec_pecoff_coff_prep_zmagic(struct image_params * imgp,
struct coff_filehdr * fp,
struct coff_aouthdr * ap, int peofs)
{
int scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns;
int error = ENOEXEC, i;
int prot;
u_long text_size = 0, data_size = 0, dsize;
u_long text_addr = 0, data_addr = VM_MAXUSER_ADDRESS;
u_long ldexport, ldbase;
struct pecoff_opthdr *wp;
struct coff_scnhdr *sh;
struct vmspace *vmspace;
struct pecoff_args *argp = NULL;
sh = malloc(scnsiz, M_TEMP, M_WAITOK);
wp = (void *) ((char *) ap + sizeof(struct coff_aouthdr));
error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc), imgp->vp,
peofs + PECOFF_HDR_SIZE, (caddr_t) sh, scnsiz);
if ((error = exec_extract_strings(imgp)) != 0)
goto fail;
- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
+ exec_new_vmspace(imgp, &pecoff_sysvec);
vmspace = imgp->proc->p_vmspace;
for (i = 0; i < fp->f_nscns; i++) {
prot = VM_PROT_WRITE; /* XXX for relocation? */
prot |= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0;
prot |= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0;
prot |= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0;
sh[i].s_vaddr += wp->w_base;
if (sh[i].s_flags & COFF_STYP_DISCARD)
continue;
if ((sh[i].s_flags & COFF_STYP_TEXT) != 0) {
error = pecoff_load_section(
FIRST_THREAD_IN_PROC(imgp->proc),
vmspace, imgp->vp, sh[i].s_scnptr,
(caddr_t) sh[i].s_vaddr, sh[i].s_paddr,
sh[i].s_size ,prot);
DPRINTF(("ERROR%d\n", error));
if (error)
goto fail;
text_addr = trunc_page(sh[i].s_vaddr);
text_size = trunc_page(sh[i].s_size + sh[i].s_vaddr - text_addr);
}
if ((sh[i].s_flags & (COFF_STYP_DATA|COFF_STYP_BSS)) != 0) {
if (pecoff_load_section(
FIRST_THREAD_IN_PROC(imgp->proc), vmspace,
imgp->vp, sh[i].s_scnptr, (caddr_t) sh[i].s_vaddr,
sh[i].s_paddr, sh[i].s_size, prot) != 0)
goto fail;
data_addr = min(trunc_page(sh[i].s_vaddr), data_addr);
dsize = round_page(sh[i].s_vaddr + sh[i].s_paddr)
- data_addr;
data_size = max(dsize, data_size);
}
}
vmspace->vm_tsize = text_size >> PAGE_SHIFT;
vmspace->vm_taddr = (caddr_t) (uintptr_t) text_addr;
vmspace->vm_dsize = data_size >> PAGE_SHIFT;
vmspace->vm_daddr = (caddr_t) (uintptr_t) data_addr;
argp = malloc(sizeof(struct pecoff_args), M_TEMP, M_WAITOK);
if (argp == NULL) {
error = ENOMEM;
goto fail;
}
argp->a_base = wp->w_base;
argp->a_entry = wp->w_base + ap->a_entry;
argp->a_end = data_addr + data_size;
argp->a_subsystem = wp->w_subvers;
error = pecoff_load_file(FIRST_THREAD_IN_PROC(imgp->proc),
"/usr/libexec/ld.so.dll", &ldbase, &imgp->entry_addr, &ldexport);
if (error)
goto fail;
argp->a_ldbase = ldbase;
argp->a_ldexport = ldexport;
memcpy(argp->a_imghdr, wp->w_imghdr, sizeof(struct pecoff_imghdr) * 16);
for (i = 0; i < 16; i++) {
argp->a_imghdr[i].i_vaddr += wp->w_base;
}
imgp->proc->p_sysent = &pecoff_sysvec;
if (error)
goto fail;
imgp->auxargs = argp;
imgp->auxarg_size = sizeof(struct pecoff_args);
imgp->interpreted = 0;
if (sh != NULL)
free(sh, M_TEMP);
return 0;
fail:
error = (error) ? error : ENOEXEC;
if (sh != NULL)
free(sh, M_TEMP);
if (argp != NULL)
free(argp, M_TEMP);
return error;
}
int
exec_pecoff_coff_makecmds(struct image_params * imgp,
struct coff_filehdr * fp, int peofs)
{
struct coff_aouthdr *ap;
int error;
if (COFF_BADMAG(fp)) {
return ENOEXEC;
}
ap = (void *) ((char *) fp + sizeof(struct coff_filehdr));
switch (ap->a_magic) {
case COFF_OMAGIC:
error = exec_pecoff_coff_prep_omagic(imgp, fp, ap, peofs);
break;
case COFF_NMAGIC:
error = exec_pecoff_coff_prep_nmagic(imgp, fp, ap, peofs);
break;
case COFF_ZMAGIC:
error = exec_pecoff_coff_prep_zmagic(imgp, fp, ap, peofs);
break;
default:
return ENOEXEC;
}
return error;
}
static int
pecoff_signature(td, vp, dp)
struct thread *td;
struct vnode *vp;
const struct pecoff_dos_filehdr *dp;
{
int error;
char buf[512];
char *pesig;
if (DOS_BADMAG(dp)) {
return ENOEXEC;
}
error = pecoff_read_from(td, vp, dp->d_peofs, buf, sizeof(buf));
if (error) {
return error;
}
pesig = buf;
if (memcmp(pesig, signature, sizeof(signature) - 1) == 0) {
return 0;
}
return EFTYPE;
}
int
pecoff_read_from(td, vp, pos, buf, siz)
struct thread *td;
struct vnode *vp;
int pos;
caddr_t buf;
int siz;
{
int error;
size_t resid;
error = vn_rdwr(UIO_READ, vp, buf, siz, pos,
UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
&resid, td);
if (error)
return error;
if (resid != 0) {
return ENOEXEC;
}
return 0;
}
static int
imgact_pecoff(struct image_params * imgp)
{
const struct pecoff_dos_filehdr *dp = (const struct pecoff_dos_filehdr *)
imgp->image_header;
struct coff_filehdr *fp;
int error, peofs;
struct thread *td = curthread;
error = pecoff_signature(FIRST_THREAD_IN_PROC(imgp->proc),
imgp->vp, dp);
if (error) {
return -1;
}
VOP_UNLOCK(imgp->vp, 0, td);
peofs = dp->d_peofs + sizeof(signature) - 1;
fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK);
error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc),
imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE);
if (error)
goto fail;
error = exec_pecoff_coff_makecmds(imgp, fp, peofs);
fail:
free(fp, M_TEMP);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
return error;
}
static struct execsw pecoff_execsw = {imgact_pecoff, "FreeBSD PEcoff"};
EXEC_SET(pecoff, pecoff_execsw);
Index: head/sys/compat/svr4/imgact_svr4.c
===================================================================
--- head/sys/compat/svr4/imgact_svr4.c (revision 103766)
+++ head/sys/compat/svr4/imgact_svr4.c (revision 103767)
@@ -1,243 +1,243 @@
/*-
* Copyright (c) 1998 Mark Newton
* Copyright (c) 1994-1996 Søren Schmidt
* All rights reserved.
*
* Based heavily on /sys/kern/imgact_aout.c which is:
* Copyright (c) 1993, David Greenman
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/imgact_aout.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mman.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <compat/svr4/svr4.h>
static int exec_svr4_imgact(struct image_params *iparams);
static int
exec_svr4_imgact(imgp)
struct image_params *imgp;
{
const struct exec *a_out = (const struct exec *) imgp->image_header;
struct vmspace *vmspace;
vm_offset_t vmaddr;
unsigned long virtual_offset, file_offset;
vm_offset_t buffer;
unsigned long bss_size;
int error;
struct thread *td = curthread;
if (((a_out->a_magic >> 16) & 0xff) != 0x64)
return -1;
/*
* Set file/virtual offset based on a.out variant.
*/
switch ((int)(a_out->a_magic & 0xffff)) {
case 0413:
virtual_offset = 0;
file_offset = 1024;
break;
case 0314:
virtual_offset = 4096;
file_offset = 0;
break;
default:
return (-1);
}
bss_size = round_page(a_out->a_bss);
#ifdef DEBUG
printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n", a_out->a_text, a_out->a_data, bss_size);
#endif
/*
* Check various fields in header for validity/bounds.
*/
if (a_out->a_entry < virtual_offset ||
a_out->a_entry >= virtual_offset + a_out->a_text ||
a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
return (-1);
/* text + data can't exceed file size */
if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
return (EFAULT);
/* For p_rlimit below. */
mtx_assert(&Giant, MA_OWNED);
/*
* text/data/bss must not exceed limits
*/
if (a_out->a_text > maxtsiz ||
a_out->a_data + bss_size > imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
return (ENOMEM);
VOP_UNLOCK(imgp->vp, 0, td);
/* copy in arguments and/or environment from old process */
error = exec_extract_strings(imgp);
if (error)
goto fail;
/*
* Destroy old process VM and create a new one (with a new stack)
*/
- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
+ exec_new_vmspace(imgp, &svr4_sysvec);
vmspace = imgp->proc->p_vmspace;
/*
* Check if file_offset page aligned,.
* Currently we cannot handle misalinged file offsets,
* and so we read in the entire image (what a waste).
*/
if (file_offset & PAGE_MASK) {
#ifdef DEBUG
printf("imgact: Non page aligned binary %lu\n", file_offset);
#endif
/*
* Map text+data+bss read/write/execute
*/
vmaddr = virtual_offset;
error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
a_out->a_text + a_out->a_data + bss_size, FALSE,
VM_PROT_ALL, VM_PROT_ALL, 0);
if (error)
goto fail;
error = vm_mmap(kernel_map, &buffer,
round_page(a_out->a_text + a_out->a_data + file_offset),
VM_PROT_READ, VM_PROT_READ, 0,
(caddr_t) imgp->vp, trunc_page(file_offset));
if (error)
goto fail;
error = copyout((caddr_t)(buffer + file_offset), (caddr_t)vmaddr,
a_out->a_text + a_out->a_data);
vm_map_remove(kernel_map, buffer,
buffer + round_page(a_out->a_text + a_out->a_data + file_offset));
if (error)
goto fail;
/*
* remove write enable on the 'text' part
*/
error = vm_map_protect(&vmspace->vm_map,
vmaddr,
vmaddr + a_out->a_text,
VM_PROT_EXECUTE|VM_PROT_READ,
TRUE);
if (error)
goto fail;
}
else {
#ifdef DEBUG
printf("imgact: Page aligned binary %lu\n", file_offset);
#endif
/*
* Map text+data read/execute
*/
vmaddr = virtual_offset;
error = vm_mmap(&vmspace->vm_map, &vmaddr,
a_out->a_text + a_out->a_data,
VM_PROT_READ | VM_PROT_EXECUTE,
VM_PROT_ALL,
MAP_PRIVATE | MAP_FIXED,
(caddr_t)imgp->vp, file_offset);
if (error)
goto fail;
#ifdef DEBUG
printf("imgact: startaddr=%08lx, length=%08lx\n", (u_long)vmaddr,
a_out->a_text + a_out->a_data);
#endif
/*
* allow read/write of data
*/
error = vm_map_protect(&vmspace->vm_map,
vmaddr + a_out->a_text,
vmaddr + a_out->a_text + a_out->a_data,
VM_PROT_ALL,
FALSE);
if (error)
goto fail;
/*
* Allocate anon demand-zeroed area for uninitialized data
*/
if (bss_size != 0) {
vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
if (error)
goto fail;
#ifdef DEBUG
printf("imgact: bssaddr=%08lx, length=%08lx\n",
(u_long)vmaddr, bss_size);
#endif
}
}
/* Fill in process VM information */
vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT;
vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT;
vmspace->vm_taddr = (caddr_t)virtual_offset;
vmspace->vm_daddr = (caddr_t)virtual_offset + a_out->a_text;
/* Fill in image_params */
imgp->interpreted = 0;
imgp->entry_addr = a_out->a_entry;
imgp->proc->p_sysent = &svr4_sysvec;
fail:
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
return (error);
}
/*
* Tell kern_execve.c about it, with a little help from the linker.
*/
struct execsw svr4_execsw = { exec_svr4_imgact, "svr4 ELF" };
EXEC_SET(execsw_set, svr4_execsw);
Index: head/sys/fs/procfs/procfs_status.c
===================================================================
--- head/sys/fs/procfs/procfs_status.c (revision 103766)
+++ head/sys/fs/procfs/procfs_status.c (revision 103767)
@@ -1,204 +1,206 @@
/*
* Copyright (c) 1993 Jan-Simon Pendry
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_status.c 8.4 (Berkeley) 6/15/94
*
* From:
* $Id: procfs_status.c,v 3.1 1993/12/15 09:40:17 jsp Exp $
* $FreeBSD$
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/exec.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/jail.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/sx.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sbuf.h>
+#include <sys/sysent.h>
#include <sys/tty.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_param.h>
#include <fs/pseudofs/pseudofs.h>
#include <fs/procfs/procfs.h>
int
procfs_doprocstatus(PFS_FILL_ARGS)
{
struct session *sess;
struct thread *tdfirst;
struct tty *tp;
struct ucred *cr;
char *pc;
char *sep;
int pid, ppid, pgid, sid;
int i;
pid = p->p_pid;
PROC_LOCK(p);
ppid = p->p_pptr ? p->p_pptr->p_pid : 0;
pgid = p->p_pgrp->pg_id;
sess = p->p_pgrp->pg_session;
SESS_LOCK(sess);
sid = sess->s_leader ? sess->s_leader->p_pid : 0;
/* comm pid ppid pgid sid maj,min ctty,sldr start ut st wmsg
euid ruid rgid,egid,groups[1 .. NGROUPS]
*/
pc = p->p_comm;
do {
if (*pc < 33 || *pc > 126 || *pc == '\\')
sbuf_printf(sb, "\\%03o", *pc);
else
sbuf_putc(sb, *pc);
} while (*++pc);
sbuf_printf(sb, " %d %d %d %d ", pid, ppid, pgid, sid);
if ((p->p_flag&P_CONTROLT) && (tp = sess->s_ttyp))
sbuf_printf(sb, "%d,%d ", major(tp->t_dev), minor(tp->t_dev));
else
sbuf_printf(sb, "%d,%d ", -1, -1);
sep = "";
if (sess->s_ttyvp) {
sbuf_printf(sb, "%sctty", sep);
sep = ",";
}
if (SESS_LEADER(p)) {
sbuf_printf(sb, "%ssldr", sep);
sep = ",";
}
SESS_UNLOCK(sess);
if (*sep != ',') {
sbuf_printf(sb, "noflags");
}
mtx_lock_spin(&sched_lock);
if (p->p_sflag & PS_INMEM) {
struct timeval ut, st;
calcru(p, &ut, &st, (struct timeval *) NULL);
mtx_unlock_spin(&sched_lock);
sbuf_printf(sb, " %lld,%ld %ld,%ld %ld,%ld",
(long long)p->p_stats->p_start.tv_sec,
p->p_stats->p_start.tv_usec,
ut.tv_sec, ut.tv_usec,
st.tv_sec, st.tv_usec);
} else {
mtx_unlock_spin(&sched_lock);
sbuf_printf(sb, " -1,-1 -1,-1 -1,-1");
}
if (p->p_flag & P_KSES)
sbuf_printf(sb, " %s", "-kse- ");
else {
tdfirst = FIRST_THREAD_IN_PROC(p); /* XXX diff from td? */
sbuf_printf(sb, " %s",
(tdfirst->td_wchan && tdfirst->td_wmesg) ?
tdfirst->td_wmesg : "nochan");
}
cr = p->p_ucred;
sbuf_printf(sb, " %lu %lu %lu",
(u_long)cr->cr_uid,
(u_long)cr->cr_ruid,
(u_long)cr->cr_rgid);
/* egid (cr->cr_svgid) is equal to cr_ngroups[0]
see also getegid(2) in /sys/kern/kern_prot.c */
for (i = 0; i < cr->cr_ngroups; i++) {
sbuf_printf(sb, ",%lu", (u_long)cr->cr_groups[i]);
}
if (jailed(p->p_ucred)) {
mtx_lock(&p->p_ucred->cr_prison->pr_mtx);
sbuf_printf(sb, " %s", p->p_ucred->cr_prison->pr_host);
mtx_unlock(&p->p_ucred->cr_prison->pr_mtx);
} else {
sbuf_printf(sb, " -");
}
PROC_UNLOCK(p);
sbuf_printf(sb, "\n");
return (0);
}
int
procfs_doproccmdline(PFS_FILL_ARGS)
{
struct ps_strings pstr;
int error, i;
/*
* If we are using the ps/cmdline caching, use that. Otherwise
* revert back to the old way which only implements full cmdline
* for the currept process and just p->p_comm for all other
* processes.
* Note that if the argv is no longer available, we deliberately
* don't fall back on p->p_comm or return an error: the authentic
* Linux behaviour is to return zero-length in this case.
*/
PROC_LOCK(p);
if (p->p_args && (ps_argsopen || !p_cansee(td, p))) {
sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length);
PROC_UNLOCK(p);
return (0);
}
PROC_UNLOCK(p);
if (p != td->td_proc) {
sbuf_printf(sb, "%.*s", MAXCOMLEN, p->p_comm);
} else {
- error = copyin((void*)PS_STRINGS, &pstr, sizeof(pstr));
+ error = copyin((void *)p->p_sysent->sv_psstrings, &pstr,
+ sizeof(pstr));
if (error)
return (error);
for (i = 0; i < pstr.ps_nargvstr; i++) {
sbuf_copyin(sb, pstr.ps_argvstr[i], 0);
sbuf_printf(sb, "%c", '\0');
}
}
return (0);
}
Index: head/sys/i386/ibcs2/imgact_coff.c
===================================================================
--- head/sys/i386/ibcs2/imgact_coff.c (revision 103766)
+++ head/sys/i386/ibcs2/imgact_coff.c (revision 103767)
@@ -1,493 +1,493 @@
/*-
* Copyright (c) 1994 Sean Eric Fagan
* Copyright (c) 1994 Søren Schmidt
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/imgact.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <i386/ibcs2/coff.h>
#include <i386/ibcs2/ibcs2_util.h>
MODULE_DEPEND(coff, ibcs2, 1, 1, 1);
extern struct sysentvec ibcs2_svr3_sysvec;
static int coff_load_file(struct thread *td, char *name);
static int exec_coff_imgact(struct image_params *imgp);
static int load_coff_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot);
static int
load_coff_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset,
caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
{
size_t map_len;
vm_offset_t map_offset;
vm_offset_t map_addr;
int error;
unsigned char *data_buf = 0;
size_t copy_len;
map_offset = trunc_page(offset);
map_addr = trunc_page((vm_offset_t)vmaddr);
if (memsz > filsz) {
/*
* We have the stupid situation that
* the section is longer than it is on file,
* which means it has zero-filled areas, and
* we have to work for it. Stupid iBCS!
*/
map_len = trunc_page(offset + filsz) - trunc_page(map_offset);
} else {
/*
* The only stuff we care about is on disk, and we
* don't care if we map in more than is really there.
*/
map_len = round_page(offset + filsz) - trunc_page(map_offset);
}
DPRINTF(("%s(%d): vm_mmap(&vmspace->vm_map, &0x%08lx, 0x%x, 0x%x, "
"VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, vp, 0x%x)\n",
__FILE__, __LINE__, map_addr, map_len, prot, map_offset));
if ((error = vm_mmap(&vmspace->vm_map,
&map_addr,
map_len,
prot,
VM_PROT_ALL,
MAP_PRIVATE | MAP_FIXED,
(caddr_t) vp,
map_offset)) != 0)
return error;
if (memsz == filsz) {
/* We're done! */
return 0;
}
/*
* Now we have screwball stuff, to accomodate stupid COFF.
* We have to map the remaining bit of the file into the kernel's
* memory map, allocate some anonymous memory, copy that last
* bit into it, and then we're done. *sigh*
* For clean-up reasons, we actally map in the file last.
*/
copy_len = (offset + filsz) - trunc_page(offset + filsz);
map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx,0x%x, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n", __FILE__, __LINE__, map_addr, map_len));
if (map_len != 0) {
error = vm_map_find(&vmspace->vm_map, NULL, 0, &map_addr,
map_len, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
if (error)
return error;
}
if ((error = vm_mmap(kernel_map,
(vm_offset_t *) &data_buf,
PAGE_SIZE,
VM_PROT_READ,
VM_PROT_READ,
0,
(caddr_t) vp,
trunc_page(offset + filsz))) != 0)
return error;
error = copyout(data_buf, (caddr_t) map_addr, copy_len);
if (vm_map_remove(kernel_map,
(vm_offset_t) data_buf,
(vm_offset_t) data_buf + PAGE_SIZE))
panic("load_coff_section vm_map_remove failed");
return error;
}
static int
coff_load_file(struct thread *td, char *name)
{
struct proc *p = td->td_proc;
struct vmspace *vmspace = p->p_vmspace;
int error;
struct nameidata nd;
struct vnode *vp;
struct vattr attr;
struct filehdr *fhdr;
struct aouthdr *ahdr;
struct scnhdr *scns;
char *ptr = 0;
int nscns;
unsigned long text_offset = 0, text_address = 0, text_size = 0;
unsigned long data_offset = 0, data_address = 0, data_size = 0;
unsigned long bss_size = 0;
int i;
NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, name, td);
error = namei(&nd);
if (error)
return error;
vp = nd.ni_vp;
if (vp == NULL)
return ENOEXEC;
if (vp->v_writecount) {
error = ETXTBSY;
goto fail;
}
if ((error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) != 0)
goto fail;
if ((vp->v_mount->mnt_flag & MNT_NOEXEC)
|| ((attr.va_mode & 0111) == 0)
|| (attr.va_type != VREG))
goto fail;
if (attr.va_size == 0) {
error = ENOEXEC;
goto fail;
}
if ((error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td)) != 0)
goto fail;
if ((error = VOP_OPEN(vp, FREAD, td->td_ucred, td)) != 0)
goto fail;
/*
* Lose the lock on the vnode. It's no longer needed, and must not
* exist for the pagefault paging to work below.
*/
VOP_UNLOCK(vp, 0, td);
if ((error = vm_mmap(kernel_map,
(vm_offset_t *) &ptr,
PAGE_SIZE,
VM_PROT_READ,
VM_PROT_READ,
0,
(caddr_t) vp,
0)) != 0)
goto unlocked_fail;
fhdr = (struct filehdr *)ptr;
if (fhdr->f_magic != I386_COFF) {
error = ENOEXEC;
goto dealloc_and_fail;
}
nscns = fhdr->f_nscns;
if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) {
/*
* XXX -- just fail. I'm so lazy.
*/
error = ENOEXEC;
goto dealloc_and_fail;
}
ahdr = (struct aouthdr*)(ptr + sizeof(struct filehdr));
scns = (struct scnhdr*)(ptr + sizeof(struct filehdr)
+ sizeof(struct aouthdr));
for (i = 0; i < nscns; i++) {
if (scns[i].s_flags & STYP_NOLOAD)
continue;
else if (scns[i].s_flags & STYP_TEXT) {
text_address = scns[i].s_vaddr;
text_size = scns[i].s_size;
text_offset = scns[i].s_scnptr;
}
else if (scns[i].s_flags & STYP_DATA) {
data_address = scns[i].s_vaddr;
data_size = scns[i].s_size;
data_offset = scns[i].s_scnptr;
} else if (scns[i].s_flags & STYP_BSS) {
bss_size = scns[i].s_size;
}
}
if ((error = load_coff_section(vmspace, vp, text_offset,
(caddr_t)(void *)(uintptr_t)text_address,
text_size, text_size,
VM_PROT_READ | VM_PROT_EXECUTE)) != 0) {
goto dealloc_and_fail;
}
if ((error = load_coff_section(vmspace, vp, data_offset,
(caddr_t)(void *)(uintptr_t)data_address,
data_size + bss_size, data_size,
VM_PROT_ALL)) != 0) {
goto dealloc_and_fail;
}
error = 0;
dealloc_and_fail:
if (vm_map_remove(kernel_map,
(vm_offset_t) ptr,
(vm_offset_t) ptr + PAGE_SIZE))
panic("%s vm_map_remove failed", __func__);
fail:
VOP_UNLOCK(vp, 0, td);
unlocked_fail:
NDFREE(&nd, NDF_ONLY_PNBUF);
vrele(nd.ni_vp);
return error;
}
static int
exec_coff_imgact(imgp)
struct image_params *imgp;
{
const struct filehdr *fhdr = (const struct filehdr*)imgp->image_header;
const struct aouthdr *ahdr;
const struct scnhdr *scns;
int i;
struct vmspace *vmspace;
int nscns;
int error;
unsigned long text_offset = 0, text_address = 0, text_size = 0;
unsigned long data_offset = 0, data_address = 0, data_size = 0;
unsigned long bss_size = 0;
caddr_t hole;
struct thread *td = curthread;
if (fhdr->f_magic != I386_COFF ||
!(fhdr->f_flags & F_EXEC)) {
DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__));
return -1;
}
nscns = fhdr->f_nscns;
if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) {
/*
* For now, return an error -- need to be able to
* read in all of the section structures.
*/
DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__));
return -1;
}
ahdr = (const struct aouthdr*)
((const char*)(imgp->image_header) + sizeof(struct filehdr));
imgp->entry_addr = ahdr->entry;
scns = (const struct scnhdr*)
((const char*)(imgp->image_header) + sizeof(struct filehdr) +
sizeof(struct aouthdr));
VOP_UNLOCK(imgp->vp, 0, td);
if ((error = exec_extract_strings(imgp)) != 0) {
DPRINTF(("%s(%d): return %d\n", __FILE__, __LINE__, error));
goto fail;
}
- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
+ exec_new_vmspace(imgp, &ibcs2_svr3_sysvec);
vmspace = imgp->proc->p_vmspace;
for (i = 0; i < nscns; i++) {
DPRINTF(("i = %d, scns[i].s_name = %s, scns[i].s_vaddr = %08lx, "
"scns[i].s_scnptr = %d\n", i, scns[i].s_name,
scns[i].s_vaddr, scns[i].s_scnptr));
if (scns[i].s_flags & STYP_NOLOAD) {
/*
* A section that is not loaded, for whatever
* reason. It takes precedance over other flag
* bits...
*/
continue;
} else if (scns[i].s_flags & STYP_TEXT) {
text_address = scns[i].s_vaddr;
text_size = scns[i].s_size;
text_offset = scns[i].s_scnptr;
} else if (scns[i].s_flags & STYP_DATA) {
/* .data section */
data_address = scns[i].s_vaddr;
data_size = scns[i].s_size;
data_offset = scns[i].s_scnptr;
} else if (scns[i].s_flags & STYP_BSS) {
/* .bss section */
bss_size = scns[i].s_size;
} else if (scns[i].s_flags & STYP_LIB) {
char *buf = 0;
int foff = trunc_page(scns[i].s_scnptr);
int off = scns[i].s_scnptr - foff;
int len = round_page(scns[i].s_size + PAGE_SIZE);
int j;
if ((error = vm_mmap(kernel_map,
(vm_offset_t *) &buf,
len,
VM_PROT_READ,
VM_PROT_READ,
0,
(caddr_t) imgp->vp,
foff)) != 0) {
error = ENOEXEC;
goto fail;
}
if(scns[i].s_size) {
char *libbuf;
int emul_path_len = strlen(ibcs2_emul_path);
libbuf = malloc(MAXPATHLEN + emul_path_len,
M_TEMP, M_WAITOK);
strcpy(libbuf, ibcs2_emul_path);
for (j = off; j < scns[i].s_size + off;) {
long stroff, nextoff;
char *libname;
nextoff = 4 * *(long *)(buf + j);
stroff = 4 * *(long *)(buf + j + sizeof(long));
libname = buf + j + stroff;
j += nextoff;
DPRINTF(("%s(%d): shared library %s\n",
__FILE__, __LINE__, libname));
strcpy(&libbuf[emul_path_len], libname);
/* XXXKSE only 1:1 in coff */ error = coff_load_file(
FIRST_THREAD_IN_PROC(imgp->proc), libbuf);
if (error)
error = coff_load_file(
FIRST_THREAD_IN_PROC(imgp->proc),
libname);
if (error)
break;
}
free(libbuf, M_TEMP);
}
if (vm_map_remove(kernel_map,
(vm_offset_t) buf,
(vm_offset_t) buf + len))
panic("exec_coff_imgact vm_map_remove failed");
if (error)
goto fail;
}
}
/*
* Map in .text now
*/
DPRINTF(("%s(%d): load_coff_section(vmspace, "
"imgp->vp, %08lx, %08lx, 0x%x, 0x%x, 0x%x)\n",
__FILE__, __LINE__, text_offset, text_address,
text_size, text_size, VM_PROT_READ | VM_PROT_EXECUTE));
if ((error = load_coff_section(vmspace, imgp->vp,
text_offset,
(caddr_t)(void *)(uintptr_t)text_address,
text_size, text_size,
VM_PROT_READ | VM_PROT_EXECUTE)) != 0) {
DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
goto fail;
}
/*
* Map in .data and .bss now
*/
DPRINTF(("%s(%d): load_coff_section(vmspace, "
"imgp->vp, 0x%08lx, 0x%08lx, 0x%x, 0x%x, 0x%x)\n",
__FILE__, __LINE__, data_offset, data_address,
data_size + bss_size, data_size, VM_PROT_ALL));
if ((error = load_coff_section(vmspace, imgp->vp,
data_offset,
(caddr_t)(void *)(uintptr_t)data_address,
data_size + bss_size, data_size,
VM_PROT_ALL)) != 0) {
DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
goto fail;
}
imgp->interpreted = 0;
imgp->proc->p_sysent = &ibcs2_svr3_sysvec;
vmspace->vm_tsize = round_page(text_size) >> PAGE_SHIFT;
vmspace->vm_dsize = round_page(data_size + bss_size) >> PAGE_SHIFT;
vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)text_address;
vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t)data_address;
hole = (caddr_t)trunc_page((vm_offset_t)vmspace->vm_daddr) + ctob(vmspace->vm_dsize);
DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n",
__FILE__, __LINE__, hole));
DPRINTF(("imgact: error = %d\n", error));
error = vm_map_find(&vmspace->vm_map, NULL, 0,
(vm_offset_t *) &hole, PAGE_SIZE, FALSE,
VM_PROT_ALL, VM_PROT_ALL, 0);
DPRINTF(("IBCS2: start vm_dsize = 0x%x, vm_daddr = 0x%x end = 0x%x\n",
ctob(vmspace->vm_dsize), vmspace->vm_daddr,
ctob(vmspace->vm_dsize) + vmspace->vm_daddr ));
DPRINTF(("%s(%d): returning successfully!\n", __FILE__, __LINE__));
fail:
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
return error;
}
/*
* Tell kern_execve.c about it, with a little help from the linker.
*/
static struct execsw coff_execsw = { exec_coff_imgact, "coff" };
EXEC_SET(coff, coff_execsw);
Index: head/sys/i386/linux/imgact_linux.c
===================================================================
--- head/sys/i386/linux/imgact_linux.c (revision 103766)
+++ head/sys/i386/linux/imgact_linux.c (revision 103767)
@@ -1,246 +1,246 @@
/*-
* Copyright (c) 1994-1996 Søren Schmidt
* All rights reserved.
*
* Based heavily on /sys/kern/imgact_aout.c which is:
* Copyright (c) 1993, David Greenman
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/imgact_aout.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mman.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <i386/linux/linux.h>
static int exec_linux_imgact(struct image_params *iparams);
static int
exec_linux_imgact(imgp)
struct image_params *imgp;
{
const struct exec *a_out = (const struct exec *) imgp->image_header;
struct vmspace *vmspace;
vm_offset_t vmaddr;
unsigned long virtual_offset, file_offset;
vm_offset_t buffer;
unsigned long bss_size;
struct thread *td = curthread;
int error;
if (((a_out->a_magic >> 16) & 0xff) != 0x64)
return -1;
/*
* Set file/virtual offset based on a.out variant.
*/
switch ((int)(a_out->a_magic & 0xffff)) {
case 0413:
virtual_offset = 0;
file_offset = 1024;
break;
case 0314:
virtual_offset = 4096;
file_offset = 0;
break;
default:
return (-1);
}
bss_size = round_page(a_out->a_bss);
#ifdef DEBUG
printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n",
(u_long)a_out->a_text, (u_long)a_out->a_data, bss_size);
#endif
/*
* Check various fields in header for validity/bounds.
*/
if (a_out->a_entry < virtual_offset ||
a_out->a_entry >= virtual_offset + a_out->a_text ||
a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
return (-1);
/* text + data can't exceed file size */
if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
return (EFAULT);
/*
* text/data/bss must not exceed limits
*/
mtx_assert(&Giant, MA_OWNED);
if (a_out->a_text > maxtsiz ||
a_out->a_data + bss_size > imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
return (ENOMEM);
VOP_UNLOCK(imgp->vp, 0, td);
/* copy in arguments and/or environment from old process */
error = exec_extract_strings(imgp);
if (error)
goto fail;
/*
* Destroy old process VM and create a new one (with a new stack)
*/
- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
+ exec_new_vmspace(imgp, &linux_sysvec);
vmspace = imgp->proc->p_vmspace;
/*
* Check if file_offset page aligned,.
* Currently we cannot handle misalinged file offsets,
* and so we read in the entire image (what a waste).
*/
if (file_offset & PAGE_MASK) {
#ifdef DEBUG
printf("imgact: Non page aligned binary %lu\n", file_offset);
#endif
/*
* Map text+data+bss read/write/execute
*/
vmaddr = virtual_offset;
error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
a_out->a_text + a_out->a_data + bss_size, FALSE,
VM_PROT_ALL, VM_PROT_ALL, 0);
if (error)
goto fail;
error = vm_mmap(kernel_map, &buffer,
round_page(a_out->a_text + a_out->a_data + file_offset),
VM_PROT_READ, VM_PROT_READ, 0,
(caddr_t) imgp->vp, trunc_page(file_offset));
if (error)
goto fail;
error = copyout((caddr_t)(void *)(uintptr_t)(buffer + file_offset),
(caddr_t)vmaddr, a_out->a_text + a_out->a_data);
vm_map_remove(kernel_map, buffer,
buffer + round_page(a_out->a_text + a_out->a_data + file_offset));
if (error)
goto fail;
/*
* remove write enable on the 'text' part
*/
error = vm_map_protect(&vmspace->vm_map,
vmaddr,
vmaddr + a_out->a_text,
VM_PROT_EXECUTE|VM_PROT_READ,
TRUE);
if (error)
goto fail;
}
else {
#ifdef DEBUG
printf("imgact: Page aligned binary %lu\n", file_offset);
#endif
/*
* Map text+data read/execute
*/
vmaddr = virtual_offset;
error = vm_mmap(&vmspace->vm_map, &vmaddr,
a_out->a_text + a_out->a_data,
VM_PROT_READ | VM_PROT_EXECUTE,
VM_PROT_ALL,
MAP_PRIVATE | MAP_FIXED,
(caddr_t)imgp->vp, file_offset);
if (error)
goto fail;
#ifdef DEBUG
printf("imgact: startaddr=%08lx, length=%08lx\n",
(u_long)vmaddr, a_out->a_text + a_out->a_data);
#endif
/*
* allow read/write of data
*/
error = vm_map_protect(&vmspace->vm_map,
vmaddr + a_out->a_text,
vmaddr + a_out->a_text + a_out->a_data,
VM_PROT_ALL,
FALSE);
if (error)
goto fail;
/*
* Allocate anon demand-zeroed area for uninitialized data
*/
if (bss_size != 0) {
vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
if (error)
goto fail;
#ifdef DEBUG
printf("imgact: bssaddr=%08lx, length=%08lx\n",
(u_long)vmaddr, bss_size);
#endif
}
/* Indicate that this file should not be modified */
mp_fixme("Unlocked v_flag access");
imgp->vp->v_vflag |= VV_TEXT;
}
/* Fill in process VM information */
vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT;
vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT;
vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)virtual_offset;
vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t)
(virtual_offset + a_out->a_text);
/* Fill in image_params */
imgp->interpreted = 0;
imgp->entry_addr = a_out->a_entry;
imgp->proc->p_sysent = &linux_sysvec;
fail:
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
return (error);
}
/*
* Tell kern_execve.c about it, with a little help from the linker.
*/
static struct execsw linux_execsw = { exec_linux_imgact, "linux a.out" };
EXEC_SET(linuxaout, linux_execsw);
Index: head/sys/kern/imgact_aout.c
===================================================================
--- head/sys/kern/imgact_aout.c (revision 103766)
+++ head/sys/kern/imgact_aout.c (revision 103767)
@@ -1,314 +1,314 @@
/*
* Copyright (c) 1993, David Greenman
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/param.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/imgact_aout.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/user.h>
#include <machine/frame.h>
#include <machine/md_var.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_param.h>
static int exec_aout_imgact(struct image_params *imgp);
static int aout_fixup(register_t **stack_base, struct image_params *imgp);
struct sysentvec aout_sysvec = {
SYS_MAXSYSCALL,
sysent,
0,
0,
NULL,
0,
NULL,
NULL,
aout_fixup,
sendsig,
sigcode,
&szsigcode,
NULL,
"FreeBSD a.out",
aout_coredump,
NULL,
MINSIGSTKSZ,
PAGE_SIZE,
VM_MIN_ADDRESS,
VM_MAXUSER_ADDRESS,
USRSTACK,
PS_STRINGS,
VM_PROT_ALL,
exec_copyout_strings,
exec_setregs
};
static int
aout_fixup(stack_base, imgp)
register_t **stack_base;
struct image_params *imgp;
{
return (suword(--(*stack_base), imgp->argc));
}
static int
exec_aout_imgact(imgp)
struct image_params *imgp;
{
const struct exec *a_out = (const struct exec *) imgp->image_header;
struct vmspace *vmspace;
struct vnode *vp;
vm_map_t map;
vm_object_t object;
vm_offset_t text_end, data_end;
unsigned long virtual_offset;
unsigned long file_offset;
unsigned long bss_size;
int error;
GIANT_REQUIRED;
/*
* Linux and *BSD binaries look very much alike,
* only the machine id is different:
* 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
* NetBSD is in network byte order.. ugh.
*/
if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
((a_out->a_magic >> 16) & 0xff) != 0 &&
((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
return -1;
/*
* Set file/virtual offset based on a.out variant.
* We do two cases: host byte order and network byte order
* (for NetBSD compatibility)
*/
switch ((int)(a_out->a_magic & 0xffff)) {
case ZMAGIC:
virtual_offset = 0;
if (a_out->a_text) {
file_offset = PAGE_SIZE;
} else {
/* Bill's "screwball mode" */
file_offset = 0;
}
break;
case QMAGIC:
virtual_offset = PAGE_SIZE;
file_offset = 0;
/* Pass PS_STRINGS for BSD/OS binaries only. */
if (N_GETMID(*a_out) == MID_ZERO)
- imgp->ps_strings = PS_STRINGS;
+ imgp->ps_strings = aout_sysvec.sv_psstrings;
break;
default:
/* NetBSD compatibility */
switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
case ZMAGIC:
case QMAGIC:
virtual_offset = PAGE_SIZE;
file_offset = 0;
break;
default:
return (-1);
}
}
bss_size = roundup(a_out->a_bss, PAGE_SIZE);
/*
* Check various fields in header for validity/bounds.
*/
if (/* entry point must lay with text region */
a_out->a_entry < virtual_offset ||
a_out->a_entry >= virtual_offset + a_out->a_text ||
/* text and data size must each be page rounded */
a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
return (-1);
/* text + data can't exceed file size */
if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
return (EFAULT);
/*
* text/data/bss must not exceed limits
*/
mtx_assert(&Giant, MA_OWNED);
if (/* text can't exceed maximum text size */
a_out->a_text > maxtsiz ||
/* data + bss can't exceed rlimit */
a_out->a_data + bss_size >
imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
return (ENOMEM);
/* copy in arguments and/or environment from old process */
error = exec_extract_strings(imgp);
if (error)
return (error);
/*
* Destroy old process VM and create a new one (with a new stack)
*/
- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
+ exec_new_vmspace(imgp, &aout_sysvec);
/*
* The vm space can be changed by exec_new_vmspace
*/
vmspace = imgp->proc->p_vmspace;
vp = imgp->vp;
object = imgp->object;
map = &vmspace->vm_map;
vm_map_lock(map);
vm_object_reference(object);
text_end = virtual_offset + a_out->a_text;
error = vm_map_insert(map, object,
file_offset,
virtual_offset, text_end,
VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT);
if (error) {
vm_map_unlock(map);
return (error);
}
data_end = text_end + a_out->a_data;
if (a_out->a_data) {
vm_object_reference(object);
error = vm_map_insert(map, object,
file_offset + a_out->a_text,
text_end, data_end,
VM_PROT_ALL, VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT);
if (error) {
vm_map_unlock(map);
return (error);
}
}
if (bss_size) {
error = vm_map_insert(map, NULL, 0,
data_end, data_end + bss_size,
VM_PROT_ALL, VM_PROT_ALL, 0);
if (error) {
vm_map_unlock(map);
return (error);
}
}
vm_map_unlock(map);
/* Fill in process VM information */
vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
vmspace->vm_daddr = (caddr_t) (uintptr_t)
(virtual_offset + a_out->a_text);
/* Fill in image_params */
imgp->interpreted = 0;
imgp->entry_addr = a_out->a_entry;
imgp->proc->p_sysent = &aout_sysvec;
return (0);
}
/*
* Dump core, into a file named as described in the comments for
* expand_name(), unless the process was setuid/setgid.
*/
int
aout_coredump(td, vp, limit)
register struct thread *td;
register struct vnode *vp;
off_t limit;
{
struct proc *p = td->td_proc;
register struct ucred *cred = td->td_ucred;
register struct vmspace *vm = p->p_vmspace;
char *tempuser;
int error;
if (ctob((uarea_pages + kstack_pages)
+ vm->vm_dsize + vm->vm_ssize) >= limit)
return (EFAULT);
tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
M_WAITOK | M_ZERO);
if (tempuser == NULL)
return (ENOMEM);
PROC_LOCK(p);
fill_kinfo_proc(p, &p->p_uarea->u_kproc);
PROC_UNLOCK(p);
bcopy(p->p_uarea, tempuser, sizeof(struct user));
bcopy(td->td_frame,
tempuser + ctob(uarea_pages) +
((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
sizeof(struct trapframe));
error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
ctob(uarea_pages + kstack_pages),
(off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
(int *)NULL, td);
free(tempuser, M_TEMP);
if (error == 0)
error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
(int)ctob(vm->vm_dsize),
(off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE,
IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
if (error == 0)
error = vn_rdwr_inchunks(UIO_WRITE, vp,
- (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
- round_page(ctob(vm->vm_ssize)),
+ (caddr_t)trunc_page(p->p_sysent->sv_usrstack -
+ ctob(vm->vm_ssize)), round_page(ctob(vm->vm_ssize)),
(off_t)ctob(uarea_pages + kstack_pages) +
ctob(vm->vm_dsize), UIO_USERSPACE,
IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
return (error);
}
/*
* Tell kern_execve.c about it, with a little help from the linker.
*/
static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
EXEC_SET(aout, aout_execsw);
Index: head/sys/kern/imgact_elf.c
===================================================================
--- head/sys/kern/imgact_elf.c (revision 103766)
+++ head/sys/kern/imgact_elf.c (revision 103767)
@@ -1,1241 +1,1240 @@
/*-
* Copyright (c) 2000 David O'Brien
* Copyright (c) 1995-1996 Søren Schmidt
* Copyright (c) 1996 Peter Wemm
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/param.h>
#include <sys/exec.h>
#include <sys/fcntl.h>
#include <sys/imgact.h>
#include <sys/imgact_elf.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/mman.h>
#include <sys/namei.h>
#include <sys/pioctl.h>
#include <sys/proc.h>
#include <sys/procfs.h>
#include <sys/resourcevar.h>
#include <sys/systm.h>
#include <sys/signalvar.h>
#include <sys/stat.h>
#include <sys/sx.h>
#include <sys/syscall.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
#include <sys/vnode.h>
#include <vm/vm.h>
#include <vm/vm_kern.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <machine/elf.h>
#include <machine/md_var.h>
#define OLD_EI_BRAND 8
__ElfType(Brandinfo);
__ElfType(Auxargs);
static int __elfN(check_header)(const Elf_Ehdr *hdr);
static Elf_Brandinfo *__elfN(get_brandinfo)(const Elf_Ehdr *hdr,
const char *interp);
static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
u_long *entry, size_t pagesize);
static int __elfN(load_section)(struct proc *p,
struct vmspace *vmspace, struct vnode *vp, vm_object_t object,
vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
vm_prot_t prot, size_t pagesize);
static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
static int elf_trace = 0;
#if __ELF_WORD_SIZE == 32
SYSCTL_INT(_debug, OID_AUTO, elf32_trace, CTLFLAG_RW, &elf_trace, 0, "");
#else
SYSCTL_INT(_debug, OID_AUTO, elf64_trace, CTLFLAG_RW, &elf_trace, 0, "");
#endif
static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
extern int fallback_elf_brand;
int
__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
{
int i;
for (i = 0; i < MAX_BRANDS; i++) {
if (elf_brand_list[i] == NULL) {
elf_brand_list[i] = entry;
break;
}
}
if (i == MAX_BRANDS)
return (-1);
return (0);
}
int
__elfN(remove_brand_entry)(Elf_Brandinfo *entry)
{
int i;
for (i = 0; i < MAX_BRANDS; i++) {
if (elf_brand_list[i] == entry) {
elf_brand_list[i] = NULL;
break;
}
}
if (i == MAX_BRANDS)
return (-1);
return (0);
}
int
__elfN(brand_inuse)(Elf_Brandinfo *entry)
{
struct proc *p;
int rval = FALSE;
sx_slock(&allproc_lock);
LIST_FOREACH(p, &allproc, p_list) {
if (p->p_sysent == entry->sysvec) {
rval = TRUE;
break;
}
}
sx_sunlock(&allproc_lock);
return (rval);
}
static Elf_Brandinfo *
__elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp)
{
Elf_Brandinfo *bi;
int i;
/*
* We support three types of branding -- (1) the ELF EI_OSABI field
* that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
* branding w/in the ELF header, and (3) path of the `interp_path'
* field. We should also look for an ".note.ABI-tag" ELF section now
* in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones.
*/
/* If the executable has a brand, search for it in the brand list. */
for (i = 0; i < MAX_BRANDS; i++) {
bi = elf_brand_list[i];
if (bi != NULL && hdr->e_machine == bi->machine &&
(hdr->e_ident[EI_OSABI] == bi->brand ||
strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0))
return (bi);
}
/* Lacking a known brand, search for a recognized interpreter. */
if (interp != NULL) {
for (i = 0; i < MAX_BRANDS; i++) {
bi = elf_brand_list[i];
if (bi != NULL && hdr->e_machine == bi->machine &&
strcmp(interp, bi->interp_path) == 0)
return (bi);
}
}
/* Lacking a recognized interpreter, try the default brand */
for (i = 0; i < MAX_BRANDS; i++) {
bi = elf_brand_list[i];
if (bi != NULL && hdr->e_machine == bi->machine &&
fallback_elf_brand == bi->brand)
return (bi);
}
return (NULL);
}
static int
__elfN(check_header)(const Elf_Ehdr *hdr)
{
Elf_Brandinfo *bi;
int i;
if (!IS_ELF(*hdr) ||
hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
hdr->e_ident[EI_VERSION] != EV_CURRENT)
return (ENOEXEC);
/*
* Make sure we have at least one brand for this machine.
*/
for (i = 0; i < MAX_BRANDS; i++) {
bi = elf_brand_list[i];
if (bi != NULL && bi->machine == hdr->e_machine)
break;
}
if (i == MAX_BRANDS)
return (ENOEXEC);
if (hdr->e_version != ELF_TARG_VER)
return (ENOEXEC);
return (0);
}
static int
__elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
vm_offset_t start, vm_offset_t end, vm_prot_t prot,
vm_prot_t max)
{
int error, rv;
vm_offset_t off;
vm_offset_t data_buf = 0;
/*
* Create the page if it doesn't exist yet. Ignore errors.
*/
vm_map_lock(map);
vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), max,
max, 0);
vm_map_unlock(map);
/*
* Find the page from the underlying object.
*/
if (object) {
vm_object_reference(object);
rv = vm_map_find(exec_map,
object,
trunc_page(offset),
&data_buf,
PAGE_SIZE,
TRUE,
VM_PROT_READ,
VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
if (rv != KERN_SUCCESS) {
vm_object_deallocate(object);
return (rv);
}
off = offset - trunc_page(offset);
error = copyout((caddr_t)data_buf + off, (caddr_t)start,
end - start);
vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
if (error) {
return (KERN_FAILURE);
}
}
return (KERN_SUCCESS);
}
static int
__elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
vm_offset_t start, vm_offset_t end, vm_prot_t prot,
vm_prot_t max, int cow)
{
int rv;
if (start != trunc_page(start)) {
rv = __elfN(map_partial)(map, object, offset, start,
round_page(start), prot, max);
if (rv)
return (rv);
offset += round_page(start) - start;
start = round_page(start);
}
if (end != round_page(end)) {
rv = __elfN(map_partial)(map, object, offset +
trunc_page(end) - start, trunc_page(end), end, prot, max);
if (rv)
return (rv);
end = trunc_page(end);
}
if (end > start) {
if (offset & PAGE_MASK) {
vm_offset_t data_buf, off;
vm_size_t sz;
int error;
/*
* The mapping is not page aligned. This means we have
* to copy the data. Sigh.
*/
rv = vm_map_find(map, 0, 0, &start, end - start,
FALSE, prot, max, 0);
if (rv)
return (rv);
while (start < end) {
vm_object_reference(object);
rv = vm_map_find(exec_map,
object,
trunc_page(offset),
&data_buf,
2 * PAGE_SIZE,
TRUE,
VM_PROT_READ,
VM_PROT_ALL,
(MAP_COPY_ON_WRITE
| MAP_PREFAULT_PARTIAL));
if (rv != KERN_SUCCESS) {
vm_object_deallocate(object);
return (rv);
}
off = offset - trunc_page(offset);
sz = end - start;
if (sz > PAGE_SIZE)
sz = PAGE_SIZE;
error = copyout((caddr_t)data_buf + off,
(caddr_t)start, sz);
vm_map_remove(exec_map, data_buf,
data_buf + 2 * PAGE_SIZE);
if (error) {
return (KERN_FAILURE);
}
start += sz;
}
rv = KERN_SUCCESS;
} else {
vm_map_lock(map);
rv = vm_map_insert(map, object, offset, start, end,
prot, max, cow);
vm_map_unlock(map);
}
return (rv);
} else {
return (KERN_SUCCESS);
}
}
static int
__elfN(load_section)(struct proc *p, struct vmspace *vmspace,
struct vnode *vp, vm_object_t object, vm_offset_t offset,
caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
size_t pagesize)
{
size_t map_len;
vm_offset_t map_addr;
int error, rv;
size_t copy_len;
vm_offset_t file_addr;
vm_offset_t data_buf = 0;
GIANT_REQUIRED;
error = 0;
/*
* It's necessary to fail if the filsz + offset taken from the
* header is greater than the actual file pager object's size.
* If we were to allow this, then the vm_map_find() below would
* walk right off the end of the file object and into the ether.
*
* While I'm here, might as well check for something else that
* is invalid: filsz cannot be greater than memsz.
*/
if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size ||
filsz > memsz) {
uprintf("elf_load_section: truncated ELF file\n");
return (ENOEXEC);
}
#define trunc_page_ps(va, ps) ((va) & ~(ps - 1))
#define round_page_ps(va, ps) (((va) + (ps - 1)) & ~(ps - 1))
map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
file_addr = trunc_page_ps(offset, pagesize);
/*
* We have two choices. We can either clear the data in the last page
* of an oversized mapping, or we can start the anon mapping a page
* early and copy the initialized data into that first page. We
* choose the second..
*/
if (memsz > filsz)
map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
else
map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
if (map_len != 0) {
vm_object_reference(object);
rv = __elfN(map_insert)(&vmspace->vm_map,
object,
file_addr, /* file offset */
map_addr, /* virtual start */
map_addr + map_len,/* virtual end */
prot,
VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT);
if (rv != KERN_SUCCESS) {
vm_object_deallocate(object);
return (EINVAL);
}
/* we can stop now if we've covered it all */
if (memsz == filsz) {
return (0);
}
}
/*
* We have to get the remaining bit of the file into the first part
* of the oversized map segment. This is normally because the .data
* segment in the file is extended to provide bss. It's a neat idea
* to try and save a page, but it's a pain in the behind to implement.
*/
copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
map_addr;
/* This had damn well better be true! */
if (map_len != 0) {
rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr,
map_addr + map_len, VM_PROT_ALL, VM_PROT_ALL, 0);
if (rv != KERN_SUCCESS) {
return (EINVAL);
}
}
if (copy_len != 0) {
vm_offset_t off;
vm_object_reference(object);
rv = vm_map_find(exec_map,
object,
trunc_page(offset + filsz),
&data_buf,
PAGE_SIZE,
TRUE,
VM_PROT_READ,
VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
if (rv != KERN_SUCCESS) {
vm_object_deallocate(object);
return (EINVAL);
}
/* send the page fragment to user space */
off = trunc_page_ps(offset + filsz, pagesize) -
trunc_page(offset + filsz);
error = copyout((caddr_t)data_buf + off, (caddr_t)map_addr,
copy_len);
vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
if (error) {
return (error);
}
}
/*
* set it to the specified protection.
* XXX had better undo the damage from pasting over the cracks here!
*/
vm_map_protect(&vmspace->vm_map, trunc_page(map_addr),
round_page(map_addr + map_len), prot, FALSE);
return (error);
}
/*
* Load the file "file" into memory. It may be either a shared object
* or an executable.
*
* The "addr" reference parameter is in/out. On entry, it specifies
* the address where a shared object should be loaded. If the file is
* an executable, this value is ignored. On exit, "addr" specifies
* where the file was actually loaded.
*
* The "entry" reference parameter is out only. On exit, it specifies
* the entry point for the loaded file.
*/
static int
__elfN(load_file)(struct proc *p, const char *file, u_long *addr,
u_long *entry, size_t pagesize)
{
struct {
struct nameidata nd;
struct vattr attr;
struct image_params image_params;
} *tempdata;
const Elf_Ehdr *hdr = NULL;
const Elf_Phdr *phdr = NULL;
struct nameidata *nd;
struct vmspace *vmspace = p->p_vmspace;
struct vattr *attr;
struct image_params *imgp;
vm_prot_t prot;
u_long rbase;
u_long base_addr = 0;
int error, i, numsegs;
if (curthread->td_proc != p)
panic("elf_load_file - thread"); /* XXXKSE DIAGNOSTIC */
tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
nd = &tempdata->nd;
attr = &tempdata->attr;
imgp = &tempdata->image_params;
/*
* Initialize part of the common data
*/
imgp->proc = p;
imgp->uap = NULL;
imgp->attr = attr;
imgp->firstpage = NULL;
imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE);
imgp->object = NULL;
if (imgp->image_header == NULL) {
nd->ni_vp = NULL;
error = ENOMEM;
goto fail;
}
/* XXXKSE */
NDINIT(nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread);
if ((error = namei(nd)) != 0) {
nd->ni_vp = NULL;
goto fail;
}
NDFREE(nd, NDF_ONLY_PNBUF);
imgp->vp = nd->ni_vp;
/*
* Check permissions, modes, uid, etc on the file, and "open" it.
*/
error = exec_check_permissions(imgp);
if (error) {
VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
goto fail;
}
error = exec_map_first_page(imgp);
/*
* Also make certain that the interpreter stays the same, so set
* its VV_TEXT flag, too.
*/
if (error == 0)
nd->ni_vp->v_vflag |= VV_TEXT;
VOP_GETVOBJECT(nd->ni_vp, &imgp->object);
vm_object_reference(imgp->object);
VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
if (error)
goto fail;
hdr = (const Elf_Ehdr *)imgp->image_header;
if ((error = __elfN(check_header)(hdr)) != 0)
goto fail;
if (hdr->e_type == ET_DYN)
rbase = *addr;
else if (hdr->e_type == ET_EXEC)
rbase = 0;
else {
error = ENOEXEC;
goto fail;
}
/* Only support headers that fit within first page for now */
if ((hdr->e_phoff > PAGE_SIZE) ||
(hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
error = ENOEXEC;
goto fail;
}
phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
if (phdr[i].p_type == PT_LOAD) { /* Loadable segment */
prot = 0;
if (phdr[i].p_flags & PF_X)
prot |= VM_PROT_EXECUTE;
if (phdr[i].p_flags & PF_W)
prot |= VM_PROT_WRITE;
if (phdr[i].p_flags & PF_R)
prot |= VM_PROT_READ;
if ((error = __elfN(load_section)(p, vmspace,
nd->ni_vp, imgp->object, phdr[i].p_offset,
(caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
phdr[i].p_memsz, phdr[i].p_filesz, prot,
pagesize)) != 0)
goto fail;
/*
* Establish the base address if this is the
* first segment.
*/
if (numsegs == 0)
base_addr = trunc_page(phdr[i].p_vaddr +
rbase);
numsegs++;
}
}
*addr = base_addr;
*entry = (unsigned long)hdr->e_entry + rbase;
fail:
if (imgp->firstpage)
exec_unmap_first_page(imgp);
if (imgp->image_header)
kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header,
PAGE_SIZE);
if (imgp->object)
vm_object_deallocate(imgp->object);
if (nd->ni_vp)
vrele(nd->ni_vp);
free(tempdata, M_TEMP);
return (error);
}
static int
__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
{
const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
const Elf_Phdr *phdr;
Elf_Auxargs *elf_auxargs = NULL;
struct vmspace *vmspace;
vm_prot_t prot;
u_long text_size = 0, data_size = 0, total_size = 0;
u_long text_addr = 0, data_addr = 0;
u_long seg_size, seg_addr;
u_long addr, entry = 0, proghdr = 0;
int error, i;
const char *interp = NULL;
Elf_Brandinfo *brand_info;
char *path;
struct thread *td = curthread;
struct sysentvec *sv;
GIANT_REQUIRED;
/*
* Do we have a valid ELF header ?
*/
if (__elfN(check_header)(hdr) != 0 || hdr->e_type != ET_EXEC)
return (-1);
/*
* From here on down, we return an errno, not -1, as we've
* detected an ELF file.
*/
if ((hdr->e_phoff > PAGE_SIZE) ||
(hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
/* Only support headers in first page for now */
return (ENOEXEC);
}
phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
/*
* From this point on, we may have resources that need to be freed.
*/
VOP_UNLOCK(imgp->vp, 0, td);
for (i = 0; i < hdr->e_phnum; i++) {
switch (phdr[i].p_type) {
case PT_INTERP: /* Path to interpreter */
if (phdr[i].p_filesz > MAXPATHLEN ||
phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) {
error = ENOEXEC;
goto fail;
}
interp = imgp->image_header + phdr[i].p_offset;
break;
default:
break;
}
}
brand_info = __elfN(get_brandinfo)(hdr, interp);
if (brand_info == NULL) {
uprintf("ELF binary type \"%u\" not known.\n",
hdr->e_ident[EI_OSABI]);
error = ENOEXEC;
goto fail;
}
sv = brand_info->sysvec;
if ((error = exec_extract_strings(imgp)) != 0)
goto fail;
- exec_new_vmspace(imgp, sv->sv_minuser, sv->sv_maxuser,
- sv->sv_usrstack);
+ exec_new_vmspace(imgp, sv);
vmspace = imgp->proc->p_vmspace;
for (i = 0; i < hdr->e_phnum; i++) {
switch (phdr[i].p_type) {
case PT_LOAD: /* Loadable segment */
prot = 0;
if (phdr[i].p_flags & PF_X)
prot |= VM_PROT_EXECUTE;
if (phdr[i].p_flags & PF_W)
prot |= VM_PROT_WRITE;
if (phdr[i].p_flags & PF_R)
prot |= VM_PROT_READ;
#if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER)
/*
* Some x86 binaries assume read == executable,
* notably the M3 runtime and therefore cvsup
*/
if (prot & VM_PROT_READ)
prot |= VM_PROT_EXECUTE;
#endif
if ((error = __elfN(load_section)(imgp->proc, vmspace,
imgp->vp, imgp->object, phdr[i].p_offset,
(caddr_t)(uintptr_t)phdr[i].p_vaddr,
phdr[i].p_memsz, phdr[i].p_filesz, prot,
sv->sv_pagesize)) != 0)
goto fail;
seg_addr = trunc_page(phdr[i].p_vaddr);
seg_size = round_page(phdr[i].p_memsz +
phdr[i].p_vaddr - seg_addr);
/*
* Is this .text or .data? We can't use
* VM_PROT_WRITE or VM_PROT_EXEC, it breaks the
* alpha terribly and possibly does other bad
* things so we stick to the old way of figuring
* it out: If the segment contains the program
* entry point, it's a text segment, otherwise it
* is a data segment.
*
* Note that obreak() assumes that data_addr +
* data_size == end of data load area, and the ELF
* file format expects segments to be sorted by
* address. If multiple data segments exist, the
* last one will be used.
*/
if (hdr->e_entry >= phdr[i].p_vaddr &&
hdr->e_entry < (phdr[i].p_vaddr +
phdr[i].p_memsz)) {
text_size = seg_size;
text_addr = seg_addr;
entry = (u_long)hdr->e_entry;
} else {
data_size = seg_size;
data_addr = seg_addr;
}
total_size += seg_size;
/*
* Check limits. It should be safe to check the
* limits after loading the segment since we do
* not actually fault in all the segment's pages.
*/
if (data_size >
imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur ||
text_size > maxtsiz ||
total_size >
imgp->proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
error = ENOMEM;
goto fail;
}
break;
case PT_PHDR: /* Program header table info */
proghdr = phdr[i].p_vaddr;
break;
default:
break;
}
}
vmspace->vm_tsize = text_size >> PAGE_SHIFT;
vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
vmspace->vm_dsize = data_size >> PAGE_SHIFT;
vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
addr = ELF_RTLD_ADDR(vmspace);
imgp->entry_addr = entry;
imgp->proc->p_sysent = sv;
if (interp != NULL) {
path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path,
interp);
if ((error = __elfN(load_file)(imgp->proc, path, &addr,
&imgp->entry_addr, sv->sv_pagesize)) != 0) {
if ((error = __elfN(load_file)(imgp->proc, interp,
&addr, &imgp->entry_addr, sv->sv_pagesize)) != 0) {
uprintf("ELF interpreter %s not found\n",
path);
free(path, M_TEMP);
goto fail;
}
}
free(path, M_TEMP);
}
/*
* Construct auxargs table (used by the fixup routine)
*/
elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
elf_auxargs->execfd = -1;
elf_auxargs->phdr = proghdr;
elf_auxargs->phent = hdr->e_phentsize;
elf_auxargs->phnum = hdr->e_phnum;
elf_auxargs->pagesz = PAGE_SIZE;
elf_auxargs->base = addr;
elf_auxargs->flags = 0;
elf_auxargs->entry = entry;
elf_auxargs->trace = elf_trace;
imgp->auxargs = elf_auxargs;
imgp->interpreted = 0;
fail:
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
return (error);
}
#if __ELF_WORD_SIZE == 32
#define suword suword32
#define stacktype u_int32_t
#else
#define suword suword64
#define stacktype u_int64_t
#endif
int
__elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
{
Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
stacktype *base;
stacktype *pos;
base = (stacktype *)*stack_base;
pos = base + (imgp->argc + imgp->envc + 2);
if (args->trace) {
AUXARGS_ENTRY(pos, AT_DEBUG, 1);
}
if (args->execfd != -1) {
AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
}
AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
AUXARGS_ENTRY(pos, AT_BASE, args->base);
AUXARGS_ENTRY(pos, AT_NULL, 0);
free(imgp->auxargs, M_TEMP);
imgp->auxargs = NULL;
base--;
suword(base, (long)imgp->argc);
*stack_base = (register_t *)base;
return (0);
}
/*
* Code for generating ELF core dumps.
*/
typedef void (*segment_callback)(vm_map_entry_t, void *);
/* Closure for cb_put_phdr(). */
struct phdr_closure {
Elf_Phdr *phdr; /* Program header to fill in */
Elf_Off offset; /* Offset of segment in core file */
};
/* Closure for cb_size_segment(). */
struct sseg_closure {
int count; /* Count of writable segments. */
size_t size; /* Total size of all writable segments. */
};
static void cb_put_phdr(vm_map_entry_t, void *);
static void cb_size_segment(vm_map_entry_t, void *);
static void each_writable_segment(struct proc *, segment_callback, void *);
static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *,
int, void *, size_t);
static void __elfN(puthdr)(struct proc *, void *, size_t *,
const prstatus_t *, const prfpregset_t *, const prpsinfo_t *, int);
static void __elfN(putnote)(void *, size_t *, const char *, int,
const void *, size_t);
extern int osreldate;
int
__elfN(coredump)(td, vp, limit)
struct thread *td;
register struct vnode *vp;
off_t limit;
{
register struct proc *p = td->td_proc;
register struct ucred *cred = td->td_ucred;
int error = 0;
struct sseg_closure seginfo;
void *hdr;
size_t hdrsize;
/* Size the program segments. */
seginfo.count = 0;
seginfo.size = 0;
each_writable_segment(p, cb_size_segment, &seginfo);
/*
* Calculate the size of the core file header area by making
* a dry run of generating it. Nothing is written, but the
* size is calculated.
*/
hdrsize = 0;
__elfN(puthdr)((struct proc *)NULL, (void *)NULL, &hdrsize,
(const prstatus_t *)NULL, (const prfpregset_t *)NULL,
(const prpsinfo_t *)NULL, seginfo.count);
if (hdrsize + seginfo.size >= limit)
return (EFAULT);
/*
* Allocate memory for building the header, fill it up,
* and write it out.
*/
hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
if (hdr == NULL) {
return (EINVAL);
}
error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize);
/* Write the contents of all of the writable segments. */
if (error == 0) {
Elf_Phdr *php;
off_t offset;
int i;
php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
offset = hdrsize;
for (i = 0; i < seginfo.count; i++) {
error = vn_rdwr_inchunks(UIO_WRITE, vp,
(caddr_t)(uintptr_t)php->p_vaddr,
php->p_filesz, offset, UIO_USERSPACE,
IO_UNIT | IO_DIRECT, cred, NOCRED, (int *)NULL,
curthread); /* XXXKSE */
if (error != 0)
break;
offset += php->p_filesz;
php++;
}
}
free(hdr, M_TEMP);
return (error);
}
/*
* A callback for each_writable_segment() to write out the segment's
* program header entry.
*/
static void
cb_put_phdr(entry, closure)
vm_map_entry_t entry;
void *closure;
{
struct phdr_closure *phc = (struct phdr_closure *)closure;
Elf_Phdr *phdr = phc->phdr;
phc->offset = round_page(phc->offset);
phdr->p_type = PT_LOAD;
phdr->p_offset = phc->offset;
phdr->p_vaddr = entry->start;
phdr->p_paddr = 0;
phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
phdr->p_align = PAGE_SIZE;
phdr->p_flags = 0;
if (entry->protection & VM_PROT_READ)
phdr->p_flags |= PF_R;
if (entry->protection & VM_PROT_WRITE)
phdr->p_flags |= PF_W;
if (entry->protection & VM_PROT_EXECUTE)
phdr->p_flags |= PF_X;
phc->offset += phdr->p_filesz;
phc->phdr++;
}
/*
* A callback for each_writable_segment() to gather information about
* the number of segments and their total size.
*/
static void
cb_size_segment(entry, closure)
vm_map_entry_t entry;
void *closure;
{
struct sseg_closure *ssc = (struct sseg_closure *)closure;
ssc->count++;
ssc->size += entry->end - entry->start;
}
/*
* For each writable segment in the process's memory map, call the given
* function with a pointer to the map entry and some arbitrary
* caller-supplied data.
*/
static void
each_writable_segment(p, func, closure)
struct proc *p;
segment_callback func;
void *closure;
{
vm_map_t map = &p->p_vmspace->vm_map;
vm_map_entry_t entry;
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
vm_object_t obj;
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) ||
(entry->protection & (VM_PROT_READ|VM_PROT_WRITE)) !=
(VM_PROT_READ|VM_PROT_WRITE))
continue;
/*
** Dont include memory segment in the coredump if
** MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
** madvise(2).
*/
if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
continue;
if ((obj = entry->object.vm_object) == NULL)
continue;
/* Find the deepest backing object. */
while (obj->backing_object != NULL)
obj = obj->backing_object;
/* Ignore memory-mapped devices and such things. */
if (obj->type != OBJT_DEFAULT &&
obj->type != OBJT_SWAP &&
obj->type != OBJT_VNODE)
continue;
(*func)(entry, closure);
}
}
/*
* Write the core file header to the file, including padding up to
* the page boundary.
*/
static int
__elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize)
struct thread *td;
struct vnode *vp;
struct ucred *cred;
int numsegs;
size_t hdrsize;
void *hdr;
{
struct {
prstatus_t status;
prfpregset_t fpregset;
prpsinfo_t psinfo;
} *tempdata;
struct proc *p = td->td_proc;
size_t off;
prstatus_t *status;
prfpregset_t *fpregset;
prpsinfo_t *psinfo;
tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO | M_WAITOK);
status = &tempdata->status;
fpregset = &tempdata->fpregset;
psinfo = &tempdata->psinfo;
/* Gather the information for the header. */
status->pr_version = PRSTATUS_VERSION;
status->pr_statussz = sizeof(prstatus_t);
status->pr_gregsetsz = sizeof(gregset_t);
status->pr_fpregsetsz = sizeof(fpregset_t);
status->pr_osreldate = osreldate;
status->pr_cursig = p->p_sig;
status->pr_pid = p->p_pid;
fill_regs(td, &status->pr_reg);
fill_fpregs(td, fpregset);
psinfo->pr_version = PRPSINFO_VERSION;
psinfo->pr_psinfosz = sizeof(prpsinfo_t);
strncpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname) - 1);
/* XXX - We don't fill in the command line arguments properly yet. */
strncpy(psinfo->pr_psargs, p->p_comm, PRARGSZ);
/* Fill in the header. */
bzero(hdr, hdrsize);
off = 0;
__elfN(puthdr)(p, hdr, &off, status, fpregset, psinfo, numsegs);
free(tempdata, M_TEMP);
/* Write it to the core file. */
return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL,
td)); /* XXXKSE */
}
static void
__elfN(puthdr)(struct proc *p, void *dst, size_t *off, const prstatus_t *status,
const prfpregset_t *fpregset, const prpsinfo_t *psinfo, int numsegs)
{
size_t ehoff;
size_t phoff;
size_t noteoff;
size_t notesz;
ehoff = *off;
*off += sizeof(Elf_Ehdr);
phoff = *off;
*off += (numsegs + 1) * sizeof(Elf_Phdr);
noteoff = *off;
__elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status,
sizeof *status);
__elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
sizeof *fpregset);
__elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
sizeof *psinfo);
notesz = *off - noteoff;
/* Align up to a page boundary for the program segments. */
*off = round_page(*off);
if (dst != NULL) {
Elf_Ehdr *ehdr;
Elf_Phdr *phdr;
struct phdr_closure phc;
/*
* Fill in the ELF header.
*/
ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
ehdr->e_ident[EI_MAG0] = ELFMAG0;
ehdr->e_ident[EI_MAG1] = ELFMAG1;
ehdr->e_ident[EI_MAG2] = ELFMAG2;
ehdr->e_ident[EI_MAG3] = ELFMAG3;
ehdr->e_ident[EI_CLASS] = ELF_CLASS;
ehdr->e_ident[EI_DATA] = ELF_DATA;
ehdr->e_ident[EI_VERSION] = EV_CURRENT;
ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
ehdr->e_ident[EI_ABIVERSION] = 0;
ehdr->e_ident[EI_PAD] = 0;
ehdr->e_type = ET_CORE;
ehdr->e_machine = ELF_ARCH;
ehdr->e_version = EV_CURRENT;
ehdr->e_entry = 0;
ehdr->e_phoff = phoff;
ehdr->e_flags = 0;
ehdr->e_ehsize = sizeof(Elf_Ehdr);
ehdr->e_phentsize = sizeof(Elf_Phdr);
ehdr->e_phnum = numsegs + 1;
ehdr->e_shentsize = sizeof(Elf_Shdr);
ehdr->e_shnum = 0;
ehdr->e_shstrndx = SHN_UNDEF;
/*
* Fill in the program header entries.
*/
phdr = (Elf_Phdr *)((char *)dst + phoff);
/* The note segement. */
phdr->p_type = PT_NOTE;
phdr->p_offset = noteoff;
phdr->p_vaddr = 0;
phdr->p_paddr = 0;
phdr->p_filesz = notesz;
phdr->p_memsz = 0;
phdr->p_flags = 0;
phdr->p_align = 0;
phdr++;
/* All the writable segments from the program. */
phc.phdr = phdr;
phc.offset = *off;
each_writable_segment(p, cb_put_phdr, &phc);
}
}
static void
__elfN(putnote)(void *dst, size_t *off, const char *name, int type,
const void *desc, size_t descsz)
{
Elf_Note note;
note.n_namesz = strlen(name) + 1;
note.n_descsz = descsz;
note.n_type = type;
if (dst != NULL)
bcopy(&note, (char *)dst + *off, sizeof note);
*off += sizeof note;
if (dst != NULL)
bcopy(name, (char *)dst + *off, note.n_namesz);
*off += roundup2(note.n_namesz, sizeof(Elf_Size));
if (dst != NULL)
bcopy(desc, (char *)dst + *off, note.n_descsz);
*off += roundup2(note.n_descsz, sizeof(Elf_Size));
}
/*
* Tell kern_execve.c about it, with a little help from the linker.
*/
#if __ELF_WORD_SIZE == 32
static struct execsw elf_execsw = {exec_elf32_imgact, "ELF32"};
EXEC_SET(elf32, elf_execsw);
#else
static struct execsw elf_execsw = {exec_elf64_imgact, "ELF64"};
EXEC_SET(elf64, elf_execsw);
#endif
Index: head/sys/kern/imgact_gzip.c
===================================================================
--- head/sys/kern/imgact_gzip.c (revision 103766)
+++ head/sys/kern/imgact_gzip.c (revision 103767)
@@ -1,385 +1,385 @@
/*
* ----------------------------------------------------------------------------
* "THE BEER-WARE LICENSE" (Revision 42):
* <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
* can do whatever you want with this stuff. If we meet some day, and you think
* this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
* ----------------------------------------------------------------------------
*
* $FreeBSD$
*
* This module handles execution of a.out files which have been run through
* "gzip". This saves diskspace, but wastes cpu-cycles and VM.
*
* TODO:
* text-segments should be made R/O after being filled
* is the vm-stuff safe ?
* should handle the entire header of gzip'ed stuff.
* inflate isn't quite reentrant yet...
* error-handling is a mess...
* so is the rest...
* tidy up unnecesary includes
*/
#include <sys/param.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/imgact_aout.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mman.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sysent.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/inflate.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
struct imgact_gzip {
struct image_params *ip;
struct exec a_out;
int error;
int gotheader;
int where;
u_char *inbuf;
u_long offset;
u_long output;
u_long len;
int idx;
u_long virtual_offset, file_offset, file_end, bss_size;
};
static int exec_gzip_imgact(struct image_params *imgp);
static int NextByte(void *vp);
static int do_aout_hdr(struct imgact_gzip *);
static int Flush(void *vp, u_char *, u_long siz);
static int
exec_gzip_imgact(imgp)
struct image_params *imgp;
{
int error, error2 = 0;
const u_char *p = (const u_char *) imgp->image_header;
struct imgact_gzip igz;
struct inflate infl;
struct vmspace *vmspace;
/* If these four are not OK, it isn't a gzip file */
if (p[0] != 0x1f)
return -1; /* 0 Simply magic */
if (p[1] != 0x8b)
return -1; /* 1 Simply magic */
if (p[2] != 0x08)
return -1; /* 2 Compression method */
if (p[9] != 0x03)
return -1; /* 9 OS compressed on */
/*
* If this one contains anything but a comment or a filename marker,
* we don't want to chew on it
*/
if (p[3] & ~(0x18))
return ENOEXEC; /* 3 Flags */
/* These are of no use to us */
/* 4-7 Timestamp */
/* 8 Extra flags */
bzero(&igz, sizeof igz);
bzero(&infl, sizeof infl);
infl.gz_private = (void *) &igz;
infl.gz_input = NextByte;
infl.gz_output = Flush;
igz.ip = imgp;
igz.idx = 10;
if (p[3] & 0x08) { /* skip a filename */
while (p[igz.idx++])
if (igz.idx >= PAGE_SIZE)
return ENOEXEC;
}
if (p[3] & 0x10) { /* skip a comment */
while (p[igz.idx++])
if (igz.idx >= PAGE_SIZE)
return ENOEXEC;
}
igz.len = imgp->attr->va_size;
error = inflate(&infl);
/*
* The unzipped file may not even have been long enough to contain
* a header giving Flush() a chance to return error. Check for this.
*/
if ( !igz.gotheader )
return ENOEXEC;
if ( !error ) {
vmspace = imgp->proc->p_vmspace;
error = vm_map_protect(&vmspace->vm_map,
(vm_offset_t) vmspace->vm_taddr,
(vm_offset_t) (vmspace->vm_taddr +
(vmspace->vm_tsize << PAGE_SHIFT)) ,
VM_PROT_READ|VM_PROT_EXECUTE,0);
}
if (igz.inbuf) {
error2 =
vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
(vm_offset_t) igz.inbuf + PAGE_SIZE);
}
if (igz.error || error || error2) {
printf("Output=%lu ", igz.output);
printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
error, igz.error, error2, igz.where);
}
if (igz.error)
return igz.error;
if (error)
return ENOEXEC;
if (error2)
return error2;
return 0;
}
static int
do_aout_hdr(struct imgact_gzip * gz)
{
int error;
struct vmspace *vmspace;
vm_offset_t vmaddr;
/*
* Set file/virtual offset based on a.out variant. We do two cases:
* host byte order and network byte order (for NetBSD compatibility)
*/
switch ((int) (gz->a_out.a_magic & 0xffff)) {
case ZMAGIC:
gz->virtual_offset = 0;
if (gz->a_out.a_text) {
gz->file_offset = PAGE_SIZE;
} else {
/* Bill's "screwball mode" */
gz->file_offset = 0;
}
break;
case QMAGIC:
gz->virtual_offset = PAGE_SIZE;
gz->file_offset = 0;
break;
default:
/* NetBSD compatibility */
switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
case ZMAGIC:
case QMAGIC:
gz->virtual_offset = PAGE_SIZE;
gz->file_offset = 0;
break;
default:
gz->where = __LINE__;
return (-1);
}
}
gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
/*
* Check various fields in header for validity/bounds.
*/
if ( /* entry point must lay with text region */
gz->a_out.a_entry < gz->virtual_offset ||
gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
/* text and data size must each be page rounded */
gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
gz->where = __LINE__;
return (-1);
}
/*
* text/data/bss must not exceed limits
*/
mtx_assert(&Giant, MA_OWNED);
if ( /* text can't exceed maximum text size */
gz->a_out.a_text > maxtsiz ||
/* data + bss can't exceed rlimit */
gz->a_out.a_data + gz->bss_size >
gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) {
gz->where = __LINE__;
return (ENOMEM);
}
/* Find out how far we should go */
gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
/* copy in arguments and/or environment from old process */
error = exec_extract_strings(gz->ip);
if (error) {
gz->where = __LINE__;
return (error);
}
/*
* Destroy old process VM and create a new one (with a new stack)
*/
- exec_new_vmspace(gz->ip, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
+ exec_new_vmspace(gz->ip, &aout_sysvec);
vmspace = gz->ip->proc->p_vmspace;
vmaddr = gz->virtual_offset;
error = vm_mmap(&vmspace->vm_map,
&vmaddr,
gz->a_out.a_text + gz->a_out.a_data,
VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
0,
0);
if (error) {
gz->where = __LINE__;
return (error);
}
if (gz->bss_size != 0) {
/*
* Allocate demand-zeroed area for uninitialized data.
* "bss" = 'block started by symbol' - named after the
* IBM 7090 instruction of the same name.
*/
vmaddr = gz->virtual_offset + gz->a_out.a_text +
gz->a_out.a_data;
error = vm_map_find(&vmspace->vm_map,
NULL,
0,
&vmaddr,
gz->bss_size,
FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
if (error) {
gz->where = __LINE__;
return (error);
}
}
/* Fill in process VM information */
vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset;
vmspace->vm_daddr = (caddr_t) (uintptr_t)
(gz->virtual_offset + gz->a_out.a_text);
/* Fill in image_params */
gz->ip->interpreted = 0;
gz->ip->entry_addr = gz->a_out.a_entry;
gz->ip->proc->p_sysent = &aout_sysvec;
return 0;
}
static int
NextByte(void *vp)
{
int error;
struct imgact_gzip *igz = (struct imgact_gzip *) vp;
if (igz->idx >= igz->len) {
igz->where = __LINE__;
return GZ_EOF;
}
if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
return igz->inbuf[(igz->idx++) - igz->offset];
}
if (igz->inbuf) {
error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
(vm_offset_t) igz->inbuf + PAGE_SIZE);
if (error) {
igz->where = __LINE__;
igz->error = error;
return GZ_EOF;
}
}
igz->offset = igz->idx & ~PAGE_MASK;
error = vm_mmap(kernel_map, /* map */
(vm_offset_t *) & igz->inbuf, /* address */
PAGE_SIZE, /* size */
VM_PROT_READ, /* protection */
VM_PROT_READ, /* max protection */
0, /* flags */
(caddr_t) igz->ip->vp, /* vnode */
igz->offset); /* offset */
if (error) {
igz->where = __LINE__;
igz->error = error;
return GZ_EOF;
}
return igz->inbuf[(igz->idx++) - igz->offset];
}
static int
Flush(void *vp, u_char * ptr, u_long siz)
{
struct imgact_gzip *gz = (struct imgact_gzip *) vp;
u_char *p = ptr, *q;
int i;
/* First, find a a.out-header */
if (gz->output < sizeof gz->a_out) {
q = (u_char *) & gz->a_out;
i = min(siz, sizeof gz->a_out - gz->output);
bcopy(p, q + gz->output, i);
gz->output += i;
p += i;
siz -= i;
if (gz->output == sizeof gz->a_out) {
gz->gotheader = 1;
i = do_aout_hdr(gz);
if (i == -1) {
if (!gz->where)
gz->where = __LINE__;
gz->error = ENOEXEC;
return ENOEXEC;
} else if (i) {
gz->where = __LINE__;
gz->error = i;
return ENOEXEC;
}
if (gz->file_offset == 0) {
q = (u_char *) (uintptr_t) gz->virtual_offset;
copyout(&gz->a_out, q, sizeof gz->a_out);
}
}
}
/* Skip over zero-padded first PAGE if needed */
if (gz->output < gz->file_offset &&
gz->output + siz > gz->file_offset) {
i = min(siz, gz->file_offset - gz->output);
gz->output += i;
p += i;
siz -= i;
}
if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
i = min(siz, gz->file_end - gz->output);
q = (u_char *) (uintptr_t)
(gz->virtual_offset + gz->output - gz->file_offset);
copyout(p, q, i);
gz->output += i;
p += i;
siz -= i;
}
gz->output += siz;
return 0;
}
/*
* Tell kern_execve.c about it, with a little help from the linker.
*/
static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
EXEC_SET(execgzip, gzip_execsw);
Index: head/sys/kern/init_main.c
===================================================================
--- head/sys/kern/init_main.c (revision 103766)
+++ head/sys/kern/init_main.c (revision 103767)
@@ -1,725 +1,725 @@
/*
* Copyright (c) 1995 Terrence R. Lambert
* All rights reserved.
*
* Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)init_main.c 8.9 (Berkeley) 1/21/94
* $FreeBSD$
*/
#include "opt_init_path.h"
#include "opt_mac.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/exec.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/mac.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/systm.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
#include <sys/sysent.h>
#include <sys/reboot.h>
#include <sys/sx.h>
#include <sys/sysproto.h>
#include <sys/vmmeter.h>
#include <sys/unistd.h>
#include <sys/malloc.h>
#include <sys/conf.h>
#include <machine/cpu.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <sys/user.h>
#include <sys/copyright.h>
void mi_startup(void); /* Should be elsewhere */
/* Components of the first process -- never freed. */
static struct session session0;
static struct pgrp pgrp0;
struct proc proc0;
struct thread thread0;
struct kse kse0;
struct ksegrp ksegrp0;
static struct procsig procsig0;
static struct filedesc0 filedesc0;
static struct plimit limit0;
static struct vmspace vmspace0;
struct proc *initproc;
int cmask = CMASK;
extern int fallback_elf_brand;
struct vnode *rootvp;
int boothowto = 0; /* initialized so that it can be patched */
SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "");
int bootverbose;
SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "");
/*
* This ensures that there is at least one entry so that the sysinit_set
* symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never
* executed.
*/
SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL)
/*
* The sysinit table itself. Items are checked off as the are run.
* If we want to register new sysinit types, add them to newsysinit.
*/
SET_DECLARE(sysinit_set, struct sysinit);
struct sysinit **sysinit, **sysinit_end;
struct sysinit **newsysinit, **newsysinit_end;
/*
* Merge a new sysinit set into the current set, reallocating it if
* necessary. This can only be called after malloc is running.
*/
void
sysinit_add(struct sysinit **set, struct sysinit **set_end)
{
struct sysinit **newset;
struct sysinit **sipp;
struct sysinit **xipp;
int count;
count = set_end - set;
if (newsysinit)
count += newsysinit_end - newsysinit;
else
count += sysinit_end - sysinit;
newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
if (newset == NULL)
panic("cannot malloc for sysinit");
xipp = newset;
if (newsysinit)
for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
*xipp++ = *sipp;
else
for (sipp = sysinit; sipp < sysinit_end; sipp++)
*xipp++ = *sipp;
for (sipp = set; sipp < set_end; sipp++)
*xipp++ = *sipp;
if (newsysinit)
free(newsysinit, M_TEMP);
newsysinit = newset;
newsysinit_end = newset + count;
}
/*
* System startup; initialize the world, create process 0, mount root
* filesystem, and fork to create init and pagedaemon. Most of the
* hard work is done in the lower-level initialization routines including
* startup(), which does memory initialization and autoconfiguration.
*
* This allows simple addition of new kernel subsystems that require
* boot time initialization. It also allows substitution of subsystem
* (for instance, a scheduler, kernel profiler, or VM system) by object
* module. Finally, it allows for optional "kernel threads".
*/
void
mi_startup(void)
{
register struct sysinit **sipp; /* system initialization*/
register struct sysinit **xipp; /* interior loop of sort*/
register struct sysinit *save; /* bubble*/
if (sysinit == NULL) {
sysinit = SET_BEGIN(sysinit_set);
sysinit_end = SET_LIMIT(sysinit_set);
}
restart:
/*
* Perform a bubble sort of the system initialization objects by
* their subsystem (primary key) and order (secondary key).
*/
for (sipp = sysinit; sipp < sysinit_end; sipp++) {
for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
if ((*sipp)->subsystem < (*xipp)->subsystem ||
((*sipp)->subsystem == (*xipp)->subsystem &&
(*sipp)->order <= (*xipp)->order))
continue; /* skip*/
save = *sipp;
*sipp = *xipp;
*xipp = save;
}
}
/*
* Traverse the (now) ordered list of system initialization tasks.
* Perform each task, and continue on to the next task.
*
* The last item on the list is expected to be the scheduler,
* which will not return.
*/
for (sipp = sysinit; sipp < sysinit_end; sipp++) {
if ((*sipp)->subsystem == SI_SUB_DUMMY)
continue; /* skip dummy task(s)*/
if ((*sipp)->subsystem == SI_SUB_DONE)
continue;
/* Call function */
(*((*sipp)->func))((*sipp)->udata);
/* Check off the one we're just done */
(*sipp)->subsystem = SI_SUB_DONE;
/* Check if we've installed more sysinit items via KLD */
if (newsysinit != NULL) {
if (sysinit != SET_BEGIN(sysinit_set))
free(sysinit, M_TEMP);
sysinit = newsysinit;
sysinit_end = newsysinit_end;
newsysinit = NULL;
newsysinit_end = NULL;
goto restart;
}
}
panic("Shouldn't get here!");
/* NOTREACHED*/
}
/*
***************************************************************************
****
**** The following SYSINIT's belong elsewhere, but have not yet
**** been moved.
****
***************************************************************************
*/
static void
print_caddr_t(void *data __unused)
{
printf("%s", (char *)data);
}
SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version)
static void
set_boot_verbose(void *data __unused)
{
if (boothowto & RB_VERBOSE)
bootverbose++;
}
SYSINIT(boot_verbose, SI_SUB_TUNABLES, SI_ORDER_ANY, set_boot_verbose, NULL)
struct sysentvec null_sysvec = {
0,
NULL,
0,
0,
NULL,
0,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
"null",
NULL,
NULL,
0,
PAGE_SIZE,
VM_MIN_ADDRESS,
VM_MAXUSER_ADDRESS,
USRSTACK,
PS_STRINGS,
VM_PROT_ALL,
NULL,
NULL
};
/*
***************************************************************************
****
**** The two following SYSINT's are proc0 specific glue code. I am not
**** convinced that they can not be safely combined, but their order of
**** operation has been maintained as the same as the original init_main.c
**** for right now.
****
**** These probably belong in init_proc.c or kern_proc.c, since they
**** deal with proc0 (the fork template process).
****
***************************************************************************
*/
/* ARGSUSED*/
static void
proc0_init(void *dummy __unused)
{
register struct proc *p;
register struct filedesc0 *fdp;
register unsigned i;
struct thread *td;
struct ksegrp *kg;
struct kse *ke;
GIANT_REQUIRED;
p = &proc0;
td = &thread0;
ke = &kse0;
kg = &ksegrp0;
/*
* Initialize magic number.
*/
p->p_magic = P_MAGIC;
/*
* Initialize thread, process and pgrp structures.
*/
procinit();
threadinit();
/*
* Initialize sleep queue hash table
*/
sleepinit();
/*
* additional VM structures
*/
vm_init2();
/*
* Create process 0 (the swapper).
*/
LIST_INSERT_HEAD(&allproc, p, p_list);
LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
p->p_pgrp = &pgrp0;
LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
LIST_INIT(&pgrp0.pg_members);
LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
pgrp0.pg_session = &session0;
mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
session0.s_count = 1;
session0.s_leader = p;
p->p_sysent = &null_sysvec;
/*
* proc_linkup was already done in init_i386() or alphainit() etc.
* because the earlier code needed to follow td->td_proc. Otherwise
* I would have done it here.. maybe this means this should be
* done earlier too.
*/
p->p_flag = P_SYSTEM;
p->p_sflag = PS_INMEM;
p->p_state = PRS_NORMAL;
td->td_state = TDS_RUNNING;
kg->kg_nice = NZERO;
kg->kg_pri_class = PRI_TIMESHARE;
kg->kg_user_pri = PUSER;
td->td_priority = PVM;
td->td_base_pri = PUSER;
td->td_kse = ke; /* XXXKSE */
ke->ke_oncpu = 0;
ke->ke_state = KES_THREAD;
ke->ke_thread = td;
/* proc_linkup puts it in the idle queue, that's not what we want. */
TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
kg->kg_idle_kses--;
p->p_peers = 0;
p->p_leader = p;
KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));
bcopy("swapper", p->p_comm, sizeof ("swapper"));
callout_init(&p->p_itcallout, 0);
callout_init(&td->td_slpcallout, 1);
/* Create credentials. */
p->p_ucred = crget();
p->p_ucred->cr_ngroups = 1; /* group 0 */
p->p_ucred->cr_uidinfo = uifind(0);
p->p_ucred->cr_ruidinfo = uifind(0);
p->p_ucred->cr_prison = NULL; /* Don't jail it. */
#ifdef MAC
mac_create_proc0(p->p_ucred);
#endif
td->td_ucred = crhold(p->p_ucred);
/* Create procsig. */
p->p_procsig = &procsig0;
p->p_procsig->ps_refcnt = 1;
/* Initialize signal state for process 0. */
siginit(&proc0);
/* Create the file descriptor table. */
fdp = &filedesc0;
p->p_fd = &fdp->fd_fd;
mtx_init(&fdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
fdp->fd_fd.fd_refcnt = 1;
fdp->fd_fd.fd_cmask = cmask;
fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
fdp->fd_fd.fd_nfiles = NDFILE;
/* Create the limits structures. */
p->p_limit = &limit0;
for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
limit0.pl_rlimit[i].rlim_cur =
limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
i = ptoa(cnt.v_free_count);
limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
limit0.p_cpulimit = RLIM_INFINITY;
limit0.p_refcnt = 1;
/* Allocate a prototype map so we have something to fork. */
pmap_pinit0(vmspace_pmap(&vmspace0));
p->p_vmspace = &vmspace0;
vmspace0.vm_refcnt = 1;
- vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
- trunc_page(VM_MAXUSER_ADDRESS));
+ vm_map_init(&vmspace0.vm_map, p->p_sysent->sv_minuser,
+ p->p_sysent->sv_maxuser);
vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0);
/*
* We continue to place resource usage info and signal
* actions in the user struct so they're pageable.
*/
p->p_stats = &p->p_uarea->u_stats;
p->p_sigacts = &p->p_uarea->u_sigacts;
/*
* Charge root for one process.
*/
(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
}
SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
/* ARGSUSED*/
static void
proc0_post(void *dummy __unused)
{
struct timespec ts;
struct proc *p;
/*
* Now we can look at the time, having had a chance to verify the
* time from the filesystem. Pretend that proc0 started now.
*/
sx_slock(&allproc_lock);
LIST_FOREACH(p, &allproc, p_list) {
microtime(&p->p_stats->p_start);
p->p_runtime.sec = 0;
p->p_runtime.frac = 0;
}
sx_sunlock(&allproc_lock);
binuptime(PCPU_PTR(switchtime));
PCPU_SET(switchticks, ticks);
/*
* Give the ``random'' number generator a thump.
*/
nanotime(&ts);
srandom(ts.tv_sec ^ ts.tv_nsec);
}
SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
/*
***************************************************************************
****
**** The following SYSINIT's and glue code should be moved to the
**** respective files on a per subsystem basis.
****
***************************************************************************
*/
/*
***************************************************************************
****
**** The following code probably belongs in another file, like
**** kern/init_init.c.
****
***************************************************************************
*/
/*
* List of paths to try when searching for "init".
*/
static char init_path[MAXPATHLEN] =
#ifdef INIT_PATH
__XSTRING(INIT_PATH);
#else
"/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall";
#endif
SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
"Path used to search the init process");
/*
* Start the initial user process; try exec'ing each pathname in init_path.
* The program is invoked with one argument containing the boot flags.
*/
static void
start_init(void *dummy)
{
vm_offset_t addr;
struct execve_args args;
int options, error;
char *var, *path, *next, *s;
char *ucp, **uap, *arg0, *arg1;
struct thread *td;
struct proc *p;
int init_does_devfs = 0;
mtx_lock(&Giant);
GIANT_REQUIRED;
td = curthread;
p = td->td_proc;
vfs_mountroot();
/* Get the vnode for '/'. Set p->p_fd->fd_cdir to reference it. */
if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode))
panic("cannot find root vnode");
FILEDESC_LOCK(p->p_fd);
p->p_fd->fd_cdir = rootvnode;
VREF(p->p_fd->fd_cdir);
p->p_fd->fd_rdir = rootvnode;
VREF(p->p_fd->fd_rdir);
FILEDESC_UNLOCK(p->p_fd);
VOP_UNLOCK(rootvnode, 0, td);
#ifdef MAC
mac_create_root_mount(td->td_ucred, TAILQ_FIRST(&mountlist));
#endif
if (devfs_present) {
/*
* For disk based systems, we probably cannot do this yet
* since the fs will be read-only. But a NFS root
* might be ok. It is worth a shot.
*/
error = kern_mkdir(td, "/dev", UIO_SYSSPACE, 0700);
if (error == EEXIST)
error = 0;
if (error == 0)
error = kernel_vmount(0, "fstype", "devfs",
"fspath", "/dev", NULL);
if (error != 0)
init_does_devfs = 1;
}
/*
* Need just enough stack to hold the faked-up "execve()" arguments.
*/
- addr = trunc_page(USRSTACK - PAGE_SIZE);
+ addr = p->p_sysent->sv_usrstack - PAGE_SIZE;
if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
panic("init: couldn't allocate argument space");
p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
p->p_vmspace->vm_ssize = 1;
if ((var = getenv("init_path")) != NULL) {
strncpy(init_path, var, sizeof init_path);
init_path[sizeof init_path - 1] = 0;
freeenv(var);
}
if ((var = getenv("kern.fallback_elf_brand")) != NULL) {
fallback_elf_brand = strtol(var, NULL, 0);
freeenv(var);
}
for (path = init_path; *path != '\0'; path = next) {
while (*path == ':')
path++;
if (*path == '\0')
break;
for (next = path; *next != '\0' && *next != ':'; next++)
/* nothing */ ;
if (bootverbose)
printf("start_init: trying %.*s\n", (int)(next - path),
path);
/*
* Move out the boot flag argument.
*/
options = 0;
- ucp = (char *)USRSTACK;
+ ucp = (char *)p->p_sysent->sv_usrstack;
(void)subyte(--ucp, 0); /* trailing zero */
if (boothowto & RB_SINGLE) {
(void)subyte(--ucp, 's');
options = 1;
}
#ifdef notyet
if (boothowto & RB_FASTBOOT) {
(void)subyte(--ucp, 'f');
options = 1;
}
#endif
#ifdef BOOTCDROM
(void)subyte(--ucp, 'C');
options = 1;
#endif
if (init_does_devfs) {
(void)subyte(--ucp, 'd');
options = 1;
}
if (options == 0)
(void)subyte(--ucp, '-');
(void)subyte(--ucp, '-'); /* leading hyphen */
arg1 = ucp;
/*
* Move out the file name (also arg 0).
*/
(void)subyte(--ucp, 0);
for (s = next - 1; s >= path; s--)
(void)subyte(--ucp, *s);
arg0 = ucp;
/*
* Move out the arg pointers.
*/
uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
(void)suword((caddr_t)--uap, (long)0); /* terminator */
(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
/*
* Point at the arguments.
*/
args.fname = arg0;
args.argv = uap;
args.envv = NULL;
/*
* Now try to exec the program. If can't for any reason
* other than it doesn't exist, complain.
*
* Otherwise, return via fork_trampoline() all the way
* to user mode as init!
*/
if ((error = execve(td, &args)) == 0) {
mtx_unlock(&Giant);
return;
}
if (error != ENOENT)
printf("exec %.*s: error %d\n", (int)(next - path),
path, error);
}
printf("init: not found in path %s\n", init_path);
panic("no init");
}
/*
* Like kthread_create(), but runs in it's own address space.
* We do this early to reserve pid 1.
*
* Note special case - do not make it runnable yet. Other work
* in progress will change this more.
*/
static void
create_init(const void *udata __unused)
{
struct ucred *newcred, *oldcred;
int error;
error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, &initproc);
if (error)
panic("cannot fork init: %d\n", error);
/* divorce init's credentials from the kernel's */
newcred = crget();
PROC_LOCK(initproc);
initproc->p_flag |= P_SYSTEM;
oldcred = initproc->p_ucred;
crcopy(newcred, oldcred);
#ifdef MAC
mac_create_proc1(newcred);
#endif
initproc->p_ucred = newcred;
PROC_UNLOCK(initproc);
crfree(oldcred);
cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
mtx_lock_spin(&sched_lock);
initproc->p_sflag |= PS_INMEM;
mtx_unlock_spin(&sched_lock);
cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
}
SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
/*
* Make it runnable now.
*/
static void
kick_init(const void *udata __unused)
{
struct thread *td;
td = FIRST_THREAD_IN_PROC(initproc);
mtx_lock_spin(&sched_lock);
TD_SET_CAN_RUN(td);
setrunqueue(td); /* XXXKSE */
mtx_unlock_spin(&sched_lock);
}
SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c (revision 103766)
+++ head/sys/kern/kern_exec.c (revision 103767)
@@ -1,1113 +1,1139 @@
/*
* Copyright (c) 1993, David Greenman
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include "opt_ktrace.h"
#include "opt_mac.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/signalvar.h>
#include <sys/kernel.h>
#include <sys/mac.h>
#include <sys/mount.h>
#include <sys/filedesc.h>
#include <sys/fcntl.h>
#include <sys/acct.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/imgact_elf.h>
#include <sys/wait.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/namei.h>
#include <sys/sysent.h>
#include <sys/shm.h>
#include <sys/sysctl.h>
#include <sys/user.h>
#include <sys/vnode.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <machine/reg.h>
MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
static MALLOC_DEFINE(M_ATEXEC, "atexec", "atexec callback");
+static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
+static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
+
/*
* callout list for things to do at exec time
*/
struct execlist {
execlist_fn function;
TAILQ_ENTRY(execlist) next;
};
TAILQ_HEAD(exec_list_head, execlist);
static struct exec_list_head exec_list = TAILQ_HEAD_INITIALIZER(exec_list);
/* XXX This should be vm_size_t. */
-static u_long ps_strings = PS_STRINGS;
-SYSCTL_ULONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings,
- 0, "");
+SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
+ NULL, 0, sysctl_kern_ps_strings, "LU", "");
/* XXX This should be vm_size_t. */
-static u_long usrstack = USRSTACK;
-SYSCTL_ULONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, 0, "");
+SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
+ NULL, 0, sysctl_kern_usrstack, "LU", "");
u_long ps_arg_cache_limit = PAGE_SIZE / 16;
SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
&ps_arg_cache_limit, 0, "");
int ps_argsopen = 1;
SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, "");
#ifdef __ia64__
/* XXX HACK */
static int regstkpages = 256;
SYSCTL_INT(_machdep, OID_AUTO, regstkpages, CTLFLAG_RW, &regstkpages, 0, "");
#endif
+static int
+sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
+{
+ struct proc *p;
+
+ p = curproc;
+ return (SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
+ sizeof(p->p_sysent->sv_psstrings)));
+}
+
+static int
+sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
+{
+ struct proc *p;
+
+ p = curproc;
+ return (SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
+ sizeof(p->p_sysent->sv_usrstack)));
+}
+
/*
* Each of the items is a pointer to a `const struct execsw', hence the
* double pointer here.
*/
static const struct execsw **execsw;
#ifndef _SYS_SYSPROTO_H_
struct execve_args {
char *fname;
char **argv;
char **envv;
};
#endif
/*
* execve() system call.
*
* MPSAFE
*/
int
execve(td, uap)
struct thread *td;
register struct execve_args *uap;
{
struct proc *p = td->td_proc;
struct nameidata nd, *ndp;
struct ucred *newcred = NULL, *oldcred;
struct uidinfo *euip;
register_t *stack_base;
int error, len, i;
struct image_params image_params, *imgp;
struct vattr attr;
int (*img_first)(struct image_params *);
struct pargs *oldargs = NULL, *newargs = NULL;
struct procsig *oldprocsig, *newprocsig;
#ifdef KTRACE
struct vnode *tracevp = NULL;
#endif
struct vnode *textvp = NULL;
int credential_changing;
int textset;
imgp = &image_params;
/*
* Lock the process and set the P_INEXEC flag to indicate that
* it should be left alone until we're done here. This is
* necessary to avoid race conditions - e.g. in ptrace() -
* that might allow a local user to illicitly obtain elevated
* privileges.
*/
PROC_LOCK(p);
KASSERT((p->p_flag & P_INEXEC) == 0,
("%s(): process already has P_INEXEC flag", __func__));
if (p->p_flag & P_KSES) {
if (thread_single(SINGLE_EXIT)) {
PROC_UNLOCK(p);
return (ERESTART); /* Try again later. */
}
/*
* If we get here all other threads are dead,
* so unset the associated flags and lose KSE mode.
*/
p->p_flag &= ~P_KSES;
td->td_flags &= ~TDF_UNBOUND;
thread_single_end();
}
p->p_flag |= P_INEXEC;
PROC_UNLOCK(p);
/*
* Initialize part of the common data
*/
imgp->proc = p;
imgp->uap = uap;
imgp->attr = &attr;
imgp->argc = imgp->envc = 0;
imgp->argv0 = NULL;
imgp->entry_addr = 0;
imgp->vmspace_destroyed = 0;
imgp->interpreted = 0;
imgp->interpreter_name[0] = '\0';
imgp->auxargs = NULL;
imgp->vp = NULL;
imgp->object = NULL;
imgp->firstpage = NULL;
imgp->ps_strings = 0;
imgp->auxarg_size = 0;
/*
* Allocate temporary demand zeroed space for argument and
* environment strings
*/
imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX +
PAGE_SIZE);
if (imgp->stringbase == NULL) {
error = ENOMEM;
mtx_lock(&Giant);
goto exec_fail;
}
imgp->stringp = imgp->stringbase;
imgp->stringspace = ARG_MAX;
imgp->image_header = imgp->stringbase + ARG_MAX;
/*
* Translate the file name. namei() returns a vnode pointer
* in ni_vp amoung other things.
*/
ndp = &nd;
NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
UIO_USERSPACE, uap->fname, td);
mtx_lock(&Giant);
interpret:
error = namei(ndp);
if (error) {
kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
ARG_MAX + PAGE_SIZE);
goto exec_fail;
}
imgp->vp = ndp->ni_vp;
imgp->fname = uap->fname;
/*
* Check file permissions (also 'opens' file)
*/
error = exec_check_permissions(imgp);
if (error)
goto exec_fail_dealloc;
if (VOP_GETVOBJECT(imgp->vp, &imgp->object) == 0)
vm_object_reference(imgp->object);
/*
* Set VV_TEXT now so no one can write to the executable while we're
* activating it.
*
* Remember if this was set before and unset it in case this is not
* actually an executable image.
*/
textset = imgp->vp->v_vflag & VV_TEXT;
imgp->vp->v_vflag |= VV_TEXT;
error = exec_map_first_page(imgp);
if (error)
goto exec_fail_dealloc;
/*
* If the current process has a special image activator it
* wants to try first, call it. For example, emulating shell
* scripts differently.
*/
error = -1;
if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
error = img_first(imgp);
/*
* Loop through the list of image activators, calling each one.
* An activator returns -1 if there is no match, 0 on success,
* and an error otherwise.
*/
for (i = 0; error == -1 && execsw[i]; ++i) {
if (execsw[i]->ex_imgact == NULL ||
execsw[i]->ex_imgact == img_first) {
continue;
}
error = (*execsw[i]->ex_imgact)(imgp);
}
if (error) {
if (error == -1) {
if (textset == 0)
imgp->vp->v_vflag &= ~VV_TEXT;
error = ENOEXEC;
}
goto exec_fail_dealloc;
}
/*
* Special interpreter operation, cleanup and loop up to try to
* activate the interpreter.
*/
if (imgp->interpreted) {
exec_unmap_first_page(imgp);
/*
* VV_TEXT needs to be unset for scripts. There is a short
* period before we determine that something is a script where
* VV_TEXT will be set. The vnode lock is held over this
* entire period so nothing should illegitimately be blocked.
*/
imgp->vp->v_vflag &= ~VV_TEXT;
/* free name buffer and old vnode */
NDFREE(ndp, NDF_ONLY_PNBUF);
vput(ndp->ni_vp);
vm_object_deallocate(imgp->object);
imgp->object = NULL;
/* set new name to that of the interpreter */
NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
UIO_SYSSPACE, imgp->interpreter_name, td);
goto interpret;
}
/*
* Copy out strings (args and env) and initialize stack base
*/
if (p->p_sysent->sv_copyout_strings)
stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
else
stack_base = exec_copyout_strings(imgp);
/*
* If custom stack fixup routine present for this process
* let it do the stack setup.
* Else stuff argument count as first item on stack
*/
if (p->p_sysent->sv_fixup)
(*p->p_sysent->sv_fixup)(&stack_base, imgp);
else
suword(--stack_base, imgp->argc);
/*
* For security and other reasons, the file descriptor table cannot
* be shared after an exec.
*/
FILEDESC_LOCK(p->p_fd);
if (p->p_fd->fd_refcnt > 1) {
struct filedesc *tmp;
tmp = fdcopy(td);
FILEDESC_UNLOCK(p->p_fd);
fdfree(td);
p->p_fd = tmp;
} else
FILEDESC_UNLOCK(p->p_fd);
/*
* Malloc things before we need locks.
*/
newcred = crget();
euip = uifind(attr.va_uid);
i = imgp->endargs - imgp->stringbase;
if (ps_arg_cache_limit >= i + sizeof(struct pargs))
newargs = pargs_alloc(i);
/* close files on exec */
fdcloseexec(td);
/* Get a reference to the vnode prior to locking the proc */
VREF(ndp->ni_vp);
/*
* For security and other reasons, signal handlers cannot
* be shared after an exec. The new process gets a copy of the old
* handlers. In execsigs(), the new process will have its signals
* reset.
*/
PROC_LOCK(p);
mp_fixme("procsig needs a lock");
if (p->p_procsig->ps_refcnt > 1) {
oldprocsig = p->p_procsig;
PROC_UNLOCK(p);
MALLOC(newprocsig, struct procsig *, sizeof(struct procsig),
M_SUBPROC, M_WAITOK);
bcopy(oldprocsig, newprocsig, sizeof(*newprocsig));
newprocsig->ps_refcnt = 1;
oldprocsig->ps_refcnt--;
PROC_LOCK(p);
p->p_procsig = newprocsig;
if (p->p_sigacts == &p->p_uarea->u_sigacts)
panic("shared procsig but private sigacts?");
p->p_uarea->u_sigacts = *p->p_sigacts;
p->p_sigacts = &p->p_uarea->u_sigacts;
}
/* Stop profiling */
stopprofclock(p);
/* reset caught signals */
execsigs(p);
/* name this process - nameiexec(p, ndp) */
len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
p->p_comm[len] = 0;
/*
* mark as execed, wakeup the process that vforked (if any) and tell
* it that it now has its own resources back
*/
p->p_flag |= P_EXEC;
if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
p->p_flag &= ~P_PPWAIT;
wakeup(p->p_pptr);
}
/*
* Implement image setuid/setgid.
*
* Don't honor setuid/setgid if the filesystem prohibits it or if
* the process is being traced.
*/
oldcred = p->p_ucred;
credential_changing = 0;
credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid !=
attr.va_uid;
credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid !=
attr.va_gid;
if (credential_changing &&
(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
(p->p_flag & P_TRACED) == 0) {
/*
* Turn off syscall tracing for set-id programs, except for
* root. Record any set-id flags first to make sure that
* we do not regain any tracing during a possible block.
*/
setsugid(p);
#ifdef KTRACE
if (p->p_tracep && suser_cred(oldcred, PRISON_ROOT)) {
mtx_lock(&ktrace_mtx);
p->p_traceflag = 0;
tracevp = p->p_tracep;
p->p_tracep = NULL;
mtx_unlock(&ktrace_mtx);
}
#endif
/*
* Close any file descriptors 0..2 that reference procfs,
* then make sure file descriptors 0..2 are in use.
*
* setugidsafety() may call closef() and then pfind()
* which may grab the process lock.
* fdcheckstd() may call falloc() which may block to
* allocate memory, so temporarily drop the process lock.
*/
PROC_UNLOCK(p);
setugidsafety(td);
error = fdcheckstd(td);
PROC_LOCK(p);
if (error != 0)
goto done1;
/*
* Set the new credentials.
*/
crcopy(newcred, oldcred);
if (attr.va_mode & VSUID)
change_euid(newcred, euip);
if (attr.va_mode & VSGID)
change_egid(newcred, attr.va_gid);
/*
* Implement correct POSIX saved-id behavior.
*/
change_svuid(newcred, newcred->cr_uid);
change_svgid(newcred, newcred->cr_gid);
p->p_ucred = newcred;
newcred = NULL;
} else {
if (oldcred->cr_uid == oldcred->cr_ruid &&
oldcred->cr_gid == oldcred->cr_rgid)
p->p_flag &= ~P_SUGID;
/*
* Implement correct POSIX saved-id behavior.
*
* XXX: It's not clear that the existing behavior is
* POSIX-compliant. A number of sources indicate that the
* saved uid/gid should only be updated if the new ruid is
* not equal to the old ruid, or the new euid is not equal
* to the old euid and the new euid is not equal to the old
* ruid. The FreeBSD code always updates the saved uid/gid.
* Also, this code uses the new (replaced) euid and egid as
* the source, which may or may not be the right ones to use.
*/
if (oldcred->cr_svuid != oldcred->cr_uid ||
oldcred->cr_svgid != oldcred->cr_gid) {
crcopy(newcred, oldcred);
change_svuid(newcred, newcred->cr_uid);
change_svgid(newcred, newcred->cr_gid);
p->p_ucred = newcred;
newcred = NULL;
}
}
/*
* Store the vp for use in procfs. This vnode was referenced prior
* to locking the proc lock.
*/
textvp = p->p_textvp;
p->p_textvp = ndp->ni_vp;
/*
* Notify others that we exec'd, and clear the P_INEXEC flag
* as we're now a bona fide freshly-execed process.
*/
KNOTE(&p->p_klist, NOTE_EXEC);
p->p_flag &= ~P_INEXEC;
/*
* If tracing the process, trap to debugger so breakpoints
* can be set before the program executes.
*/
_STOPEVENT(p, S_EXEC, 0);
if (p->p_flag & P_TRACED)
psignal(p, SIGTRAP);
/* clear "fork but no exec" flag, as we _are_ execing */
p->p_acflag &= ~AFORK;
/* Free any previous argument cache */
oldargs = p->p_args;
p->p_args = NULL;
/* Set values passed into the program in registers. */
if (p->p_sysent->sv_setregs)
(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
(u_long)(uintptr_t)stack_base, imgp->ps_strings);
else
exec_setregs(td, imgp->entry_addr,
(u_long)(uintptr_t)stack_base, imgp->ps_strings);
/* Cache arguments if they fit inside our allowance */
if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
bcopy(imgp->stringbase, newargs->ar_args, i);
p->p_args = newargs;
newargs = NULL;
}
done1:
PROC_UNLOCK(p);
/*
* Free any resources malloc'd earlier that we didn't use.
*/
uifree(euip);
if (newcred == NULL)
crfree(oldcred);
else
crfree(newcred);
/*
* Handle deferred decrement of ref counts.
*/
if (textvp != NULL)
vrele(textvp);
if (ndp->ni_vp && error != 0)
vrele(ndp->ni_vp);
#ifdef KTRACE
if (tracevp != NULL)
vrele(tracevp);
#endif
if (oldargs != NULL)
pargs_drop(oldargs);
if (newargs != NULL)
pargs_drop(newargs);
exec_fail_dealloc:
/*
* free various allocated resources
*/
if (imgp->firstpage)
exec_unmap_first_page(imgp);
if (imgp->stringbase != NULL)
kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
ARG_MAX + PAGE_SIZE);
if (imgp->vp) {
NDFREE(ndp, NDF_ONLY_PNBUF);
vput(imgp->vp);
}
if (imgp->object)
vm_object_deallocate(imgp->object);
if (error == 0)
goto done2;
exec_fail:
/* we're done here, clear P_INEXEC */
PROC_LOCK(p);
p->p_flag &= ~P_INEXEC;
PROC_UNLOCK(p);
if (imgp->vmspace_destroyed) {
/* sorry, no more process anymore. exit gracefully */
exit1(td, W_EXITCODE(0, SIGABRT));
/* NOT REACHED */
error = 0;
}
done2:
mtx_unlock(&Giant);
return (error);
}
int
exec_map_first_page(imgp)
struct image_params *imgp;
{
int rv, i;
int initial_pagein;
vm_page_t ma[VM_INITIAL_PAGEIN];
vm_object_t object;
GIANT_REQUIRED;
if (imgp->firstpage) {
exec_unmap_first_page(imgp);
}
VOP_GETVOBJECT(imgp->vp, &object);
ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
initial_pagein = VM_INITIAL_PAGEIN;
if (initial_pagein > object->size)
initial_pagein = object->size;
for (i = 1; i < initial_pagein; i++) {
if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
if ((ma[i]->flags & PG_BUSY) || ma[i]->busy)
break;
if (ma[i]->valid)
break;
vm_page_busy(ma[i]);
} else {
ma[i] = vm_page_alloc(object, i,
VM_ALLOC_NORMAL);
if (ma[i] == NULL)
break;
}
}
initial_pagein = i;
rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
ma[0] = vm_page_lookup(object, 0);
if ((rv != VM_PAGER_OK) || (ma[0] == NULL) ||
(ma[0]->valid == 0)) {
if (ma[0]) {
vm_page_lock_queues();
vm_page_protect(ma[0], VM_PROT_NONE);
vm_page_free(ma[0]);
vm_page_unlock_queues();
}
return (EIO);
}
}
vm_page_lock_queues();
vm_page_wire(ma[0]);
vm_page_wakeup(ma[0]);
vm_page_unlock_queues();
pmap_qenter((vm_offset_t)imgp->image_header, ma, 1);
imgp->firstpage = ma[0];
return (0);
}
void
exec_unmap_first_page(imgp)
struct image_params *imgp;
{
GIANT_REQUIRED;
if (imgp->firstpage) {
pmap_qremove((vm_offset_t)imgp->image_header, 1);
vm_page_lock_queues();
vm_page_unwire(imgp->firstpage, 1);
vm_page_unlock_queues();
imgp->firstpage = NULL;
}
}
/*
* Destroy old address space, and allocate a new stack
* The new stack is only SGROWSIZ large because it is grown
* automatically in trap.c.
*/
int
-exec_new_vmspace(imgp, minuser, maxuser, stack_addr)
+exec_new_vmspace(imgp, sv)
struct image_params *imgp;
- vm_offset_t minuser, maxuser, stack_addr;
+ struct sysentvec *sv;
{
int error;
struct execlist *ep;
struct proc *p = imgp->proc;
struct vmspace *vmspace = p->p_vmspace;
+ vm_offset_t stack_addr;
+ vm_map_t map;
GIANT_REQUIRED;
- stack_addr = stack_addr - maxssiz;
+ stack_addr = sv->sv_usrstack - maxssiz;
imgp->vmspace_destroyed = 1;
/*
* Perform functions registered with at_exec().
*/
TAILQ_FOREACH(ep, &exec_list, next)
(*ep->function)(p);
/*
* Blow away entire process VM, if address space not shared,
* otherwise, create a new VM space so that other threads are
* not disrupted
*/
- if (vmspace->vm_refcnt == 1 &&
- vm_map_min(&vmspace->vm_map) == minuser &&
- vm_map_max(&vmspace->vm_map) == maxuser) {
+ map = &vmspace->vm_map;
+ if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
+ vm_map_max(map) == sv->sv_maxuser) {
if (vmspace->vm_shm)
shmexit(p);
- pmap_remove_pages(vmspace_pmap(vmspace), minuser, maxuser);
- vm_map_remove(&vmspace->vm_map, minuser, maxuser);
+ pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map),
+ vm_map_max(map));
+ vm_map_remove(map, vm_map_min(map), vm_map_max(map));
} else {
- vmspace_exec(p, minuser, maxuser);
+ vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
vmspace = p->p_vmspace;
+ map = &vmspace->vm_map;
}
/* Allocate a new stack */
- error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz,
- VM_PROT_ALL, VM_PROT_ALL, 0);
+ error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
+ sv->sv_stackprot, VM_PROT_ALL, 0);
if (error)
return (error);
#ifdef __ia64__
{
/*
* Allocate backing store. We really need something
* similar to vm_map_stack which can allow the backing
* store to grow upwards. This will do for now.
*/
vm_offset_t bsaddr;
- bsaddr = USRSTACK - 2 * maxssiz;
- error = vm_map_find(&vmspace->vm_map, 0, 0, &bsaddr,
+ bsaddr = p->p_sysent->sv_usrstack - 2 * maxssiz;
+ error = vm_map_find(map, 0, 0, &bsaddr,
regstkpages * PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0);
FIRST_THREAD_IN_PROC(p)->td_md.md_bspstore = bsaddr;
}
#endif
/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
* VM_STACK case, but they are still used to monitor the size of the
* process stack so we can check the stack rlimit.
*/
vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
- vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz;
+ vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;
return (0);
}
/*
* Copy out argument and environment strings from the old process
* address space into the temporary string buffer.
*/
int
exec_extract_strings(imgp)
struct image_params *imgp;
{
char **argv, **envv;
char *argp, *envp;
int error;
size_t length;
/*
* extract arguments first
*/
argv = imgp->uap->argv;
if (argv) {
argp = (caddr_t)(intptr_t)fuword(argv);
if (argp == (caddr_t)-1)
return (EFAULT);
if (argp)
argv++;
if (imgp->argv0)
argp = imgp->argv0;
if (argp) {
do {
if (argp == (caddr_t)-1)
return (EFAULT);
if ((error = copyinstr(argp, imgp->stringp,
imgp->stringspace, &length))) {
if (error == ENAMETOOLONG)
return (E2BIG);
return (error);
}
imgp->stringspace -= length;
imgp->stringp += length;
imgp->argc++;
} while ((argp = (caddr_t)(intptr_t)fuword(argv++)));
}
}
imgp->endargs = imgp->stringp;
/*
* extract environment strings
*/
envv = imgp->uap->envv;
if (envv) {
while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
if (envp == (caddr_t)-1)
return (EFAULT);
if ((error = copyinstr(envp, imgp->stringp,
imgp->stringspace, &length))) {
if (error == ENAMETOOLONG)
return (E2BIG);
return (error);
}
imgp->stringspace -= length;
imgp->stringp += length;
imgp->envc++;
}
}
return (0);
}
/*
* Copy strings out to the new process address space, constructing
* new arg and env vector tables. Return a pointer to the base
* so that it can be used as the initial stack pointer.
*/
register_t *
exec_copyout_strings(imgp)
struct image_params *imgp;
{
int argc, envc;
char **vectp;
char *stringp, *destp;
register_t *stack_base;
struct ps_strings *arginfo;
struct proc *p;
int szsigcode;
/*
* Calculate string base and vector table pointers.
* Also deal with signal trampoline code for this exec type.
*/
p = imgp->proc;
szsigcode = 0;
- arginfo = (struct ps_strings *)PS_STRINGS;
+ arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
if (p->p_sysent->sv_szsigcode != NULL)
szsigcode = *(p->p_sysent->sv_szsigcode);
destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
/*
* install sigcode
*/
if (szsigcode)
copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
szsigcode), szsigcode);
/*
* If we have a valid auxargs ptr, prepare some room
* on the stack.
*/
if (imgp->auxargs) {
/*
* 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
* lower compatibility.
*/
imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
(AT_COUNT * 2);
/*
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets,and imgp->auxarg_size is room
* for argument of Runtime loader.
*/
vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
imgp->auxarg_size) * sizeof(char *));
} else
/*
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets
*/
vectp = (char **)(destp - (imgp->argc + imgp->envc + 2) *
sizeof(char *));
/*
* vectp also becomes our initial stack base
*/
stack_base = (register_t *)vectp;
stringp = imgp->stringbase;
argc = imgp->argc;
envc = imgp->envc;
/*
* Copy out strings - arguments and environment.
*/
copyout(stringp, destp, ARG_MAX - imgp->stringspace);
/*
* Fill in "ps_strings" struct for ps, w, etc.
*/
suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
suword(&arginfo->ps_nargvstr, argc);
/*
* Fill in argument portion of vector table.
*/
for (; argc > 0; --argc) {
suword(vectp++, (long)(intptr_t)destp);
while (*stringp++ != 0)
destp++;
destp++;
}
/* a null vector table pointer separates the argp's from the envp's */
suword(vectp++, 0);
suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
suword(&arginfo->ps_nenvstr, envc);
/*
* Fill in environment portion of vector table.
*/
for (; envc > 0; --envc) {
suword(vectp++, (long)(intptr_t)destp);
while (*stringp++ != 0)
destp++;
destp++;
}
/* end of vector table is a null pointer */
suword(vectp, 0);
return (stack_base);
}
/*
* Check permissions of file to execute.
* Called with imgp->vp locked.
* Return 0 for success or error code on failure.
*/
int
exec_check_permissions(imgp)
struct image_params *imgp;
{
struct vnode *vp = imgp->vp;
struct vattr *attr = imgp->attr;
struct thread *td;
int error;
td = curthread; /* XXXKSE */
#ifdef MAC
error = mac_check_vnode_exec(td->td_ucred, imgp->vp);
if (error)
return (error);
#endif
/* Get file attributes */
error = VOP_GETATTR(vp, attr, td->td_ucred, td);
if (error)
return (error);
/*
* 1) Check if file execution is disabled for the filesystem that this
* file resides on.
* 2) Insure that at least one execute bit is on - otherwise root
* will always succeed, and we don't want to happen unless the
* file really is executable.
* 3) Insure that the file is a regular file.
*/
if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
((attr->va_mode & 0111) == 0) ||
(attr->va_type != VREG))
return (EACCES);
/*
* Zero length files can't be exec'd
*/
if (attr->va_size == 0)
return (ENOEXEC);
/*
* Check for execute permission to file based on current credentials.
*/
error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
if (error)
return (error);
/*
* Check number of open-for-writes on the file and deny execution
* if there are any.
*/
if (vp->v_writecount)
return (ETXTBSY);
/*
* Call filesystem specific open routine (which does nothing in the
* general case).
*/
error = VOP_OPEN(vp, FREAD, td->td_ucred, td);
return (error);
}
/*
* Exec handler registration
*/
int
exec_register(execsw_arg)
const struct execsw *execsw_arg;
{
const struct execsw **es, **xs, **newexecsw;
int count = 2; /* New slot and trailing NULL */
if (execsw)
for (es = execsw; *es; es++)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
if (newexecsw == NULL)
return (ENOMEM);
xs = newexecsw;
if (execsw)
for (es = execsw; *es; es++)
*xs++ = *es;
*xs++ = execsw_arg;
*xs = NULL;
if (execsw)
free(execsw, M_TEMP);
execsw = newexecsw;
return (0);
}
int
exec_unregister(execsw_arg)
const struct execsw *execsw_arg;
{
const struct execsw **es, **xs, **newexecsw;
int count = 1;
if (execsw == NULL)
panic("unregister with no handlers left?\n");
for (es = execsw; *es; es++) {
if (*es == execsw_arg)
break;
}
if (*es == NULL)
return (ENOENT);
for (es = execsw; *es; es++)
if (*es != execsw_arg)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
if (newexecsw == NULL)
return (ENOMEM);
xs = newexecsw;
for (es = execsw; *es; es++)
if (*es != execsw_arg)
*xs++ = *es;
*xs = NULL;
if (execsw)
free(execsw, M_TEMP);
execsw = newexecsw;
return (0);
}
int
at_exec(function)
execlist_fn function;
{
struct execlist *ep;
#ifdef INVARIANTS
/* Be noisy if the programmer has lost track of things */
if (rm_at_exec(function))
printf("WARNING: exec callout entry (%p) already present\n",
function);
#endif
ep = malloc(sizeof(*ep), M_ATEXEC, M_NOWAIT);
if (ep == NULL)
return (ENOMEM);
ep->function = function;
TAILQ_INSERT_TAIL(&exec_list, ep, next);
return (0);
}
/*
* Scan the exec callout list for the given item and remove it.
* Returns the number of items removed (0 or 1)
*/
int
rm_at_exec(function)
execlist_fn function;
{
struct execlist *ep;
TAILQ_FOREACH(ep, &exec_list, next) {
if (ep->function == function) {
TAILQ_REMOVE(&exec_list, ep, next);
free(ep, M_ATEXEC);
return (1);
}
}
return (0);
}
Index: head/sys/kern/kern_exit.c
===================================================================
--- head/sys/kern/kern_exit.c (revision 103766)
+++ head/sys/kern/kern_exit.c (revision 103767)
@@ -1,883 +1,883 @@
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_exit.c 8.7 (Berkeley) 2/12/94
* $FreeBSD$
*/
#include "opt_compat.h"
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/tty.h>
#include <sys/wait.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/sx.h>
#include <sys/ptrace.h>
#include <sys/acct.h> /* for acct_process() function prototype */
#include <sys/filedesc.h>
#include <sys/shm.h>
#include <sys/sem.h>
#include <sys/jail.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/uma.h>
#include <sys/user.h>
/* Required to be non-static for SysVR4 emulator */
MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status");
static MALLOC_DEFINE(M_ATEXIT, "atexit", "atexit callback");
static int wait1(struct thread *, struct wait_args *, int);
/*
* callout list for things to do at exit time
*/
struct exitlist {
exitlist_fn function;
TAILQ_ENTRY(exitlist) next;
};
TAILQ_HEAD(exit_list_head, exitlist);
static struct exit_list_head exit_list = TAILQ_HEAD_INITIALIZER(exit_list);
/*
* exit --
* Death of process.
*
* MPSAFE
*/
void
sys_exit(td, uap)
struct thread *td;
struct sys_exit_args /* {
int rval;
} */ *uap;
{
mtx_lock(&Giant);
exit1(td, W_EXITCODE(uap->rval, 0));
/* NOTREACHED */
}
/*
* Exit: deallocate address space and other resources, change proc state
* to zombie, and unlink proc from allproc and parent's lists. Save exit
* status and rusage for wait(). Check for child processes and orphan them.
*/
void
exit1(td, rv)
register struct thread *td;
int rv;
{
struct exitlist *ep;
struct proc *p, *nq, *q;
struct tty *tp;
struct vnode *ttyvp;
register struct vmspace *vm;
struct vnode *vtmp;
#ifdef KTRACE
struct vnode *tracevp;
#endif
GIANT_REQUIRED;
p = td->td_proc;
if (p == initproc) {
printf("init died (signal %d, exit %d)\n",
WTERMSIG(rv), WEXITSTATUS(rv));
panic("Going nowhere without my init!");
}
/*
* XXXXKSE: MUST abort all other threads before proceeding past here.
*/
PROC_LOCK(p);
if (p->p_flag & P_KSES) {
/*
* First check if some other thread got here before us..
* if so, act apropriatly, (exit or suspend);
*/
thread_suspend_check(0);
/*
* Here is a trick..
* We need to free up our KSE to process other threads
* so that we can safely set the UNBOUND flag
* (whether or not we have a mailbox) as we are NEVER
* going to return to the user.
* The flag will not be set yet if we are exiting
* because of a signal, pagefault, or similar
* (or even an exit(2) from the UTS).
*/
td->td_flags |= TDF_UNBOUND;
/*
* Kill off the other threads. This requires
* Some co-operation from other parts of the kernel
* so it may not be instant.
* With this state set:
* Any thread entering the kernel from userspace will
* thread_exit() in trap(). Any thread attempting to
* sleep will return immediatly
* with EINTR or EWOULDBLOCK, which will hopefully force them
* to back out to userland, freeing resources as they go, and
* anything attempting to return to userland will thread_exit()
* from userret(). thread_exit() will unsuspend us
* when the last other thread exits.
*/
if (thread_single(SINGLE_EXIT)) {
panic ("Exit: Single threading fouled up");
}
/*
* All other activity in this process is now stopped.
* Remove excess KSEs and KSEGRPS. XXXKSE (when we have them)
* ...
* Turn off threading support.
*/
p->p_flag &= ~P_KSES;
td->td_flags &= ~TDF_UNBOUND;
thread_single_end(); /* Don't need this any more. */
}
/*
* With this state set:
* Any thread entering the kernel from userspace will thread_exit()
* in trap(). Any thread attempting to sleep will return immediatly
* with EINTR or EWOULDBLOCK, which will hopefully force them
* to back out to userland, freeing resources as they go, and
* anything attempting to return to userland will thread_exit()
* from userret(). thread_exit() will do a wakeup on p->p_numthreads
* if it transitions to 1.
*/
p->p_flag |= P_WEXIT;
PROC_UNLOCK(p);
/* Are we a task leader? */
PROC_LOCK(p);
if (p == p->p_leader) {
q = p->p_peers;
while (q != NULL) {
PROC_LOCK(q);
psignal(q, SIGKILL);
PROC_UNLOCK(q);
q = q->p_peers;
}
while (p->p_peers)
msleep(p, &p->p_mtx, PWAIT, "exit1", 0);
}
PROC_UNLOCK(p);
#ifdef PGINPROF
vmsizmon();
#endif
STOPEVENT(p, S_EXIT, rv);
wakeup(&p->p_stype); /* Wakeup anyone in procfs' PIOCWAIT */
/*
* Check if any loadable modules need anything done at process exit.
* e.g. SYSV IPC stuff
* XXX what if one of these generates an error?
*/
TAILQ_FOREACH(ep, &exit_list, next)
(*ep->function)(p);
stopprofclock(p);
MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
M_ZOMBIE, M_WAITOK);
/*
* If parent is waiting for us to exit or exec,
* P_PPWAIT is set; we will wakeup the parent below.
*/
PROC_LOCK(p);
p->p_flag &= ~(P_TRACED | P_PPWAIT);
SIGEMPTYSET(p->p_siglist);
PROC_UNLOCK(p);
if (timevalisset(&p->p_realtimer.it_value))
callout_stop(&p->p_itcallout);
/*
* Reset any sigio structures pointing to us as a result of
* F_SETOWN with our pid.
*/
funsetownlst(&p->p_sigiolst);
/*
* Close open files and release open-file table.
* This may block!
*/
fdfree(td); /* XXXKSE *//* may not be the one in proc */
/*
* Remove ourself from our leader's peer list and wake our leader.
*/
PROC_LOCK(p->p_leader);
if (p->p_leader->p_peers) {
q = p->p_leader;
while (q->p_peers != p)
q = q->p_peers;
q->p_peers = p->p_peers;
wakeup(p->p_leader);
}
PROC_UNLOCK(p->p_leader);
/* The next two chunks should probably be moved to vmspace_exit. */
vm = p->p_vmspace;
/*
* Release user portion of address space.
* This releases references to vnodes,
* which could cause I/O if the file has been unlinked.
* Need to do this early enough that we can still sleep.
* Can't free the entire vmspace as the kernel stack
* may be mapped within that space also.
*/
if (--vm->vm_refcnt == 0) {
if (vm->vm_shm)
shmexit(p);
- pmap_remove_pages(vmspace_pmap(vm), VM_MIN_ADDRESS,
- VM_MAXUSER_ADDRESS);
- (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS,
- VM_MAXUSER_ADDRESS);
+ pmap_remove_pages(vmspace_pmap(vm), vm_map_min(&vm->vm_map),
+ vm_map_max(&vm->vm_map));
+ (void) vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
+ vm_map_max(&vm->vm_map));
vm->vm_freer = p;
}
sx_xlock(&proctree_lock);
if (SESS_LEADER(p)) {
register struct session *sp;
sp = p->p_session;
if (sp->s_ttyvp) {
/*
* Controlling process.
* Signal foreground pgrp,
* drain controlling terminal
* and revoke access to controlling terminal.
*/
if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
tp = sp->s_ttyp;
if (sp->s_ttyp->t_pgrp) {
PGRP_LOCK(sp->s_ttyp->t_pgrp);
pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
PGRP_UNLOCK(sp->s_ttyp->t_pgrp);
}
/* XXX tp should be locked. */
sx_xunlock(&proctree_lock);
(void) ttywait(tp);
sx_xlock(&proctree_lock);
/*
* The tty could have been revoked
* if we blocked.
*/
if (sp->s_ttyvp) {
ttyvp = sp->s_ttyvp;
SESS_LOCK(p->p_session);
sp->s_ttyvp = NULL;
SESS_UNLOCK(p->p_session);
sx_xunlock(&proctree_lock);
VOP_REVOKE(ttyvp, REVOKEALL);
vrele(ttyvp);
sx_xlock(&proctree_lock);
}
}
if (sp->s_ttyvp) {
ttyvp = sp->s_ttyvp;
SESS_LOCK(p->p_session);
sp->s_ttyvp = NULL;
SESS_UNLOCK(p->p_session);
vrele(ttyvp);
}
/*
* s_ttyp is not zero'd; we use this to indicate
* that the session once had a controlling terminal.
* (for logging and informational purposes)
*/
}
SESS_LOCK(p->p_session);
sp->s_leader = NULL;
SESS_UNLOCK(p->p_session);
}
fixjobc(p, p->p_pgrp, 0);
sx_xunlock(&proctree_lock);
(void)acct_process(td);
#ifdef KTRACE
/*
* release trace file
*/
PROC_LOCK(p);
mtx_lock(&ktrace_mtx);
p->p_traceflag = 0; /* don't trace the vrele() */
tracevp = p->p_tracep;
p->p_tracep = NULL;
mtx_unlock(&ktrace_mtx);
PROC_UNLOCK(p);
if (tracevp != NULL)
vrele(tracevp);
#endif
/*
* Release reference to text vnode
*/
if ((vtmp = p->p_textvp) != NULL) {
p->p_textvp = NULL;
vrele(vtmp);
}
/*
* Release our limits structure.
*/
mtx_assert(&Giant, MA_OWNED);
if (--p->p_limit->p_refcnt == 0) {
FREE(p->p_limit, M_SUBPROC);
p->p_limit = NULL;
}
/*
* Release this thread's reference to the ucred. The actual proc
* reference will stay around until the proc is harvested by
* wait(). At this point the ucred is immutable (no other threads
* from this proc are around that can change it) so we leave the
* per-thread ucred pointer intact in case it is needed although
* in theory nothing should be using it at this point.
*/
crfree(td->td_ucred);
/*
* Remove proc from allproc queue and pidhash chain.
* Place onto zombproc. Unlink from parent's child list.
*/
sx_xlock(&allproc_lock);
LIST_REMOVE(p, p_list);
LIST_INSERT_HEAD(&zombproc, p, p_list);
LIST_REMOVE(p, p_hash);
sx_xunlock(&allproc_lock);
sx_xlock(&proctree_lock);
q = LIST_FIRST(&p->p_children);
if (q != NULL) /* only need this if any child is S_ZOMB */
wakeup(initproc);
for (; q != NULL; q = nq) {
nq = LIST_NEXT(q, p_sibling);
PROC_LOCK(q);
proc_reparent(q, initproc);
q->p_sigparent = SIGCHLD;
/*
* Traced processes are killed
* since their existence means someone is screwing up.
*/
if (q->p_flag & P_TRACED) {
q->p_flag &= ~P_TRACED;
psignal(q, SIGKILL);
}
PROC_UNLOCK(q);
}
/*
* Save exit status and final rusage info, adding in child rusage
* info and self times.
*/
PROC_LOCK(p);
p->p_xstat = rv;
*p->p_ru = p->p_stats->p_ru;
mtx_lock_spin(&sched_lock);
calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL);
mtx_unlock_spin(&sched_lock);
ruadd(p->p_ru, &p->p_stats->p_cru);
/*
* Notify interested parties of our demise.
*/
KNOTE(&p->p_klist, NOTE_EXIT);
/*
* Notify parent that we're gone. If parent has the PS_NOCLDWAIT
* flag set, or if the handler is set to SIG_IGN, notify process
* 1 instead (and hope it will handle this situation).
*/
PROC_LOCK(p->p_pptr);
if (p->p_pptr->p_procsig->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
struct proc *pp;
pp = p->p_pptr;
PROC_UNLOCK(pp);
proc_reparent(p, initproc);
PROC_LOCK(p->p_pptr);
/*
* If this was the last child of our parent, notify
* parent, so in case he was wait(2)ing, he will
* continue.
*/
if (LIST_EMPTY(&pp->p_children))
wakeup(pp);
}
if (p->p_sigparent && p->p_pptr != initproc)
psignal(p->p_pptr, p->p_sigparent);
else
psignal(p->p_pptr, SIGCHLD);
PROC_UNLOCK(p->p_pptr);
/*
* If this is a kthread, then wakeup anyone waiting for it to exit.
*/
if (p->p_flag & P_KTHREAD)
wakeup(p);
PROC_UNLOCK(p);
/*
* Finally, call machine-dependent code to release the remaining
* resources including address space, the kernel stack and pcb.
* The address space is released by "vmspace_exitfree(p)" in
* vm_waitproc().
*/
cpu_exit(td);
PROC_LOCK(p);
PROC_LOCK(p->p_pptr);
sx_xunlock(&proctree_lock);
mtx_lock_spin(&sched_lock);
while (mtx_owned(&Giant))
mtx_unlock(&Giant);
/*
* We have to wait until after releasing all locks before
* changing p_state. If we block on a mutex then we will be
* back at SRUN when we resume and our parent will never
* harvest us.
*/
p->p_state = PRS_ZOMBIE;
wakeup(p->p_pptr);
PROC_UNLOCK(p->p_pptr);
cnt.v_swtch++;
binuptime(PCPU_PTR(switchtime));
PCPU_SET(switchticks, ticks);
cpu_sched_exit(td); /* XXXKSE check if this should be in thread_exit */
/*
* Make sure this thread is discarded from the zombie.
* This will also release this thread's reference to the ucred.
*/
thread_exit();
panic("exit1");
}
#ifdef COMPAT_43
/*
* MPSAFE. The dirty work is handled by wait1().
*/
int
owait(td, uap)
struct thread *td;
register struct owait_args /* {
int dummy;
} */ *uap;
{
struct wait_args w;
w.options = 0;
w.rusage = NULL;
w.pid = WAIT_ANY;
w.status = NULL;
return (wait1(td, &w, 1));
}
#endif /* COMPAT_43 */
/*
* MPSAFE. The dirty work is handled by wait1().
*/
int
wait4(td, uap)
struct thread *td;
struct wait_args *uap;
{
return (wait1(td, uap, 0));
}
/*
* MPSAFE
*/
static int
wait1(td, uap, compat)
register struct thread *td;
register struct wait_args /* {
int pid;
int *status;
int options;
struct rusage *rusage;
} */ *uap;
int compat;
{
struct rusage ru;
register int nfound;
register struct proc *p, *q, *t;
int status, error;
struct kse *ke;
struct ksegrp *kg;
q = td->td_proc;
if (uap->pid == 0) {
PROC_LOCK(q);
uap->pid = -q->p_pgid;
PROC_UNLOCK(q);
}
if (uap->options &~ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE))
return (EINVAL);
mtx_lock(&Giant);
loop:
nfound = 0;
sx_xlock(&proctree_lock);
LIST_FOREACH(p, &q->p_children, p_sibling) {
PROC_LOCK(p);
if (uap->pid != WAIT_ANY &&
p->p_pid != uap->pid && p->p_pgid != -uap->pid) {
PROC_UNLOCK(p);
continue;
}
/*
* This special case handles a kthread spawned by linux_clone
* (see linux_misc.c). The linux_wait4 and linux_waitpid
* functions need to be able to distinguish between waiting
* on a process and waiting on a thread. It is a thread if
* p_sigparent is not SIGCHLD, and the WLINUXCLONE option
* signifies we want to wait for threads and not processes.
*/
if ((p->p_sigparent != SIGCHLD) ^
((uap->options & WLINUXCLONE) != 0)) {
PROC_UNLOCK(p);
continue;
}
nfound++;
if (p->p_state == PRS_ZOMBIE) {
/*
* charge childs scheduling cpu usage to parent
* XXXKSE assume only one thread & kse & ksegrp
* keep estcpu in each ksegrp
* so charge it to the ksegrp that did the wait
* since process estcpu is sum of all ksegrps,
* this is strictly as expected.
* Assume that the child process aggregated all
* tke estcpu into the 'build-in' ksegrp.
* XXXKSE
*/
if (curthread->td_proc->p_pid != 1) {
mtx_lock_spin(&sched_lock);
curthread->td_ksegrp->kg_estcpu =
ESTCPULIM(curthread->td_ksegrp->kg_estcpu +
FIRST_KSEGRP_IN_PROC(p)->kg_estcpu);
mtx_unlock_spin(&sched_lock);
}
td->td_retval[0] = p->p_pid;
#ifdef COMPAT_43
if (compat)
td->td_retval[1] = p->p_xstat;
else
#endif
if (uap->status) {
status = p->p_xstat; /* convert to int */
PROC_UNLOCK(p);
if ((error = copyout(&status,
uap->status, sizeof(status)))) {
sx_xunlock(&proctree_lock);
mtx_unlock(&Giant);
return (error);
}
PROC_LOCK(p);
}
if (uap->rusage) {
bcopy(p->p_ru, &ru, sizeof(ru));
PROC_UNLOCK(p);
if ((error = copyout(&ru,
uap->rusage, sizeof (struct rusage)))) {
sx_xunlock(&proctree_lock);
mtx_unlock(&Giant);
return (error);
}
} else
PROC_UNLOCK(p);
/*
* If we got the child via a ptrace 'attach',
* we need to give it back to the old parent.
*/
if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
PROC_LOCK(p);
p->p_oppid = 0;
proc_reparent(p, t);
PROC_UNLOCK(p);
psignal(t, SIGCHLD);
wakeup(t);
PROC_UNLOCK(t);
sx_xunlock(&proctree_lock);
mtx_unlock(&Giant);
return (0);
}
/*
* Remove other references to this process to ensure
* we have an exclusive reference.
*/
leavepgrp(p);
sx_xlock(&allproc_lock);
LIST_REMOVE(p, p_list); /* off zombproc */
sx_xunlock(&allproc_lock);
LIST_REMOVE(p, p_sibling);
sx_xunlock(&proctree_lock);
/*
* As a side effect of this lock, we know that
* all other writes to this proc are visible now, so
* no more locking is needed for p.
*/
PROC_LOCK(p);
p->p_xstat = 0; /* XXX: why? */
PROC_UNLOCK(p);
PROC_LOCK(q);
ruadd(&q->p_stats->p_cru, p->p_ru);
PROC_UNLOCK(q);
FREE(p->p_ru, M_ZOMBIE);
p->p_ru = NULL;
/*
* Decrement the count of procs running with this uid.
*/
(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
/*
* Free up credentials.
*/
crfree(p->p_ucred);
p->p_ucred = NULL; /* XXX: why? */
/*
* Remove unused arguments
*/
pargs_drop(p->p_args);
p->p_args = NULL;
if (--p->p_procsig->ps_refcnt == 0) {
if (p->p_sigacts != &p->p_uarea->u_sigacts)
FREE(p->p_sigacts, M_SUBPROC);
FREE(p->p_procsig, M_SUBPROC);
p->p_procsig = NULL;
}
/*
* There should only be one KSE/KSEGRP but
* do it right anyhow.
*/
FOREACH_KSEGRP_IN_PROC(p, kg) {
FOREACH_KSE_IN_GROUP(kg, ke) {
/* Free the KSE spare thread. */
if (ke->ke_tdspare != NULL) {
thread_free(ke->ke_tdspare);
ke->ke_tdspare = NULL;
}
}
}
thread_reap(); /* check for zombie threads */
/*
* Give vm and machine-dependent layer a chance
* to free anything that cpu_exit couldn't
* release while still running in process context.
*/
vm_waitproc(p);
mtx_destroy(&p->p_mtx);
KASSERT(FIRST_THREAD_IN_PROC(p),
("wait1: no residual thread!"));
uma_zfree(proc_zone, p);
sx_xlock(&allproc_lock);
nprocs--;
sx_xunlock(&allproc_lock);
mtx_unlock(&Giant);
return (0);
}
if (P_SHOULDSTOP(p) && ((p->p_flag & P_WAITED) == 0) &&
(p->p_flag & P_TRACED || uap->options & WUNTRACED)) {
p->p_flag |= P_WAITED;
sx_xunlock(&proctree_lock);
td->td_retval[0] = p->p_pid;
#ifdef COMPAT_43
if (compat) {
td->td_retval[1] = W_STOPCODE(p->p_xstat);
PROC_UNLOCK(p);
error = 0;
} else
#endif
if (uap->status) {
status = W_STOPCODE(p->p_xstat);
PROC_UNLOCK(p);
error = copyout(&status,
uap->status, sizeof(status));
} else {
PROC_UNLOCK(p);
error = 0;
}
mtx_unlock(&Giant);
return (error);
}
if (uap->options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
sx_xunlock(&proctree_lock);
td->td_retval[0] = p->p_pid;
p->p_flag &= ~P_CONTINUED;
PROC_UNLOCK(p);
if (uap->status) {
status = SIGCONT;
error = copyout(&status,
uap->status, sizeof(status));
} else
error = 0;
mtx_unlock(&Giant);
return (error);
}
PROC_UNLOCK(p);
}
if (nfound == 0) {
sx_xunlock(&proctree_lock);
mtx_unlock(&Giant);
return (ECHILD);
}
if (uap->options & WNOHANG) {
sx_xunlock(&proctree_lock);
td->td_retval[0] = 0;
mtx_unlock(&Giant);
return (0);
}
PROC_LOCK(q);
sx_xunlock(&proctree_lock);
error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
PROC_UNLOCK(q);
if (error) {
mtx_unlock(&Giant);
return (error);
}
goto loop;
}
/*
* Make process 'parent' the new parent of process 'child'.
* Must be called with an exclusive hold of proctree lock.
*/
void
proc_reparent(child, parent)
register struct proc *child;
register struct proc *parent;
{
sx_assert(&proctree_lock, SX_XLOCKED);
PROC_LOCK_ASSERT(child, MA_OWNED);
if (child->p_pptr == parent)
return;
LIST_REMOVE(child, p_sibling);
LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
child->p_pptr = parent;
}
/*
* The next two functions are to handle adding/deleting items on the
* exit callout list
*
* at_exit():
* Take the arguments given and put them onto the exit callout list,
* However first make sure that it's not already there.
* returns 0 on success.
*/
int
at_exit(function)
exitlist_fn function;
{
struct exitlist *ep;
#ifdef INVARIANTS
/* Be noisy if the programmer has lost track of things */
if (rm_at_exit(function))
printf("WARNING: exit callout entry (%p) already present\n",
function);
#endif
ep = malloc(sizeof(*ep), M_ATEXIT, M_NOWAIT);
if (ep == NULL)
return (ENOMEM);
ep->function = function;
TAILQ_INSERT_TAIL(&exit_list, ep, next);
return (0);
}
/*
* Scan the exit callout list for the given item and remove it.
* Returns the number of items removed (0 or 1)
*/
int
rm_at_exit(function)
exitlist_fn function;
{
struct exitlist *ep;
TAILQ_FOREACH(ep, &exit_list, next) {
if (ep->function == function) {
TAILQ_REMOVE(&exit_list, ep, next);
free(ep, M_ATEXIT);
return (1);
}
}
return (0);
}
Index: head/sys/kern/kern_resource.c
===================================================================
--- head/sys/kern/kern_resource.c (revision 103766)
+++ head/sys/kern/kern_resource.c (revision 103767)
@@ -1,1045 +1,1048 @@
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_resource.c 8.5 (Berkeley) 1/21/94
* $FreeBSD$
*/
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/file.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sx.h>
+#include <sys/sysent.h>
#include <sys/time.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
static int donice(struct thread *td, struct proc *chgp, int n);
static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
static struct mtx uihashtbl_mtx;
static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
static u_long uihash; /* size of hash table - 1 */
static struct uidinfo *uilookup(uid_t uid);
/*
* Resource controls and accounting.
*/
#ifndef _SYS_SYSPROTO_H_
struct getpriority_args {
int which;
int who;
};
#endif
/*
* MPSAFE
*/
int
getpriority(td, uap)
struct thread *td;
register struct getpriority_args *uap;
{
struct proc *p;
int low = PRIO_MAX + 1;
int error = 0;
struct ksegrp *kg;
mtx_lock(&Giant);
switch (uap->which) {
case PRIO_PROCESS:
if (uap->who == 0)
low = td->td_ksegrp->kg_nice;
else {
p = pfind(uap->who);
if (p == NULL)
break;
if (p_cansee(td, p) == 0) {
FOREACH_KSEGRP_IN_PROC(p, kg) {
if (kg->kg_nice < low)
low = kg->kg_nice;
}
}
PROC_UNLOCK(p);
}
break;
case PRIO_PGRP: {
register struct pgrp *pg;
sx_slock(&proctree_lock);
if (uap->who == 0) {
pg = td->td_proc->p_pgrp;
PGRP_LOCK(pg);
} else {
pg = pgfind(uap->who);
if (pg == NULL) {
sx_sunlock(&proctree_lock);
break;
}
}
sx_sunlock(&proctree_lock);
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (!p_cansee(td, p)) {
FOREACH_KSEGRP_IN_PROC(p, kg) {
if (kg->kg_nice < low)
low = kg->kg_nice;
}
}
PROC_UNLOCK(p);
}
PGRP_UNLOCK(pg);
break;
}
case PRIO_USER:
if (uap->who == 0)
uap->who = td->td_ucred->cr_uid;
sx_slock(&allproc_lock);
LIST_FOREACH(p, &allproc, p_list) {
PROC_LOCK(p);
if (!p_cansee(td, p) &&
p->p_ucred->cr_uid == uap->who) {
FOREACH_KSEGRP_IN_PROC(p, kg) {
if (kg->kg_nice < low)
low = kg->kg_nice;
}
}
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
break;
default:
error = EINVAL;
break;
}
if (low == PRIO_MAX + 1 && error == 0)
error = ESRCH;
td->td_retval[0] = low;
mtx_unlock(&Giant);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct setpriority_args {
int which;
int who;
int prio;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
setpriority(td, uap)
struct thread *td;
register struct setpriority_args *uap;
{
struct proc *curp = td->td_proc;
register struct proc *p;
int found = 0, error = 0;
mtx_lock(&Giant);
switch (uap->which) {
case PRIO_PROCESS:
if (uap->who == 0) {
PROC_LOCK(curp);
error = donice(td, curp, uap->prio);
PROC_UNLOCK(curp);
} else {
p = pfind(uap->who);
if (p == 0)
break;
if (p_cansee(td, p) == 0)
error = donice(td, p, uap->prio);
PROC_UNLOCK(p);
}
found++;
break;
case PRIO_PGRP: {
register struct pgrp *pg;
sx_slock(&proctree_lock);
if (uap->who == 0) {
pg = curp->p_pgrp;
PGRP_LOCK(pg);
} else {
pg = pgfind(uap->who);
if (pg == NULL) {
sx_sunlock(&proctree_lock);
break;
}
}
sx_sunlock(&proctree_lock);
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (!p_cansee(td, p)) {
error = donice(td, p, uap->prio);
found++;
}
PROC_UNLOCK(p);
}
PGRP_UNLOCK(pg);
break;
}
case PRIO_USER:
if (uap->who == 0)
uap->who = td->td_ucred->cr_uid;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_ucred->cr_uid == uap->who &&
!p_cansee(td, p)) {
error = donice(td, p, uap->prio);
found++;
}
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
break;
default:
error = EINVAL;
break;
}
if (found == 0 && error == 0)
error = ESRCH;
mtx_unlock(&Giant);
return (error);
}
/*
* Set "nice" for a process. Doesn't really understand threaded processes well
* but does try. Has the unfortunate side effect of making all the NICE
* values for a process's ksegrps the same.. This suggests that
* NICE valuse should be stored as a process nice and deltas for the ksegrps.
* (but not yet).
*/
static int
donice(struct thread *td, struct proc *p, int n)
{
int error;
int low = PRIO_MAX + 1;
struct ksegrp *kg;
PROC_LOCK_ASSERT(p, MA_OWNED);
if ((error = p_cansched(td, p)))
return (error);
if (n > PRIO_MAX)
n = PRIO_MAX;
if (n < PRIO_MIN)
n = PRIO_MIN;
/*
* Only allow nicing if to more than the lowest nice.
* e.g. nices of 4,3,2 allow nice to 3 but not 1
*/
FOREACH_KSEGRP_IN_PROC(p, kg) {
if (kg->kg_nice < low)
low = kg->kg_nice;
}
if (n < low && suser(td))
return (EACCES);
FOREACH_KSEGRP_IN_PROC(p, kg) {
kg->kg_nice = n;
(void)resetpriority(kg);
}
return (0);
}
/* rtprio system call */
#ifndef _SYS_SYSPROTO_H_
struct rtprio_args {
int function;
pid_t pid;
struct rtprio *rtp;
};
#endif
/*
* Set realtime priority
*/
/*
* MPSAFE
*/
/* ARGSUSED */
int
rtprio(td, uap)
struct thread *td;
register struct rtprio_args *uap;
{
struct proc *curp = td->td_proc;
register struct proc *p;
struct rtprio rtp;
int error, cierror = 0;
/* Perform copyin before acquiring locks if needed. */
if (uap->function == RTP_SET)
cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
if (uap->pid == 0) {
p = curp;
PROC_LOCK(p);
} else {
p = pfind(uap->pid);
if (p == NULL)
return (ESRCH);
}
switch (uap->function) {
case RTP_LOOKUP:
if ((error = p_cansee(td, p)))
break;
mtx_lock_spin(&sched_lock);
pri_to_rtp(FIRST_KSEGRP_IN_PROC(p), &rtp);
mtx_unlock_spin(&sched_lock);
PROC_UNLOCK(p);
return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
case RTP_SET:
if ((error = p_cansched(td, p)) || (error = cierror))
break;
/* disallow setting rtprio in most cases if not superuser */
if (suser(td) != 0) {
/* can't set someone else's */
if (uap->pid) {
error = EPERM;
break;
}
/* can't set realtime priority */
/*
* Realtime priority has to be restricted for reasons which should be
* obvious. However, for idle priority, there is a potential for
* system deadlock if an idleprio process gains a lock on a resource
* that other processes need (and the idleprio process can't run
* due to a CPU-bound normal process). Fix me! XXX
*/
#if 0
if (RTP_PRIO_IS_REALTIME(rtp.type))
#endif
if (rtp.type != RTP_PRIO_NORMAL) {
error = EPERM;
break;
}
}
mtx_lock_spin(&sched_lock);
error = rtp_to_pri(&rtp, FIRST_KSEGRP_IN_PROC(p));
mtx_unlock_spin(&sched_lock);
break;
default:
error = EINVAL;
break;
}
PROC_UNLOCK(p);
return (error);
}
int
rtp_to_pri(struct rtprio *rtp, struct ksegrp *kg)
{
if (rtp->prio > RTP_PRIO_MAX)
return (EINVAL);
switch (RTP_PRIO_BASE(rtp->type)) {
case RTP_PRIO_REALTIME:
kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio;
break;
case RTP_PRIO_NORMAL:
kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio;
break;
case RTP_PRIO_IDLE:
kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio;
break;
default:
return (EINVAL);
}
kg->kg_pri_class = rtp->type;
if (curthread->td_ksegrp == kg) {
curthread->td_base_pri = kg->kg_user_pri;
curthread->td_priority = kg->kg_user_pri; /* XXX dubious */
}
return (0);
}
void
pri_to_rtp(struct ksegrp *kg, struct rtprio *rtp)
{
switch (PRI_BASE(kg->kg_pri_class)) {
case PRI_REALTIME:
rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME;
break;
case PRI_TIMESHARE:
rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE;
break;
case PRI_IDLE:
rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE;
break;
default:
break;
}
rtp->type = kg->kg_pri_class;
}
#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
#ifndef _SYS_SYSPROTO_H_
struct osetrlimit_args {
u_int which;
struct orlimit *rlp;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
osetrlimit(td, uap)
struct thread *td;
register struct osetrlimit_args *uap;
{
struct orlimit olim;
struct rlimit lim;
int error;
if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
return (error);
lim.rlim_cur = olim.rlim_cur;
lim.rlim_max = olim.rlim_max;
mtx_lock(&Giant);
error = dosetrlimit(td, uap->which, &lim);
mtx_unlock(&Giant);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ogetrlimit_args {
u_int which;
struct orlimit *rlp;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
ogetrlimit(td, uap)
struct thread *td;
register struct ogetrlimit_args *uap;
{
struct proc *p = td->td_proc;
struct orlimit olim;
int error;
if (uap->which >= RLIM_NLIMITS)
return (EINVAL);
mtx_lock(&Giant);
olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur;
if (olim.rlim_cur == -1)
olim.rlim_cur = 0x7fffffff;
olim.rlim_max = p->p_rlimit[uap->which].rlim_max;
if (olim.rlim_max == -1)
olim.rlim_max = 0x7fffffff;
error = copyout(&olim, uap->rlp, sizeof(olim));
mtx_unlock(&Giant);
return (error);
}
#endif /* COMPAT_43 || COMPAT_SUNOS */
#ifndef _SYS_SYSPROTO_H_
struct __setrlimit_args {
u_int which;
struct rlimit *rlp;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
setrlimit(td, uap)
struct thread *td;
register struct __setrlimit_args *uap;
{
struct rlimit alim;
int error;
if ((error = copyin(uap->rlp, &alim, sizeof (struct rlimit))))
return (error);
mtx_lock(&Giant);
error = dosetrlimit(td, uap->which, &alim);
mtx_unlock(&Giant);
return (error);
}
int
dosetrlimit(td, which, limp)
struct thread *td;
u_int which;
struct rlimit *limp;
{
struct proc *p = td->td_proc;
register struct rlimit *alimp;
int error;
GIANT_REQUIRED;
if (which >= RLIM_NLIMITS)
return (EINVAL);
alimp = &p->p_rlimit[which];
/*
* Preserve historical bugs by treating negative limits as unsigned.
*/
if (limp->rlim_cur < 0)
limp->rlim_cur = RLIM_INFINITY;
if (limp->rlim_max < 0)
limp->rlim_max = RLIM_INFINITY;
if (limp->rlim_cur > alimp->rlim_max ||
limp->rlim_max > alimp->rlim_max)
if ((error = suser_cred(td->td_ucred, PRISON_ROOT)))
return (error);
if (limp->rlim_cur > limp->rlim_max)
limp->rlim_cur = limp->rlim_max;
if (p->p_limit->p_refcnt > 1 &&
(p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
p->p_limit->p_refcnt--;
p->p_limit = limcopy(p->p_limit);
alimp = &p->p_rlimit[which];
}
switch (which) {
case RLIMIT_CPU:
if (limp->rlim_cur > RLIM_INFINITY / (rlim_t)1000000)
p->p_limit->p_cpulimit = RLIM_INFINITY;
else
p->p_limit->p_cpulimit =
(rlim_t)1000000 * limp->rlim_cur;
break;
case RLIMIT_DATA:
if (limp->rlim_cur > maxdsiz)
limp->rlim_cur = maxdsiz;
if (limp->rlim_max > maxdsiz)
limp->rlim_max = maxdsiz;
break;
case RLIMIT_STACK:
if (limp->rlim_cur > maxssiz)
limp->rlim_cur = maxssiz;
if (limp->rlim_max > maxssiz)
limp->rlim_max = maxssiz;
/*
* Stack is allocated to the max at exec time with only
* "rlim_cur" bytes accessible. If stack limit is going
* up make more accessible, if going down make inaccessible.
*/
if (limp->rlim_cur != alimp->rlim_cur) {
vm_offset_t addr;
vm_size_t size;
vm_prot_t prot;
if (limp->rlim_cur > alimp->rlim_cur) {
- prot = VM_PROT_ALL;
+ prot = p->p_sysent->sv_stackprot;
size = limp->rlim_cur - alimp->rlim_cur;
- addr = USRSTACK - limp->rlim_cur;
+ addr = p->p_sysent->sv_usrstack -
+ limp->rlim_cur;
} else {
prot = VM_PROT_NONE;
size = alimp->rlim_cur - limp->rlim_cur;
- addr = USRSTACK - alimp->rlim_cur;
+ addr = p->p_sysent->sv_usrstack -
+ alimp->rlim_cur;
}
addr = trunc_page(addr);
size = round_page(size);
(void) vm_map_protect(&p->p_vmspace->vm_map,
addr, addr+size, prot, FALSE);
}
break;
case RLIMIT_NOFILE:
if (limp->rlim_cur > maxfilesperproc)
limp->rlim_cur = maxfilesperproc;
if (limp->rlim_max > maxfilesperproc)
limp->rlim_max = maxfilesperproc;
break;
case RLIMIT_NPROC:
if (limp->rlim_cur > maxprocperuid)
limp->rlim_cur = maxprocperuid;
if (limp->rlim_max > maxprocperuid)
limp->rlim_max = maxprocperuid;
if (limp->rlim_cur < 1)
limp->rlim_cur = 1;
if (limp->rlim_max < 1)
limp->rlim_max = 1;
break;
}
*alimp = *limp;
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct __getrlimit_args {
u_int which;
struct rlimit *rlp;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
getrlimit(td, uap)
struct thread *td;
register struct __getrlimit_args *uap;
{
int error;
struct proc *p = td->td_proc;
if (uap->which >= RLIM_NLIMITS)
return (EINVAL);
mtx_lock(&Giant);
error = copyout(&p->p_rlimit[uap->which], uap->rlp,
sizeof (struct rlimit));
mtx_unlock(&Giant);
return(error);
}
/*
* Transform the running time and tick information in proc p into user,
* system, and interrupt time usage.
*/
void
calcru(p, up, sp, ip)
struct proc *p;
struct timeval *up;
struct timeval *sp;
struct timeval *ip;
{
/* {user, system, interrupt, total} {ticks, usec}; previous tu: */
u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
u_int64_t uut = 0, sut = 0, iut = 0;
int s;
struct timeval tv;
struct bintime bt;
struct kse *ke;
struct ksegrp *kg;
mtx_assert(&sched_lock, MA_OWNED);
/* XXX: why spl-protect ? worst case is an off-by-one report */
FOREACH_KSEGRP_IN_PROC(p, kg) {
/* we could accumulate per ksegrp and per process here*/
FOREACH_KSE_IN_GROUP(kg, ke) {
s = splstatclock();
ut = ke->ke_uticks;
st = ke->ke_sticks;
it = ke->ke_iticks;
splx(s);
tt = ut + st + it;
if (tt == 0) {
st = 1;
tt = 1;
}
if (ke == curthread->td_kse) {
/*
* Adjust for the current time slice. This is actually fairly
* important since the error here is on the order of a time
* quantum, which is much greater than the sampling error.
* XXXKSE use a different test due to threads on other
* processors also being 'current'.
*/
binuptime(&bt);
bintime_sub(&bt, PCPU_PTR(switchtime));
bintime_add(&bt, &p->p_runtime);
} else {
bt = p->p_runtime;
}
bintime2timeval(&bt, &tv);
tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
ptu = ke->ke_uu + ke->ke_su + ke->ke_iu;
if (tu < ptu || (int64_t)tu < 0) {
/* XXX no %qd in kernel. Truncate. */
printf("calcru: negative time of %ld usec for pid %d (%s)\n",
(long)tu, p->p_pid, p->p_comm);
tu = ptu;
}
/* Subdivide tu. */
uu = (tu * ut) / tt;
su = (tu * st) / tt;
iu = tu - uu - su;
/* Enforce monotonicity. */
if (uu < ke->ke_uu || su < ke->ke_su || iu < ke->ke_iu) {
if (uu < ke->ke_uu)
uu = ke->ke_uu;
else if (uu + ke->ke_su + ke->ke_iu > tu)
uu = tu - ke->ke_su - ke->ke_iu;
if (st == 0)
su = ke->ke_su;
else {
su = ((tu - uu) * st) / (st + it);
if (su < ke->ke_su)
su = ke->ke_su;
else if (uu + su + ke->ke_iu > tu)
su = tu - uu - ke->ke_iu;
}
KASSERT(uu + su + ke->ke_iu <= tu,
("calcru: monotonisation botch 1"));
iu = tu - uu - su;
KASSERT(iu >= ke->ke_iu,
("calcru: monotonisation botch 2"));
}
ke->ke_uu = uu;
ke->ke_su = su;
ke->ke_iu = iu;
uut += uu;
sut += su;
iut += iu;
} /* end kse loop */
} /* end kseg loop */
up->tv_sec = uut / 1000000;
up->tv_usec = uut % 1000000;
sp->tv_sec = sut / 1000000;
sp->tv_usec = sut % 1000000;
if (ip != NULL) {
ip->tv_sec = iut / 1000000;
ip->tv_usec = iut % 1000000;
}
}
#ifndef _SYS_SYSPROTO_H_
struct getrusage_args {
int who;
struct rusage *rusage;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
getrusage(td, uap)
register struct thread *td;
register struct getrusage_args *uap;
{
struct proc *p = td->td_proc;
register struct rusage *rup;
int error = 0;
mtx_lock(&Giant);
switch (uap->who) {
case RUSAGE_SELF:
rup = &p->p_stats->p_ru;
mtx_lock_spin(&sched_lock);
calcru(p, &rup->ru_utime, &rup->ru_stime, NULL);
mtx_unlock_spin(&sched_lock);
break;
case RUSAGE_CHILDREN:
rup = &p->p_stats->p_cru;
break;
default:
rup = NULL;
error = EINVAL;
break;
}
mtx_unlock(&Giant);
if (error == 0) {
error = copyout(rup, uap->rusage, sizeof (struct rusage));
}
return(error);
}
void
ruadd(ru, ru2)
register struct rusage *ru, *ru2;
{
register long *ip, *ip2;
register int i;
timevaladd(&ru->ru_utime, &ru2->ru_utime);
timevaladd(&ru->ru_stime, &ru2->ru_stime);
if (ru->ru_maxrss < ru2->ru_maxrss)
ru->ru_maxrss = ru2->ru_maxrss;
ip = &ru->ru_first; ip2 = &ru2->ru_first;
for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
*ip++ += *ip2++;
}
/*
* Make a copy of the plimit structure.
* We share these structures copy-on-write after fork,
* and copy when a limit is changed.
*/
struct plimit *
limcopy(lim)
struct plimit *lim;
{
register struct plimit *copy;
MALLOC(copy, struct plimit *, sizeof(struct plimit),
M_SUBPROC, M_WAITOK);
bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct plimit));
copy->p_lflags = 0;
copy->p_refcnt = 1;
return (copy);
}
/*
* Find the uidinfo structure for a uid. This structure is used to
* track the total resource consumption (process count, socket buffer
* size, etc.) for the uid and impose limits.
*/
void
uihashinit()
{
uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
}
/*
* lookup a uidinfo struct for the parameter uid.
* uihashtbl_mtx must be locked.
*/
static struct uidinfo *
uilookup(uid)
uid_t uid;
{
struct uihashhead *uipp;
struct uidinfo *uip;
mtx_assert(&uihashtbl_mtx, MA_OWNED);
uipp = UIHASH(uid);
LIST_FOREACH(uip, uipp, ui_hash)
if (uip->ui_uid == uid)
break;
return (uip);
}
/*
* Find or allocate a struct uidinfo for a particular uid.
* Increase refcount on uidinfo struct returned.
* uifree() should be called on a struct uidinfo when released.
*/
struct uidinfo *
uifind(uid)
uid_t uid;
{
struct uidinfo *uip;
mtx_lock(&uihashtbl_mtx);
uip = uilookup(uid);
if (uip == NULL) {
struct uidinfo *old_uip;
mtx_unlock(&uihashtbl_mtx);
uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
mtx_lock(&uihashtbl_mtx);
/*
* There's a chance someone created our uidinfo while we
* were in malloc and not holding the lock, so we have to
* make sure we don't insert a duplicate uidinfo
*/
if ((old_uip = uilookup(uid)) != NULL) {
/* someone else beat us to it */
free(uip, M_UIDINFO);
uip = old_uip;
} else {
uip->ui_mtxp = mtx_pool_alloc();
uip->ui_uid = uid;
LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
}
}
uihold(uip);
mtx_unlock(&uihashtbl_mtx);
return (uip);
}
/*
* Place another refcount on a uidinfo struct.
*/
void
uihold(uip)
struct uidinfo *uip;
{
UIDINFO_LOCK(uip);
uip->ui_ref++;
UIDINFO_UNLOCK(uip);
}
/*-
* Since uidinfo structs have a long lifetime, we use an
* opportunistic refcounting scheme to avoid locking the lookup hash
* for each release.
*
* If the refcount hits 0, we need to free the structure,
* which means we need to lock the hash.
* Optimal case:
* After locking the struct and lowering the refcount, if we find
* that we don't need to free, simply unlock and return.
* Suboptimal case:
* If refcount lowering results in need to free, bump the count
* back up, loose the lock and aquire the locks in the proper
* order to try again.
*/
void
uifree(uip)
struct uidinfo *uip;
{
/* Prepare for optimal case. */
UIDINFO_LOCK(uip);
if (--uip->ui_ref != 0) {
UIDINFO_UNLOCK(uip);
return;
}
/* Prepare for suboptimal case. */
uip->ui_ref++;
UIDINFO_UNLOCK(uip);
mtx_lock(&uihashtbl_mtx);
UIDINFO_LOCK(uip);
/*
* We must subtract one from the count again because we backed out
* our initial subtraction before dropping the lock.
* Since another thread may have added a reference after we dropped the
* initial lock we have to test for zero again.
*/
if (--uip->ui_ref == 0) {
LIST_REMOVE(uip, ui_hash);
mtx_unlock(&uihashtbl_mtx);
if (uip->ui_sbsize != 0)
/* XXX no %qd in kernel. Truncate. */
printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
uip->ui_uid, (long)uip->ui_sbsize);
if (uip->ui_proccnt != 0)
printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
uip->ui_uid, uip->ui_proccnt);
UIDINFO_UNLOCK(uip);
FREE(uip, M_UIDINFO);
return;
}
mtx_unlock(&uihashtbl_mtx);
UIDINFO_UNLOCK(uip);
}
/*
* Change the count associated with number of processes
* a given user is using. When 'max' is 0, don't enforce a limit
*/
int
chgproccnt(uip, diff, max)
struct uidinfo *uip;
int diff;
int max;
{
UIDINFO_LOCK(uip);
/* don't allow them to exceed max, but allow subtraction */
if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
UIDINFO_UNLOCK(uip);
return (0);
}
uip->ui_proccnt += diff;
if (uip->ui_proccnt < 0)
printf("negative proccnt for uid = %d\n", uip->ui_uid);
UIDINFO_UNLOCK(uip);
return (1);
}
/*
* Change the total socket buffer size a user has used.
*/
int
chgsbsize(uip, hiwat, to, max)
struct uidinfo *uip;
u_int *hiwat;
u_int to;
rlim_t max;
{
rlim_t new;
int s;
s = splnet();
UIDINFO_LOCK(uip);
new = uip->ui_sbsize + to - *hiwat;
/* don't allow them to exceed max, but allow subtraction */
if (to > *hiwat && new > max) {
splx(s);
UIDINFO_UNLOCK(uip);
return (0);
}
uip->ui_sbsize = new;
*hiwat = to;
if (uip->ui_sbsize < 0)
printf("negative sbsize for uid = %d\n", uip->ui_uid);
splx(s);
UIDINFO_UNLOCK(uip);
return (1);
}
Index: head/sys/sys/imgact.h
===================================================================
--- head/sys/sys/imgact.h (revision 103766)
+++ head/sys/sys/imgact.h (revision 103767)
@@ -1,78 +1,78 @@
/*-
* Copyright (c) 1993, David Greenman
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _SYS_IMGACT_H_
#define _SYS_IMGACT_H_
#define MAXSHELLCMDLEN 128
+struct sysentvec;
struct thread;
struct vm_object;
struct image_params {
struct proc *proc; /* our process struct */
struct execve_args *uap; /* syscall arguments */
struct vnode *vp; /* pointer to vnode of file to exec */
struct vm_object *object; /* The vm object for this vp */
struct vattr *attr; /* attributes of file */
const char *image_header; /* head of file to exec */
char *stringbase; /* base address of tmp string storage */
char *stringp; /* current 'end' pointer of tmp strings */
char *endargs; /* end of argv vector */
int stringspace; /* space left in tmp string storage area */
int argc, envc; /* count of argument and environment strings */
char *argv0; /* Replacement for argv[0] when interpreting */
unsigned long entry_addr; /* entry address of target executable */
char vmspace_destroyed; /* flag - we've blown away original vm space */
char interpreted; /* flag - this executable is interpreted */
char interpreter_name[MAXSHELLCMDLEN]; /* name of the interpreter */
void *auxargs; /* ELF Auxinfo structure pointer */
struct vm_page *firstpage; /* first page that we mapped */
char *fname; /* pointer to filename of executable (user space) */
unsigned long ps_strings; /* PS_STRINGS for BSD/OS binaries */
size_t auxarg_size;
};
#ifdef _KERNEL
int exec_check_permissions(struct image_params *);
register_t *exec_copyout_strings(struct image_params *);
int exec_extract_strings(struct image_params *);
-int exec_new_vmspace(struct image_params *, vm_offset_t, vm_offset_t,
- vm_offset_t);
+int exec_new_vmspace(struct image_params *, struct sysentvec *);
void exec_setregs(struct thread *, u_long, u_long, u_long);
int exec_shell_imgact(struct image_params *);
#endif
#endif /* !_SYS_IMGACT_H_ */
Index: head/sys/vm/vm_glue.c
===================================================================
--- head/sys/vm/vm_glue.c (revision 103766)
+++ head/sys/vm/vm_glue.c (revision 103767)
@@ -1,873 +1,865 @@
/*
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
* $FreeBSD$
*/
#include "opt_vm.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/shm.h>
#include <sys/vmmeter.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/unistd.h>
#include <machine/limits.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>
#include <vm/vm_object.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <vm/vm_pager.h>
#include <sys/user.h>
extern int maxslp;
/*
* System initialization
*
* Note: proc0 from proc.h
*/
static void vm_init_limits(void *);
SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0)
/*
* THIS MUST BE THE LAST INITIALIZATION ITEM!!!
*
* Note: run scheduling should be divorced from the vm system.
*/
static void scheduler(void *);
SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL)
#ifndef NO_SWAPPING
static void swapout(struct proc *);
static void vm_proc_swapin(struct proc *p);
static void vm_proc_swapout(struct proc *p);
#endif
/*
* MPSAFE
*/
int
kernacc(addr, len, rw)
caddr_t addr;
int len, rw;
{
boolean_t rv;
vm_offset_t saddr, eaddr;
vm_prot_t prot;
KASSERT((rw & ~VM_PROT_ALL) == 0,
("illegal ``rw'' argument to kernacc (%x)\n", rw));
prot = rw;
saddr = trunc_page((vm_offset_t)addr);
eaddr = round_page((vm_offset_t)addr + len);
rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
return (rv == TRUE);
}
/*
* MPSAFE
*/
int
useracc(addr, len, rw)
caddr_t addr;
int len, rw;
{
boolean_t rv;
vm_prot_t prot;
+ vm_map_t map;
KASSERT((rw & ~VM_PROT_ALL) == 0,
("illegal ``rw'' argument to useracc (%x)\n", rw));
prot = rw;
- /*
- * XXX - check separately to disallow access to user area and user
- * page tables - they are in the map.
- *
- * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. It was once
- * only used (as an end address) in trap.c. Use it as an end address
- * here too. This bogusness has spread. I just fixed where it was
- * used as a max in vm_mmap.c.
- */
- if ((vm_offset_t) addr + len > /* XXX */ VM_MAXUSER_ADDRESS
- || (vm_offset_t) addr + len < (vm_offset_t) addr) {
+ map = &curproc->p_vmspace->vm_map;
+ if ((vm_offset_t)addr + len > vm_map_max(map) ||
+ (vm_offset_t)addr + len < (vm_offset_t)addr) {
return (FALSE);
}
- rv = vm_map_check_protection(&curproc->p_vmspace->vm_map,
- trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
- prot);
+ rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
+ round_page((vm_offset_t)addr + len), prot);
return (rv == TRUE);
}
/*
* MPSAFE
*/
void
vslock(addr, len)
caddr_t addr;
u_int len;
{
vm_map_wire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr),
round_page((vm_offset_t)addr + len), FALSE);
}
/*
* MPSAFE
*/
void
vsunlock(addr, len)
caddr_t addr;
u_int len;
{
vm_map_unwire(&curproc->p_vmspace->vm_map,
trunc_page((vm_offset_t)addr),
round_page((vm_offset_t)addr + len), FALSE);
}
/*
* Create the U area for a new process.
* This routine directly affects the fork perf for a process.
*/
void
vm_proc_new(struct proc *p)
{
vm_page_t ma[UAREA_PAGES];
vm_object_t upobj;
vm_offset_t up;
vm_page_t m;
u_int i;
/*
* Allocate object for the upage.
*/
upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES);
p->p_upages_obj = upobj;
/*
* Get a kernel virtual address for the U area for this process.
*/
up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE);
if (up == 0)
panic("vm_proc_new: upage allocation failed");
p->p_uarea = (struct user *)up;
for (i = 0; i < UAREA_PAGES; i++) {
/*
* Get a uarea page.
*/
m = vm_page_grab(upobj, i,
VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
ma[i] = m;
vm_page_wakeup(m);
vm_page_flag_clear(m, PG_ZERO);
m->valid = VM_PAGE_BITS_ALL;
}
/*
* Enter the pages into the kernel address space.
*/
pmap_qenter(up, ma, UAREA_PAGES);
}
/*
* Dispose the U area for a process that has exited.
* This routine directly impacts the exit perf of a process.
* XXX proc_zone is marked UMA_ZONE_NOFREE, so this should never be called.
*/
void
vm_proc_dispose(struct proc *p)
{
vm_object_t upobj;
vm_offset_t up;
vm_page_t m;
upobj = p->p_upages_obj;
if (upobj->resident_page_count != UAREA_PAGES)
panic("vm_proc_dispose: incorrect number of pages in upobj");
vm_page_lock_queues();
while ((m = TAILQ_FIRST(&upobj->memq)) != NULL) {
vm_page_busy(m);
vm_page_unwire(m, 0);
vm_page_free(m);
}
vm_page_unlock_queues();
up = (vm_offset_t)p->p_uarea;
pmap_qremove(up, UAREA_PAGES);
kmem_free(kernel_map, up, UAREA_PAGES * PAGE_SIZE);
vm_object_deallocate(upobj);
}
#ifndef NO_SWAPPING
/*
* Allow the U area for a process to be prejudicially paged out.
*/
void
vm_proc_swapout(struct proc *p)
{
vm_object_t upobj;
vm_offset_t up;
vm_page_t m;
upobj = p->p_upages_obj;
if (upobj->resident_page_count != UAREA_PAGES)
panic("vm_proc_dispose: incorrect number of pages in upobj");
vm_page_lock_queues();
TAILQ_FOREACH(m, &upobj->memq, listq) {
vm_page_dirty(m);
vm_page_unwire(m, 0);
}
vm_page_unlock_queues();
up = (vm_offset_t)p->p_uarea;
pmap_qremove(up, UAREA_PAGES);
}
/*
* Bring the U area for a specified process back in.
*/
void
vm_proc_swapin(struct proc *p)
{
vm_page_t ma[UAREA_PAGES];
vm_object_t upobj;
vm_offset_t up;
vm_page_t m;
int rv;
int i;
upobj = p->p_upages_obj;
for (i = 0; i < UAREA_PAGES; i++) {
m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
if (m->valid != VM_PAGE_BITS_ALL) {
rv = vm_pager_get_pages(upobj, &m, 1, 0);
if (rv != VM_PAGER_OK)
panic("vm_proc_swapin: cannot get upage");
}
ma[i] = m;
}
if (upobj->resident_page_count != UAREA_PAGES)
panic("vm_proc_swapin: lost pages from upobj");
vm_page_lock_queues();
TAILQ_FOREACH(m, &upobj->memq, listq) {
m->valid = VM_PAGE_BITS_ALL;
vm_page_wire(m);
vm_page_wakeup(m);
}
vm_page_unlock_queues();
up = (vm_offset_t)p->p_uarea;
pmap_qenter(up, ma, UAREA_PAGES);
}
#endif
/*
* Implement fork's actions on an address space.
* Here we arrange for the address space to be copied or referenced,
* allocate a user struct (pcb and kernel stack), then call the
* machine-dependent layer to fill those in and make the new process
* ready to run. The new process is set up so that it returns directly
* to user mode to avoid stack copying and relocation problems.
*/
void
vm_forkproc(td, p2, td2, flags)
struct thread *td;
struct proc *p2;
struct thread *td2;
int flags;
{
struct proc *p1 = td->td_proc;
struct user *up;
GIANT_REQUIRED;
if ((flags & RFPROC) == 0) {
/*
* Divorce the memory, if it is shared, essentially
* this changes shared memory amongst threads, into
* COW locally.
*/
if ((flags & RFMEM) == 0) {
if (p1->p_vmspace->vm_refcnt > 1) {
vmspace_unshare(p1);
}
}
cpu_fork(td, p2, td2, flags);
return;
}
if (flags & RFMEM) {
p2->p_vmspace = p1->p_vmspace;
p1->p_vmspace->vm_refcnt++;
}
while (vm_page_count_severe()) {
VM_WAIT;
}
if ((flags & RFMEM) == 0) {
p2->p_vmspace = vmspace_fork(p1->p_vmspace);
pmap_pinit2(vmspace_pmap(p2->p_vmspace));
if (p1->p_vmspace->vm_shm)
shmfork(p1, p2);
}
/* XXXKSE this is unsatisfactory but should be adequate */
up = p2->p_uarea;
/*
* p_stats currently points at fields in the user struct
* but not at &u, instead at p_addr. Copy parts of
* p_stats; zero the rest of p_stats (statistics).
*
* If procsig->ps_refcnt is 1 and p2->p_sigacts is NULL we dont' need
* to share sigacts, so we use the up->u_sigacts.
*/
p2->p_stats = &up->u_stats;
if (p2->p_sigacts == NULL) {
if (p2->p_procsig->ps_refcnt != 1)
printf ("PID:%d NULL sigacts with refcnt not 1!\n",p2->p_pid);
p2->p_sigacts = &up->u_sigacts;
up->u_sigacts = *p1->p_sigacts;
}
bzero(&up->u_stats.pstat_startzero,
(unsigned) ((caddr_t) &up->u_stats.pstat_endzero -
(caddr_t) &up->u_stats.pstat_startzero));
bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy,
((caddr_t) &up->u_stats.pstat_endcopy -
(caddr_t) &up->u_stats.pstat_startcopy));
/*
* cpu_fork will copy and update the pcb, set up the kernel stack,
* and make the child ready to run.
*/
cpu_fork(td, p2, td2, flags);
}
/*
* Called after process has been wait(2)'ed apon and is being reaped.
* The idea is to reclaim resources that we could not reclaim while
* the process was still executing.
*/
void
vm_waitproc(p)
struct proc *p;
{
GIANT_REQUIRED;
cpu_wait(p);
vmspace_exitfree(p); /* and clean-out the vmspace */
}
/*
* Set default limits for VM system.
* Called for proc 0, and then inherited by all others.
*
* XXX should probably act directly on proc0.
*/
static void
vm_init_limits(udata)
void *udata;
{
struct proc *p = udata;
int rss_limit;
/*
* Set up the initial limits on process VM. Set the maximum resident
* set size to be half of (reasonably) available memory. Since this
* is a soft limit, it comes into effect only when the system is out
* of memory - half of main memory helps to favor smaller processes,
* and reduces thrashing of the object cache.
*/
p->p_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
p->p_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
p->p_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
p->p_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
/* limit the limit to no less than 2MB */
rss_limit = max(cnt.v_free_count, 512);
p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
}
void
faultin(p)
struct proc *p;
{
GIANT_REQUIRED;
PROC_LOCK_ASSERT(p, MA_OWNED);
mtx_assert(&sched_lock, MA_OWNED);
#ifdef NO_SWAPPING
if ((p->p_sflag & PS_INMEM) == 0)
panic("faultin: proc swapped out with NO_SWAPPING!");
#else
if ((p->p_sflag & PS_INMEM) == 0) {
struct thread *td;
++p->p_lock;
/*
* If another process is swapping in this process,
* just wait until it finishes.
*/
if (p->p_sflag & PS_SWAPPINGIN) {
mtx_unlock_spin(&sched_lock);
msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0);
mtx_lock_spin(&sched_lock);
--p->p_lock;
return;
}
p->p_sflag |= PS_SWAPPINGIN;
mtx_unlock_spin(&sched_lock);
PROC_UNLOCK(p);
vm_proc_swapin(p);
FOREACH_THREAD_IN_PROC (p, td) {
pmap_swapin_thread(td);
TD_CLR_SWAPPED(td);
}
PROC_LOCK(p);
mtx_lock_spin(&sched_lock);
p->p_sflag &= ~PS_SWAPPINGIN;
p->p_sflag |= PS_INMEM;
FOREACH_THREAD_IN_PROC (p, td)
if (TD_CAN_RUN(td))
setrunnable(td);
wakeup(&p->p_sflag);
/* undo the effect of setting SLOCK above */
--p->p_lock;
}
#endif
}
/*
* This swapin algorithm attempts to swap-in processes only if there
* is enough space for them. Of course, if a process waits for a long
* time, it will be swapped in anyway.
*
* XXXKSE - process with the thread with highest priority counts..
*
* Giant is still held at this point, to be released in tsleep.
*/
/* ARGSUSED*/
static void
scheduler(dummy)
void *dummy;
{
struct proc *p;
struct thread *td;
int pri;
struct proc *pp;
int ppri;
mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
/* GIANT_REQUIRED */
loop:
if (vm_page_count_min()) {
VM_WAIT;
goto loop;
}
pp = NULL;
ppri = INT_MIN;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
struct ksegrp *kg;
if (p->p_sflag & (PS_INMEM | PS_SWAPPING | PS_SWAPPINGIN)) {
continue;
}
mtx_lock_spin(&sched_lock);
FOREACH_THREAD_IN_PROC(p, td) {
/*
* An otherwise runnable thread of a process
* swapped out has only the TDI_SWAPPED bit set.
*
*/
if (td->td_inhibitors == TDI_SWAPPED) {
kg = td->td_ksegrp;
pri = p->p_swtime + kg->kg_slptime;
if ((p->p_sflag & PS_SWAPINREQ) == 0) {
pri -= kg->kg_nice * 8;
}
/*
* if this ksegrp is higher priority
* and there is enough space, then select
* this process instead of the previous
* selection.
*/
if (pri > ppri) {
pp = p;
ppri = pri;
}
}
}
mtx_unlock_spin(&sched_lock);
}
sx_sunlock(&allproc_lock);
/*
* Nothing to do, back to sleep.
*/
if ((p = pp) == NULL) {
tsleep(&proc0, PVM, "sched", maxslp * hz / 2);
goto loop;
}
PROC_LOCK(p);
mtx_lock_spin(&sched_lock);
/*
* Another process may be bringing or may have already
* brought this process in while we traverse all threads.
* Or, this process may even be being swapped out again.
*/
if (p->p_sflag & (PS_INMEM|PS_SWAPPING|PS_SWAPPINGIN)) {
mtx_unlock_spin(&sched_lock);
PROC_UNLOCK(p);
goto loop;
}
p->p_sflag &= ~PS_SWAPINREQ;
/*
* We would like to bring someone in. (only if there is space).
* [What checks the space? ]
*/
faultin(p);
PROC_UNLOCK(p);
p->p_swtime = 0;
mtx_unlock_spin(&sched_lock);
goto loop;
}
#ifndef NO_SWAPPING
/*
* Swap_idle_threshold1 is the guaranteed swapped in time for a process
*/
static int swap_idle_threshold1 = 2;
SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1,
CTLFLAG_RW, &swap_idle_threshold1, 0, "");
/*
* Swap_idle_threshold2 is the time that a process can be idle before
* it will be swapped out, if idle swapping is enabled.
*/
static int swap_idle_threshold2 = 10;
SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2,
CTLFLAG_RW, &swap_idle_threshold2, 0, "");
/*
* Swapout is driven by the pageout daemon. Very simple, we find eligible
* procs and unwire their u-areas. We try to always "swap" at least one
* process in case we need the room for a swapin.
* If any procs have been sleeping/stopped for at least maxslp seconds,
* they are swapped. Else, we swap the longest-sleeping or stopped process,
* if any, otherwise the longest-resident process.
*/
void
swapout_procs(action)
int action;
{
struct proc *p;
struct thread *td;
struct ksegrp *kg;
struct proc *outp, *outp2;
int outpri, outpri2;
int didswap = 0;
GIANT_REQUIRED;
outp = outp2 = NULL;
outpri = outpri2 = INT_MIN;
retry:
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
struct vmspace *vm;
int minslptime = 100000;
/*
* Do not swapout a process that
* is waiting for VM data
* structures there is a possible
* deadlock. Test this first as
* this may block.
*
* Lock the map until swapout
* finishes, or a thread of this
* process may attempt to alter
* the map.
*
* Watch out for a process in
* creation. It may have no
* address space yet.
*
* An aio daemon switches its
* address space while running.
* Perform a quick check whether
* a process has P_SYSTEM.
*/
PROC_LOCK(p);
if ((p->p_flag & P_SYSTEM) != 0) {
PROC_UNLOCK(p);
continue;
}
mtx_lock_spin(&sched_lock);
if (p->p_state == PRS_NEW) {
mtx_unlock_spin(&sched_lock);
PROC_UNLOCK(p);
continue;
}
vm = p->p_vmspace;
KASSERT(vm != NULL,
("swapout_procs: a process has no address space"));
++vm->vm_refcnt;
mtx_unlock_spin(&sched_lock);
PROC_UNLOCK(p);
if (!vm_map_trylock(&vm->vm_map))
goto nextproc1;
PROC_LOCK(p);
if (p->p_lock != 0 ||
(p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
) != 0) {
goto nextproc2;
}
/*
* only aiod changes vmspace, however it will be
* skipped because of the if statement above checking
* for P_SYSTEM
*/
mtx_lock_spin(&sched_lock);
if ((p->p_sflag & (PS_INMEM|PS_SWAPPING|PS_SWAPPINGIN)) != PS_INMEM)
goto nextproc;
switch (p->p_state) {
default:
/* Don't swap out processes in any sort
* of 'special' state. */
goto nextproc;
case PRS_NORMAL:
/*
* do not swapout a realtime process
* Check all the thread groups..
*/
FOREACH_KSEGRP_IN_PROC(p, kg) {
if (PRI_IS_REALTIME(kg->kg_pri_class))
goto nextproc;
/*
* Guarantee swap_idle_threshold1
* time in memory.
*/
if (kg->kg_slptime < swap_idle_threshold1)
goto nextproc;
/*
* Do not swapout a process if it is
* waiting on a critical event of some
* kind or there is a thread whose
* pageable memory may be accessed.
*
* This could be refined to support
* swapping out a thread.
*/
FOREACH_THREAD_IN_GROUP(kg, td) {
if ((td->td_priority) < PSOCK ||
!thread_safetoswapout(td))
goto nextproc;
}
/*
* If the system is under memory stress,
* or if we are swapping
* idle processes >= swap_idle_threshold2,
* then swap the process out.
*/
if (((action & VM_SWAP_NORMAL) == 0) &&
(((action & VM_SWAP_IDLE) == 0) ||
(kg->kg_slptime < swap_idle_threshold2)))
goto nextproc;
if (minslptime > kg->kg_slptime)
minslptime = kg->kg_slptime;
}
/*
* If the process has been asleep for awhile and had
* most of its pages taken away already, swap it out.
*/
if ((action & VM_SWAP_NORMAL) ||
((action & VM_SWAP_IDLE) &&
(minslptime > swap_idle_threshold2))) {
swapout(p);
didswap++;
/*
* swapout() unlocks a proc lock. This is
* ugly, but avoids superfluous lock.
*/
mtx_unlock_spin(&sched_lock);
vm_map_unlock(&vm->vm_map);
vmspace_free(vm);
sx_sunlock(&allproc_lock);
goto retry;
}
}
nextproc:
mtx_unlock_spin(&sched_lock);
nextproc2:
PROC_UNLOCK(p);
vm_map_unlock(&vm->vm_map);
nextproc1:
vmspace_free(vm);
continue;
}
sx_sunlock(&allproc_lock);
/*
* If we swapped something out, and another process needed memory,
* then wakeup the sched process.
*/
if (didswap)
wakeup(&proc0);
}
static void
swapout(p)
struct proc *p;
{
struct thread *td;
PROC_LOCK_ASSERT(p, MA_OWNED);
mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
#if defined(SWAP_DEBUG)
printf("swapping out %d\n", p->p_pid);
#endif
/*
* The states of this process and its threads may have changed
* by now. Assuming that there is only one pageout daemon thread,
* this process should still be in memory.
*/
KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPING|PS_SWAPPINGIN)) == PS_INMEM,
("swapout: lost a swapout race?"));
#if defined(INVARIANTS)
/*
* Make sure that all threads are safe to be swapped out.
*
* Alternatively, we could swap out only safe threads.
*/
FOREACH_THREAD_IN_PROC(p, td) {
KASSERT(thread_safetoswapout(td),
("swapout: there is a thread not safe for swapout"));
}
#endif /* INVARIANTS */
++p->p_stats->p_ru.ru_nswap;
/*
* remember the process resident count
*/
p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
PROC_UNLOCK(p);
FOREACH_THREAD_IN_PROC (p, td) /* shouldn't be possible, but..... */
if (TD_ON_RUNQ(td)) { /* XXXKSE */
panic("swapping out runnable process");
remrunqueue(td); /* XXXKSE */
}
p->p_sflag &= ~PS_INMEM;
p->p_sflag |= PS_SWAPPING;
mtx_unlock_spin(&sched_lock);
vm_proc_swapout(p);
FOREACH_THREAD_IN_PROC(p, td) {
pmap_swapout_thread(td);
TD_SET_SWAPPED(td);
}
mtx_lock_spin(&sched_lock);
p->p_sflag &= ~PS_SWAPPING;
p->p_swtime = 0;
}
#endif /* !NO_SWAPPING */
Index: head/sys/vm/vm_map.c
===================================================================
--- head/sys/vm/vm_map.c (revision 103766)
+++ head/sys/vm/vm_map.c (revision 103767)
@@ -1,3159 +1,3158 @@
/*
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Authors: Avadis Tevanian, Jr., Michael Wayne Young
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
* $FreeBSD$
*/
/*
* Virtual memory mapping module.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ktr.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/vmmeter.h>
#include <sys/mman.h>
#include <sys/vnode.h>
#include <sys/resourcevar.h>
+#include <sys/sysent.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <vm/swap_pager.h>
#include <vm/uma.h>
/*
* Virtual memory maps provide for the mapping, protection,
* and sharing of virtual memory objects. In addition,
* this module provides for an efficient virtual copy of
* memory from one map to another.
*
* Synchronization is required prior to most operations.
*
* Maps consist of an ordered doubly-linked list of simple
* entries; a single hint is used to speed up lookups.
*
* Since portions of maps are specified by start/end addresses,
* which may not align with existing map entries, all
* routines merely "clip" entries to these start/end values.
* [That is, an entry is split into two, bordering at a
* start or end value.] Note that these clippings may not
* always be necessary (as the two resulting entries are then
* not changed); however, the clipping is done for convenience.
*
* As mentioned above, virtual copy operations are performed
* by copying VM object references from one map to
* another, and then marking both regions as copy-on-write.
*/
/*
* vm_map_startup:
*
* Initialize the vm_map module. Must be called before
* any other vm_map routines.
*
* Map and entry structures are allocated from the general
* purpose memory pool with some exceptions:
*
* - The kernel map and kmem submap are allocated statically.
* - Kernel map entries are allocated out of a static pool.
*
* These restrictions are necessary since malloc() uses the
* maps and requires map entries.
*/
static uma_zone_t mapentzone;
static uma_zone_t kmapentzone;
static uma_zone_t mapzone;
static uma_zone_t vmspace_zone;
static struct vm_object kmapentobj;
static void vmspace_zinit(void *mem, int size);
static void vmspace_zfini(void *mem, int size);
static void vm_map_zinit(void *mem, int size);
static void vm_map_zfini(void *mem, int size);
static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max);
#ifdef INVARIANTS
static void vm_map_zdtor(void *mem, int size, void *arg);
static void vmspace_zdtor(void *mem, int size, void *arg);
#endif
void
vm_map_startup(void)
{
mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
#ifdef INVARIANTS
vm_map_zdtor,
#else
NULL,
#endif
vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
uma_prealloc(mapzone, MAX_KMAP);
kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
uma_prealloc(kmapentzone, MAX_KMAPENT);
mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
uma_prealloc(mapentzone, MAX_MAPENT);
}
static void
vmspace_zfini(void *mem, int size)
{
struct vmspace *vm;
vm = (struct vmspace *)mem;
vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
}
static void
vmspace_zinit(void *mem, int size)
{
struct vmspace *vm;
vm = (struct vmspace *)mem;
vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map));
}
static void
vm_map_zfini(void *mem, int size)
{
vm_map_t map;
map = (vm_map_t)mem;
lockdestroy(&map->lock);
}
static void
vm_map_zinit(void *mem, int size)
{
vm_map_t map;
map = (vm_map_t)mem;
map->nentries = 0;
map->size = 0;
map->infork = 0;
lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE);
}
#ifdef INVARIANTS
static void
vmspace_zdtor(void *mem, int size, void *arg)
{
struct vmspace *vm;
vm = (struct vmspace *)mem;
vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
}
static void
vm_map_zdtor(void *mem, int size, void *arg)
{
vm_map_t map;
map = (vm_map_t)mem;
KASSERT(map->nentries == 0,
("map %p nentries == %d on free.",
map, map->nentries));
KASSERT(map->size == 0,
("map %p size == %lu on free.",
map, (unsigned long)map->size));
KASSERT(map->infork == 0,
("map %p infork == %d on free.",
map, map->infork));
}
#endif /* INVARIANTS */
/*
* Allocate a vmspace structure, including a vm_map and pmap,
* and initialize those structures. The refcnt is set to 1.
* The remaining fields must be initialized by the caller.
*/
struct vmspace *
vmspace_alloc(min, max)
vm_offset_t min, max;
{
struct vmspace *vm;
GIANT_REQUIRED;
vm = uma_zalloc(vmspace_zone, M_WAITOK);
CTR1(KTR_VM, "vmspace_alloc: %p", vm);
_vm_map_init(&vm->vm_map, min, max);
pmap_pinit(vmspace_pmap(vm));
vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */
vm->vm_refcnt = 1;
vm->vm_shm = NULL;
vm->vm_freer = NULL;
return (vm);
}
void
vm_init2(void)
{
uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count,
(VM_MAX_KERNEL_ADDRESS - KERNBASE) / PAGE_SIZE) / 8);
vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
#ifdef INVARIANTS
vmspace_zdtor,
#else
NULL,
#endif
vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
pmap_init2();
vm_object_init2();
}
static __inline void
vmspace_dofree(struct vmspace *vm)
{
CTR1(KTR_VM, "vmspace_free: %p", vm);
/*
* Lock the map, to wait out all other references to it.
* Delete all of the mappings and pages they hold, then call
* the pmap module to reclaim anything left.
*/
vm_map_lock(&vm->vm_map);
(void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
vm->vm_map.max_offset);
vm_map_unlock(&vm->vm_map);
pmap_release(vmspace_pmap(vm));
uma_zfree(vmspace_zone, vm);
}
void
vmspace_free(struct vmspace *vm)
{
GIANT_REQUIRED;
if (vm->vm_refcnt == 0)
panic("vmspace_free: attempt to free already freed vmspace");
if (--vm->vm_refcnt == 0)
vmspace_dofree(vm);
}
void
vmspace_exitfree(struct proc *p)
{
struct vmspace *vm;
GIANT_REQUIRED;
if (p == p->p_vmspace->vm_freer) {
vm = p->p_vmspace;
p->p_vmspace = NULL;
vmspace_dofree(vm);
}
}
/*
* vmspace_swap_count() - count the approximate swap useage in pages for a
* vmspace.
*
* Swap useage is determined by taking the proportional swap used by
* VM objects backing the VM map. To make up for fractional losses,
* if the VM object has any swap use at all the associated map entries
* count for at least 1 swap page.
*/
int
vmspace_swap_count(struct vmspace *vmspace)
{
vm_map_t map = &vmspace->vm_map;
vm_map_entry_t cur;
int count = 0;
vm_map_lock_read(map);
for (cur = map->header.next; cur != &map->header; cur = cur->next) {
vm_object_t object;
if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
(object = cur->object.vm_object) != NULL &&
object->type == OBJT_SWAP
) {
int n = (cur->end - cur->start) / PAGE_SIZE;
if (object->un_pager.swp.swp_bcount) {
count += object->un_pager.swp.swp_bcount *
SWAP_META_PAGES * n / object->size + 1;
}
}
}
vm_map_unlock_read(map);
return (count);
}
void
_vm_map_lock(vm_map_t map, const char *file, int line)
{
int error;
if (map->system_map)
GIANT_REQUIRED;
error = lockmgr(&map->lock, LK_EXCLUSIVE, NULL, curthread);
KASSERT(error == 0, ("%s: failed to get lock", __func__));
map->timestamp++;
}
void
_vm_map_unlock(vm_map_t map, const char *file, int line)
{
lockmgr(&map->lock, LK_RELEASE, NULL, curthread);
}
void
_vm_map_lock_read(vm_map_t map, const char *file, int line)
{
int error;
if (map->system_map)
GIANT_REQUIRED;
error = lockmgr(&map->lock, LK_EXCLUSIVE, NULL, curthread);
KASSERT(error == 0, ("%s: failed to get lock", __func__));
}
void
_vm_map_unlock_read(vm_map_t map, const char *file, int line)
{
lockmgr(&map->lock, LK_RELEASE, NULL, curthread);
}
int
_vm_map_trylock(vm_map_t map, const char *file, int line)
{
int error;
if (map->system_map)
GIANT_REQUIRED;
error = lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT, NULL, curthread);
return (error == 0);
}
int
_vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
{
KASSERT(lockstatus(&map->lock, curthread) == LK_EXCLUSIVE,
("%s: lock not held", __func__));
map->timestamp++;
return (0);
}
void
_vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
{
KASSERT(lockstatus(&map->lock, curthread) == LK_EXCLUSIVE,
("%s: lock not held", __func__));
}
/*
* vm_map_unlock_and_wait:
*/
int
vm_map_unlock_and_wait(vm_map_t map, boolean_t user_wait)
{
int retval;
mtx_lock(&Giant);
vm_map_unlock(map);
retval = tsleep(&map->root, PVM, "vmmapw", 0);
mtx_unlock(&Giant);
return (retval);
}
/*
* vm_map_wakeup:
*/
void
vm_map_wakeup(vm_map_t map)
{
/*
* Acquire and release Giant to prevent a wakeup() from being
* performed (and lost) between the vm_map_unlock() and the
* tsleep() in vm_map_unlock_and_wait().
*/
mtx_lock(&Giant);
mtx_unlock(&Giant);
wakeup(&map->root);
}
long
vmspace_resident_count(struct vmspace *vmspace)
{
return pmap_resident_count(vmspace_pmap(vmspace));
}
/*
* vm_map_create:
*
* Creates and returns a new empty VM map with
* the given physical map structure, and having
* the given lower and upper address bounds.
*/
vm_map_t
vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
{
vm_map_t result;
result = uma_zalloc(mapzone, M_WAITOK);
CTR1(KTR_VM, "vm_map_create: %p", result);
_vm_map_init(result, min, max);
result->pmap = pmap;
return (result);
}
/*
* Initialize an existing vm_map structure
* such as that in the vmspace structure.
* The pmap is set elsewhere.
*/
static void
_vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
{
map->header.next = map->header.prev = &map->header;
map->needs_wakeup = FALSE;
map->system_map = 0;
map->min_offset = min;
map->max_offset = max;
map->first_free = &map->header;
map->root = NULL;
map->timestamp = 0;
}
void
vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
{
_vm_map_init(map, min, max);
lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE);
}
/*
* vm_map_entry_dispose: [ internal use only ]
*
* Inverse of vm_map_entry_create.
*/
static void
vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
{
uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
}
/*
* vm_map_entry_create: [ internal use only ]
*
* Allocates a VM map entry for insertion.
* No entry fields are filled in.
*/
static vm_map_entry_t
vm_map_entry_create(vm_map_t map)
{
vm_map_entry_t new_entry;
if (map->system_map)
new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
else
new_entry = uma_zalloc(mapentzone, M_WAITOK);
if (new_entry == NULL)
panic("vm_map_entry_create: kernel resources exhausted");
return (new_entry);
}
/*
* vm_map_entry_set_behavior:
*
* Set the expected access behavior, either normal, random, or
* sequential.
*/
static __inline void
vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
{
entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
(behavior & MAP_ENTRY_BEHAV_MASK);
}
/*
* vm_map_entry_splay:
*
* Implements Sleator and Tarjan's top-down splay algorithm. Returns
* the vm_map_entry containing the given address. If, however, that
* address is not found in the vm_map, returns a vm_map_entry that is
* adjacent to the address, coming before or after it.
*/
static vm_map_entry_t
vm_map_entry_splay(vm_offset_t address, vm_map_entry_t root)
{
struct vm_map_entry dummy;
vm_map_entry_t lefttreemax, righttreemin, y;
if (root == NULL)
return (root);
lefttreemax = righttreemin = &dummy;
for (;; root = y) {
if (address < root->start) {
if ((y = root->left) == NULL)
break;
if (address < y->start) {
/* Rotate right. */
root->left = y->right;
y->right = root;
root = y;
if ((y = root->left) == NULL)
break;
}
/* Link into the new root's right tree. */
righttreemin->left = root;
righttreemin = root;
} else if (address >= root->end) {
if ((y = root->right) == NULL)
break;
if (address >= y->end) {
/* Rotate left. */
root->right = y->left;
y->left = root;
root = y;
if ((y = root->right) == NULL)
break;
}
/* Link into the new root's left tree. */
lefttreemax->right = root;
lefttreemax = root;
} else
break;
}
/* Assemble the new root. */
lefttreemax->right = root->left;
righttreemin->left = root->right;
root->left = dummy.right;
root->right = dummy.left;
return (root);
}
/*
* vm_map_entry_{un,}link:
*
* Insert/remove entries from maps.
*/
static void
vm_map_entry_link(vm_map_t map,
vm_map_entry_t after_where,
vm_map_entry_t entry)
{
CTR4(KTR_VM,
"vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
map->nentries, entry, after_where);
map->nentries++;
entry->prev = after_where;
entry->next = after_where->next;
entry->next->prev = entry;
after_where->next = entry;
if (after_where != &map->header) {
if (after_where != map->root)
vm_map_entry_splay(after_where->start, map->root);
entry->right = after_where->right;
entry->left = after_where;
after_where->right = NULL;
} else {
entry->right = map->root;
entry->left = NULL;
}
map->root = entry;
}
static void
vm_map_entry_unlink(vm_map_t map,
vm_map_entry_t entry)
{
vm_map_entry_t next, prev, root;
if (entry != map->root)
vm_map_entry_splay(entry->start, map->root);
if (entry->left == NULL)
root = entry->right;
else {
root = vm_map_entry_splay(entry->start, entry->left);
root->right = entry->right;
}
map->root = root;
prev = entry->prev;
next = entry->next;
next->prev = prev;
prev->next = next;
map->nentries--;
CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
map->nentries, entry);
}
/*
* vm_map_lookup_entry: [ internal use only ]
*
* Finds the map entry containing (or
* immediately preceding) the specified address
* in the given map; the entry is returned
* in the "entry" parameter. The boolean
* result indicates whether the address is
* actually contained in the map.
*/
boolean_t
vm_map_lookup_entry(
vm_map_t map,
vm_offset_t address,
vm_map_entry_t *entry) /* OUT */
{
vm_map_entry_t cur;
cur = vm_map_entry_splay(address, map->root);
if (cur == NULL)
*entry = &map->header;
else {
map->root = cur;
if (address >= cur->start) {
*entry = cur;
if (cur->end > address)
return (TRUE);
} else
*entry = cur->prev;
}
return (FALSE);
}
/*
* vm_map_insert:
*
* Inserts the given whole VM object into the target
* map at the specified address range. The object's
* size should match that of the address range.
*
* Requires that the map be locked, and leaves it so.
*
* If object is non-NULL, ref count must be bumped by caller
* prior to making call to account for the new entry.
*/
int
vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
int cow)
{
vm_map_entry_t new_entry;
vm_map_entry_t prev_entry;
vm_map_entry_t temp_entry;
vm_eflags_t protoeflags;
/*
* Check that the start and end points are not bogus.
*/
if ((start < map->min_offset) || (end > map->max_offset) ||
(start >= end))
return (KERN_INVALID_ADDRESS);
/*
* Find the entry prior to the proposed starting address; if it's part
* of an existing entry, this range is bogus.
*/
if (vm_map_lookup_entry(map, start, &temp_entry))
return (KERN_NO_SPACE);
prev_entry = temp_entry;
/*
* Assert that the next entry doesn't overlap the end point.
*/
if ((prev_entry->next != &map->header) &&
(prev_entry->next->start < end))
return (KERN_NO_SPACE);
protoeflags = 0;
if (cow & MAP_COPY_ON_WRITE)
protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
if (cow & MAP_NOFAULT) {
protoeflags |= MAP_ENTRY_NOFAULT;
KASSERT(object == NULL,
("vm_map_insert: paradoxical MAP_NOFAULT request"));
}
if (cow & MAP_DISABLE_SYNCER)
protoeflags |= MAP_ENTRY_NOSYNC;
if (cow & MAP_DISABLE_COREDUMP)
protoeflags |= MAP_ENTRY_NOCOREDUMP;
if (object) {
/*
* When object is non-NULL, it could be shared with another
* process. We have to set or clear OBJ_ONEMAPPING
* appropriately.
*/
vm_object_lock(object);
if ((object->ref_count > 1) || (object->shadow_count != 0)) {
vm_object_clear_flag(object, OBJ_ONEMAPPING);
}
vm_object_unlock(object);
}
else if ((prev_entry != &map->header) &&
(prev_entry->eflags == protoeflags) &&
(prev_entry->end == start) &&
(prev_entry->wired_count == 0) &&
((prev_entry->object.vm_object == NULL) ||
vm_object_coalesce(prev_entry->object.vm_object,
OFF_TO_IDX(prev_entry->offset),
(vm_size_t)(prev_entry->end - prev_entry->start),
(vm_size_t)(end - prev_entry->end)))) {
/*
* We were able to extend the object. Determine if we
* can extend the previous map entry to include the
* new range as well.
*/
if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
(prev_entry->protection == prot) &&
(prev_entry->max_protection == max)) {
map->size += (end - prev_entry->end);
prev_entry->end = end;
vm_map_simplify_entry(map, prev_entry);
return (KERN_SUCCESS);
}
/*
* If we can extend the object but cannot extend the
* map entry, we have to create a new map entry. We
* must bump the ref count on the extended object to
* account for it. object may be NULL.
*/
object = prev_entry->object.vm_object;
offset = prev_entry->offset +
(prev_entry->end - prev_entry->start);
vm_object_reference(object);
}
/*
* NOTE: if conditionals fail, object can be NULL here. This occurs
* in things like the buffer map where we manage kva but do not manage
* backing objects.
*/
/*
* Create a new entry
*/
new_entry = vm_map_entry_create(map);
new_entry->start = start;
new_entry->end = end;
new_entry->eflags = protoeflags;
new_entry->object.vm_object = object;
new_entry->offset = offset;
new_entry->avail_ssize = 0;
new_entry->inheritance = VM_INHERIT_DEFAULT;
new_entry->protection = prot;
new_entry->max_protection = max;
new_entry->wired_count = 0;
/*
* Insert the new entry into the list
*/
vm_map_entry_link(map, prev_entry, new_entry);
map->size += new_entry->end - new_entry->start;
/*
* Update the free space hint
*/
if ((map->first_free == prev_entry) &&
(prev_entry->end >= new_entry->start)) {
map->first_free = new_entry;
}
#if 0
/*
* Temporarily removed to avoid MAP_STACK panic, due to
* MAP_STACK being a huge hack. Will be added back in
* when MAP_STACK (and the user stack mapping) is fixed.
*/
/*
* It may be possible to simplify the entry
*/
vm_map_simplify_entry(map, new_entry);
#endif
if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
mtx_lock(&Giant);
pmap_object_init_pt(map->pmap, start,
object, OFF_TO_IDX(offset), end - start,
cow & MAP_PREFAULT_PARTIAL);
mtx_unlock(&Giant);
}
return (KERN_SUCCESS);
}
/*
* Find sufficient space for `length' bytes in the given map, starting at
* `start'. The map must be locked. Returns 0 on success, 1 on no space.
*/
int
vm_map_findspace(
vm_map_t map,
vm_offset_t start,
vm_size_t length,
vm_offset_t *addr)
{
vm_map_entry_t entry, next;
vm_offset_t end;
if (start < map->min_offset)
start = map->min_offset;
if (start > map->max_offset)
return (1);
/*
* Look for the first possible address; if there's already something
* at this address, we have to start after it.
*/
if (start == map->min_offset) {
if ((entry = map->first_free) != &map->header)
start = entry->end;
} else {
vm_map_entry_t tmp;
if (vm_map_lookup_entry(map, start, &tmp))
start = tmp->end;
entry = tmp;
}
/*
* Look through the rest of the map, trying to fit a new region in the
* gap between existing regions, or after the very last region.
*/
for (;; start = (entry = next)->end) {
/*
* Find the end of the proposed new region. Be sure we didn't
* go beyond the end of the map, or wrap around the address;
* if so, we lose. Otherwise, if this is the last entry, or
* if the proposed new region fits before the next entry, we
* win.
*/
end = start + length;
if (end > map->max_offset || end < start)
return (1);
next = entry->next;
if (next == &map->header || next->start >= end)
break;
}
*addr = start;
if (map == kernel_map) {
vm_offset_t ksize;
if ((ksize = round_page(start + length)) > kernel_vm_end) {
mtx_lock(&Giant);
pmap_growkernel(ksize);
mtx_unlock(&Giant);
}
}
return (0);
}
/*
* vm_map_find finds an unallocated region in the target address
* map with the given length. The search is defined to be
* first-fit from the specified address; the region found is
* returned in the same parameter.
*
* If object is non-NULL, ref count must be bumped by caller
* prior to making call to account for the new entry.
*/
int
vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
vm_offset_t *addr, /* IN/OUT */
vm_size_t length, boolean_t find_space, vm_prot_t prot,
vm_prot_t max, int cow)
{
vm_offset_t start;
int result, s = 0;
start = *addr;
if (map == kmem_map)
s = splvm();
vm_map_lock(map);
if (find_space) {
if (vm_map_findspace(map, start, length, addr)) {
vm_map_unlock(map);
if (map == kmem_map)
splx(s);
return (KERN_NO_SPACE);
}
start = *addr;
}
result = vm_map_insert(map, object, offset,
start, start + length, prot, max, cow);
vm_map_unlock(map);
if (map == kmem_map)
splx(s);
return (result);
}
/*
* vm_map_simplify_entry:
*
* Simplify the given map entry by merging with either neighbor. This
* routine also has the ability to merge with both neighbors.
*
* The map must be locked.
*
* This routine guarentees that the passed entry remains valid (though
* possibly extended). When merging, this routine may delete one or
* both neighbors.
*/
void
vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
{
vm_map_entry_t next, prev;
vm_size_t prevsize, esize;
if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP))
return;
prev = entry->prev;
if (prev != &map->header) {
prevsize = prev->end - prev->start;
if ( (prev->end == entry->start) &&
(prev->object.vm_object == entry->object.vm_object) &&
(!prev->object.vm_object ||
(prev->offset + prevsize == entry->offset)) &&
(prev->eflags == entry->eflags) &&
(prev->protection == entry->protection) &&
(prev->max_protection == entry->max_protection) &&
(prev->inheritance == entry->inheritance) &&
(prev->wired_count == entry->wired_count)) {
if (map->first_free == prev)
map->first_free = entry;
vm_map_entry_unlink(map, prev);
entry->start = prev->start;
entry->offset = prev->offset;
if (prev->object.vm_object)
vm_object_deallocate(prev->object.vm_object);
vm_map_entry_dispose(map, prev);
}
}
next = entry->next;
if (next != &map->header) {
esize = entry->end - entry->start;
if ((entry->end == next->start) &&
(next->object.vm_object == entry->object.vm_object) &&
(!entry->object.vm_object ||
(entry->offset + esize == next->offset)) &&
(next->eflags == entry->eflags) &&
(next->protection == entry->protection) &&
(next->max_protection == entry->max_protection) &&
(next->inheritance == entry->inheritance) &&
(next->wired_count == entry->wired_count)) {
if (map->first_free == next)
map->first_free = entry;
vm_map_entry_unlink(map, next);
entry->end = next->end;
if (next->object.vm_object)
vm_object_deallocate(next->object.vm_object);
vm_map_entry_dispose(map, next);
}
}
}
/*
* vm_map_clip_start: [ internal use only ]
*
* Asserts that the given entry begins at or after
* the specified address; if necessary,
* it splits the entry into two.
*/
#define vm_map_clip_start(map, entry, startaddr) \
{ \
if (startaddr > entry->start) \
_vm_map_clip_start(map, entry, startaddr); \
}
/*
* This routine is called only when it is known that
* the entry must be split.
*/
static void
_vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
{
vm_map_entry_t new_entry;
/*
* Split off the front portion -- note that we must insert the new
* entry BEFORE this one, so that this entry has the specified
* starting address.
*/
vm_map_simplify_entry(map, entry);
/*
* If there is no object backing this entry, we might as well create
* one now. If we defer it, an object can get created after the map
* is clipped, and individual objects will be created for the split-up
* map. This is a bit of a hack, but is also about the best place to
* put this improvement.
*/
if (entry->object.vm_object == NULL && !map->system_map) {
vm_object_t object;
object = vm_object_allocate(OBJT_DEFAULT,
atop(entry->end - entry->start));
entry->object.vm_object = object;
entry->offset = 0;
}
new_entry = vm_map_entry_create(map);
*new_entry = *entry;
new_entry->end = start;
entry->offset += (start - entry->start);
entry->start = start;
vm_map_entry_link(map, entry->prev, new_entry);
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
vm_object_reference(new_entry->object.vm_object);
}
}
/*
* vm_map_clip_end: [ internal use only ]
*
* Asserts that the given entry ends at or before
* the specified address; if necessary,
* it splits the entry into two.
*/
#define vm_map_clip_end(map, entry, endaddr) \
{ \
if (endaddr < entry->end) \
_vm_map_clip_end(map, entry, endaddr); \
}
/*
* This routine is called only when it is known that
* the entry must be split.
*/
static void
_vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
{
vm_map_entry_t new_entry;
/*
* If there is no object backing this entry, we might as well create
* one now. If we defer it, an object can get created after the map
* is clipped, and individual objects will be created for the split-up
* map. This is a bit of a hack, but is also about the best place to
* put this improvement.
*/
if (entry->object.vm_object == NULL && !map->system_map) {
vm_object_t object;
object = vm_object_allocate(OBJT_DEFAULT,
atop(entry->end - entry->start));
entry->object.vm_object = object;
entry->offset = 0;
}
/*
* Create a new entry and insert it AFTER the specified entry
*/
new_entry = vm_map_entry_create(map);
*new_entry = *entry;
new_entry->start = entry->end = end;
new_entry->offset += (end - entry->start);
vm_map_entry_link(map, entry, new_entry);
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
vm_object_reference(new_entry->object.vm_object);
}
}
/*
* VM_MAP_RANGE_CHECK: [ internal use only ]
*
* Asserts that the starting and ending region
* addresses fall within the valid range of the map.
*/
#define VM_MAP_RANGE_CHECK(map, start, end) \
{ \
if (start < vm_map_min(map)) \
start = vm_map_min(map); \
if (end > vm_map_max(map)) \
end = vm_map_max(map); \
if (start > end) \
start = end; \
}
/*
* vm_map_submap: [ kernel use only ]
*
* Mark the given range as handled by a subordinate map.
*
* This range must have been created with vm_map_find,
* and no other operations may have been performed on this
* range prior to calling vm_map_submap.
*
* Only a limited number of operations can be performed
* within this rage after calling vm_map_submap:
* vm_fault
* [Don't try vm_map_copy!]
*
* To remove a submapping, one must first remove the
* range from the superior map, and then destroy the
* submap (if desired). [Better yet, don't try it.]
*/
int
vm_map_submap(
vm_map_t map,
vm_offset_t start,
vm_offset_t end,
vm_map_t submap)
{
vm_map_entry_t entry;
int result = KERN_INVALID_ARGUMENT;
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (vm_map_lookup_entry(map, start, &entry)) {
vm_map_clip_start(map, entry, start);
} else
entry = entry->next;
vm_map_clip_end(map, entry, end);
if ((entry->start == start) && (entry->end == end) &&
((entry->eflags & MAP_ENTRY_COW) == 0) &&
(entry->object.vm_object == NULL)) {
entry->object.sub_map = submap;
entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
result = KERN_SUCCESS;
}
vm_map_unlock(map);
return (result);
}
/*
* vm_map_protect:
*
* Sets the protection of the specified address
* region in the target map. If "set_max" is
* specified, the maximum protection is to be set;
* otherwise, only the current protection is affected.
*/
int
vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
vm_prot_t new_prot, boolean_t set_max)
{
vm_map_entry_t current;
vm_map_entry_t entry;
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (vm_map_lookup_entry(map, start, &entry)) {
vm_map_clip_start(map, entry, start);
} else {
entry = entry->next;
}
/*
* Make a first pass to check for protection violations.
*/
current = entry;
while ((current != &map->header) && (current->start < end)) {
if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
vm_map_unlock(map);
return (KERN_INVALID_ARGUMENT);
}
if ((new_prot & current->max_protection) != new_prot) {
vm_map_unlock(map);
return (KERN_PROTECTION_FAILURE);
}
current = current->next;
}
/*
* Go back and fix up protections. [Note that clipping is not
* necessary the second time.]
*/
current = entry;
while ((current != &map->header) && (current->start < end)) {
vm_prot_t old_prot;
vm_map_clip_end(map, current, end);
old_prot = current->protection;
if (set_max)
current->protection =
(current->max_protection = new_prot) &
old_prot;
else
current->protection = new_prot;
/*
* Update physical map if necessary. Worry about copy-on-write
* here -- CHECK THIS XXX
*/
if (current->protection != old_prot) {
mtx_lock(&Giant);
#define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
VM_PROT_ALL)
pmap_protect(map->pmap, current->start,
current->end,
current->protection & MASK(current));
#undef MASK
mtx_unlock(&Giant);
}
vm_map_simplify_entry(map, current);
current = current->next;
}
vm_map_unlock(map);
return (KERN_SUCCESS);
}
/*
* vm_map_madvise:
*
* This routine traverses a processes map handling the madvise
* system call. Advisories are classified as either those effecting
* the vm_map_entry structure, or those effecting the underlying
* objects.
*/
int
vm_map_madvise(
vm_map_t map,
vm_offset_t start,
vm_offset_t end,
int behav)
{
vm_map_entry_t current, entry;
int modify_map = 0;
/*
* Some madvise calls directly modify the vm_map_entry, in which case
* we need to use an exclusive lock on the map and we need to perform
* various clipping operations. Otherwise we only need a read-lock
* on the map.
*/
switch(behav) {
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
case MADV_NOSYNC:
case MADV_AUTOSYNC:
case MADV_NOCORE:
case MADV_CORE:
modify_map = 1;
vm_map_lock(map);
break;
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_FREE:
vm_map_lock_read(map);
break;
default:
return (KERN_INVALID_ARGUMENT);
}
/*
* Locate starting entry and clip if necessary.
*/
VM_MAP_RANGE_CHECK(map, start, end);
if (vm_map_lookup_entry(map, start, &entry)) {
if (modify_map)
vm_map_clip_start(map, entry, start);
} else {
entry = entry->next;
}
if (modify_map) {
/*
* madvise behaviors that are implemented in the vm_map_entry.
*
* We clip the vm_map_entry so that behavioral changes are
* limited to the specified address range.
*/
for (current = entry;
(current != &map->header) && (current->start < end);
current = current->next
) {
if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
continue;
vm_map_clip_end(map, current, end);
switch (behav) {
case MADV_NORMAL:
vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
break;
case MADV_SEQUENTIAL:
vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
break;
case MADV_RANDOM:
vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
break;
case MADV_NOSYNC:
current->eflags |= MAP_ENTRY_NOSYNC;
break;
case MADV_AUTOSYNC:
current->eflags &= ~MAP_ENTRY_NOSYNC;
break;
case MADV_NOCORE:
current->eflags |= MAP_ENTRY_NOCOREDUMP;
break;
case MADV_CORE:
current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
break;
default:
break;
}
vm_map_simplify_entry(map, current);
}
vm_map_unlock(map);
} else {
vm_pindex_t pindex;
int count;
/*
* madvise behaviors that are implemented in the underlying
* vm_object.
*
* Since we don't clip the vm_map_entry, we have to clip
* the vm_object pindex and count.
*/
for (current = entry;
(current != &map->header) && (current->start < end);
current = current->next
) {
vm_offset_t useStart;
if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
continue;
pindex = OFF_TO_IDX(current->offset);
count = atop(current->end - current->start);
useStart = current->start;
if (current->start < start) {
pindex += atop(start - current->start);
count -= atop(start - current->start);
useStart = start;
}
if (current->end > end)
count -= atop(current->end - end);
if (count <= 0)
continue;
vm_object_madvise(current->object.vm_object,
pindex, count, behav);
if (behav == MADV_WILLNEED) {
mtx_lock(&Giant);
pmap_object_init_pt(
map->pmap,
useStart,
current->object.vm_object,
pindex,
(count << PAGE_SHIFT),
MAP_PREFAULT_MADVISE
);
mtx_unlock(&Giant);
}
}
vm_map_unlock_read(map);
}
return (0);
}
/*
* vm_map_inherit:
*
* Sets the inheritance of the specified address
* range in the target map. Inheritance
* affects how the map will be shared with
* child maps at the time of vm_map_fork.
*/
int
vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
vm_inherit_t new_inheritance)
{
vm_map_entry_t entry;
vm_map_entry_t temp_entry;
switch (new_inheritance) {
case VM_INHERIT_NONE:
case VM_INHERIT_COPY:
case VM_INHERIT_SHARE:
break;
default:
return (KERN_INVALID_ARGUMENT);
}
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (vm_map_lookup_entry(map, start, &temp_entry)) {
entry = temp_entry;
vm_map_clip_start(map, entry, start);
} else
entry = temp_entry->next;
while ((entry != &map->header) && (entry->start < end)) {
vm_map_clip_end(map, entry, end);
entry->inheritance = new_inheritance;
vm_map_simplify_entry(map, entry);
entry = entry->next;
}
vm_map_unlock(map);
return (KERN_SUCCESS);
}
/*
* vm_map_unwire:
*
* Implements both kernel and user unwiring.
*/
int
vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
boolean_t user_unwire)
{
vm_map_entry_t entry, first_entry, tmp_entry;
vm_offset_t saved_start;
unsigned int last_timestamp;
int rv;
boolean_t need_wakeup, result;
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (!vm_map_lookup_entry(map, start, &first_entry)) {
vm_map_unlock(map);
return (KERN_INVALID_ADDRESS);
}
last_timestamp = map->timestamp;
entry = first_entry;
while (entry != &map->header && entry->start < end) {
if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
/*
* We have not yet clipped the entry.
*/
saved_start = (start >= entry->start) ? start :
entry->start;
entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
if (vm_map_unlock_and_wait(map, user_unwire)) {
/*
* Allow interruption of user unwiring?
*/
}
vm_map_lock(map);
if (last_timestamp+1 != map->timestamp) {
/*
* Look again for the entry because the map was
* modified while it was unlocked.
* Specifically, the entry may have been
* clipped, merged, or deleted.
*/
if (!vm_map_lookup_entry(map, saved_start,
&tmp_entry)) {
if (saved_start == start) {
/*
* First_entry has been deleted.
*/
vm_map_unlock(map);
return (KERN_INVALID_ADDRESS);
}
end = saved_start;
rv = KERN_INVALID_ADDRESS;
goto done;
}
if (entry == first_entry)
first_entry = tmp_entry;
else
first_entry = NULL;
entry = tmp_entry;
}
last_timestamp = map->timestamp;
continue;
}
vm_map_clip_start(map, entry, start);
vm_map_clip_end(map, entry, end);
/*
* Mark the entry in case the map lock is released. (See
* above.)
*/
entry->eflags |= MAP_ENTRY_IN_TRANSITION;
/*
* Check the map for holes in the specified region.
*/
if (entry->end < end && (entry->next == &map->header ||
entry->next->start > entry->end)) {
end = entry->end;
rv = KERN_INVALID_ADDRESS;
goto done;
}
/*
* Require that the entry is wired.
*/
if (entry->wired_count == 0 || (user_unwire &&
(entry->eflags & MAP_ENTRY_USER_WIRED) == 0)) {
end = entry->end;
rv = KERN_INVALID_ARGUMENT;
goto done;
}
entry = entry->next;
}
rv = KERN_SUCCESS;
done:
need_wakeup = FALSE;
if (first_entry == NULL) {
result = vm_map_lookup_entry(map, start, &first_entry);
KASSERT(result, ("vm_map_unwire: lookup failed"));
}
entry = first_entry;
while (entry != &map->header && entry->start < end) {
if (rv == KERN_SUCCESS) {
if (user_unwire)
entry->eflags &= ~MAP_ENTRY_USER_WIRED;
entry->wired_count--;
if (entry->wired_count == 0) {
/*
* Retain the map lock.
*/
vm_fault_unwire(map, entry->start, entry->end);
}
}
KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
("vm_map_unwire: in-transition flag missing"));
entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
need_wakeup = TRUE;
}
vm_map_simplify_entry(map, entry);
entry = entry->next;
}
vm_map_unlock(map);
if (need_wakeup)
vm_map_wakeup(map);
return (rv);
}
/*
* vm_map_wire:
*
* Implements both kernel and user wiring.
*/
int
vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
boolean_t user_wire)
{
vm_map_entry_t entry, first_entry, tmp_entry;
vm_offset_t saved_end, saved_start;
unsigned int last_timestamp;
int rv;
boolean_t need_wakeup, result;
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (!vm_map_lookup_entry(map, start, &first_entry)) {
vm_map_unlock(map);
return (KERN_INVALID_ADDRESS);
}
last_timestamp = map->timestamp;
entry = first_entry;
while (entry != &map->header && entry->start < end) {
if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
/*
* We have not yet clipped the entry.
*/
saved_start = (start >= entry->start) ? start :
entry->start;
entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
if (vm_map_unlock_and_wait(map, user_wire)) {
/*
* Allow interruption of user wiring?
*/
}
vm_map_lock(map);
if (last_timestamp + 1 != map->timestamp) {
/*
* Look again for the entry because the map was
* modified while it was unlocked.
* Specifically, the entry may have been
* clipped, merged, or deleted.
*/
if (!vm_map_lookup_entry(map, saved_start,
&tmp_entry)) {
if (saved_start == start) {
/*
* first_entry has been deleted.
*/
vm_map_unlock(map);
return (KERN_INVALID_ADDRESS);
}
end = saved_start;
rv = KERN_INVALID_ADDRESS;
goto done;
}
if (entry == first_entry)
first_entry = tmp_entry;
else
first_entry = NULL;
entry = tmp_entry;
}
last_timestamp = map->timestamp;
continue;
}
vm_map_clip_start(map, entry, start);
vm_map_clip_end(map, entry, end);
/*
* Mark the entry in case the map lock is released. (See
* above.)
*/
entry->eflags |= MAP_ENTRY_IN_TRANSITION;
/*
*
*/
if (entry->wired_count == 0) {
entry->wired_count++;
saved_start = entry->start;
saved_end = entry->end;
/*
* Release the map lock, relying on the in-transition
* mark.
*/
vm_map_unlock(map);
rv = vm_fault_wire(map, saved_start, saved_end,
user_wire);
vm_map_lock(map);
if (last_timestamp + 1 != map->timestamp) {
/*
* Look again for the entry because the map was
* modified while it was unlocked. The entry
* may have been clipped, but NOT merged or
* deleted.
*/
result = vm_map_lookup_entry(map, saved_start,
&tmp_entry);
KASSERT(result, ("vm_map_wire: lookup failed"));
if (entry == first_entry)
first_entry = tmp_entry;
else
first_entry = NULL;
entry = tmp_entry;
while (entry->end < saved_end) {
if (rv != KERN_SUCCESS) {
KASSERT(entry->wired_count == 1,
("vm_map_wire: bad count"));
entry->wired_count = -1;
}
entry = entry->next;
}
}
last_timestamp = map->timestamp;
if (rv != KERN_SUCCESS) {
KASSERT(entry->wired_count == 1,
("vm_map_wire: bad count"));
/*
* Assign an out-of-range value to represent
* the failure to wire this entry.
*/
entry->wired_count = -1;
end = entry->end;
goto done;
}
} else if (!user_wire ||
(entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
entry->wired_count++;
}
/*
* Check the map for holes in the specified region.
*/
if (entry->end < end && (entry->next == &map->header ||
entry->next->start > entry->end)) {
end = entry->end;
rv = KERN_INVALID_ADDRESS;
goto done;
}
entry = entry->next;
}
rv = KERN_SUCCESS;
done:
need_wakeup = FALSE;
if (first_entry == NULL) {
result = vm_map_lookup_entry(map, start, &first_entry);
KASSERT(result, ("vm_map_wire: lookup failed"));
}
entry = first_entry;
while (entry != &map->header && entry->start < end) {
if (rv == KERN_SUCCESS) {
if (user_wire)
entry->eflags |= MAP_ENTRY_USER_WIRED;
} else if (entry->wired_count == -1) {
/*
* Wiring failed on this entry. Thus, unwiring is
* unnecessary.
*/
entry->wired_count = 0;
} else {
if (!user_wire || (entry->wired_count == 1 &&
(entry->eflags & MAP_ENTRY_USER_WIRED) == 0))
entry->wired_count--;
if (entry->wired_count == 0) {
/*
* Retain the map lock.
*/
vm_fault_unwire(map, entry->start, entry->end);
}
}
KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
("vm_map_wire: in-transition flag missing"));
entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
need_wakeup = TRUE;
}
vm_map_simplify_entry(map, entry);
entry = entry->next;
}
vm_map_unlock(map);
if (need_wakeup)
vm_map_wakeup(map);
return (rv);
}
/*
* vm_map_clean
*
* Push any dirty cached pages in the address range to their pager.
* If syncio is TRUE, dirty pages are written synchronously.
* If invalidate is TRUE, any cached pages are freed as well.
*
* Returns an error if any part of the specified range is not mapped.
*/
int
vm_map_clean(
vm_map_t map,
vm_offset_t start,
vm_offset_t end,
boolean_t syncio,
boolean_t invalidate)
{
vm_map_entry_t current;
vm_map_entry_t entry;
vm_size_t size;
vm_object_t object;
vm_ooffset_t offset;
GIANT_REQUIRED;
vm_map_lock_read(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (!vm_map_lookup_entry(map, start, &entry)) {
vm_map_unlock_read(map);
return (KERN_INVALID_ADDRESS);
}
/*
* Make a first pass to check for holes.
*/
for (current = entry; current->start < end; current = current->next) {
if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
vm_map_unlock_read(map);
return (KERN_INVALID_ARGUMENT);
}
if (end > current->end &&
(current->next == &map->header ||
current->end != current->next->start)) {
vm_map_unlock_read(map);
return (KERN_INVALID_ADDRESS);
}
}
if (invalidate)
pmap_remove(vm_map_pmap(map), start, end);
/*
* Make a second pass, cleaning/uncaching pages from the indicated
* objects as we go.
*/
for (current = entry; current->start < end; current = current->next) {
offset = current->offset + (start - current->start);
size = (end <= current->end ? end : current->end) - start;
if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
vm_map_t smap;
vm_map_entry_t tentry;
vm_size_t tsize;
smap = current->object.sub_map;
vm_map_lock_read(smap);
(void) vm_map_lookup_entry(smap, offset, &tentry);
tsize = tentry->end - offset;
if (tsize < size)
size = tsize;
object = tentry->object.vm_object;
offset = tentry->offset + (offset - tentry->start);
vm_map_unlock_read(smap);
} else {
object = current->object.vm_object;
}
/*
* Note that there is absolutely no sense in writing out
* anonymous objects, so we track down the vnode object
* to write out.
* We invalidate (remove) all pages from the address space
* anyway, for semantic correctness.
*
* note: certain anonymous maps, such as MAP_NOSYNC maps,
* may start out with a NULL object.
*/
while (object && object->backing_object) {
object = object->backing_object;
offset += object->backing_object_offset;
if (object->size < OFF_TO_IDX(offset + size))
size = IDX_TO_OFF(object->size) - offset;
}
if (object && (object->type == OBJT_VNODE) &&
(current->protection & VM_PROT_WRITE)) {
/*
* Flush pages if writing is allowed, invalidate them
* if invalidation requested. Pages undergoing I/O
* will be ignored by vm_object_page_remove().
*
* We cannot lock the vnode and then wait for paging
* to complete without deadlocking against vm_fault.
* Instead we simply call vm_object_page_remove() and
* allow it to block internally on a page-by-page
* basis when it encounters pages undergoing async
* I/O.
*/
int flags;
vm_object_reference(object);
vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY, curthread);
flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
flags |= invalidate ? OBJPC_INVAL : 0;
vm_object_page_clean(object,
OFF_TO_IDX(offset),
OFF_TO_IDX(offset + size + PAGE_MASK),
flags);
if (invalidate) {
/*vm_object_pip_wait(object, "objmcl");*/
vm_object_page_remove(object,
OFF_TO_IDX(offset),
OFF_TO_IDX(offset + size + PAGE_MASK),
FALSE);
}
VOP_UNLOCK(object->handle, 0, curthread);
vm_object_deallocate(object);
}
start += size;
}
vm_map_unlock_read(map);
return (KERN_SUCCESS);
}
/*
* vm_map_entry_unwire: [ internal use only ]
*
* Make the region specified by this entry pageable.
*
* The map in question should be locked.
* [This is the reason for this routine's existence.]
*/
static void
vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
{
vm_fault_unwire(map, entry->start, entry->end);
entry->wired_count = 0;
}
/*
* vm_map_entry_delete: [ internal use only ]
*
* Deallocate the given entry from the target map.
*/
static void
vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
{
vm_map_entry_unlink(map, entry);
map->size -= entry->end - entry->start;
if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
vm_object_deallocate(entry->object.vm_object);
}
vm_map_entry_dispose(map, entry);
}
/*
* vm_map_delete: [ internal use only ]
*
* Deallocates the given address range from the target
* map.
*/
int
vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
{
vm_object_t object;
vm_map_entry_t entry;
vm_map_entry_t first_entry;
/*
* Find the start of the region, and clip it
*/
if (!vm_map_lookup_entry(map, start, &first_entry))
entry = first_entry->next;
else {
entry = first_entry;
vm_map_clip_start(map, entry, start);
}
/*
* Save the free space hint
*/
if (entry == &map->header) {
map->first_free = &map->header;
} else if (map->first_free->start >= start) {
map->first_free = entry->prev;
}
/*
* Step through all entries in this region
*/
while ((entry != &map->header) && (entry->start < end)) {
vm_map_entry_t next;
vm_offset_t s, e;
vm_pindex_t offidxstart, offidxend, count;
/*
* Wait for wiring or unwiring of an entry to complete.
*/
if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0) {
unsigned int last_timestamp;
vm_offset_t saved_start;
vm_map_entry_t tmp_entry;
saved_start = entry->start;
entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
last_timestamp = map->timestamp;
(void) vm_map_unlock_and_wait(map, FALSE);
vm_map_lock(map);
if (last_timestamp + 1 != map->timestamp) {
/*
* Look again for the entry because the map was
* modified while it was unlocked.
* Specifically, the entry may have been
* clipped, merged, or deleted.
*/
if (!vm_map_lookup_entry(map, saved_start,
&tmp_entry))
entry = tmp_entry->next;
else {
entry = tmp_entry;
vm_map_clip_start(map, entry,
saved_start);
}
}
continue;
}
vm_map_clip_end(map, entry, end);
s = entry->start;
e = entry->end;
next = entry->next;
offidxstart = OFF_TO_IDX(entry->offset);
count = OFF_TO_IDX(e - s);
object = entry->object.vm_object;
/*
* Unwire before removing addresses from the pmap; otherwise,
* unwiring will put the entries back in the pmap.
*/
if (entry->wired_count != 0) {
vm_map_entry_unwire(map, entry);
}
offidxend = offidxstart + count;
if ((object == kernel_object) || (object == kmem_object)) {
vm_object_page_remove(object, offidxstart, offidxend, FALSE);
} else {
mtx_lock(&Giant);
pmap_remove(map->pmap, s, e);
if (object != NULL &&
object->ref_count != 1 &&
(object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING &&
(object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
vm_object_collapse(object);
vm_object_page_remove(object, offidxstart, offidxend, FALSE);
if (object->type == OBJT_SWAP) {
swap_pager_freespace(object, offidxstart, count);
}
if (offidxend >= object->size &&
offidxstart < object->size) {
object->size = offidxstart;
}
}
mtx_unlock(&Giant);
}
/*
* Delete the entry (which may delete the object) only after
* removing all pmap entries pointing to its pages.
* (Otherwise, its page frames may be reallocated, and any
* modify bits will be set in the wrong object!)
*/
vm_map_entry_delete(map, entry);
entry = next;
}
return (KERN_SUCCESS);
}
/*
* vm_map_remove:
*
* Remove the given address range from the target map.
* This is the exported form of vm_map_delete.
*/
int
vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
{
int result, s = 0;
if (map == kmem_map)
s = splvm();
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
result = vm_map_delete(map, start, end);
vm_map_unlock(map);
if (map == kmem_map)
splx(s);
return (result);
}
/*
* vm_map_check_protection:
*
* Assert that the target map allows the specified
* privilege on the entire address region given.
* The entire region must be allocated.
*/
boolean_t
vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
vm_prot_t protection)
{
vm_map_entry_t entry;
vm_map_entry_t tmp_entry;
vm_map_lock_read(map);
if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
vm_map_unlock_read(map);
return (FALSE);
}
entry = tmp_entry;
while (start < end) {
if (entry == &map->header) {
vm_map_unlock_read(map);
return (FALSE);
}
/*
* No holes allowed!
*/
if (start < entry->start) {
vm_map_unlock_read(map);
return (FALSE);
}
/*
* Check protection associated with entry.
*/
if ((entry->protection & protection) != protection) {
vm_map_unlock_read(map);
return (FALSE);
}
/* go to next entry */
start = entry->end;
entry = entry->next;
}
vm_map_unlock_read(map);
return (TRUE);
}
/*
* vm_map_copy_entry:
*
* Copies the contents of the source entry to the destination
* entry. The entries *must* be aligned properly.
*/
static void
vm_map_copy_entry(
vm_map_t src_map,
vm_map_t dst_map,
vm_map_entry_t src_entry,
vm_map_entry_t dst_entry)
{
vm_object_t src_object;
if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
return;
if (src_entry->wired_count == 0) {
/*
* If the source entry is marked needs_copy, it is already
* write-protected.
*/
if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
pmap_protect(src_map->pmap,
src_entry->start,
src_entry->end,
src_entry->protection & ~VM_PROT_WRITE);
}
/*
* Make a copy of the object.
*/
if ((src_object = src_entry->object.vm_object) != NULL) {
if ((src_object->handle == NULL) &&
(src_object->type == OBJT_DEFAULT ||
src_object->type == OBJT_SWAP)) {
vm_object_collapse(src_object);
if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
vm_object_split(src_entry);
src_object = src_entry->object.vm_object;
}
}
vm_object_reference(src_object);
vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
dst_entry->object.vm_object = src_object;
src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
dst_entry->offset = src_entry->offset;
} else {
dst_entry->object.vm_object = NULL;
dst_entry->offset = 0;
}
pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
dst_entry->end - dst_entry->start, src_entry->start);
} else {
/*
* Of course, wired down pages can't be set copy-on-write.
* Cause wired pages to be copied into the new map by
* simulating faults (the new pages are pageable)
*/
vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
}
}
/*
* vmspace_fork:
* Create a new process vmspace structure and vm_map
* based on those of an existing process. The new map
* is based on the old map, according to the inheritance
* values on the regions in that map.
*
* The source map must not be locked.
*/
struct vmspace *
vmspace_fork(struct vmspace *vm1)
{
struct vmspace *vm2;
vm_map_t old_map = &vm1->vm_map;
vm_map_t new_map;
vm_map_entry_t old_entry;
vm_map_entry_t new_entry;
vm_object_t object;
GIANT_REQUIRED;
vm_map_lock(old_map);
old_map->infork = 1;
vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
(caddr_t) &vm1->vm_endcopy - (caddr_t) &vm1->vm_startcopy);
new_map = &vm2->vm_map; /* XXX */
new_map->timestamp = 1;
old_entry = old_map->header.next;
while (old_entry != &old_map->header) {
if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
panic("vm_map_fork: encountered a submap");
switch (old_entry->inheritance) {
case VM_INHERIT_NONE:
break;
case VM_INHERIT_SHARE:
/*
* Clone the entry, creating the shared object if necessary.
*/
object = old_entry->object.vm_object;
if (object == NULL) {
object = vm_object_allocate(OBJT_DEFAULT,
atop(old_entry->end - old_entry->start));
old_entry->object.vm_object = object;
old_entry->offset = (vm_offset_t) 0;
}
/*
* Add the reference before calling vm_object_shadow
* to insure that a shadow object is created.
*/
vm_object_reference(object);
if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
vm_object_shadow(&old_entry->object.vm_object,
&old_entry->offset,
atop(old_entry->end - old_entry->start));
old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
/* Transfer the second reference too. */
vm_object_reference(
old_entry->object.vm_object);
vm_object_deallocate(object);
object = old_entry->object.vm_object;
}
vm_object_clear_flag(object, OBJ_ONEMAPPING);
/*
* Clone the entry, referencing the shared object.
*/
new_entry = vm_map_entry_create(new_map);
*new_entry = *old_entry;
new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
new_entry->wired_count = 0;
/*
* Insert the entry into the new map -- we know we're
* inserting at the end of the new map.
*/
vm_map_entry_link(new_map, new_map->header.prev,
new_entry);
/*
* Update the physical map
*/
pmap_copy(new_map->pmap, old_map->pmap,
new_entry->start,
(old_entry->end - old_entry->start),
old_entry->start);
break;
case VM_INHERIT_COPY:
/*
* Clone the entry and link into the map.
*/
new_entry = vm_map_entry_create(new_map);
*new_entry = *old_entry;
new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
new_entry->wired_count = 0;
new_entry->object.vm_object = NULL;
vm_map_entry_link(new_map, new_map->header.prev,
new_entry);
vm_map_copy_entry(old_map, new_map, old_entry,
new_entry);
break;
}
old_entry = old_entry->next;
}
new_map->size = old_map->size;
old_map->infork = 0;
vm_map_unlock(old_map);
return (vm2);
}
int
vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
vm_prot_t prot, vm_prot_t max, int cow)
{
vm_map_entry_t prev_entry;
vm_map_entry_t new_stack_entry;
vm_size_t init_ssize;
int rv;
- if (VM_MIN_ADDRESS > 0 && addrbos < VM_MIN_ADDRESS)
+ if (addrbos < vm_map_min(map))
return (KERN_NO_SPACE);
if (max_ssize < sgrowsiz)
init_ssize = max_ssize;
else
init_ssize = sgrowsiz;
vm_map_lock(map);
/* If addr is already mapped, no go */
if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
vm_map_unlock(map);
return (KERN_NO_SPACE);
}
/* If we would blow our VMEM resource limit, no go */
if (map->size + init_ssize >
curthread->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
vm_map_unlock(map);
return (KERN_NO_SPACE);
}
/* If we can't accomodate max_ssize in the current mapping,
* no go. However, we need to be aware that subsequent user
* mappings might map into the space we have reserved for
* stack, and currently this space is not protected.
*
* Hopefully we will at least detect this condition
* when we try to grow the stack.
*/
if ((prev_entry->next != &map->header) &&
(prev_entry->next->start < addrbos + max_ssize)) {
vm_map_unlock(map);
return (KERN_NO_SPACE);
}
/* We initially map a stack of only init_ssize. We will
* grow as needed later. Since this is to be a grow
* down stack, we map at the top of the range.
*
* Note: we would normally expect prot and max to be
* VM_PROT_ALL, and cow to be 0. Possibly we should
* eliminate these as input parameters, and just
* pass these values here in the insert call.
*/
rv = vm_map_insert(map, NULL, 0, addrbos + max_ssize - init_ssize,
addrbos + max_ssize, prot, max, cow);
/* Now set the avail_ssize amount */
if (rv == KERN_SUCCESS){
if (prev_entry != &map->header)
vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize);
new_stack_entry = prev_entry->next;
if (new_stack_entry->end != addrbos + max_ssize ||
new_stack_entry->start != addrbos + max_ssize - init_ssize)
panic ("Bad entry start/end for new stack entry");
else
new_stack_entry->avail_ssize = max_ssize - init_ssize;
}
vm_map_unlock(map);
return (rv);
}
/* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the
* desired address is already mapped, or if we successfully grow
* the stack. Also returns KERN_SUCCESS if addr is outside the
* stack range (this is strange, but preserves compatibility with
* the grow function in vm_machdep.c).
*/
int
vm_map_growstack (struct proc *p, vm_offset_t addr)
{
vm_map_entry_t prev_entry;
vm_map_entry_t stack_entry;
vm_map_entry_t new_stack_entry;
struct vmspace *vm = p->p_vmspace;
vm_map_t map = &vm->vm_map;
vm_offset_t end;
int grow_amount;
int rv;
int is_procstack;
GIANT_REQUIRED;
Retry:
vm_map_lock_read(map);
/* If addr is already in the entry range, no need to grow.*/
if (vm_map_lookup_entry(map, addr, &prev_entry)) {
vm_map_unlock_read(map);
return (KERN_SUCCESS);
}
if ((stack_entry = prev_entry->next) == &map->header) {
vm_map_unlock_read(map);
return (KERN_SUCCESS);
}
if (prev_entry == &map->header)
end = stack_entry->start - stack_entry->avail_ssize;
else
end = prev_entry->end;
/* This next test mimics the old grow function in vm_machdep.c.
* It really doesn't quite make sense, but we do it anyway
* for compatibility.
*
* If not growable stack, return success. This signals the
* caller to proceed as he would normally with normal vm.
*/
if (stack_entry->avail_ssize < 1 ||
addr >= stack_entry->start ||
addr < stack_entry->start - stack_entry->avail_ssize) {
vm_map_unlock_read(map);
return (KERN_SUCCESS);
}
/* Find the minimum grow amount */
grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
if (grow_amount > stack_entry->avail_ssize) {
vm_map_unlock_read(map);
return (KERN_NO_SPACE);
}
/* If there is no longer enough space between the entries
* nogo, and adjust the available space. Note: this
* should only happen if the user has mapped into the
* stack area after the stack was created, and is
* probably an error.
*
* This also effectively destroys any guard page the user
* might have intended by limiting the stack size.
*/
if (grow_amount > stack_entry->start - end) {
if (vm_map_lock_upgrade(map))
goto Retry;
stack_entry->avail_ssize = stack_entry->start - end;
vm_map_unlock(map);
return (KERN_NO_SPACE);
}
is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
/* If this is the main process stack, see if we're over the
* stack limit.
*/
if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
vm_map_unlock_read(map);
return (KERN_NO_SPACE);
}
/* Round up the grow amount modulo SGROWSIZ */
grow_amount = roundup (grow_amount, sgrowsiz);
if (grow_amount > stack_entry->avail_ssize) {
grow_amount = stack_entry->avail_ssize;
}
if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
ctob(vm->vm_ssize);
}
/* If we would blow our VMEM resource limit, no go */
if (map->size + grow_amount >
curthread->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
vm_map_unlock_read(map);
return (KERN_NO_SPACE);
}
if (vm_map_lock_upgrade(map))
goto Retry;
/* Get the preliminary new entry start value */
addr = stack_entry->start - grow_amount;
/* If this puts us into the previous entry, cut back our growth
* to the available space. Also, see the note above.
*/
if (addr < end) {
stack_entry->avail_ssize = stack_entry->start - end;
addr = end;
}
rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
- VM_PROT_ALL,
- VM_PROT_ALL,
- 0);
+ p->p_sysent->sv_stackprot, VM_PROT_ALL, 0);
/* Adjust the available stack space by the amount we grew. */
if (rv == KERN_SUCCESS) {
if (prev_entry != &map->header)
vm_map_clip_end(map, prev_entry, addr);
new_stack_entry = prev_entry->next;
if (new_stack_entry->end != stack_entry->start ||
new_stack_entry->start != addr)
panic ("Bad stack grow start/end in new stack entry");
else {
new_stack_entry->avail_ssize = stack_entry->avail_ssize -
(new_stack_entry->end -
new_stack_entry->start);
if (is_procstack)
vm->vm_ssize += btoc(new_stack_entry->end -
new_stack_entry->start);
}
}
vm_map_unlock(map);
return (rv);
}
/*
* Unshare the specified VM space for exec. If other processes are
* mapped to it, then create a new one. The new vmspace is null.
*/
void
vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
{
struct vmspace *oldvmspace = p->p_vmspace;
struct vmspace *newvmspace;
GIANT_REQUIRED;
newvmspace = vmspace_alloc(minuser, maxuser);
bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
(caddr_t) (newvmspace + 1) - (caddr_t) &newvmspace->vm_startcopy);
/*
* This code is written like this for prototype purposes. The
* goal is to avoid running down the vmspace here, but let the
* other process's that are still using the vmspace to finally
* run it down. Even though there is little or no chance of blocking
* here, it is a good idea to keep this form for future mods.
*/
p->p_vmspace = newvmspace;
pmap_pinit2(vmspace_pmap(newvmspace));
vmspace_free(oldvmspace);
if (p == curthread->td_proc) /* XXXKSE ? */
pmap_activate(curthread);
}
/*
* Unshare the specified VM space for forcing COW. This
* is called by rfork, for the (RFMEM|RFPROC) == 0 case.
*/
void
vmspace_unshare(struct proc *p)
{
struct vmspace *oldvmspace = p->p_vmspace;
struct vmspace *newvmspace;
GIANT_REQUIRED;
if (oldvmspace->vm_refcnt == 1)
return;
newvmspace = vmspace_fork(oldvmspace);
p->p_vmspace = newvmspace;
pmap_pinit2(vmspace_pmap(newvmspace));
vmspace_free(oldvmspace);
if (p == curthread->td_proc) /* XXXKSE ? */
pmap_activate(curthread);
}
/*
* vm_map_lookup:
*
* Finds the VM object, offset, and
* protection for a given virtual address in the
* specified map, assuming a page fault of the
* type specified.
*
* Leaves the map in question locked for read; return
* values are guaranteed until a vm_map_lookup_done
* call is performed. Note that the map argument
* is in/out; the returned map must be used in
* the call to vm_map_lookup_done.
*
* A handle (out_entry) is returned for use in
* vm_map_lookup_done, to make that fast.
*
* If a lookup is requested with "write protection"
* specified, the map may be changed to perform virtual
* copying operations, although the data referenced will
* remain the same.
*/
int
vm_map_lookup(vm_map_t *var_map, /* IN/OUT */
vm_offset_t vaddr,
vm_prot_t fault_typea,
vm_map_entry_t *out_entry, /* OUT */
vm_object_t *object, /* OUT */
vm_pindex_t *pindex, /* OUT */
vm_prot_t *out_prot, /* OUT */
boolean_t *wired) /* OUT */
{
vm_map_entry_t entry;
vm_map_t map = *var_map;
vm_prot_t prot;
vm_prot_t fault_type = fault_typea;
RetryLookup:;
/*
* Lookup the faulting address.
*/
vm_map_lock_read(map);
#define RETURN(why) \
{ \
vm_map_unlock_read(map); \
return (why); \
}
/*
* If the map has an interesting hint, try it before calling full
* blown lookup routine.
*/
entry = map->root;
*out_entry = entry;
if (entry == NULL ||
(vaddr < entry->start) || (vaddr >= entry->end)) {
/*
* Entry was either not a valid hint, or the vaddr was not
* contained in the entry, so do a full lookup.
*/
if (!vm_map_lookup_entry(map, vaddr, out_entry))
RETURN(KERN_INVALID_ADDRESS);
entry = *out_entry;
}
/*
* Handle submaps.
*/
if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
vm_map_t old_map = map;
*var_map = map = entry->object.sub_map;
vm_map_unlock_read(old_map);
goto RetryLookup;
}
/*
* Check whether this task is allowed to have this page.
* Note the special case for MAP_ENTRY_COW
* pages with an override. This is to implement a forced
* COW for debuggers.
*/
if (fault_type & VM_PROT_OVERRIDE_WRITE)
prot = entry->max_protection;
else
prot = entry->protection;
fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
if ((fault_type & prot) != fault_type) {
RETURN(KERN_PROTECTION_FAILURE);
}
if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
(entry->eflags & MAP_ENTRY_COW) &&
(fault_type & VM_PROT_WRITE) &&
(fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
RETURN(KERN_PROTECTION_FAILURE);
}
/*
* If this page is not pageable, we have to get it for all possible
* accesses.
*/
*wired = (entry->wired_count != 0);
if (*wired)
prot = fault_type = entry->protection;
/*
* If the entry was copy-on-write, we either ...
*/
if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
/*
* If we want to write the page, we may as well handle that
* now since we've got the map locked.
*
* If we don't need to write the page, we just demote the
* permissions allowed.
*/
if (fault_type & VM_PROT_WRITE) {
/*
* Make a new object, and place it in the object
* chain. Note that no new references have appeared
* -- one just moved from the map to the new
* object.
*/
if (vm_map_lock_upgrade(map))
goto RetryLookup;
vm_object_shadow(
&entry->object.vm_object,
&entry->offset,
atop(entry->end - entry->start));
entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
vm_map_lock_downgrade(map);
} else {
/*
* We're attempting to read a copy-on-write page --
* don't allow writes.
*/
prot &= ~VM_PROT_WRITE;
}
}
/*
* Create an object if necessary.
*/
if (entry->object.vm_object == NULL &&
!map->system_map) {
if (vm_map_lock_upgrade(map))
goto RetryLookup;
entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
atop(entry->end - entry->start));
entry->offset = 0;
vm_map_lock_downgrade(map);
}
/*
* Return the object/offset from this entry. If the entry was
* copy-on-write or empty, it has been fixed up.
*/
*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
*object = entry->object.vm_object;
/*
* Return whether this is the only map sharing this data.
*/
*out_prot = prot;
return (KERN_SUCCESS);
#undef RETURN
}
/*
* vm_map_lookup_done:
*
* Releases locks acquired by a vm_map_lookup
* (according to the handle returned by that lookup).
*/
void
vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
{
/*
* Unlock the main-level map
*/
vm_map_unlock_read(map);
}
#ifdef ENABLE_VFS_IOOPT
/*
* Experimental support for zero-copy I/O
*
* Implement uiomove with VM operations. This handles (and collateral changes)
* support every combination of source object modification, and COW type
* operations.
*/
int
vm_uiomove(
vm_map_t mapa,
vm_object_t srcobject,
off_t cp,
int cnta,
vm_offset_t uaddra,
int *npages)
{
vm_map_t map;
vm_object_t first_object, oldobject, object;
vm_map_entry_t entry;
vm_prot_t prot;
boolean_t wired;
int tcnt, rv;
vm_offset_t uaddr, start, end, tend;
vm_pindex_t first_pindex, oindex;
vm_size_t osize;
off_t ooffset;
int cnt;
GIANT_REQUIRED;
if (npages)
*npages = 0;
cnt = cnta;
uaddr = uaddra;
while (cnt > 0) {
map = mapa;
if ((vm_map_lookup(&map, uaddr,
VM_PROT_READ, &entry, &first_object,
&first_pindex, &prot, &wired)) != KERN_SUCCESS) {
return EFAULT;
}
vm_map_clip_start(map, entry, uaddr);
tcnt = cnt;
tend = uaddr + tcnt;
if (tend > entry->end) {
tcnt = entry->end - uaddr;
tend = entry->end;
}
vm_map_clip_end(map, entry, tend);
start = entry->start;
end = entry->end;
osize = atop(tcnt);
oindex = OFF_TO_IDX(cp);
if (npages) {
vm_size_t idx;
for (idx = 0; idx < osize; idx++) {
vm_page_t m;
if ((m = vm_page_lookup(srcobject, oindex + idx)) == NULL) {
vm_map_lookup_done(map, entry);
return 0;
}
/*
* disallow busy or invalid pages, but allow
* m->busy pages if they are entirely valid.
*/
if ((m->flags & PG_BUSY) ||
((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
vm_map_lookup_done(map, entry);
return 0;
}
}
}
/*
* If we are changing an existing map entry, just redirect
* the object, and change mappings.
*/
if ((first_object->type == OBJT_VNODE) &&
((oldobject = entry->object.vm_object) == first_object)) {
if ((entry->offset != cp) || (oldobject != srcobject)) {
/*
* Remove old window into the file
*/
pmap_remove (map->pmap, uaddr, tend);
/*
* Force copy on write for mmaped regions
*/
vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
/*
* Point the object appropriately
*/
if (oldobject != srcobject) {
/*
* Set the object optimization hint flag
*/
vm_object_set_flag(srcobject, OBJ_OPT);
vm_object_reference(srcobject);
entry->object.vm_object = srcobject;
if (oldobject) {
vm_object_deallocate(oldobject);
}
}
entry->offset = cp;
map->timestamp++;
} else {
pmap_remove (map->pmap, uaddr, tend);
}
} else if ((first_object->ref_count == 1) &&
(first_object->size == osize) &&
((first_object->type == OBJT_DEFAULT) ||
(first_object->type == OBJT_SWAP)) ) {
oldobject = first_object->backing_object;
if ((first_object->backing_object_offset != cp) ||
(oldobject != srcobject)) {
/*
* Remove old window into the file
*/
pmap_remove (map->pmap, uaddr, tend);
/*
* Remove unneeded old pages
*/
vm_object_page_remove(first_object, 0, 0, 0);
/*
* Invalidate swap space
*/
if (first_object->type == OBJT_SWAP) {
swap_pager_freespace(first_object,
0,
first_object->size);
}
/*
* Force copy on write for mmaped regions
*/
vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
/*
* Point the object appropriately
*/
if (oldobject != srcobject) {
/*
* Set the object optimization hint flag
*/
vm_object_set_flag(srcobject, OBJ_OPT);
vm_object_reference(srcobject);
if (oldobject) {
TAILQ_REMOVE(&oldobject->shadow_head,
first_object, shadow_list);
oldobject->shadow_count--;
/* XXX bump generation? */
vm_object_deallocate(oldobject);
}
TAILQ_INSERT_TAIL(&srcobject->shadow_head,
first_object, shadow_list);
srcobject->shadow_count++;
/* XXX bump generation? */
first_object->backing_object = srcobject;
}
first_object->backing_object_offset = cp;
map->timestamp++;
} else {
pmap_remove (map->pmap, uaddr, tend);
}
/*
* Otherwise, we have to do a logical mmap.
*/
} else {
vm_object_set_flag(srcobject, OBJ_OPT);
vm_object_reference(srcobject);
pmap_remove (map->pmap, uaddr, tend);
vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
vm_map_lock_upgrade(map);
if (entry == &map->header) {
map->first_free = &map->header;
} else if (map->first_free->start >= start) {
map->first_free = entry->prev;
}
vm_map_entry_delete(map, entry);
object = srcobject;
ooffset = cp;
rv = vm_map_insert(map, object, ooffset, start, tend,
VM_PROT_ALL, VM_PROT_ALL, MAP_COPY_ON_WRITE);
if (rv != KERN_SUCCESS)
panic("vm_uiomove: could not insert new entry: %d", rv);
}
/*
* Map the window directly, if it is already in memory
*/
pmap_object_init_pt(map->pmap, uaddr,
srcobject, oindex, tcnt, 0);
map->timestamp++;
vm_map_unlock(map);
cnt -= tcnt;
uaddr += tcnt;
cp += tcnt;
if (npages)
*npages += osize;
}
return 0;
}
#endif
#include "opt_ddb.h"
#ifdef DDB
#include <sys/kernel.h>
#include <ddb/ddb.h>
/*
* vm_map_print: [ debug ]
*/
DB_SHOW_COMMAND(map, vm_map_print)
{
static int nlines;
/* XXX convert args. */
vm_map_t map = (vm_map_t)addr;
boolean_t full = have_addr;
vm_map_entry_t entry;
db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
(void *)map,
(void *)map->pmap, map->nentries, map->timestamp);
nlines++;
if (!full && db_indent)
return;
db_indent += 2;
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
db_iprintf("map entry %p: start=%p, end=%p\n",
(void *)entry, (void *)entry->start, (void *)entry->end);
nlines++;
{
static char *inheritance_name[4] =
{"share", "copy", "none", "donate_copy"};
db_iprintf(" prot=%x/%x/%s",
entry->protection,
entry->max_protection,
inheritance_name[(int)(unsigned char)entry->inheritance]);
if (entry->wired_count != 0)
db_printf(", wired");
}
if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
/* XXX no %qd in kernel. Truncate entry->offset. */
db_printf(", share=%p, offset=0x%lx\n",
(void *)entry->object.sub_map,
(long)entry->offset);
nlines++;
if ((entry->prev == &map->header) ||
(entry->prev->object.sub_map !=
entry->object.sub_map)) {
db_indent += 2;
vm_map_print((db_expr_t)(intptr_t)
entry->object.sub_map,
full, 0, (char *)0);
db_indent -= 2;
}
} else {
/* XXX no %qd in kernel. Truncate entry->offset. */
db_printf(", object=%p, offset=0x%lx",
(void *)entry->object.vm_object,
(long)entry->offset);
if (entry->eflags & MAP_ENTRY_COW)
db_printf(", copy (%s)",
(entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
db_printf("\n");
nlines++;
if ((entry->prev == &map->header) ||
(entry->prev->object.vm_object !=
entry->object.vm_object)) {
db_indent += 2;
vm_object_print((db_expr_t)(intptr_t)
entry->object.vm_object,
full, 0, (char *)0);
nlines += 4;
db_indent -= 2;
}
}
}
db_indent -= 2;
if (db_indent == 0)
nlines = 0;
}
DB_SHOW_COMMAND(procvm, procvm)
{
struct proc *p;
if (have_addr) {
p = (struct proc *) addr;
} else {
p = curproc;
}
db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
(void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
(void *)vmspace_pmap(p->p_vmspace));
vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
}
#endif /* DDB */
Index: head/sys/vm/vm_mmap.c
===================================================================
--- head/sys/vm/vm_mmap.c (revision 103766)
+++ head/sys/vm/vm_mmap.c (revision 103767)
@@ -1,1275 +1,1262 @@
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
*
* @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94
* $FreeBSD$
*/
/*
* Mapped file (mmap) interface to VM
*/
#include "opt_compat.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/mman.h>
#include <sys/conf.h>
#include <sys/stat.h>
#include <sys/vmmeter.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
#include <vm/vm_pageout.h>
#include <vm/vm_extern.h>
#include <vm/vm_page.h>
#include <vm/vm_kern.h>
#ifndef _SYS_SYSPROTO_H_
struct sbrk_args {
int incr;
};
#endif
static int max_proc_mmap;
SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
/*
* Set the maximum number of vm_map_entry structures per process. Roughly
* speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
* of our KVM malloc space still results in generous limits. We want a
* default that is good enough to prevent the kernel running out of resources
* if attacked from compromised user account but generous enough such that
* multi-threaded processes are not unduly inconvenienced.
*/
static void vmmapentry_rsrc_init(void *);
SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL)
static void
vmmapentry_rsrc_init(dummy)
void *dummy;
{
max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
max_proc_mmap /= 100;
}
/*
* MPSAFE
*/
/* ARGSUSED */
int
sbrk(td, uap)
struct thread *td;
struct sbrk_args *uap;
{
/* Not yet implemented */
/* mtx_lock(&Giant); */
/* mtx_unlock(&Giant); */
return (EOPNOTSUPP);
}
#ifndef _SYS_SYSPROTO_H_
struct sstk_args {
int incr;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
sstk(td, uap)
struct thread *td;
struct sstk_args *uap;
{
/* Not yet implemented */
/* mtx_lock(&Giant); */
/* mtx_unlock(&Giant); */
return (EOPNOTSUPP);
}
#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
#ifndef _SYS_SYSPROTO_H_
struct getpagesize_args {
int dummy;
};
#endif
/* ARGSUSED */
int
ogetpagesize(td, uap)
struct thread *td;
struct getpagesize_args *uap;
{
/* MP SAFE */
td->td_retval[0] = PAGE_SIZE;
return (0);
}
#endif /* COMPAT_43 || COMPAT_SUNOS */
/*
* Memory Map (mmap) system call. Note that the file offset
* and address are allowed to be NOT page aligned, though if
* the MAP_FIXED flag it set, both must have the same remainder
* modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
* page-aligned, the actual mapping starts at trunc_page(addr)
* and the return value is adjusted up by the page offset.
*
* Generally speaking, only character devices which are themselves
* memory-based, such as a video framebuffer, can be mmap'd. Otherwise
* there would be no cache coherency between a descriptor and a VM mapping
* both to the same character device.
*
* Block devices can be mmap'd no matter what they represent. Cache coherency
* is maintained as long as you do not write directly to the underlying
* character device.
*/
#ifndef _SYS_SYSPROTO_H_
struct mmap_args {
void *addr;
size_t len;
int prot;
int flags;
int fd;
long pad;
off_t pos;
};
#endif
/*
* MPSAFE
*/
int
mmap(td, uap)
struct thread *td;
struct mmap_args *uap;
{
struct file *fp = NULL;
struct vnode *vp;
vm_offset_t addr;
vm_size_t size, pageoff;
vm_prot_t prot, maxprot;
void *handle;
int flags, error;
int disablexworkaround;
off_t pos;
struct vmspace *vms = td->td_proc->p_vmspace;
vm_object_t obj;
addr = (vm_offset_t) uap->addr;
size = uap->len;
prot = uap->prot & VM_PROT_ALL;
flags = uap->flags;
pos = uap->pos;
vp = NULL;
fp = NULL;
/* make sure mapping fits into numeric range etc */
if ((ssize_t) uap->len < 0 ||
((flags & MAP_ANON) && uap->fd != -1))
return (EINVAL);
if (flags & MAP_STACK) {
if ((uap->fd != -1) ||
((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
return (EINVAL);
flags |= MAP_ANON;
pos = 0;
}
/*
* Align the file position to a page boundary,
* and save its page offset component.
*/
pageoff = (pos & PAGE_MASK);
pos -= pageoff;
/* Adjust size for rounding (on both ends). */
size += pageoff; /* low end... */
size = (vm_size_t) round_page(size); /* hi end */
/*
* Check for illegal addresses. Watch out for address wrap... Note
* that VM_*_ADDRESS are not constants due to casts (argh).
*/
if (flags & MAP_FIXED) {
/*
* The specified address must have the same remainder
* as the file offset taken modulo PAGE_SIZE, so it
* should be aligned after adjustment by pageoff.
*/
addr -= pageoff;
if (addr & PAGE_MASK)
return (EINVAL);
/* Address range must be all in user VM space. */
- if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
+ if (addr < vm_map_min(&vms->vm_map) ||
+ addr + size > vm_map_max(&vms->vm_map))
return (EINVAL);
-#ifndef __i386__
- if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
- return (EINVAL);
-#endif
if (addr + size < addr)
return (EINVAL);
}
/*
* XXX for non-fixed mappings where no hint is provided or
* the hint would fall in the potential heap space,
* place it after the end of the largest possible heap.
*
* There should really be a pmap call to determine a reasonable
* location.
*/
else if (addr == 0 ||
(addr >= round_page((vm_offset_t)vms->vm_taddr) &&
addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz)))
addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);
mtx_lock(&Giant); /* syscall marked mp-safe but isn't */
if (flags & MAP_ANON) {
/*
* Mapping blank space is trivial.
*/
handle = NULL;
maxprot = VM_PROT_ALL;
pos = 0;
} else {
/*
* Mapping file, get fp for validation. Obtain vnode and make
* sure it is of appropriate type.
* don't let the descriptor disappear on us if we block
*/
if ((error = fget(td, uap->fd, &fp)) != 0)
goto done;
if (fp->f_type != DTYPE_VNODE) {
error = EINVAL;
goto done;
}
/*
* POSIX shared-memory objects are defined to have
* kernel persistence, and are not defined to support
* read(2)/write(2) -- or even open(2). Thus, we can
* use MAP_ASYNC to trade on-disk coherence for speed.
* The shm_open(3) library routine turns on the FPOSIXSHM
* flag to request this behavior.
*/
if (fp->f_flag & FPOSIXSHM)
flags |= MAP_NOSYNC;
vp = (struct vnode *) fp->f_data;
error = vget(vp, LK_EXCLUSIVE, td);
if (error)
goto done;
if (vp->v_type != VREG && vp->v_type != VCHR) {
error = EINVAL;
goto done;
}
if (vp->v_type == VREG) {
/*
* Get the proper underlying object
*/
if (VOP_GETVOBJECT(vp, &obj) != 0) {
error = EINVAL;
goto done;
}
if (obj->handle != vp) {
vput(vp);
vp = (struct vnode*)obj->handle;
vget(vp, LK_EXCLUSIVE, td);
}
}
/*
* XXX hack to handle use of /dev/zero to map anon memory (ala
* SunOS).
*/
if ((vp->v_type == VCHR) &&
(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) {
handle = NULL;
maxprot = VM_PROT_ALL;
flags |= MAP_ANON;
pos = 0;
} else {
/*
* cdevs does not provide private mappings of any kind.
*/
/*
* However, for XIG X server to continue to work,
* we should allow the superuser to do it anyway.
* We only allow it at securelevel < 1.
* (Because the XIG X server writes directly to video
* memory via /dev/mem, it should never work at any
* other securelevel.
* XXX this will have to go
*/
if (securelevel_ge(td->td_ucred, 1))
disablexworkaround = 1;
else
disablexworkaround = suser(td);
if (vp->v_type == VCHR && disablexworkaround &&
(flags & (MAP_PRIVATE|MAP_COPY))) {
error = EINVAL;
goto done;
}
/*
* Ensure that file and memory protections are
* compatible. Note that we only worry about
* writability if mapping is shared; in this case,
* current and max prot are dictated by the open file.
* XXX use the vnode instead? Problem is: what
* credentials do we use for determination? What if
* proc does a setuid?
*/
maxprot = VM_PROT_EXECUTE; /* ??? */
if (fp->f_flag & FREAD) {
maxprot |= VM_PROT_READ;
} else if (prot & PROT_READ) {
error = EACCES;
goto done;
}
/*
* If we are sharing potential changes (either via
* MAP_SHARED or via the implicit sharing of character
* device mappings), and we are trying to get write
* permission although we opened it without asking
* for it, bail out. Check for superuser, only if
* we're at securelevel < 1, to allow the XIG X server
* to continue to work.
*/
if ((flags & MAP_SHARED) != 0 ||
(vp->v_type == VCHR && disablexworkaround)) {
if ((fp->f_flag & FWRITE) != 0) {
struct vattr va;
if ((error =
VOP_GETATTR(vp, &va,
td->td_ucred, td))) {
goto done;
}
if ((va.va_flags &
(SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) {
maxprot |= VM_PROT_WRITE;
} else if (prot & PROT_WRITE) {
error = EPERM;
goto done;
}
} else if ((prot & PROT_WRITE) != 0) {
error = EACCES;
goto done;
}
} else {
maxprot |= VM_PROT_WRITE;
}
handle = (void *)vp;
}
}
/*
* Do not allow more then a certain number of vm_map_entry structures
* per process. Scale with the number of rforks sharing the map
* to make the limit reasonable for threads.
*/
if (max_proc_mmap &&
vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
error = ENOMEM;
goto done;
}
mtx_unlock(&Giant);
error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
flags, handle, pos);
mtx_lock(&Giant);
if (error == 0)
td->td_retval[0] = (register_t) (addr + pageoff);
done:
if (vp)
vput(vp);
mtx_unlock(&Giant);
if (fp)
fdrop(fp, td);
return (error);
}
#ifdef COMPAT_43
#ifndef _SYS_SYSPROTO_H_
struct ommap_args {
caddr_t addr;
int len;
int prot;
int flags;
int fd;
long pos;
};
#endif
int
ommap(td, uap)
struct thread *td;
struct ommap_args *uap;
{
struct mmap_args nargs;
static const char cvtbsdprot[8] = {
0,
PROT_EXEC,
PROT_WRITE,
PROT_EXEC | PROT_WRITE,
PROT_READ,
PROT_EXEC | PROT_READ,
PROT_WRITE | PROT_READ,
PROT_EXEC | PROT_WRITE | PROT_READ,
};
#define OMAP_ANON 0x0002
#define OMAP_COPY 0x0020
#define OMAP_SHARED 0x0010
#define OMAP_FIXED 0x0100
nargs.addr = uap->addr;
nargs.len = uap->len;
nargs.prot = cvtbsdprot[uap->prot & 0x7];
nargs.flags = 0;
if (uap->flags & OMAP_ANON)
nargs.flags |= MAP_ANON;
if (uap->flags & OMAP_COPY)
nargs.flags |= MAP_COPY;
if (uap->flags & OMAP_SHARED)
nargs.flags |= MAP_SHARED;
else
nargs.flags |= MAP_PRIVATE;
if (uap->flags & OMAP_FIXED)
nargs.flags |= MAP_FIXED;
nargs.fd = uap->fd;
nargs.pos = uap->pos;
return (mmap(td, &nargs));
}
#endif /* COMPAT_43 */
#ifndef _SYS_SYSPROTO_H_
struct msync_args {
void *addr;
int len;
int flags;
};
#endif
/*
* MPSAFE
*/
int
msync(td, uap)
struct thread *td;
struct msync_args *uap;
{
vm_offset_t addr;
vm_size_t size, pageoff;
int flags;
vm_map_t map;
int rv;
addr = (vm_offset_t) uap->addr;
size = uap->len;
flags = uap->flags;
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
if (addr + size < addr)
return (EINVAL);
if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
return (EINVAL);
mtx_lock(&Giant);
map = &td->td_proc->p_vmspace->vm_map;
/*
* XXX Gak! If size is zero we are supposed to sync "all modified
* pages with the region containing addr". Unfortunately, we don't
* really keep track of individual mmaps so we approximate by flushing
* the range of the map entry containing addr. This can be incorrect
* if the region splits or is coalesced with a neighbor.
*/
if (size == 0) {
vm_map_entry_t entry;
vm_map_lock_read(map);
rv = vm_map_lookup_entry(map, addr, &entry);
vm_map_unlock_read(map);
if (rv == FALSE) {
rv = -1;
goto done2;
}
addr = entry->start;
size = entry->end - entry->start;
}
/*
* Clean the pages and interpret the return value.
*/
rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0,
(flags & MS_INVALIDATE) != 0);
done2:
mtx_unlock(&Giant);
switch (rv) {
case KERN_SUCCESS:
return (0);
case KERN_INVALID_ADDRESS:
return (EINVAL); /* Sun returns ENOMEM? */
case KERN_FAILURE:
return (EIO);
default:
return (EINVAL);
}
}
#ifndef _SYS_SYSPROTO_H_
struct munmap_args {
void *addr;
size_t len;
};
#endif
/*
* MPSAFE
*/
int
munmap(td, uap)
struct thread *td;
struct munmap_args *uap;
{
vm_offset_t addr;
vm_size_t size, pageoff;
vm_map_t map;
addr = (vm_offset_t) uap->addr;
size = uap->len;
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
if (addr + size < addr)
return (EINVAL);
if (size == 0)
return (0);
/*
- * Check for illegal addresses. Watch out for address wrap... Note
- * that VM_*_ADDRESS are not constants due to casts (argh).
+ * Check for illegal addresses. Watch out for address wrap...
*/
- if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
- return (EINVAL);
-#ifndef __i386__
- if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
- return (EINVAL);
-#endif
map = &td->td_proc->p_vmspace->vm_map;
+ if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
+ return (EINVAL);
/*
* Make sure entire range is allocated.
*/
if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE))
return (EINVAL);
/* returns nothing but KERN_SUCCESS anyway */
(void) vm_map_remove(map, addr, addr + size);
return (0);
}
#if 0
void
munmapfd(td, fd)
struct thread *td;
int fd;
{
/*
* XXX should unmap any regions mapped to this file
*/
FILEDESC_LOCK(p->p_fd);
td->td_proc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED;
FILEDESC_UNLOCK(p->p_fd);
}
#endif
#ifndef _SYS_SYSPROTO_H_
struct mprotect_args {
const void *addr;
size_t len;
int prot;
};
#endif
/*
* MPSAFE
*/
int
mprotect(td, uap)
struct thread *td;
struct mprotect_args *uap;
{
vm_offset_t addr;
vm_size_t size, pageoff;
vm_prot_t prot;
addr = (vm_offset_t) uap->addr;
size = uap->len;
prot = uap->prot & VM_PROT_ALL;
#if defined(VM_PROT_READ_IS_EXEC)
if (prot & VM_PROT_READ)
prot |= VM_PROT_EXECUTE;
#endif
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
if (addr + size < addr)
return (EINVAL);
switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
addr + size, prot, FALSE)) {
case KERN_SUCCESS:
return (0);
case KERN_PROTECTION_FAILURE:
return (EACCES);
}
return (EINVAL);
}
#ifndef _SYS_SYSPROTO_H_
struct minherit_args {
void *addr;
size_t len;
int inherit;
};
#endif
/*
* MPSAFE
*/
int
minherit(td, uap)
struct thread *td;
struct minherit_args *uap;
{
vm_offset_t addr;
vm_size_t size, pageoff;
vm_inherit_t inherit;
addr = (vm_offset_t)uap->addr;
size = uap->len;
inherit = uap->inherit;
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
if (addr + size < addr)
return (EINVAL);
switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
addr + size, inherit)) {
case KERN_SUCCESS:
return (0);
case KERN_PROTECTION_FAILURE:
return (EACCES);
}
return (EINVAL);
}
#ifndef _SYS_SYSPROTO_H_
struct madvise_args {
void *addr;
size_t len;
int behav;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
madvise(td, uap)
struct thread *td;
struct madvise_args *uap;
{
vm_offset_t start, end;
+ vm_map_t map;
/*
* Check for illegal behavior
*/
if (uap->behav < 0 || uap->behav > MADV_CORE)
return (EINVAL);
/*
* Check for illegal addresses. Watch out for address wrap... Note
* that VM_*_ADDRESS are not constants due to casts (argh).
*/
- if (VM_MAXUSER_ADDRESS > 0 &&
- ((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS)
+ map = &td->td_proc->p_vmspace->vm_map;
+ if ((vm_offset_t)uap->addr < vm_map_min(map) ||
+ (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
return (EINVAL);
-#ifndef __i386__
- if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS)
- return (EINVAL);
-#endif
if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
return (EINVAL);
/*
* Since this routine is only advisory, we default to conservative
* behavior.
*/
start = trunc_page((vm_offset_t) uap->addr);
end = round_page((vm_offset_t) uap->addr + uap->len);
- if (vm_map_madvise(&td->td_proc->p_vmspace->vm_map, start, end,
- uap->behav))
+ if (vm_map_madvise(map, start, end, uap->behav))
return (EINVAL);
return (0);
}
#ifndef _SYS_SYSPROTO_H_
struct mincore_args {
const void *addr;
size_t len;
char *vec;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
mincore(td, uap)
struct thread *td;
struct mincore_args *uap;
{
vm_offset_t addr, first_addr;
vm_offset_t end, cend;
pmap_t pmap;
vm_map_t map;
char *vec;
int error = 0;
int vecindex, lastvecindex;
vm_map_entry_t current;
vm_map_entry_t entry;
int mincoreinfo;
unsigned int timestamp;
/*
* Make sure that the addresses presented are valid for user
* mode.
*/
first_addr = addr = trunc_page((vm_offset_t) uap->addr);
end = addr + (vm_size_t)round_page(uap->len);
- if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS)
+ map = &td->td_proc->p_vmspace->vm_map;
+ if (end > vm_map_max(map) || end < addr)
return (EINVAL);
- if (end < addr)
- return (EINVAL);
/*
* Address of byte vector
*/
vec = uap->vec;
mtx_lock(&Giant);
- map = &td->td_proc->p_vmspace->vm_map;
pmap = vmspace_pmap(td->td_proc->p_vmspace);
vm_map_lock_read(map);
RestartScan:
timestamp = map->timestamp;
if (!vm_map_lookup_entry(map, addr, &entry))
entry = entry->next;
/*
* Do this on a map entry basis so that if the pages are not
* in the current processes address space, we can easily look
* up the pages elsewhere.
*/
lastvecindex = -1;
for (current = entry;
(current != &map->header) && (current->start < end);
current = current->next) {
/*
* ignore submaps (for now) or null objects
*/
if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
current->object.vm_object == NULL)
continue;
/*
* limit this scan to the current map entry and the
* limits for the mincore call
*/
if (addr < current->start)
addr = current->start;
cend = current->end;
if (cend > end)
cend = end;
/*
* scan this entry one page at a time
*/
while (addr < cend) {
/*
* Check pmap first, it is likely faster, also
* it can provide info as to whether we are the
* one referencing or modifying the page.
*/
mincoreinfo = pmap_mincore(pmap, addr);
if (!mincoreinfo) {
vm_pindex_t pindex;
vm_ooffset_t offset;
vm_page_t m;
/*
* calculate the page index into the object
*/
offset = current->offset + (addr - current->start);
pindex = OFF_TO_IDX(offset);
m = vm_page_lookup(current->object.vm_object,
pindex);
/*
* if the page is resident, then gather information about
* it.
*/
if (m) {
mincoreinfo = MINCORE_INCORE;
if (m->dirty ||
pmap_is_modified(m))
mincoreinfo |= MINCORE_MODIFIED_OTHER;
if ((m->flags & PG_REFERENCED) ||
pmap_ts_referenced(m)) {
vm_page_flag_set(m, PG_REFERENCED);
mincoreinfo |= MINCORE_REFERENCED_OTHER;
}
}
}
/*
* subyte may page fault. In case it needs to modify
* the map, we release the lock.
*/
vm_map_unlock_read(map);
/*
* calculate index into user supplied byte vector
*/
vecindex = OFF_TO_IDX(addr - first_addr);
/*
* If we have skipped map entries, we need to make sure that
* the byte vector is zeroed for those skipped entries.
*/
while ((lastvecindex + 1) < vecindex) {
error = subyte(vec + lastvecindex, 0);
if (error) {
error = EFAULT;
goto done2;
}
++lastvecindex;
}
/*
* Pass the page information to the user
*/
error = subyte(vec + vecindex, mincoreinfo);
if (error) {
error = EFAULT;
goto done2;
}
/*
* If the map has changed, due to the subyte, the previous
* output may be invalid.
*/
vm_map_lock_read(map);
if (timestamp != map->timestamp)
goto RestartScan;
lastvecindex = vecindex;
addr += PAGE_SIZE;
}
}
/*
* subyte may page fault. In case it needs to modify
* the map, we release the lock.
*/
vm_map_unlock_read(map);
/*
* Zero the last entries in the byte vector.
*/
vecindex = OFF_TO_IDX(end - first_addr);
while ((lastvecindex + 1) < vecindex) {
error = subyte(vec + lastvecindex, 0);
if (error) {
error = EFAULT;
goto done2;
}
++lastvecindex;
}
/*
* If the map has changed, due to the subyte, the previous
* output may be invalid.
*/
vm_map_lock_read(map);
if (timestamp != map->timestamp)
goto RestartScan;
vm_map_unlock_read(map);
done2:
mtx_unlock(&Giant);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct mlock_args {
const void *addr;
size_t len;
};
#endif
/*
* MPSAFE
*/
int
mlock(td, uap)
struct thread *td;
struct mlock_args *uap;
{
vm_offset_t addr;
vm_size_t size, pageoff;
int error;
addr = (vm_offset_t) uap->addr;
size = uap->len;
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
/* disable wrap around */
if (addr + size < addr)
return (EINVAL);
if (atop(size) + cnt.v_wire_count > vm_page_max_wired)
return (EAGAIN);
#ifdef pmap_wired_count
if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) >
td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
return (ENOMEM);
#else
error = suser(td);
if (error)
return (error);
#endif
error = vm_map_wire(&td->td_proc->p_vmspace->vm_map, addr,
addr + size, TRUE);
return (error == KERN_SUCCESS ? 0 : ENOMEM);
}
#ifndef _SYS_SYSPROTO_H_
struct mlockall_args {
int how;
};
#endif
/*
* MPSAFE
*/
int
mlockall(td, uap)
struct thread *td;
struct mlockall_args *uap;
{
/* mtx_lock(&Giant); */
/* mtx_unlock(&Giant); */
return 0;
}
#ifndef _SYS_SYSPROTO_H_
struct munlockall_args {
int how;
};
#endif
/*
* MPSAFE
*/
int
munlockall(td, uap)
struct thread *td;
struct munlockall_args *uap;
{
/* mtx_lock(&Giant); */
/* mtx_unlock(&Giant); */
return 0;
}
#ifndef _SYS_SYSPROTO_H_
struct munlock_args {
const void *addr;
size_t len;
};
#endif
/*
* MPSAFE
*/
int
munlock(td, uap)
struct thread *td;
struct munlock_args *uap;
{
vm_offset_t addr;
vm_size_t size, pageoff;
int error;
addr = (vm_offset_t) uap->addr;
size = uap->len;
pageoff = (addr & PAGE_MASK);
addr -= pageoff;
size += pageoff;
size = (vm_size_t) round_page(size);
/* disable wrap around */
if (addr + size < addr)
return (EINVAL);
#ifndef pmap_wired_count
error = suser(td);
if (error)
return (error);
#endif
error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, addr,
addr + size, TRUE);
return (error == KERN_SUCCESS ? 0 : ENOMEM);
}
/*
* vm_mmap()
*
* MPSAFE
*
* Internal version of mmap. Currently used by mmap, exec, and sys5
* shared memory. Handle is either a vnode pointer or NULL for MAP_ANON.
*/
int
vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
vm_prot_t maxprot, int flags,
void *handle,
vm_ooffset_t foff)
{
boolean_t fitit;
vm_object_t object;
struct vnode *vp = NULL;
objtype_t type;
int rv = KERN_SUCCESS;
vm_ooffset_t objsize;
int docow;
struct thread *td = curthread;
if (size == 0)
return (0);
objsize = size = round_page(size);
if (td->td_proc->p_vmspace->vm_map.size + size >
td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
return(ENOMEM);
}
/*
* We currently can only deal with page aligned file offsets.
* The check is here rather than in the syscall because the
* kernel calls this function internally for other mmaping
* operations (such as in exec) and non-aligned offsets will
* cause pmap inconsistencies...so we want to be sure to
* disallow this in all cases.
*/
if (foff & PAGE_MASK)
return (EINVAL);
if ((flags & MAP_FIXED) == 0) {
fitit = TRUE;
*addr = round_page(*addr);
} else {
if (*addr != trunc_page(*addr))
return (EINVAL);
fitit = FALSE;
(void) vm_map_remove(map, *addr, *addr + size);
}
/*
* Lookup/allocate object.
*/
if (flags & MAP_ANON) {
type = OBJT_DEFAULT;
/*
* Unnamed anonymous regions always start at 0.
*/
if (handle == 0)
foff = 0;
} else {
vp = (struct vnode *) handle;
mtx_lock(&Giant);
ASSERT_VOP_LOCKED(vp, "vm_mmap");
if (vp->v_type == VCHR) {
type = OBJT_DEVICE;
handle = (void *)(intptr_t)vp->v_rdev;
} else {
struct vattr vat;
int error;
error = VOP_GETATTR(vp, &vat, td->td_ucred, td);
if (error) {
mtx_unlock(&Giant);
return (error);
}
objsize = round_page(vat.va_size);
type = OBJT_VNODE;
/*
* if it is a regular file without any references
* we do not need to sync it.
*/
if (vp->v_type == VREG && vat.va_nlink == 0) {
flags |= MAP_NOSYNC;
}
}
mtx_unlock(&Giant);
}
if (handle == NULL) {
object = NULL;
docow = 0;
} else {
object = vm_pager_allocate(type,
handle, objsize, prot, foff);
if (object == NULL) {
return (type == OBJT_DEVICE ? EINVAL : ENOMEM);
}
docow = MAP_PREFAULT_PARTIAL;
}
/*
* Force device mappings to be shared.
*/
if (type == OBJT_DEVICE || type == OBJT_PHYS) {
flags &= ~(MAP_PRIVATE|MAP_COPY);
flags |= MAP_SHARED;
}
if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
docow |= MAP_COPY_ON_WRITE;
if (flags & MAP_NOSYNC)
docow |= MAP_DISABLE_SYNCER;
if (flags & MAP_NOCORE)
docow |= MAP_DISABLE_COREDUMP;
#if defined(VM_PROT_READ_IS_EXEC)
if (prot & VM_PROT_READ)
prot |= VM_PROT_EXECUTE;
if (maxprot & VM_PROT_READ)
maxprot |= VM_PROT_EXECUTE;
#endif
if (fitit)
*addr = pmap_addr_hint(object, *addr, size);
if (flags & MAP_STACK)
rv = vm_map_stack (map, *addr, size, prot,
maxprot, docow);
else
rv = vm_map_find(map, object, foff, addr, size, fitit,
prot, maxprot, docow);
if (rv != KERN_SUCCESS) {
/*
* Lose the object reference. Will destroy the
* object if it's an unnamed anonymous mapping
* or named anonymous without other references.
*/
vm_object_deallocate(object);
} else if (flags & MAP_SHARED) {
/*
* Shared memory is also shared with children.
*/
rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
if (rv != KERN_SUCCESS)
(void) vm_map_remove(map, *addr, *addr + size);
}
switch (rv) {
case KERN_SUCCESS:
return (0);
case KERN_INVALID_ADDRESS:
case KERN_NO_SPACE:
return (ENOMEM);
case KERN_PROTECTION_FAILURE:
return (EACCES);
default:
return (EINVAL);
}
}
Index: head/sys/vm/vm_pageout.c
===================================================================
--- head/sys/vm/vm_pageout.c (revision 103766)
+++ head/sys/vm/vm_pageout.c (revision 103767)
@@ -1,1537 +1,1537 @@
/*
* Copyright (c) 1991 Regents of the University of California.
* All rights reserved.
* Copyright (c) 1994 John S. Dyson
* All rights reserved.
* Copyright (c) 1994 David Greenman
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Authors: Avadis Tevanian, Jr., Michael Wayne Young
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*
* $FreeBSD$
*/
/*
* The proverbial page-out daemon.
*/
#include "opt_vm.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/ktr.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
#include <sys/vmmeter.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
#include <machine/mutex.h>
/*
* System initialization
*/
/* the kernel process "vm_pageout"*/
static void vm_pageout(void);
static int vm_pageout_clean(vm_page_t);
static void vm_pageout_scan(int pass);
static int vm_pageout_free_page_calc(vm_size_t count);
struct proc *pageproc;
static struct kproc_desc page_kp = {
"pagedaemon",
vm_pageout,
&pageproc
};
SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
#if !defined(NO_SWAPPING)
/* the kernel process "vm_daemon"*/
static void vm_daemon(void);
static struct proc *vmproc;
static struct kproc_desc vm_kp = {
"vmdaemon",
vm_daemon,
&vmproc
};
SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
#endif
int vm_pages_needed=0; /* Event on which pageout daemon sleeps */
int vm_pageout_deficit=0; /* Estimated number of pages deficit */
int vm_pageout_pages_needed=0; /* flag saying that the pageout daemon needs pages */
#if !defined(NO_SWAPPING)
static int vm_pageout_req_swapout; /* XXX */
static int vm_daemon_needed;
#endif
extern int vm_swap_size;
static int vm_max_launder = 32;
static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
static int vm_pageout_full_stats_interval = 0;
static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
static int defer_swap_pageouts=0;
static int disable_swap_pageouts=0;
#if defined(NO_SWAPPING)
static int vm_swap_enabled=0;
static int vm_swap_idle_enabled=0;
#else
static int vm_swap_enabled=1;
static int vm_swap_idle_enabled=0;
#endif
SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
SYSCTL_INT(_vm, OID_AUTO, max_launder,
CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
#if defined(NO_SWAPPING)
SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
CTLFLAG_RD, &vm_swap_enabled, 0, "");
SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
#else
SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
#endif
SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
static int pageout_lock_miss;
SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
#define VM_PAGEOUT_PAGE_COUNT 16
int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
int vm_page_max_wired; /* XXX max # of wired pages system-wide */
#if !defined(NO_SWAPPING)
typedef void freeer_fcn_t(vm_map_t, vm_object_t, vm_pindex_t, int);
static void vm_pageout_map_deactivate_pages(vm_map_t, vm_pindex_t);
static freeer_fcn_t vm_pageout_object_deactivate_pages;
static void vm_req_vmdaemon(void);
#endif
static void vm_pageout_page_stats(void);
/*
* vm_pageout_clean:
*
* Clean the page and remove it from the laundry.
*
* We set the busy bit to cause potential page faults on this page to
* block. Note the careful timing, however, the busy bit isn't set till
* late and we cannot do anything that will mess with the page.
*/
static int
vm_pageout_clean(m)
vm_page_t m;
{
vm_object_t object;
vm_page_t mc[2*vm_pageout_page_count];
int pageout_count;
int ib, is, page_base;
vm_pindex_t pindex = m->pindex;
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
object = m->object;
/*
* It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
* with the new swapper, but we could have serious problems paging
* out other object types if there is insufficient memory.
*
* Unfortunately, checking free memory here is far too late, so the
* check has been moved up a procedural level.
*/
/*
* Don't mess with the page if it's busy, held, or special
*/
if ((m->hold_count != 0) ||
((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) {
return 0;
}
mc[vm_pageout_page_count] = m;
pageout_count = 1;
page_base = vm_pageout_page_count;
ib = 1;
is = 1;
/*
* Scan object for clusterable pages.
*
* We can cluster ONLY if: ->> the page is NOT
* clean, wired, busy, held, or mapped into a
* buffer, and one of the following:
* 1) The page is inactive, or a seldom used
* active page.
* -or-
* 2) we force the issue.
*
* During heavy mmap/modification loads the pageout
* daemon can really fragment the underlying file
* due to flushing pages out of order and not trying
* align the clusters (which leave sporatic out-of-order
* holes). To solve this problem we do the reverse scan
* first and attempt to align our cluster, then do a
* forward scan if room remains.
*/
more:
while (ib && pageout_count < vm_pageout_page_count) {
vm_page_t p;
if (ib > pindex) {
ib = 0;
break;
}
if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
ib = 0;
break;
}
if (((p->queue - p->pc) == PQ_CACHE) ||
(p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
ib = 0;
break;
}
vm_page_test_dirty(p);
if ((p->dirty & p->valid) == 0 ||
p->queue != PQ_INACTIVE ||
p->wire_count != 0 || /* may be held by buf cache */
p->hold_count != 0) { /* may be undergoing I/O */
ib = 0;
break;
}
mc[--page_base] = p;
++pageout_count;
++ib;
/*
* alignment boundry, stop here and switch directions. Do
* not clear ib.
*/
if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
break;
}
while (pageout_count < vm_pageout_page_count &&
pindex + is < object->size) {
vm_page_t p;
if ((p = vm_page_lookup(object, pindex + is)) == NULL)
break;
if (((p->queue - p->pc) == PQ_CACHE) ||
(p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
break;
}
vm_page_test_dirty(p);
if ((p->dirty & p->valid) == 0 ||
p->queue != PQ_INACTIVE ||
p->wire_count != 0 || /* may be held by buf cache */
p->hold_count != 0) { /* may be undergoing I/O */
break;
}
mc[page_base + pageout_count] = p;
++pageout_count;
++is;
}
/*
* If we exhausted our forward scan, continue with the reverse scan
* when possible, even past a page boundry. This catches boundry
* conditions.
*/
if (ib && pageout_count < vm_pageout_page_count)
goto more;
/*
* we allow reads during pageouts...
*/
return vm_pageout_flush(&mc[page_base], pageout_count, 0);
}
/*
* vm_pageout_flush() - launder the given pages
*
* The given pages are laundered. Note that we setup for the start of
* I/O ( i.e. busy the page ), mark it read-only, and bump the object
* reference count all in here rather then in the parent. If we want
* the parent to do more sophisticated things we may have to change
* the ordering.
*/
int
vm_pageout_flush(mc, count, flags)
vm_page_t *mc;
int count;
int flags;
{
vm_object_t object;
int pageout_status[count];
int numpagedout = 0;
int i;
mtx_assert(&vm_page_queue_mtx, MA_OWNED);
/*
* Initiate I/O. Bump the vm_page_t->busy counter and
* mark the pages read-only.
*
* We do not have to fixup the clean/dirty bits here... we can
* allow the pager to do it after the I/O completes.
*
* NOTE! mc[i]->dirty may be partial or fragmented due to an
* edge case with file fragments.
*/
for (i = 0; i < count; i++) {
KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially invalid page", mc[i], i, count));
vm_page_io_start(mc[i]);
vm_page_protect(mc[i], VM_PROT_READ);
}
object = mc[0]->object;
vm_page_unlock_queues();
vm_object_pip_add(object, count);
vm_pager_put_pages(object, mc, count,
(flags | ((object == kernel_object) ? OBJPC_SYNC : 0)),
pageout_status);
vm_page_lock_queues();
for (i = 0; i < count; i++) {
vm_page_t mt = mc[i];
switch (pageout_status[i]) {
case VM_PAGER_OK:
numpagedout++;
break;
case VM_PAGER_PEND:
numpagedout++;
break;
case VM_PAGER_BAD:
/*
* Page outside of range of object. Right now we
* essentially lose the changes by pretending it
* worked.
*/
pmap_clear_modify(mt);
vm_page_undirty(mt);
break;
case VM_PAGER_ERROR:
case VM_PAGER_FAIL:
/*
* If page couldn't be paged out, then reactivate the
* page so it doesn't clog the inactive list. (We
* will try paging out it again later).
*/
vm_page_activate(mt);
break;
case VM_PAGER_AGAIN:
break;
}
/*
* If the operation is still going, leave the page busy to
* block all other accesses. Also, leave the paging in
* progress indicator set so that we don't attempt an object
* collapse.
*/
if (pageout_status[i] != VM_PAGER_PEND) {
vm_object_pip_wakeup(object);
vm_page_io_finish(mt);
if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
vm_page_protect(mt, VM_PROT_READ);
}
}
return numpagedout;
}
#if !defined(NO_SWAPPING)
/*
* vm_pageout_object_deactivate_pages
*
* deactivate enough pages to satisfy the inactive target
* requirements or if vm_page_proc_limit is set, then
* deactivate all of the pages in the object and its
* backing_objects.
*
* The object and map must be locked.
*/
static void
vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only)
vm_map_t map;
vm_object_t object;
vm_pindex_t desired;
int map_remove_only;
{
vm_page_t p, next;
int actcount, rcount, remove_mode;
GIANT_REQUIRED;
if (object->type == OBJT_DEVICE || object->type == OBJT_PHYS)
return;
while (object) {
if (pmap_resident_count(vm_map_pmap(map)) <= desired)
return;
if (object->paging_in_progress)
return;
remove_mode = map_remove_only;
if (object->shadow_count > 1)
remove_mode = 1;
/*
* scan the objects entire memory queue
*/
rcount = object->resident_page_count;
p = TAILQ_FIRST(&object->memq);
vm_page_lock_queues();
while (p && (rcount-- > 0)) {
if (pmap_resident_count(map->pmap) <= desired) {
vm_page_unlock_queues();
return;
}
next = TAILQ_NEXT(p, listq);
cnt.v_pdpages++;
if (p->wire_count != 0 ||
p->hold_count != 0 ||
p->busy != 0 ||
(p->flags & (PG_BUSY|PG_UNMANAGED)) ||
!pmap_page_exists_quick(vm_map_pmap(map), p)) {
p = next;
continue;
}
actcount = pmap_ts_referenced(p);
if (actcount) {
vm_page_flag_set(p, PG_REFERENCED);
} else if (p->flags & PG_REFERENCED) {
actcount = 1;
}
if ((p->queue != PQ_ACTIVE) &&
(p->flags & PG_REFERENCED)) {
vm_page_activate(p);
p->act_count += actcount;
vm_page_flag_clear(p, PG_REFERENCED);
} else if (p->queue == PQ_ACTIVE) {
if ((p->flags & PG_REFERENCED) == 0) {
p->act_count -= min(p->act_count, ACT_DECLINE);
if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) {
vm_page_protect(p, VM_PROT_NONE);
vm_page_deactivate(p);
} else {
vm_pageq_requeue(p);
}
} else {
vm_page_activate(p);
vm_page_flag_clear(p, PG_REFERENCED);
if (p->act_count < (ACT_MAX - ACT_ADVANCE))
p->act_count += ACT_ADVANCE;
vm_pageq_requeue(p);
}
} else if (p->queue == PQ_INACTIVE) {
vm_page_protect(p, VM_PROT_NONE);
}
p = next;
}
vm_page_unlock_queues();
object = object->backing_object;
}
}
/*
* deactivate some number of pages in a map, try to do it fairly, but
* that is really hard to do.
*/
static void
vm_pageout_map_deactivate_pages(map, desired)
vm_map_t map;
vm_pindex_t desired;
{
vm_map_entry_t tmpe;
vm_object_t obj, bigobj;
int nothingwired;
GIANT_REQUIRED;
if (!vm_map_trylock(map))
return;
bigobj = NULL;
nothingwired = TRUE;
/*
* first, search out the biggest object, and try to free pages from
* that.
*/
tmpe = map->header.next;
while (tmpe != &map->header) {
if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
obj = tmpe->object.vm_object;
if ((obj != NULL) && (obj->shadow_count <= 1) &&
((bigobj == NULL) ||
(bigobj->resident_page_count < obj->resident_page_count))) {
bigobj = obj;
}
}
if (tmpe->wired_count > 0)
nothingwired = FALSE;
tmpe = tmpe->next;
}
if (bigobj)
vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);
/*
* Next, hunt around for other pages to deactivate. We actually
* do this search sort of wrong -- .text first is not the best idea.
*/
tmpe = map->header.next;
while (tmpe != &map->header) {
if (pmap_resident_count(vm_map_pmap(map)) <= desired)
break;
if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
obj = tmpe->object.vm_object;
if (obj)
vm_pageout_object_deactivate_pages(map, obj, desired, 0);
}
tmpe = tmpe->next;
};
/*
* Remove all mappings if a process is swapped out, this will free page
* table pages.
*/
if (desired == 0 && nothingwired)
- pmap_remove(vm_map_pmap(map),
- VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
+ pmap_remove(vm_map_pmap(map), vm_map_min(map),
+ vm_map_max(map));
vm_map_unlock(map);
return;
}
#endif /* !defined(NO_SWAPPING) */
/*
* Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore
* to vnode deadlocks. We only do it for OBJT_DEFAULT and OBJT_SWAP objects
* which we know can be trivially freed.
*/
void
vm_pageout_page_free(vm_page_t m) {
vm_object_t object = m->object;
int type = object->type;
GIANT_REQUIRED;
if (type == OBJT_SWAP || type == OBJT_DEFAULT)
vm_object_reference(object);
vm_page_busy(m);
vm_page_protect(m, VM_PROT_NONE);
vm_page_free(m);
cnt.v_dfree++;
if (type == OBJT_SWAP || type == OBJT_DEFAULT)
vm_object_deallocate(object);
}
/*
* vm_pageout_scan does the dirty work for the pageout daemon.
*/
static void
vm_pageout_scan(int pass)
{
vm_page_t m, next;
struct vm_page marker;
int save_page_shortage;
int save_inactive_count;
int page_shortage, maxscan, pcount;
int addl_page_shortage, addl_page_shortage_init;
struct proc *p, *bigproc;
vm_offset_t size, bigsize;
vm_object_t object;
int actcount;
int vnodes_skipped = 0;
int maxlaunder;
int s;
struct thread *td;
GIANT_REQUIRED;
/*
* Do whatever cleanup that the pmap code can.
*/
pmap_collect();
uma_reclaim();
addl_page_shortage_init = vm_pageout_deficit;
vm_pageout_deficit = 0;
/*
* Calculate the number of pages we want to either free or move
* to the cache.
*/
page_shortage = vm_paging_target() + addl_page_shortage_init;
save_page_shortage = page_shortage;
save_inactive_count = cnt.v_inactive_count;
/*
* Initialize our marker
*/
bzero(&marker, sizeof(marker));
marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
marker.queue = PQ_INACTIVE;
marker.wire_count = 1;
/*
* Start scanning the inactive queue for pages we can move to the
* cache or free. The scan will stop when the target is reached or
* we have scanned the entire inactive queue. Note that m->act_count
* is not used to form decisions for the inactive queue, only for the
* active queue.
*
* maxlaunder limits the number of dirty pages we flush per scan.
* For most systems a smaller value (16 or 32) is more robust under
* extreme memory and disk pressure because any unnecessary writes
* to disk can result in extreme performance degredation. However,
* systems with excessive dirty pages (especially when MAP_NOSYNC is
* used) will die horribly with limited laundering. If the pageout
* daemon cannot clean enough pages in the first pass, we let it go
* all out in succeeding passes.
*/
if ((maxlaunder = vm_max_launder) <= 1)
maxlaunder = 1;
if (pass)
maxlaunder = 10000;
rescan0:
addl_page_shortage = addl_page_shortage_init;
maxscan = cnt.v_inactive_count;
for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
m != NULL && maxscan-- > 0 && page_shortage > 0;
m = next) {
cnt.v_pdpages++;
if (m->queue != PQ_INACTIVE) {
goto rescan0;
}
next = TAILQ_NEXT(m, pageq);
/*
* skip marker pages
*/
if (m->flags & PG_MARKER)
continue;
/*
* A held page may be undergoing I/O, so skip it.
*/
if (m->hold_count) {
vm_pageq_requeue(m);
addl_page_shortage++;
continue;
}
/*
* Don't mess with busy pages, keep in the front of the
* queue, most likely are being paged out.
*/
if (m->busy || (m->flags & PG_BUSY)) {
addl_page_shortage++;
continue;
}
/*
* If the object is not being used, we ignore previous
* references.
*/
if (m->object->ref_count == 0) {
vm_page_flag_clear(m, PG_REFERENCED);
pmap_clear_reference(m);
/*
* Otherwise, if the page has been referenced while in the
* inactive queue, we bump the "activation count" upwards,
* making it less likely that the page will be added back to
* the inactive queue prematurely again. Here we check the
* page tables (or emulated bits, if any), given the upper
* level VM system not knowing anything about existing
* references.
*/
} else if (((m->flags & PG_REFERENCED) == 0) &&
(actcount = pmap_ts_referenced(m))) {
vm_page_lock_queues();
vm_page_activate(m);
vm_page_unlock_queues();
m->act_count += (actcount + ACT_ADVANCE);
continue;
}
/*
* If the upper level VM system knows about any page
* references, we activate the page. We also set the
* "activation count" higher than normal so that we will less
* likely place pages back onto the inactive queue again.
*/
if ((m->flags & PG_REFERENCED) != 0) {
vm_page_flag_clear(m, PG_REFERENCED);
actcount = pmap_ts_referenced(m);
vm_page_lock_queues();
vm_page_activate(m);
vm_page_unlock_queues();
m->act_count += (actcount + ACT_ADVANCE + 1);
continue;
}
/*
* If the upper level VM system doesn't know anything about
* the page being dirty, we have to check for it again. As
* far as the VM code knows, any partially dirty pages are
* fully dirty.
*/
if (m->dirty == 0) {
vm_page_test_dirty(m);
} else {
vm_page_dirty(m);
}
/*
* Invalid pages can be easily freed
*/
if (m->valid == 0) {
vm_page_lock_queues();
vm_pageout_page_free(m);
vm_page_unlock_queues();
--page_shortage;
/*
* Clean pages can be placed onto the cache queue. This
* effectively frees them.
*/
} else if (m->dirty == 0) {
vm_page_lock_queues();
vm_page_cache(m);
vm_page_unlock_queues();
--page_shortage;
} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
/*
* Dirty pages need to be paged out, but flushing
* a page is extremely expensive verses freeing
* a clean page. Rather then artificially limiting
* the number of pages we can flush, we instead give
* dirty pages extra priority on the inactive queue
* by forcing them to be cycled through the queue
* twice before being flushed, after which the
* (now clean) page will cycle through once more
* before being freed. This significantly extends
* the thrash point for a heavily loaded machine.
*/
vm_page_flag_set(m, PG_WINATCFLS);
vm_pageq_requeue(m);
} else if (maxlaunder > 0) {
/*
* We always want to try to flush some dirty pages if
* we encounter them, to keep the system stable.
* Normally this number is small, but under extreme
* pressure where there are insufficient clean pages
* on the inactive queue, we may have to go all out.
*/
int swap_pageouts_ok;
struct vnode *vp = NULL;
struct mount *mp;
object = m->object;
if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
swap_pageouts_ok = 1;
} else {
swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
vm_page_count_min());
}
/*
* We don't bother paging objects that are "dead".
* Those objects are in a "rundown" state.
*/
if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
vm_pageq_requeue(m);
continue;
}
/*
* The object is already known NOT to be dead. It
* is possible for the vget() to block the whole
* pageout daemon, but the new low-memory handling
* code should prevent it.
*
* The previous code skipped locked vnodes and, worse,
* reordered pages in the queue. This results in
* completely non-deterministic operation and, on a
* busy system, can lead to extremely non-optimal
* pageouts. For example, it can cause clean pages
* to be freed and dirty pages to be moved to the end
* of the queue. Since dirty pages are also moved to
* the end of the queue once-cleaned, this gives
* way too large a weighting to defering the freeing
* of dirty pages.
*
* We can't wait forever for the vnode lock, we might
* deadlock due to a vn_read() getting stuck in
* vm_wait while holding this vnode. We skip the
* vnode if we can't get it in a reasonable amount
* of time.
*/
if (object->type == OBJT_VNODE) {
vp = object->handle;
mp = NULL;
if (vp->v_type == VREG)
vn_start_write(vp, &mp, V_NOWAIT);
if (vget(vp, LK_EXCLUSIVE|LK_NOOBJ|LK_TIMELOCK, curthread)) {
++pageout_lock_miss;
vn_finished_write(mp);
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
continue;
}
/*
* The page might have been moved to another
* queue during potential blocking in vget()
* above. The page might have been freed and
* reused for another vnode. The object might
* have been reused for another vnode.
*/
if (m->queue != PQ_INACTIVE ||
m->object != object ||
object->handle != vp) {
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
vput(vp);
vn_finished_write(mp);
continue;
}
/*
* The page may have been busied during the
* blocking in vput(); We don't move the
* page back onto the end of the queue so that
* statistics are more correct if we don't.
*/
if (m->busy || (m->flags & PG_BUSY)) {
vput(vp);
vn_finished_write(mp);
continue;
}
/*
* If the page has become held it might
* be undergoing I/O, so skip it
*/
if (m->hold_count) {
vm_pageq_requeue(m);
if (object->flags & OBJ_MIGHTBEDIRTY)
vnodes_skipped++;
vput(vp);
vn_finished_write(mp);
continue;
}
}
/*
* If a page is dirty, then it is either being washed
* (but not yet cleaned) or it is still in the
* laundry. If it is still in the laundry, then we
* start the cleaning operation.
*
* This operation may cluster, invalidating the 'next'
* pointer. To prevent an inordinate number of
* restarts we use our marker to remember our place.
*
* decrement page_shortage on success to account for
* the (future) cleaned page. Otherwise we could wind
* up laundering or cleaning too many pages.
*/
vm_page_lock_queues();
s = splvm();
TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
splx(s);
if (vm_pageout_clean(m) != 0) {
--page_shortage;
--maxlaunder;
}
s = splvm();
next = TAILQ_NEXT(&marker, pageq);
TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
splx(s);
vm_page_unlock_queues();
if (vp) {
vput(vp);
vn_finished_write(mp);
}
}
}
/*
* Compute the number of pages we want to try to move from the
* active queue to the inactive queue.
*/
page_shortage = vm_paging_target() +
cnt.v_inactive_target - cnt.v_inactive_count;
page_shortage += addl_page_shortage;
vm_page_lock_queues();
/*
* Scan the active queue for things we can deactivate. We nominally
* track the per-page activity counter and use it to locate
* deactivation candidates.
*/
pcount = cnt.v_active_count;
m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
/*
* This is a consistency check, and should likely be a panic
* or warning.
*/
if (m->queue != PQ_ACTIVE) {
break;
}
next = TAILQ_NEXT(m, pageq);
/*
* Don't deactivate pages that are busy.
*/
if ((m->busy != 0) ||
(m->flags & PG_BUSY) ||
(m->hold_count != 0)) {
vm_pageq_requeue(m);
m = next;
continue;
}
/*
* The count for pagedaemon pages is done after checking the
* page for eligibility...
*/
cnt.v_pdpages++;
/*
* Check to see "how much" the page has been used.
*/
actcount = 0;
if (m->object->ref_count != 0) {
if (m->flags & PG_REFERENCED) {
actcount += 1;
}
actcount += pmap_ts_referenced(m);
if (actcount) {
m->act_count += ACT_ADVANCE + actcount;
if (m->act_count > ACT_MAX)
m->act_count = ACT_MAX;
}
}
/*
* Since we have "tested" this bit, we need to clear it now.
*/
vm_page_flag_clear(m, PG_REFERENCED);
/*
* Only if an object is currently being used, do we use the
* page activation count stats.
*/
if (actcount && (m->object->ref_count != 0)) {
vm_pageq_requeue(m);
} else {
m->act_count -= min(m->act_count, ACT_DECLINE);
if (vm_pageout_algorithm ||
m->object->ref_count == 0 ||
m->act_count == 0) {
page_shortage--;
if (m->object->ref_count == 0) {
vm_page_protect(m, VM_PROT_NONE);
if (m->dirty == 0)
vm_page_cache(m);
else
vm_page_deactivate(m);
} else {
vm_page_deactivate(m);
}
} else {
vm_pageq_requeue(m);
}
}
m = next;
}
s = splvm();
/*
* We try to maintain some *really* free pages, this allows interrupt
* code to be guaranteed space. Since both cache and free queues
* are considered basically 'free', moving pages from cache to free
* does not effect other calculations.
*/
while (cnt.v_free_count < cnt.v_free_reserved) {
static int cache_rover = 0;
m = vm_pageq_find(PQ_CACHE, cache_rover, FALSE);
if (!m)
break;
if ((m->flags & (PG_BUSY|PG_UNMANAGED)) ||
m->busy ||
m->hold_count ||
m->wire_count) {
#ifdef INVARIANTS
printf("Warning: busy page %p found in cache\n", m);
#endif
vm_page_deactivate(m);
continue;
}
cache_rover = (cache_rover + PQ_PRIME2) & PQ_L2_MASK;
vm_pageout_page_free(m);
}
splx(s);
vm_page_unlock_queues();
#if !defined(NO_SWAPPING)
/*
* Idle process swapout -- run once per second.
*/
if (vm_swap_idle_enabled) {
static long lsec;
if (time_second != lsec) {
vm_pageout_req_swapout |= VM_SWAP_IDLE;
vm_req_vmdaemon();
lsec = time_second;
}
}
#endif
/*
* If we didn't get enough free pages, and we have skipped a vnode
* in a writeable object, wakeup the sync daemon. And kick swapout
* if we did not get enough free pages.
*/
if (vm_paging_target() > 0) {
if (vnodes_skipped && vm_page_count_min())
(void) speedup_syncer();
#if !defined(NO_SWAPPING)
if (vm_swap_enabled && vm_page_count_target()) {
vm_req_vmdaemon();
vm_pageout_req_swapout |= VM_SWAP_NORMAL;
}
#endif
}
/*
* If we are out of swap and were not able to reach our paging
* target, kill the largest process.
*
* We keep the process bigproc locked once we find it to keep anyone
* from messing with it; however, there is a possibility of
* deadlock if process B is bigproc and one of it's child processes
* attempts to propagate a signal to B while we are waiting for A's
* lock while walking this list. To avoid this, we don't block on
* the process lock but just skip a process if it is already locked.
*/
if ((vm_swap_size < 64 && vm_page_count_min()) ||
(swap_pager_full && vm_paging_target() > 0)) {
#if 0
if ((vm_swap_size < 64 || swap_pager_full) && vm_page_count_min()) {
#endif
bigproc = NULL;
bigsize = 0;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
int breakout;
/*
* If this process is already locked, skip it.
*/
if (PROC_TRYLOCK(p) == 0)
continue;
/*
* if this is a system process, skip it
*/
if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) ||
((p->p_pid < 48) && (vm_swap_size != 0))) {
PROC_UNLOCK(p);
continue;
}
/*
* if the process is in a non-running type state,
* don't touch it. Check all the threads individually.
*/
mtx_lock_spin(&sched_lock);
breakout = 0;
FOREACH_THREAD_IN_PROC(p, td) {
if (!TD_ON_RUNQ(td) &&
!TD_IS_RUNNING(td) &&
!TD_IS_SLEEPING(td)) {
breakout = 1;
break;
}
}
if (breakout) {
mtx_unlock_spin(&sched_lock);
PROC_UNLOCK(p);
continue;
}
mtx_unlock_spin(&sched_lock);
/*
* get the process size
*/
size = vmspace_resident_count(p->p_vmspace) +
vmspace_swap_count(p->p_vmspace);
/*
* if the this process is bigger than the biggest one
* remember it.
*/
if (size > bigsize) {
if (bigproc != NULL)
PROC_UNLOCK(bigproc);
bigproc = p;
bigsize = size;
} else
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
if (bigproc != NULL) {
struct ksegrp *kg;
killproc(bigproc, "out of swap space");
mtx_lock_spin(&sched_lock);
FOREACH_KSEGRP_IN_PROC(bigproc, kg) {
kg->kg_estcpu = 0;
kg->kg_nice = PRIO_MIN; /* XXXKSE ??? */
resetpriority(kg);
}
mtx_unlock_spin(&sched_lock);
PROC_UNLOCK(bigproc);
wakeup(&cnt.v_free_count);
}
}
}
/*
* This routine tries to maintain the pseudo LRU active queue,
* so that during long periods of time where there is no paging,
* that some statistic accumulation still occurs. This code
* helps the situation where paging just starts to occur.
*/
static void
vm_pageout_page_stats()
{
vm_page_t m,next;
int pcount,tpcount; /* Number of pages to check */
static int fullintervalcount = 0;
int page_shortage;
int s0;
page_shortage =
(cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
(cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
if (page_shortage <= 0)
return;
s0 = splvm();
vm_page_lock_queues();
pcount = cnt.v_active_count;
fullintervalcount += vm_pageout_stats_interval;
if (fullintervalcount < vm_pageout_full_stats_interval) {
tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count;
if (pcount > tpcount)
pcount = tpcount;
} else {
fullintervalcount = 0;
}
m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
while ((m != NULL) && (pcount-- > 0)) {
int actcount;
if (m->queue != PQ_ACTIVE) {
break;
}
next = TAILQ_NEXT(m, pageq);
/*
* Don't deactivate pages that are busy.
*/
if ((m->busy != 0) ||
(m->flags & PG_BUSY) ||
(m->hold_count != 0)) {
vm_pageq_requeue(m);
m = next;
continue;
}
actcount = 0;
if (m->flags & PG_REFERENCED) {
vm_page_flag_clear(m, PG_REFERENCED);
actcount += 1;
}
actcount += pmap_ts_referenced(m);
if (actcount) {
m->act_count += ACT_ADVANCE + actcount;
if (m->act_count > ACT_MAX)
m->act_count = ACT_MAX;
vm_pageq_requeue(m);
} else {
if (m->act_count == 0) {
/*
* We turn off page access, so that we have
* more accurate RSS stats. We don't do this
* in the normal page deactivation when the
* system is loaded VM wise, because the
* cost of the large number of page protect
* operations would be higher than the value
* of doing the operation.
*/
vm_page_protect(m, VM_PROT_NONE);
vm_page_deactivate(m);
} else {
m->act_count -= min(m->act_count, ACT_DECLINE);
vm_pageq_requeue(m);
}
}
m = next;
}
vm_page_unlock_queues();
splx(s0);
}
static int
vm_pageout_free_page_calc(count)
vm_size_t count;
{
if (count < cnt.v_page_count)
return 0;
/*
* free_reserved needs to include enough for the largest swap pager
* structures plus enough for any pv_entry structs when paging.
*/
if (cnt.v_page_count > 1024)
cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
else
cnt.v_free_min = 4;
cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
cnt.v_interrupt_free_min;
cnt.v_free_reserved = vm_pageout_page_count +
cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE;
cnt.v_free_severe = cnt.v_free_min / 2;
cnt.v_free_min += cnt.v_free_reserved;
cnt.v_free_severe += cnt.v_free_reserved;
return 1;
}
/*
* vm_pageout is the high level pageout daemon.
*/
static void
vm_pageout()
{
int pass;
mtx_lock(&Giant);
/*
* Initialize some paging parameters.
*/
cnt.v_interrupt_free_min = 2;
if (cnt.v_page_count < 2000)
vm_pageout_page_count = 8;
vm_pageout_free_page_calc(cnt.v_page_count);
/*
* v_free_target and v_cache_min control pageout hysteresis. Note
* that these are more a measure of the VM cache queue hysteresis
* then the VM free queue. Specifically, v_free_target is the
* high water mark (free+cache pages).
*
* v_free_reserved + v_cache_min (mostly means v_cache_min) is the
* low water mark, while v_free_min is the stop. v_cache_min must
* be big enough to handle memory needs while the pageout daemon
* is signalled and run to free more pages.
*/
if (cnt.v_free_count > 6144)
cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
else
cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
if (cnt.v_free_count > 2048) {
cnt.v_cache_min = cnt.v_free_target;
cnt.v_cache_max = 2 * cnt.v_cache_min;
cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
} else {
cnt.v_cache_min = 0;
cnt.v_cache_max = 0;
cnt.v_inactive_target = cnt.v_free_count / 4;
}
if (cnt.v_inactive_target > cnt.v_free_count / 3)
cnt.v_inactive_target = cnt.v_free_count / 3;
/* XXX does not really belong here */
if (vm_page_max_wired == 0)
vm_page_max_wired = cnt.v_free_count / 3;
if (vm_pageout_stats_max == 0)
vm_pageout_stats_max = cnt.v_free_target;
/*
* Set interval in seconds for stats scan.
*/
if (vm_pageout_stats_interval == 0)
vm_pageout_stats_interval = 5;
if (vm_pageout_full_stats_interval == 0)
vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
/*
* Set maximum free per pass
*/
if (vm_pageout_stats_free_max == 0)
vm_pageout_stats_free_max = 5;
swap_pager_swap_init();
pass = 0;
/*
* The pageout daemon is never done, so loop forever.
*/
while (TRUE) {
int error;
int s = splvm();
/*
* If we have enough free memory, wakeup waiters. Do
* not clear vm_pages_needed until we reach our target,
* otherwise we may be woken up over and over again and
* waste a lot of cpu.
*/
if (vm_pages_needed && !vm_page_count_min()) {
if (vm_paging_needed() <= 0)
vm_pages_needed = 0;
wakeup(&cnt.v_free_count);
}
if (vm_pages_needed) {
/*
* Still not done, take a second pass without waiting
* (unlimited dirty cleaning), otherwise sleep a bit
* and try again.
*/
++pass;
if (pass > 1)
tsleep(&vm_pages_needed, PVM,
"psleep", hz/2);
} else {
/*
* Good enough, sleep & handle stats. Prime the pass
* for the next run.
*/
if (pass > 1)
pass = 1;
else
pass = 0;
error = tsleep(&vm_pages_needed, PVM,
"psleep", vm_pageout_stats_interval * hz);
if (error && !vm_pages_needed) {
splx(s);
pass = 0;
vm_pageout_page_stats();
continue;
}
}
if (vm_pages_needed)
cnt.v_pdwakeups++;
splx(s);
vm_pageout_scan(pass);
vm_pageout_deficit = 0;
}
}
void
pagedaemon_wakeup()
{
if (!vm_pages_needed && curthread->td_proc != pageproc) {
vm_pages_needed++;
wakeup(&vm_pages_needed);
}
}
#if !defined(NO_SWAPPING)
static void
vm_req_vmdaemon()
{
static int lastrun = 0;
if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
wakeup(&vm_daemon_needed);
lastrun = ticks;
}
}
static void
vm_daemon()
{
struct proc *p;
int breakout;
struct thread *td;
mtx_lock(&Giant);
while (TRUE) {
tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
if (vm_pageout_req_swapout) {
swapout_procs(vm_pageout_req_swapout);
vm_pageout_req_swapout = 0;
}
/*
* scan the processes for exceeding their rlimits or if
* process is swapped out -- deactivate pages
*/
sx_slock(&allproc_lock);
LIST_FOREACH(p, &allproc, p_list) {
vm_pindex_t limit, size;
/*
* if this is a system process or if we have already
* looked at this process, skip it.
*/
if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
continue;
}
/*
* if the process is in a non-running type state,
* don't touch it.
*/
mtx_lock_spin(&sched_lock);
breakout = 0;
FOREACH_THREAD_IN_PROC(p, td) {
if (!TD_ON_RUNQ(td) &&
!TD_IS_RUNNING(td) &&
!TD_IS_SLEEPING(td)) {
breakout = 1;
break;
}
}
if (breakout) {
mtx_unlock_spin(&sched_lock);
continue;
}
/*
* get a limit
*/
limit = OFF_TO_IDX(
qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
p->p_rlimit[RLIMIT_RSS].rlim_max));
/*
* let processes that are swapped out really be
* swapped out set the limit to nothing (will force a
* swap-out.)
*/
if ((p->p_sflag & PS_INMEM) == 0)
limit = 0; /* XXX */
mtx_unlock_spin(&sched_lock);
size = vmspace_resident_count(p->p_vmspace);
if (limit >= 0 && size >= limit) {
vm_pageout_map_deactivate_pages(
&p->p_vmspace->vm_map, limit);
}
}
sx_sunlock(&allproc_lock);
}
}
#endif /* !defined(NO_SWAPPING) */
Index: head/sys/vm/vm_unix.c
===================================================================
--- head/sys/vm/vm_unix.c (revision 103766)
+++ head/sys/vm/vm_unix.c (revision 103767)
@@ -1,153 +1,153 @@
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vm_unix.c 1.1 89/11/07$
*
* @(#)vm_unix.c 8.1 (Berkeley) 6/11/93
* $FreeBSD$
*/
/*
* Traditional sbrk/grow interface to VM
*/
#include <sys/param.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#ifndef _SYS_SYSPROTO_H_
struct obreak_args {
char *nsize;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
obreak(td, uap)
struct thread *td;
struct obreak_args *uap;
{
struct vmspace *vm = td->td_proc->p_vmspace;
vm_offset_t new, old, base;
int rv;
int error = 0;
new = round_page((vm_offset_t)uap->nsize);
vm_map_lock(&vm->vm_map);
base = round_page((vm_offset_t) vm->vm_daddr);
old = base + ctob(vm->vm_dsize);
if (new > base) {
/*
* Check the resource limit, but allow a process to reduce
* its usage, even if it remains over the limit.
*/
if (new - base > td->td_proc->p_rlimit[RLIMIT_DATA].rlim_cur &&
new > old) {
error = ENOMEM;
goto done;
}
- if (new > VM_MAXUSER_ADDRESS) {
+ if (new > vm_map_max(&vm->vm_map)) {
error = ENOMEM;
goto done;
}
} else if (new < base) {
/*
* This is simply an invalid value. If someone wants to
* do fancy address space manipulations, mmap and munmap
* can do most of what the user would want.
*/
error = EINVAL;
goto done;
}
if (new > old) {
if (vm->vm_map.size + (new - old) >
td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
error = ENOMEM;
goto done;
}
rv = vm_map_insert(&vm->vm_map, NULL, 0, old, new,
VM_PROT_ALL, VM_PROT_ALL, 0);
if (rv != KERN_SUCCESS) {
error = ENOMEM;
goto done;
}
vm->vm_dsize += btoc(new - old);
} else if (new < old) {
rv = vm_map_delete(&vm->vm_map, new, old);
if (rv != KERN_SUCCESS) {
error = ENOMEM;
goto done;
}
vm->vm_dsize -= btoc(old - new);
}
done:
vm_map_unlock(&vm->vm_map);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct ovadvise_args {
int anom;
};
#endif
/*
* MPSAFE
*/
/* ARGSUSED */
int
ovadvise(td, uap)
struct thread *td;
struct ovadvise_args *uap;
{
/* START_GIANT_OPTIONAL */
/* END_GIANT_OPTIONAL */
return (EINVAL);
}

File Metadata

Mime Type
application/octet-stream
Expires
Thu, May 23, 1:24 PM (2 d)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
9556487
Default Alt Text
(421 KB)

Event Timeline