No OneTemporary
Actions

Size

421 KB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.

	Index: head/sys/alpha/osf1/imgact_osf1.c
	===================================================================
	--- head/sys/alpha/osf1/imgact_osf1.c (revision 103766)
	+++ head/sys/alpha/osf1/imgact_osf1.c (revision 103767)
	@@ -1,251 +1,251 @@
	/*
	* Copyright (c) 1998-1999 Andrew Gallatin
	* All rights reserved.
	*
	* Based heavily on imgact_linux.c which is
	* Copyright (c) 1994-1996 Søren Schmidt.
	* Which in turn is based heavily on /sys/kern/imgact_aout.c which is:
	* Copyright (c) 1993, David Greenman
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/types.h>
	#include <sys/malloc.h>

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mount.h>
	#include <sys/filedesc.h>
	#include <sys/fcntl.h>
	#include <sys/resourcevar.h>
	#include <sys/exec.h>
	#include <sys/mman.h>
	#include <sys/imgact.h>
	#include <sys/imgact_aout.h>
	#include <sys/kernel.h>
	#include <sys/module.h>

	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/pioctl.h>
	#include <sys/namei.h>
	#include <sys/sysent.h>
	#include <sys/shm.h>
	#include <sys/sysctl.h>
	#include <sys/vnode.h>

	#include <vm/vm.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_extern.h>

	#include <alpha/osf1/exec_ecoff.h>
	extern struct sysentvec osf1_sysvec;

	#ifdef DEBUG
	#define DPRINTF(a) printf a;
	#else
	#define DPRINTF(a)
	#endif

	static int
	exec_osf1_imgact(struct image_params *imgp)
	{
	int error;
	int path_not_saved;
	size_t bytes;
	const struct ecoff_exechdr *execp;
	const struct ecoff_aouthdr *eap;
	struct vmspace *vmspace;
	vm_offset_t baddr;
	vm_offset_t bsize;
	vm_offset_t bss_start;
	vm_offset_t daddr;
	vm_offset_t dend;
	vm_offset_t dsize;
	vm_offset_t raw_dend;
	vm_offset_t taddr;
	vm_offset_t tend;
	vm_offset_t tsize;
	struct nameidata *ndp;
	Osf_Auxargs *osf_auxargs;

	GIANT_REQUIRED;

	execp = (const struct ecoff_exechdr*)imgp->image_header;
	eap = &execp->a;
	ndp = NULL;

	/* check to make sure we have an alpha ecoff executable */
	if (ECOFF_BADMAG(execp))
	return -1;

	/* verfify it an OSF/1 exectutable */
	if (eap->magic != ECOFF_ZMAGIC) {
	printf("unknown ecoff magic %x\n", eap->magic);
	return ENOEXEC;
	}
	osf_auxargs = malloc(sizeof(Osf_Auxargs), M_TEMP, M_WAITOK \| M_ZERO);
	imgp->auxargs = osf_auxargs;
	osf_auxargs->executable = osf_auxargs->exec_path;
	path_not_saved = copyinstr(imgp->fname, osf_auxargs->executable,
	PATH_MAX, &bytes);
	if (execp->f.f_flags & DYNAMIC_FLAG) {
	if (path_not_saved) {
	uprintf("path to dynamic exectutable not found\n");
	free(imgp->auxargs, M_TEMP);
	return(path_not_saved);
	}
	/*
	* Unmap the executable & attempt to slide in
	* /sbin/loader in its place.
	*/
	if (imgp->firstpage)
	exec_unmap_first_page(imgp);

	/*
	* Replicate what execve does, and map the first
	* page of the loader.
	*/
	ndp = (struct nameidata *)malloc(sizeof(struct nameidata),
	M_TEMP, M_WAITOK);
	NDINIT(ndp, LOOKUP, LOCKLEAF \| FOLLOW \| SAVENAME, UIO_SYSSPACE,
	"/compat/osf1/sbin/loader",
	FIRST_THREAD_IN_PROC(imgp->proc));
	error = namei(ndp);
	if (error) {
	uprintf("imgact_osf1: can't read /compat/osf1/sbin/loader\n");
	free(imgp->auxargs, M_TEMP);
	return(error);
	}
	if (imgp->vp) {
	vput(imgp->vp);
	/* leaking in the nameizone ??? XXX */
	}
	imgp->vp = ndp->ni_vp;
	error = exec_map_first_page(imgp);
	osf_auxargs->loader = "/compat/osf1/sbin/loader";
	}

	execp = (const struct ecoff_exechdr*)imgp->image_header;
	eap = &execp->a;
	taddr = ECOFF_SEGMENT_ALIGN(execp, eap->text_start);
	tend = round_page(eap->text_start + eap->tsize);
	tsize = tend - taddr;

	daddr = ECOFF_SEGMENT_ALIGN(execp, eap->data_start);
	dend = round_page(eap->data_start + eap->dsize);
	dsize = dend - daddr;

	bss_start = ECOFF_SEGMENT_ALIGN(execp, eap->bss_start);
	bsize = eap->bsize;

	imgp->entry_addr = eap->entry;
	/* copy in arguments and/or environment from old process */

	error = exec_extract_strings(imgp);
	if (error)
	goto bail;

	/*
	* Destroy old process VM and create a new one (with a new stack).
	*/
	- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
	+ exec_new_vmspace(imgp, &osf1_sysvec);

	/*
	* The vm space can now be changed.
	*/
	vmspace = imgp->proc->p_vmspace;

	imgp->interpreted = 0;
	imgp->proc->p_sysent = &osf1_sysvec;

	/* set up text segment */
	if ((error = vm_mmap(&vmspace->vm_map, &taddr, tsize,
	VM_PROT_READ\|VM_PROT_EXECUTE, VM_PROT_ALL, MAP_FIXED\|MAP_COPY,
	(caddr_t)imgp->vp, ECOFF_TXTOFF(execp)))) {
	DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
	return error;
	}
	/* .. data .. */
	if ((error = vm_mmap(&vmspace->vm_map, &daddr, dsize,
	VM_PROT_READ\|VM_PROT_EXECUTE\|VM_PROT_WRITE, VM_PROT_ALL,
	MAP_FIXED\|MAP_COPY, (caddr_t)imgp->vp, ECOFF_DATOFF(execp)))) {
	DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
	goto bail;
	}
	/* .. bss .. */
	if (round_page(bsize)) {
	baddr = bss_start;
	if ((error = vm_map_find(&vmspace->vm_map, NULL,
	(vm_offset_t) 0, &baddr, round_page(bsize), FALSE,
	VM_PROT_ALL, VM_PROT_ALL, FALSE))) {
	DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__,
	error));
	goto bail;

	}
	}


	raw_dend = (eap->data_start + eap->dsize);
	if (dend > raw_dend) {
	caddr_t zeros;
	zeros = malloc(dend-raw_dend,M_TEMP,M_WAITOK\|M_ZERO);
	if ((error = copyout(zeros, (caddr_t)raw_dend,
	dend-raw_dend))) {
	uprintf("Can't zero start of bss, error %d\n",error);
	free(zeros,M_TEMP);
	goto bail;
	}
	free(zeros,M_TEMP);

	}
	vmspace->vm_tsize = btoc(round_page(tsize));
	vmspace->vm_dsize = btoc((round_page(dsize) + round_page(bsize)));
	vmspace->vm_taddr = (caddr_t)taddr;
	vmspace->vm_daddr = (caddr_t)daddr;

	return(0);

	bail:
	free(imgp->auxargs, M_TEMP);
	if (ndp) {
	VOP_CLOSE(ndp->ni_vp, FREAD, imgp->proc->p_ucred,
	FIRST_THREAD_IN_PROC(imgp->proc));
	vrele(ndp->ni_vp);
	}
	return(error);
	}
	/*
	* Tell kern_execve.c about it, with a little help from the linker.
	*/
	struct execsw osf1_execsw = { exec_osf1_imgact, "OSF/1 ECOFF" };
	EXEC_SET(osf1_ecoff, osf1_execsw);
	Index: head/sys/compat/linprocfs/linprocfs.c
	===================================================================
	--- head/sys/compat/linprocfs/linprocfs.c (revision 103766)
	+++ head/sys/compat/linprocfs/linprocfs.c (revision 103767)
	@@ -1,856 +1,857 @@
	/*
	* Copyright (c) 2000 Dag-Erling Coïdan Smørgrav
	* Copyright (c) 1999 Pierre Beyssac
	* Copyright (c) 1993 Jan-Simon Pendry
	* Copyright (c) 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Jan-Simon Pendry.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)procfs_status.c 8.4 (Berkeley) 6/15/94
	*
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/blist.h>
	#include <sys/conf.h>
	#include <sys/dkstat.h>
	#include <sys/exec.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/sbuf.h>
	#include <sys/socket.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/tty.h>
	#include <sys/user.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>

	#include <net/if.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_param.h>
	#include <vm/vm_object.h>
	#include <vm/swap_pager.h>

	#include <machine/clock.h>

	#ifdef __alpha__
	#include <machine/alpha_cpu.h>
	#include <machine/cpuconf.h>
	#include <machine/rpb.h>
	extern int ncpus;
	#endif /* __alpha__ */

	#ifdef __i386__
	#include <machine/cputypes.h>
	#include <machine/md_var.h>
	#endif /* __i386__ */

	#include <machine/../linux/linux.h>
	#include <compat/linux/linux_ioctl.h>
	#include <compat/linux/linux_mib.h>
	#include <compat/linux/linux_util.h>
	#include <fs/pseudofs/pseudofs.h>
	#include <fs/procfs/procfs.h>

	/*
	* Various conversion macros
	*/
	#define T2J(x) (((x) * 100UL) / (stathz ? stathz : hz)) /* ticks to jiffies */
	#define T2S(x) ((x) / (stathz ? stathz : hz)) /* ticks to seconds */
	#define B2K(x) ((x) >> 10) /* bytes to kbytes */
	#define B2P(x) ((x) >> PAGE_SHIFT) /* bytes to pages */
	#define P2B(x) ((x) << PAGE_SHIFT) /* pages to bytes */
	#define P2K(x) ((x) << (PAGE_SHIFT - 10)) /* pages to kbytes */

	/*
	* Filler function for proc/meminfo
	*/
	static int
	linprocfs_domeminfo(PFS_FILL_ARGS)
	{
	unsigned long memtotal; /* total memory in bytes */
	unsigned long memused; /* used memory in bytes */
	unsigned long memfree; /* free memory in bytes */
	unsigned long memshared; /* shared memory ??? */
	unsigned long buffers, cached; /* buffer / cache memory ??? */
	u_quad_t swaptotal; /* total swap space in bytes */
	u_quad_t swapused; /* used swap space in bytes */
	u_quad_t swapfree; /* free swap space in bytes */
	vm_object_t object;

	memtotal = physmem * PAGE_SIZE;
	/*
	* The correct thing here would be:
	*
	memfree = cnt.v_free_count * PAGE_SIZE;
	memused = memtotal - memfree;
	*
	* but it might mislead linux binaries into thinking there
	* is very little memory left, so we cheat and tell them that
	* all memory that isn't wired down is free.
	*/
	memused = cnt.v_wire_count * PAGE_SIZE;
	memfree = memtotal - memused;
	if (swapblist == NULL) {
	swaptotal = 0;
	swapfree = 0;
	} else {
	swaptotal = (u_quad_t)swapblist->bl_blocks * 1024; /* XXX why 1024? */
	swapfree = (u_quad_t)swapblist->bl_root->u.bmu_avail * PAGE_SIZE;
	}
	swapused = swaptotal - swapfree;
	memshared = 0;
	TAILQ_FOREACH(object, &vm_object_list, object_list)
	if (object->shadow_count > 1)
	memshared += object->resident_page_count;
	memshared *= PAGE_SIZE;
	/*
	* We'd love to be able to write:
	*
	buffers = bufspace;
	*
	* but bufspace is internal to vfs_bio.c and we don't feel
	* like unstaticizing it just for linprocfs's sake.
	*/
	buffers = 0;
	cached = cnt.v_cache_count * PAGE_SIZE;

	sbuf_printf(sb,
	" total: used: free: shared: buffers: cached:\n"
	"Mem: %lu %lu %lu %lu %lu %lu\n"
	"Swap: %llu %llu %llu\n"
	"MemTotal: %9lu kB\n"
	"MemFree: %9lu kB\n"
	"MemShared:%9lu kB\n"
	"Buffers: %9lu kB\n"
	"Cached: %9lu kB\n"
	"SwapTotal:%9llu kB\n"
	"SwapFree: %9llu kB\n",
	memtotal, memused, memfree, memshared, buffers, cached,
	swaptotal, swapused, swapfree,
	B2K(memtotal), B2K(memfree),
	B2K(memshared), B2K(buffers), B2K(cached),
	B2K(swaptotal), B2K(swapfree));

	return (0);
	}

	#ifdef __alpha__
	/*
	* Filler function for proc/cpuinfo (Alpha version)
	*/
	static int
	linprocfs_docpuinfo(PFS_FILL_ARGS)
	{
	u_int64_t type, major;
	struct pcs *pcsp;
	const char model, sysname;

	static const char *cpuname[] = {
	"EV3", "EV4", "Simulate", "LCA4", "EV5", "EV45", "EV56",
	"EV6", "PCA56", "PCA57", "EV67", "EV68CB", "EV68AL"
	};

	pcsp = LOCATE_PCS(hwrpb, hwrpb->rpb_primary_cpu_id);
	type = pcsp->pcs_proc_type;
	major = (type & PCS_PROC_MAJOR) >> PCS_PROC_MAJORSHIFT;
	if (major < sizeof(cpuname)/sizeof(char *)) {
	model = cpuname[major - 1];
	} else {
	model = "unknown";
	}

	sysname = alpha_dsr_sysname();

	sbuf_printf(sb,
	"cpu\t\t\t: Alpha\n"
	"cpu model\t\t: %s\n"
	"cpu variation\t\t: %ld\n"
	"cpu revision\t\t: %ld\n"
	"cpu serial number\t: %s\n"
	"system type\t\t: %s\n"
	"system variation\t: %s\n"
	"system revision\t\t: %ld\n"
	"system serial number\t: %s\n"
	"cycle frequency [Hz]\t: %lu\n"
	"timer frequency [Hz]\t: %lu\n"
	"page size [bytes]\t: %ld\n"
	"phys. address bits\t: %ld\n"
	"max. addr. space #\t: %ld\n"
	"BogoMIPS\t\t: %lu.%02lu\n"
	"kernel unaligned acc\t: %ld (pc=%lx,va=%lx)\n"
	"user unaligned acc\t: %ld (pc=%lx,va=%lx)\n"
	"platform string\t\t: %s\n"
	"cpus detected\t\t: %d\n"
	,
	model,
	pcsp->pcs_proc_var,
	(int )hwrpb->rpb_revision,
	" ",
	" ",
	"0",
	0,
	" ",
	hwrpb->rpb_cc_freq,
	hz,
	hwrpb->rpb_page_size,
	hwrpb->rpb_phys_addr_size,
	hwrpb->rpb_max_asn,
	0, 0,
	0, 0, 0,
	0, 0, 0,
	sysname,
	ncpus);
	return (0);
	}
	#endif /* __alpha__ */

	#ifdef __i386__
	/*
	* Filler function for proc/cpuinfo (i386 version)
	*/
	static int
	linprocfs_docpuinfo(PFS_FILL_ARGS)
	{
	int class, i, fqmhz, fqkhz;

	/*
	* We default the flags to include all non-conflicting flags,
	* and the Intel versions of conflicting flags.
	*/
	static char *flags[] = {
	"fpu", "vme", "de", "pse", "tsc",
	"msr", "pae", "mce", "cx8", "apic",
	"sep", "sep", "mtrr", "pge", "mca",
	"cmov", "pat", "pse36", "pn", "b19",
	"b20", "b21", "mmxext", "mmx", "fxsr",
	"xmm", "b26", "b27", "b28", "b29",
	"3dnowext", "3dnow"
	};

	switch (cpu_class) {
	case CPUCLASS_286:
	class = 2;
	break;
	case CPUCLASS_386:
	class = 3;
	break;
	case CPUCLASS_486:
	class = 4;
	break;
	case CPUCLASS_586:
	class = 5;
	break;
	case CPUCLASS_686:
	class = 6;
	break;
	default:
	class = 0;
	break;
	}

	sbuf_printf(sb,
	"processor\t: %d\n"
	"vendor_id\t: %.20s\n"
	"cpu family\t: %d\n"
	"model\t\t: %d\n"
	"stepping\t: %d\n",
	0, cpu_vendor, class, cpu, cpu_id & 0xf);

	sbuf_cat(sb,
	"flags\t\t:");

	if (!strcmp(cpu_vendor, "AuthenticAMD") && (class < 6)) {
	flags[16] = "fcmov";
	} else if (!strcmp(cpu_vendor, "CyrixInstead")) {
	flags[24] = "cxmmx";
	}

	for (i = 0; i < 32; i++)
	if (cpu_feature & (1 << i))
	sbuf_printf(sb, " %s", flags[i]);
	sbuf_cat(sb, "\n");
	if (class >= 5) {
	fqmhz = (tsc_freq + 4999) / 1000000;
	fqkhz = ((tsc_freq + 4999) / 10000) % 100;
	sbuf_printf(sb,
	"cpu MHz\t\t: %d.%02d\n"
	"bogomips\t: %d.%02d\n",
	fqmhz, fqkhz, fqmhz, fqkhz);
	}

	return (0);
	}
	#endif /* __i386__ */

	/*
	* Filler function for proc/mtab
	*
	* This file doesn't exist in Linux' procfs, but is included here so
	* users can symlink /compat/linux/etc/mtab to /proc/mtab
	*/
	static int
	linprocfs_domtab(PFS_FILL_ARGS)
	{
	struct nameidata nd;
	struct mount *mp;
	const char *lep;
	char dlep, flep, mntto, mntfrom, *fstype;
	size_t lep_len;
	int error;

	/* resolve symlinks etc. in the emulation tree prefix */
	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, linux_emul_path, td);
	flep = NULL;
	if (namei(&nd) != 0 \|\| vn_fullpath(td, nd.ni_vp, &dlep, &flep) == -1)
	lep = linux_emul_path;
	else
	lep = dlep;
	lep_len = strlen(lep);

	mtx_lock(&mountlist_mtx);
	error = 0;
	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	error = VFS_STATFS(mp, &mp->mnt_stat, td);
	if (error)
	break;

	/* determine device name */
	mntfrom = mp->mnt_stat.f_mntfromname;

	/* determine mount point */
	mntto = mp->mnt_stat.f_mntonname;
	if (strncmp(mntto, lep, lep_len) == 0 &&
	mntto[lep_len] == '/')
	mntto += lep_len;

	/* determine fs type */
	fstype = mp->mnt_stat.f_fstypename;
	if (strcmp(fstype, pn->pn_info->pi_name) == 0)
	mntfrom = fstype = "proc";
	else if (strcmp(fstype, "procfs") == 0)
	continue;

	sbuf_printf(sb, "%s %s %s %s", mntfrom, mntto, fstype,
	mp->mnt_stat.f_flags & MNT_RDONLY ? "ro" : "rw");
	#define ADD_OPTION(opt, name) \
	if (mp->mnt_stat.f_flags & (opt)) sbuf_printf(sb, "," name);
	ADD_OPTION(MNT_SYNCHRONOUS, "sync");
	ADD_OPTION(MNT_NOEXEC, "noexec");
	ADD_OPTION(MNT_NOSUID, "nosuid");
	ADD_OPTION(MNT_NODEV, "nodev");
	ADD_OPTION(MNT_UNION, "union");
	ADD_OPTION(MNT_ASYNC, "async");
	ADD_OPTION(MNT_SUIDDIR, "suiddir");
	ADD_OPTION(MNT_NOSYMFOLLOW, "nosymfollow");
	ADD_OPTION(MNT_NOATIME, "noatime");
	#undef ADD_OPTION
	/* a real Linux mtab will also show NFS options */
	sbuf_printf(sb, " 0 0\n");
	}
	mtx_unlock(&mountlist_mtx);
	if (flep != NULL)
	free(flep, M_TEMP);
	return (error);
	}

	/*
	* Filler function for proc/stat
	*/
	static int
	linprocfs_dostat(PFS_FILL_ARGS)
	{
	sbuf_printf(sb,
	"cpu %ld %ld %ld %ld\n"
	"disk 0 0 0 0\n"
	"page %u %u\n"
	"swap %u %u\n"
	"intr %u\n"
	"ctxt %u\n"
	"btime %lld\n",
	T2J(cp_time[CP_USER]),
	T2J(cp_time[CP_NICE]),
	T2J(cp_time[CP_SYS] /+ cp_time[CP_INTR]/),
	T2J(cp_time[CP_IDLE]),
	cnt.v_vnodepgsin,
	cnt.v_vnodepgsout,
	cnt.v_swappgsin,
	cnt.v_swappgsout,
	cnt.v_intr,
	cnt.v_swtch,
	(quad_t)boottime.tv_sec);
	return (0);
	}

	/*
	* Filler function for proc/uptime
	*/
	static int
	linprocfs_douptime(PFS_FILL_ARGS)
	{
	struct timeval tv;

	getmicrouptime(&tv);
	sbuf_printf(sb, "%lld.%02ld %ld.%02ld\n",
	(quad_t)tv.tv_sec, tv.tv_usec / 10000,
	T2S(cp_time[CP_IDLE]), T2J(cp_time[CP_IDLE]) % 100);
	return (0);
	}

	/*
	* Filler function for proc/version
	*/
	static int
	linprocfs_doversion(PFS_FILL_ARGS)
	{
	char osname[LINUX_MAX_UTSNAME];
	char osrelease[LINUX_MAX_UTSNAME];

	linux_get_osname(td->td_proc, osname);
	linux_get_osrelease(td->td_proc, osrelease);

	sbuf_printf(sb,
	"%s version %s (des@freebsd.org) (gcc version " __VERSION__ ")"
	" #4 Sun Dec 18 04:30:00 CET 1977\n", osname, osrelease);
	return (0);
	}

	/*
	* Filler function for proc/loadavg
	*/
	static int
	linprocfs_doloadavg(PFS_FILL_ARGS)
	{
	sbuf_printf(sb,
	"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
	(int)(averunnable.ldavg[0] / averunnable.fscale),
	(int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100),
	(int)(averunnable.ldavg[1] / averunnable.fscale),
	(int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100),
	(int)(averunnable.ldavg[2] / averunnable.fscale),
	(int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100),
	1, /* number of running tasks */
	nprocs, /* number of tasks */
	lastpid /* the last pid */
	);

	return (0);
	}

	/*
	* Filler function for proc/pid/stat
	*/
	static int
	linprocfs_doprocstat(PFS_FILL_ARGS)
	{
	struct kinfo_proc kp;

	PROC_LOCK(p);
	fill_kinfo_proc(p, &kp);
	sbuf_printf(sb, "%d", p->p_pid);
	#define PS_ADD(name, fmt, arg) sbuf_printf(sb, " " fmt, arg)
	PS_ADD("comm", "(%s)", p->p_comm);
	PS_ADD("statr", "%c", '0'); /* XXX */
	PS_ADD("ppid", "%d", p->p_pptr ? p->p_pptr->p_pid : 0);
	PS_ADD("pgrp", "%d", p->p_pgid);
	PS_ADD("session", "%d", p->p_session->s_sid);
	PROC_UNLOCK(p);
	PS_ADD("tty", "%d", 0); /* XXX */
	PS_ADD("tpgid", "%d", 0); /* XXX */
	PS_ADD("flags", "%u", 0); /* XXX */
	PS_ADD("minflt", "%u", 0); /* XXX */
	PS_ADD("cminflt", "%u", 0); /* XXX */
	PS_ADD("majflt", "%u", 0); /* XXX */
	PS_ADD("cminflt", "%u", 0); /* XXX */
	PS_ADD("utime", "%d", 0); /* XXX */
	PS_ADD("stime", "%d", 0); /* XXX */
	PS_ADD("cutime", "%d", 0); /* XXX */
	PS_ADD("cstime", "%d", 0); /* XXX */
	PS_ADD("counter", "%d", 0); /* XXX */
	PS_ADD("priority", "%d", 0); /* XXX */
	PS_ADD("timeout", "%u", 0); /* XXX */
	PS_ADD("itrealvalue", "%u", 0); /* XXX */
	PS_ADD("starttime", "%d", 0); /* XXX */
	PS_ADD("vsize", "%u", kp.ki_size);
	PS_ADD("rss", "%u", P2K(kp.ki_rssize));
	PS_ADD("rlim", "%u", 0); /* XXX */
	PS_ADD("startcode", "%u", (unsigned)0);
	PS_ADD("endcode", "%u", 0); /* XXX */
	PS_ADD("startstack", "%u", 0); /* XXX */
	PS_ADD("esp", "%u", 0); /* XXX */
	PS_ADD("eip", "%u", 0); /* XXX */
	PS_ADD("signal", "%d", 0); /* XXX */
	PS_ADD("blocked", "%d", 0); /* XXX */
	PS_ADD("sigignore", "%d", 0); /* XXX */
	PS_ADD("sigcatch", "%d", 0); /* XXX */
	PS_ADD("wchan", "%u", 0); /* XXX */
	PS_ADD("nswap", "%lu", (long unsigned)0); /* XXX */
	PS_ADD("cnswap", "%lu", (long unsigned)0); /* XXX */
	PS_ADD("exitsignal", "%d", 0); /* XXX */
	PS_ADD("processor", "%d", 0); /* XXX */
	#undef PS_ADD
	sbuf_putc(sb, '\n');

	return (0);
	}

	/*
	* Filler function for proc/pid/status
	*/
	static int
	linprocfs_doprocstatus(PFS_FILL_ARGS)
	{
	struct kinfo_proc kp;
	char *state;
	segsz_t lsize;
	struct thread *td2;
	int i;

	mtx_lock_spin(&sched_lock);
	td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */

	if (P_SHOULDSTOP(p)) {
	state = "T (stopped)";
	} else {
	switch(p->p_state) {
	case PRS_NEW:
	state = "I (idle)";
	break;
	case PRS_NORMAL:
	if (p->p_flag & P_WEXIT) {
	state = "X (exiting)";
	break;
	}
	switch(td2->td_state) {
	case TDS_INHIBITED:
	state = "S (sleeping)";
	break;
	case TDS_RUNQ:
	case TDS_RUNNING:
	state = "R (running)";
	break;
	default:
	state = "? (unknown)";
	break;
	}
	break;
	case PRS_WAIT:
	state = "W (waiting)";
	break;
	case PRS_ZOMBIE:
	state = "Z (zombie)";
	break;
	default:
	state = "? (unknown)";
	break;
	}
	}
	mtx_unlock_spin(&sched_lock);

	PROC_LOCK(p);
	fill_kinfo_proc(p, &kp);
	sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */
	sbuf_printf(sb, "State:\t%s\n", state);

	/*
	* Credentials
	*/
	sbuf_printf(sb, "Pid:\t%d\n", p->p_pid);
	sbuf_printf(sb, "PPid:\t%d\n", p->p_pptr ?
	p->p_pptr->p_pid : 0);
	sbuf_printf(sb, "Uid:\t%d %d %d %d\n", p->p_ucred->cr_ruid,
	p->p_ucred->cr_uid,
	p->p_ucred->cr_svuid,
	/* FreeBSD doesn't have fsuid */
	p->p_ucred->cr_uid);
	sbuf_printf(sb, "Gid:\t%d %d %d %d\n", p->p_ucred->cr_rgid,
	p->p_ucred->cr_gid,
	p->p_ucred->cr_svgid,
	/* FreeBSD doesn't have fsgid */
	p->p_ucred->cr_gid);
	sbuf_cat(sb, "Groups:\t");
	for (i = 0; i < p->p_ucred->cr_ngroups; i++)
	sbuf_printf(sb, "%d ", p->p_ucred->cr_groups[i]);
	PROC_UNLOCK(p);
	sbuf_putc(sb, '\n');

	/*
	* Memory
	*
	* While our approximation of VmLib may not be accurate (I
	* don't know of a simple way to verify it, and I'm not sure
	* it has much meaning anyway), I believe it's good enough.
	*
	* The same code that could (I think) accurately compute VmLib
	* could also compute VmLck, but I don't really care enough to
	* implement it. Submissions are welcome.
	*/
	sbuf_printf(sb, "VmSize:\t%8u kB\n", B2K(kp.ki_size));
	sbuf_printf(sb, "VmLck:\t%8u kB\n", P2K(0)); /* XXX */
	sbuf_printf(sb, "VmRss:\t%8u kB\n", P2K(kp.ki_rssize));
	sbuf_printf(sb, "VmData:\t%8u kB\n", P2K(kp.ki_dsize));
	sbuf_printf(sb, "VmStk:\t%8u kB\n", P2K(kp.ki_ssize));
	sbuf_printf(sb, "VmExe:\t%8u kB\n", P2K(kp.ki_tsize));
	lsize = B2P(kp.ki_size) - kp.ki_dsize -
	kp.ki_ssize - kp.ki_tsize - 1;
	sbuf_printf(sb, "VmLib:\t%8u kB\n", P2K(lsize));

	/*
	* Signal masks
	*
	* We support up to 128 signals, while Linux supports 32,
	* but we only define 32 (the same 32 as Linux, to boot), so
	* just show the lower 32 bits of each mask. XXX hack.
	*
	* NB: on certain platforms (Sparc at least) Linux actually
	* supports 64 signals, but this code is a long way from
	* running on anything but i386, so ignore that for now.
	*/
	PROC_LOCK(p);
	sbuf_printf(sb, "SigPnd:\t%08x\n", p->p_siglist.__bits[0]);
	/*
	* I can't seem to find out where the signal mask is in
	* relation to struct proc, so SigBlk is left unimplemented.
	*/
	sbuf_printf(sb, "SigBlk:\t%08x\n", 0); /* XXX */
	sbuf_printf(sb, "SigIgn:\t%08x\n", p->p_sigignore.__bits[0]);
	sbuf_printf(sb, "SigCgt:\t%08x\n", p->p_sigcatch.__bits[0]);
	PROC_UNLOCK(p);

	/*
	* Linux also prints the capability masks, but we don't have
	* capabilities yet, and when we do get them they're likely to
	* be meaningless to Linux programs, so we lie. XXX
	*/
	sbuf_printf(sb, "CapInh:\t%016x\n", 0);
	sbuf_printf(sb, "CapPrm:\t%016x\n", 0);
	sbuf_printf(sb, "CapEff:\t%016x\n", 0);

	return (0);
	}

	/*
	* Filler function for proc/pid/cmdline
	*/
	static int
	linprocfs_doproccmdline(PFS_FILL_ARGS)
	{
	struct ps_strings pstr;
	int error, i;

	/*
	* If we are using the ps/cmdline caching, use that. Otherwise
	* revert back to the old way which only implements full cmdline
	* for the currept process and just p->p_comm for all other
	* processes.
	* Note that if the argv is no longer available, we deliberately
	* don't fall back on p->p_comm or return an error: the authentic
	* Linux behaviour is to return zero-length in this case.
	*/

	PROC_LOCK(p);
	if (p->p_args && (ps_argsopen \|\| !p_cansee(td, p))) {
	sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length);
	PROC_UNLOCK(p);
	} else if (p != td->td_proc) {
	PROC_UNLOCK(p);
	sbuf_printf(sb, "%.*s", MAXCOMLEN, p->p_comm);
	} else {
	PROC_UNLOCK(p);
	- error = copyin((void*)PS_STRINGS, &pstr, sizeof(pstr));
	+ error = copyin((void *)p->p_sysent->sv_psstrings, &pstr,
	+ sizeof(pstr));
	if (error)
	return (error);
	for (i = 0; i < pstr.ps_nargvstr; i++) {
	sbuf_copyin(sb, pstr.ps_argvstr[i], 0);
	sbuf_printf(sb, "%c", '\0');
	}
	}

	return (0);
	}

	/*
	* Filler function for proc/net/dev
	*/
	static int
	linprocfs_donetdev(PFS_FILL_ARGS)
	{
	char ifname[16]; /* XXX LINUX_IFNAMSIZ */
	struct ifnet *ifp;

	sbuf_printf(sb, "%6s\|%58s\|%s\n%6s\|%58s\|%58s\n",
	"Inter-", " Receive", " Transmit", " face",
	"bytes packets errs drop fifo frame compressed",
	"bytes packets errs drop fifo frame compressed");

	TAILQ_FOREACH(ifp, &ifnet, if_link) {
	linux_ifname(ifp, ifname, sizeof ifname);
	sbuf_printf(sb, "%6.6s:", ifname);
	sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu ",
	0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
	sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
	0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
	}

	return (0);
	}

	#if 0
	extern struct cdevsw *cdevsw[];

	/*
	* Filler function for proc/devices
	*/
	static int
	linprocfs_dodevices(PFS_FILL_ARGS)
	{
	int i;

	sbuf_printf(sb, "Character devices:\n");

	for (i = 0; i < NUMCDEVSW; i++)
	if (cdevsw[i] != NULL)
	sbuf_printf(sb, "%3d %s\n", i, cdevsw[i]->d_name);

	sbuf_printf(sb, "\nBlock devices:\n");

	return (0);
	}
	#endif

	/*
	* Filler function for proc/cmdline
	*/
	static int
	linprocfs_docmdline(PFS_FILL_ARGS)
	{
	sbuf_printf(sb, "BOOT_IMAGE=%s", kernelname);
	sbuf_printf(sb, " ro root=302\n");
	return (0);
	}

	#if 0
	/*
	* Filler function for proc/modules
	*/
	static int
	linprocfs_domodules(PFS_FILL_ARGS)
	{
	struct linker_file *lf;

	TAILQ_FOREACH(lf, &linker_files, link) {
	sbuf_printf(sb, "%-20s%8lu%4d\n", lf->filename,
	(unsigned long)lf->size, lf->refs);
	}
	return (0);
	}
	#endif

	/*
	* Constructor
	*/
	static int
	linprocfs_init(PFS_INIT_ARGS)
	{
	struct pfs_node *root;
	struct pfs_node *dir;

	root = pi->pi_root;

	#define PFS_CREATE_FILE(name) \
	pfs_create_file(root, #name, &linprocfs_do##name, NULL, NULL, PFS_RD)
	PFS_CREATE_FILE(cmdline);
	PFS_CREATE_FILE(cpuinfo);
	#if 0
	PFS_CREATE_FILE(devices);
	#endif
	PFS_CREATE_FILE(loadavg);
	PFS_CREATE_FILE(meminfo);
	#if 0
	PFS_CREATE_FILE(modules);
	#endif
	PFS_CREATE_FILE(mtab);
	PFS_CREATE_FILE(stat);
	PFS_CREATE_FILE(uptime);
	PFS_CREATE_FILE(version);
	#undef PFS_CREATE_FILE
	pfs_create_link(root, "self", &procfs_docurproc,
	NULL, NULL, 0);

	dir = pfs_create_dir(root, "net", NULL, NULL, 0);
	pfs_create_file(dir, "dev", &linprocfs_donetdev,
	NULL, NULL, PFS_RD);

	dir = pfs_create_dir(root, "pid", NULL, NULL, PFS_PROCDEP);
	pfs_create_file(dir, "cmdline", &linprocfs_doproccmdline,
	NULL, NULL, PFS_RD);
	pfs_create_link(dir, "exe", &procfs_doprocfile,
	NULL, &procfs_notsystem, 0);
	pfs_create_file(dir, "mem", &procfs_doprocmem,
	&procfs_attr, &procfs_candebug, PFS_RDWR\|PFS_RAW);
	pfs_create_file(dir, "stat", &linprocfs_doprocstat,
	NULL, NULL, PFS_RD);
	pfs_create_file(dir, "status", &linprocfs_doprocstatus,
	NULL, NULL, PFS_RD);

	return (0);
	}

	/*
	* Destructor
	*/
	static int
	linprocfs_uninit(PFS_INIT_ARGS)
	{

	/* nothing to do, pseudofs will GC */
	return (0);
	}

	PSEUDOFS(linprocfs, 1);
	MODULE_DEPEND(linprocfs, linux, 1, 1, 1);
	MODULE_DEPEND(linprocfs, procfs, 1, 1, 1);
	Index: head/sys/compat/pecoff/imgact_pecoff.c
	===================================================================
	--- head/sys/compat/pecoff/imgact_pecoff.c (revision 103766)
	+++ head/sys/compat/pecoff/imgact_pecoff.c (revision 103767)
	@@ -1,674 +1,674 @@
	/* $NetBSD$ */
	/* $FreeBSD$ */

	/*
	* Copyright (c) 2000 Masaru OKI
	* Copyright (c) 1994, 1995, 1998 Scott Bartram
	* Copyright (c) 1994 Adam Glass
	* Copyright (c) 1993, 1994 Christopher G. Demetriou
	*
	* originally from NetBSD kern/exec_ecoff.c
	*
	* Copyright (c) 2000 Takanori Watanabe
	* Copyright (c) 2000 KUROSAWA Takahiro
	* Copyright (c) 1995-1996 Sen Schmidt
	* Copyright (c) 1996 Peter Wemm
	* All rights reserved.
	*
	* originally from FreeBSD kern/imgact_elf.c
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by Masaru OKI.
	* 4. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/imgact.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mman.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	#include <sys/sysent.h>
	#include <sys/vnode.h>

	#include <machine/reg.h>

	#include <vm/vm.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_extern.h>

	#include <sys/user.h>
	#include <sys/exec.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <machine/cpu.h>
	#include <sys/syscall.h>
	#include <sys/sysent.h>
	#include <machine/md_var.h>
	#include <machine/pecoff_machdep.h>
	#include <compat/pecoff/imgact_pecoff.h>

	#include "opt_pecoff.h"

	#define PECOFF_PE_SIGNATURE "PE\0\0"
	static int pecoff_fixup(register_t *, struct image_params );
	static int
	pecoff_coredump(register struct thread , register struct vnode ,
	off_t);
	#ifndef PECOFF_DEBUG
	#define DPRINTF(a)
	#else
	#define DPRINTF(a) printf a
	#endif
	static struct sysentvec pecoff_sysvec = {
	SYS_MAXSYSCALL,
	sysent,
	0,
	0,
	NULL,
	0,
	NULL,
	NULL,
	pecoff_fixup,
	sendsig,
	sigcode,
	&szsigcode,
	0,
	"FreeBSD PECoff",
	pecoff_coredump,
	NULL,
	MINSIGSTKSZ,
	PAGE_SIZE,
	VM_MIN_ADDRESS,
	VM_MAXUSER_ADDRESS,
	USRSTACK,
	PS_STRINGS,
	VM_PROT_ALL,
	exec_copyout_strings,
	exec_setregs

	};

	static const char signature[] = PECOFF_PE_SIGNATURE;

	static int
	exec_pecoff_coff_prep_omagic(struct image_params *,
	struct coff_filehdr *,
	struct coff_aouthdr *, int peoffs);
	static int
	exec_pecoff_coff_prep_nmagic(struct image_params *,
	struct coff_filehdr *,
	struct coff_aouthdr *, int peoffs);
	static int
	exec_pecoff_coff_prep_zmagic(struct image_params *,
	struct coff_filehdr *,
	struct coff_aouthdr *, int peoffs);

	static int
	exec_pecoff_coff_makecmds(struct image_params *,
	struct coff_filehdr *, int);

	static int pecoff_signature(struct thread , struct vnode , const struct pecoff_dos_filehdr *);
	static int pecoff_read_from(struct thread , struct vnode , int, caddr_t, int);
	static int
	pecoff_load_section(struct thread * td,
	struct vmspace * vmspace, struct vnode * vp,
	vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
	vm_prot_t prot);

	static int
	pecoff_fixup(register_t ** stack_base, struct image_params * imgp)
	{
	int len = sizeof(struct pecoff_args);
	struct pecoff_imghdr *ap;
	register_t *pos;

	pos = *stack_base + (imgp->argc + imgp->envc + 2);
	ap = (struct pecoff_imghdr *) imgp->auxargs;
	if (copyout(ap, pos, len)) {
	return 0;
	}
	free(ap, M_TEMP);
	imgp->auxargs = NULL;
	(*stack_base)--;
	suword(*stack_base, (long) imgp->argc);
	return 0;
	}


	static int
	pecoff_coredump(register struct thread * td, register struct vnode * vp,
	off_t limit)
	{
	register struct ucred *cred = td->td_ucred;
	struct proc *p = td->td_proc;
	register struct vmspace *vm = p->p_vmspace;
	char *tempuser;
	int error;
	#ifdef PECOFF_DEBUG
	struct vm_map *map;
	struct vm_map_entry *ent;
	struct reg regs;

	#endif
	if (ctob((uarea_pages + kstack_pages) + vm->vm_dsize + vm->vm_ssize) >=
	limit)
	return (EFAULT);
	tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
	M_WAITOK \| M_ZERO);
	if (tempuser == NULL)
	return (ENOMEM);
	PROC_LOCK(p);
	fill_kinfo_proc(p, &p->p_uarea->u_kproc);
	PROC_UNLOCK(p);
	bcopy(p->p_uarea, tempuser, sizeof(struct user));
	bcopy(td->td_frame,
	tempuser + ctob(uarea_pages) +
	((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
	sizeof(struct trapframe));
	#if PECOFF_DEBUG
	fill_regs(td, &regs);
	printf("EIP%x\n", regs.r_eip);
	printf("EAX%x EBX%x ECX%x EDI%x\n",
	regs.r_eax, regs.r_ebx, regs.r_ecx, regs.r_edi);
	map = &vm->vm_map;
	ent = &map->header;
	printf("%p %p %p\n", ent, ent->prev, ent->next);
	#endif
	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
	ctob(uarea_pages + kstack_pages),
	(off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
	(int *)NULL, td);
	free(tempuser, M_TEMP);
	if (error == 0)
	error = vn_rdwr_inchunks(UIO_WRITE, vp, vm->vm_daddr,
	(int)ctob(vm->vm_dsize),
	(off_t)ctob((uarea_pages + kstack_pages)),
	UIO_USERSPACE, IO_UNIT, cred, NOCRED, (int *)NULL, td);
	if (error == 0)
	error = vn_rdwr_inchunks(UIO_WRITE, vp,
	(caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
	round_page(ctob(vm->vm_ssize)),
	(off_t)ctob((uarea_pages + kstack_pages)) +
	ctob(vm->vm_dsize),
	UIO_USERSPACE, IO_UNIT, cred, NOCRED, (int *)NULL, td);
	return (error);

	}

	static int
	pecoff_load_section(struct thread * td, struct vmspace * vmspace, struct vnode * vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
	{
	size_t map_len;
	vm_offset_t map_addr;
	int error, rv;
	size_t copy_len;
	size_t copy_map_len;
	size_t copy_start;
	vm_object_t object;
	vm_offset_t copy_map_offset;
	vm_offset_t file_addr;
	vm_offset_t data_buf = 0;

	object = vp->v_object;
	error = 0;

	map_addr = trunc_page((vm_offset_t) vmaddr);
	file_addr = trunc_page(offset);
	DPRINTF(("SECARG:%x %p %x %x\n", offset, vmaddr, memsz, filsz));
	if (file_addr != offset) {
	/*
	* The section is not on page boundary. We can't use
	* vm_map_insert(). Use copyin instead.
	*/
	map_len = round_page(memsz);
	copy_len = filsz;
	copy_map_offset = file_addr;
	copy_map_len = round_page(offset + filsz) - file_addr;
	copy_start = offset - file_addr;

	DPRINTF(("offset=%x vmaddr=%lx filsz=%x memsz=%x\n",
	offset, (long)vmaddr, filsz, memsz));
	DPRINTF(("map_len=%x copy_len=%x copy_map_offset=%x"
	" copy_map_len=%x copy_start=%x\n",
	map_len, copy_len, copy_map_offset,
	copy_map_len, copy_start));
	} else {

	map_len = trunc_page(filsz);

	if (map_len != 0) {
	vm_object_reference(object);
	vm_map_lock(&vmspace->vm_map);
	rv = vm_map_insert(&vmspace->vm_map,
	object,
	file_addr, /* file offset */
	map_addr, /* virtual start */
	map_addr + map_len, /* virtual end */
	prot,
	VM_PROT_ALL,
	MAP_COPY_ON_WRITE \| MAP_PREFAULT);

	vm_map_unlock(&vmspace->vm_map);
	if (rv != KERN_SUCCESS) {
	vm_object_deallocate(object);
	return EINVAL;
	}
	/* we can stop now if we've covered it all */
	if (memsz == filsz)
	return 0;

	}
	copy_map_offset = trunc_page(offset + filsz);
	copy_map_len = PAGE_SIZE;
	copy_start = 0;
	copy_len = (offset + filsz) - trunc_page(offset + filsz);
	map_addr = trunc_page((vm_offset_t) vmaddr + filsz);
	map_len = round_page((vm_offset_t) vmaddr + memsz) - map_addr;

	}

	if (map_len != 0) {
	vm_map_lock(&vmspace->vm_map);
	rv = vm_map_insert(&vmspace->vm_map, NULL, 0,
	map_addr, map_addr + map_len,
	VM_PROT_ALL, VM_PROT_ALL, 0);
	vm_map_unlock(&vmspace->vm_map);
	DPRINTF(("EMP-rv:%d,%x %x\n", rv, map_addr, map_addr + map_len));
	if (rv != KERN_SUCCESS) {
	return EINVAL;
	}
	}
	DPRINTF(("COPYARG %x %x\n", map_addr, copy_len));
	if (copy_len != 0) {
	vm_object_reference(object);
	rv = vm_map_find(exec_map,
	object,
	copy_map_offset,
	&data_buf,
	copy_map_len,
	TRUE,
	VM_PROT_READ,
	VM_PROT_ALL,
	MAP_COPY_ON_WRITE \| MAP_PREFAULT_PARTIAL);
	if (rv != KERN_SUCCESS) {
	vm_object_deallocate(object);
	return EINVAL;
	}
	/* send the page fragment to user space */

	error = copyout((caddr_t) data_buf + copy_start,
	(caddr_t) map_addr, copy_len);
	vm_map_remove(exec_map, data_buf, data_buf + copy_map_len);
	DPRINTF(("%d\n", error));
	if (error)
	return (error);
	}
	/*
	* set it to the specified protection
	*/
	vm_map_protect(&vmspace->vm_map, map_addr,
	map_addr + map_len, prot,
	FALSE);
	return error;

	}
	static int
	pecoff_load_file(struct thread * td, const char file, u_long addr, u_long * entry, u_long * ldexport)
	{

	struct nameidata nd;
	struct pecoff_dos_filehdr dh;
	struct coff_filehdr *fp = 0;
	struct coff_aouthdr *ap;
	struct pecoff_opthdr *wp;
	struct coff_scnhdr *sh = 0;
	struct vmspace *vmspace = td->td_proc->p_vmspace;
	struct vattr attr;
	struct image_params image_params, *imgp;
	int peofs;
	int error, i, scnsiz;

	imgp = &image_params;
	/*
	* Initialize part of the common data
	*/
	imgp->proc = td->td_proc;
	imgp->uap = NULL;
	imgp->attr = &attr;
	imgp->firstpage = NULL;

	NDINIT(&nd, LOOKUP, LOCKLEAF \| FOLLOW, UIO_SYSSPACE, file, td);

	if ((error = namei(&nd)) != 0) {
	nd.ni_vp = NULL;
	goto fail;
	}
	NDFREE(&nd, NDF_ONLY_PNBUF);
	imgp->vp = nd.ni_vp;

	/*
	* Check permissions, modes, uid, etc on the file, and "open" it.
	*/
	error = exec_check_permissions(imgp);
	if (error) {
	VOP_UNLOCK(nd.ni_vp, 0, td);
	goto fail;
	}
	VOP_UNLOCK(nd.ni_vp, 0, td);
	if (error)
	goto fail;
	if ((error = pecoff_read_from(td, imgp->vp, 0, (caddr_t) & dh, sizeof(dh))) != 0)
	goto fail;
	if ((error = pecoff_signature(td, imgp->vp, &dh) != 0))
	goto fail;
	fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK);
	peofs = dh.d_peofs + sizeof(signature) - 1;
	if ((error = pecoff_read_from(td, imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE) != 0))
	goto fail;
	if (COFF_BADMAG(fp)) {
	error = ENOEXEC;
	goto fail;
	}
	ap = (void ) ((char ) fp + sizeof(struct coff_filehdr));
	wp = (void ) ((char ) ap + sizeof(struct coff_aouthdr));
	/* read section header */
	scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns;
	sh = malloc(scnsiz, M_TEMP, M_WAITOK);
	if ((error = pecoff_read_from(td, imgp->vp, peofs + PECOFF_HDR_SIZE,
	(caddr_t) sh, scnsiz)) != 0)
	goto fail;

	/*
	* Read Section infomation and map sections.
	*/

	for (i = 0; i < fp->f_nscns; i++) {
	int prot = 0;

	if (sh[i].s_flags & COFF_STYP_DISCARD)
	continue;
	/* XXX ? */
	if ((sh[i].s_flags & COFF_STYP_TEXT) &&
	(sh[i].s_flags & COFF_STYP_EXEC) == 0)
	continue;
	if ((sh[i].s_flags & (COFF_STYP_TEXT \| COFF_STYP_DATA \| COFF_STYP_BSS)) == 0)
	continue;

	prot \|= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0;
	prot \|= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0;
	prot \|= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0;

	sh[i].s_vaddr += wp->w_base; /* RVA --> VA */
	if ((error = pecoff_load_section(td, vmspace, imgp->vp, sh[i].s_scnptr
	,(caddr_t) sh[i].s_vaddr,
	sh[i].s_paddr, sh[i].s_size
	,prot)) != 0)
	goto fail;

	}
	*entry = wp->w_base + ap->a_entry;
	*addr = wp->w_base;
	*ldexport = wp->w_imghdr[0].i_vaddr + wp->w_base;
	fail:
	if (fp)
	free(fp, M_TEMP);
	if (sh)
	free(sh, M_TEMP);
	if (nd.ni_vp)
	vrele(nd.ni_vp);

	return error;
	}
	static int
	exec_pecoff_coff_prep_omagic(struct image_params * imgp,
	struct coff_filehdr * fp,
	struct coff_aouthdr * ap, int peofs)
	{
	return ENOEXEC;
	}
	static int
	exec_pecoff_coff_prep_nmagic(struct image_params * imgp,
	struct coff_filehdr * fp,
	struct coff_aouthdr * ap, int peofs)
	{
	return ENOEXEC;
	}
	static int
	exec_pecoff_coff_prep_zmagic(struct image_params * imgp,
	struct coff_filehdr * fp,
	struct coff_aouthdr * ap, int peofs)
	{
	int scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns;
	int error = ENOEXEC, i;
	int prot;
	u_long text_size = 0, data_size = 0, dsize;
	u_long text_addr = 0, data_addr = VM_MAXUSER_ADDRESS;
	u_long ldexport, ldbase;
	struct pecoff_opthdr *wp;
	struct coff_scnhdr *sh;
	struct vmspace *vmspace;
	struct pecoff_args *argp = NULL;

	sh = malloc(scnsiz, M_TEMP, M_WAITOK);

	wp = (void ) ((char ) ap + sizeof(struct coff_aouthdr));
	error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc), imgp->vp,
	peofs + PECOFF_HDR_SIZE, (caddr_t) sh, scnsiz);
	if ((error = exec_extract_strings(imgp)) != 0)
	goto fail;
	- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
	+ exec_new_vmspace(imgp, &pecoff_sysvec);
	vmspace = imgp->proc->p_vmspace;
	for (i = 0; i < fp->f_nscns; i++) {
	prot = VM_PROT_WRITE; /* XXX for relocation? */
	prot \|= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0;
	prot \|= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0;
	prot \|= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0;
	sh[i].s_vaddr += wp->w_base;
	if (sh[i].s_flags & COFF_STYP_DISCARD)
	continue;
	if ((sh[i].s_flags & COFF_STYP_TEXT) != 0) {

	error = pecoff_load_section(
	FIRST_THREAD_IN_PROC(imgp->proc),
	vmspace, imgp->vp, sh[i].s_scnptr,
	(caddr_t) sh[i].s_vaddr, sh[i].s_paddr,
	sh[i].s_size ,prot);
	DPRINTF(("ERROR%d\n", error));
	if (error)
	goto fail;
	text_addr = trunc_page(sh[i].s_vaddr);
	text_size = trunc_page(sh[i].s_size + sh[i].s_vaddr - text_addr);

	}
	if ((sh[i].s_flags & (COFF_STYP_DATA\|COFF_STYP_BSS)) != 0) {
	if (pecoff_load_section(
	FIRST_THREAD_IN_PROC(imgp->proc), vmspace,
	imgp->vp, sh[i].s_scnptr, (caddr_t) sh[i].s_vaddr,
	sh[i].s_paddr, sh[i].s_size, prot) != 0)
	goto fail;
	data_addr = min(trunc_page(sh[i].s_vaddr), data_addr);
	dsize = round_page(sh[i].s_vaddr + sh[i].s_paddr)
	- data_addr;
	data_size = max(dsize, data_size);

	}
	}
	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
	vmspace->vm_taddr = (caddr_t) (uintptr_t) text_addr;
	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
	vmspace->vm_daddr = (caddr_t) (uintptr_t) data_addr;
	argp = malloc(sizeof(struct pecoff_args), M_TEMP, M_WAITOK);
	if (argp == NULL) {
	error = ENOMEM;
	goto fail;
	}
	argp->a_base = wp->w_base;
	argp->a_entry = wp->w_base + ap->a_entry;
	argp->a_end = data_addr + data_size;
	argp->a_subsystem = wp->w_subvers;
	error = pecoff_load_file(FIRST_THREAD_IN_PROC(imgp->proc),
	"/usr/libexec/ld.so.dll", &ldbase, &imgp->entry_addr, &ldexport);
	if (error)
	goto fail;

	argp->a_ldbase = ldbase;
	argp->a_ldexport = ldexport;
	memcpy(argp->a_imghdr, wp->w_imghdr, sizeof(struct pecoff_imghdr) * 16);
	for (i = 0; i < 16; i++) {
	argp->a_imghdr[i].i_vaddr += wp->w_base;
	}
	imgp->proc->p_sysent = &pecoff_sysvec;
	if (error)
	goto fail;
	imgp->auxargs = argp;
	imgp->auxarg_size = sizeof(struct pecoff_args);
	imgp->interpreted = 0;

	if (sh != NULL)
	free(sh, M_TEMP);
	return 0;
	fail:
	error = (error) ? error : ENOEXEC;
	if (sh != NULL)
	free(sh, M_TEMP);
	if (argp != NULL)
	free(argp, M_TEMP);

	return error;
	}

	int
	exec_pecoff_coff_makecmds(struct image_params * imgp,
	struct coff_filehdr * fp, int peofs)
	{
	struct coff_aouthdr *ap;
	int error;

	if (COFF_BADMAG(fp)) {
	return ENOEXEC;
	}
	ap = (void ) ((char ) fp + sizeof(struct coff_filehdr));
	switch (ap->a_magic) {
	case COFF_OMAGIC:
	error = exec_pecoff_coff_prep_omagic(imgp, fp, ap, peofs);
	break;
	case COFF_NMAGIC:
	error = exec_pecoff_coff_prep_nmagic(imgp, fp, ap, peofs);
	break;
	case COFF_ZMAGIC:
	error = exec_pecoff_coff_prep_zmagic(imgp, fp, ap, peofs);
	break;
	default:
	return ENOEXEC;
	}

	return error;
	}

	static int
	pecoff_signature(td, vp, dp)
	struct thread *td;
	struct vnode *vp;
	const struct pecoff_dos_filehdr *dp;
	{
	int error;
	char buf[512];
	char *pesig;
	if (DOS_BADMAG(dp)) {
	return ENOEXEC;
	}
	error = pecoff_read_from(td, vp, dp->d_peofs, buf, sizeof(buf));
	if (error) {
	return error;
	}
	pesig = buf;
	if (memcmp(pesig, signature, sizeof(signature) - 1) == 0) {
	return 0;
	}
	return EFTYPE;
	}
	int
	pecoff_read_from(td, vp, pos, buf, siz)
	struct thread *td;
	struct vnode *vp;
	int pos;
	caddr_t buf;
	int siz;
	{
	int error;
	size_t resid;

	error = vn_rdwr(UIO_READ, vp, buf, siz, pos,
	UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
	&resid, td);
	if (error)
	return error;

	if (resid != 0) {
	return ENOEXEC;
	}
	return 0;
	}

	static int
	imgact_pecoff(struct image_params * imgp)
	{
	const struct pecoff_dos_filehdr dp = (const struct pecoff_dos_filehdr )
	imgp->image_header;
	struct coff_filehdr *fp;
	int error, peofs;
	struct thread *td = curthread;

	error = pecoff_signature(FIRST_THREAD_IN_PROC(imgp->proc),
	imgp->vp, dp);
	if (error) {
	return -1;
	}
	VOP_UNLOCK(imgp->vp, 0, td);

	peofs = dp->d_peofs + sizeof(signature) - 1;
	fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK);
	error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc),
	imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE);
	if (error)
	goto fail;

	error = exec_pecoff_coff_makecmds(imgp, fp, peofs);
	fail:
	free(fp, M_TEMP);
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY, td);
	return error;
	}

	static struct execsw pecoff_execsw = {imgact_pecoff, "FreeBSD PEcoff"};
	EXEC_SET(pecoff, pecoff_execsw);
	Index: head/sys/compat/svr4/imgact_svr4.c
	===================================================================
	--- head/sys/compat/svr4/imgact_svr4.c (revision 103766)
	+++ head/sys/compat/svr4/imgact_svr4.c (revision 103767)
	@@ -1,243 +1,243 @@
	/*-
	* Copyright (c) 1998 Mark Newton
	* Copyright (c) 1994-1996 Søren Schmidt
	* All rights reserved.
	*
	* Based heavily on /sys/kern/imgact_aout.c which is:
	* Copyright (c) 1993, David Greenman
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/imgact_aout.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mman.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/vnode.h>

	#include <vm/vm.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_extern.h>

	#include <compat/svr4/svr4.h>

	static int exec_svr4_imgact(struct image_params *iparams);

	static int
	exec_svr4_imgact(imgp)
	struct image_params *imgp;
	{
	const struct exec a_out = (const struct exec ) imgp->image_header;
	struct vmspace *vmspace;
	vm_offset_t vmaddr;
	unsigned long virtual_offset, file_offset;
	vm_offset_t buffer;
	unsigned long bss_size;
	int error;
	struct thread *td = curthread;

	if (((a_out->a_magic >> 16) & 0xff) != 0x64)
	return -1;

	/*
	* Set file/virtual offset based on a.out variant.
	*/
	switch ((int)(a_out->a_magic & 0xffff)) {
	case 0413:
	virtual_offset = 0;
	file_offset = 1024;
	break;
	case 0314:
	virtual_offset = 4096;
	file_offset = 0;
	break;
	default:
	return (-1);
	}
	bss_size = round_page(a_out->a_bss);
	#ifdef DEBUG
	printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n", a_out->a_text, a_out->a_data, bss_size);
	#endif

	/*
	* Check various fields in header for validity/bounds.
	*/
	if (a_out->a_entry < virtual_offset \|\|
	a_out->a_entry >= virtual_offset + a_out->a_text \|\|
	a_out->a_text & PAGE_MASK \|\| a_out->a_data & PAGE_MASK)
	return (-1);

	/* text + data can't exceed file size */
	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
	return (EFAULT);
	/* For p_rlimit below. */
	mtx_assert(&Giant, MA_OWNED);
	/*
	* text/data/bss must not exceed limits
	*/
	if (a_out->a_text > maxtsiz \|\|
	a_out->a_data + bss_size > imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
	return (ENOMEM);

	VOP_UNLOCK(imgp->vp, 0, td);

	/* copy in arguments and/or environment from old process */
	error = exec_extract_strings(imgp);
	if (error)
	goto fail;

	/*
	* Destroy old process VM and create a new one (with a new stack)
	*/
	- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
	+ exec_new_vmspace(imgp, &svr4_sysvec);
	vmspace = imgp->proc->p_vmspace;

	/*
	* Check if file_offset page aligned,.
	* Currently we cannot handle misalinged file offsets,
	* and so we read in the entire image (what a waste).
	*/
	if (file_offset & PAGE_MASK) {
	#ifdef DEBUG
	printf("imgact: Non page aligned binary %lu\n", file_offset);
	#endif
	/*
	* Map text+data+bss read/write/execute
	*/
	vmaddr = virtual_offset;
	error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
	a_out->a_text + a_out->a_data + bss_size, FALSE,
	VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error)
	goto fail;

	error = vm_mmap(kernel_map, &buffer,
	round_page(a_out->a_text + a_out->a_data + file_offset),
	VM_PROT_READ, VM_PROT_READ, 0,
	(caddr_t) imgp->vp, trunc_page(file_offset));
	if (error)
	goto fail;

	error = copyout((caddr_t)(buffer + file_offset), (caddr_t)vmaddr,
	a_out->a_text + a_out->a_data);

	vm_map_remove(kernel_map, buffer,
	buffer + round_page(a_out->a_text + a_out->a_data + file_offset));

	if (error)
	goto fail;

	/*
	* remove write enable on the 'text' part
	*/
	error = vm_map_protect(&vmspace->vm_map,
	vmaddr,
	vmaddr + a_out->a_text,
	VM_PROT_EXECUTE\|VM_PROT_READ,
	TRUE);
	if (error)
	goto fail;
	}
	else {
	#ifdef DEBUG
	printf("imgact: Page aligned binary %lu\n", file_offset);
	#endif
	/*
	* Map text+data read/execute
	*/
	vmaddr = virtual_offset;
	error = vm_mmap(&vmspace->vm_map, &vmaddr,
	a_out->a_text + a_out->a_data,
	VM_PROT_READ \| VM_PROT_EXECUTE,
	VM_PROT_ALL,
	MAP_PRIVATE \| MAP_FIXED,
	(caddr_t)imgp->vp, file_offset);
	if (error)
	goto fail;

	#ifdef DEBUG
	printf("imgact: startaddr=%08lx, length=%08lx\n", (u_long)vmaddr,
	a_out->a_text + a_out->a_data);
	#endif
	/*
	* allow read/write of data
	*/
	error = vm_map_protect(&vmspace->vm_map,
	vmaddr + a_out->a_text,
	vmaddr + a_out->a_text + a_out->a_data,
	VM_PROT_ALL,
	FALSE);
	if (error)
	goto fail;

	/*
	* Allocate anon demand-zeroed area for uninitialized data
	*/
	if (bss_size != 0) {
	vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
	error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
	bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error)
	goto fail;
	#ifdef DEBUG
	printf("imgact: bssaddr=%08lx, length=%08lx\n",
	(u_long)vmaddr, bss_size);
	#endif

	}
	}
	/* Fill in process VM information */
	vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT;
	vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT;
	vmspace->vm_taddr = (caddr_t)virtual_offset;
	vmspace->vm_daddr = (caddr_t)virtual_offset + a_out->a_text;

	/* Fill in image_params */
	imgp->interpreted = 0;
	imgp->entry_addr = a_out->a_entry;

	imgp->proc->p_sysent = &svr4_sysvec;
	fail:
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY, td);
	return (error);
	}

	/*
	* Tell kern_execve.c about it, with a little help from the linker.
	*/
	struct execsw svr4_execsw = { exec_svr4_imgact, "svr4 ELF" };
	EXEC_SET(execsw_set, svr4_execsw);

	Index: head/sys/fs/procfs/procfs_status.c
	===================================================================
	--- head/sys/fs/procfs/procfs_status.c (revision 103766)
	+++ head/sys/fs/procfs/procfs_status.c (revision 103767)
	@@ -1,204 +1,206 @@
	/*
	* Copyright (c) 1993 Jan-Simon Pendry
	* Copyright (c) 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Jan-Simon Pendry.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)procfs_status.c 8.4 (Berkeley) 6/15/94
	*
	* From:
	* $Id: procfs_status.c,v 3.1 1993/12/15 09:40:17 jsp Exp $
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/exec.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/jail.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/sx.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/sbuf.h>
	+#include <sys/sysent.h>
	#include <sys/tty.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_param.h>

	#include <fs/pseudofs/pseudofs.h>
	#include <fs/procfs/procfs.h>

	int
	procfs_doprocstatus(PFS_FILL_ARGS)
	{
	struct session *sess;
	struct thread *tdfirst;
	struct tty *tp;
	struct ucred *cr;
	char *pc;
	char *sep;
	int pid, ppid, pgid, sid;
	int i;

	pid = p->p_pid;
	PROC_LOCK(p);
	ppid = p->p_pptr ? p->p_pptr->p_pid : 0;
	pgid = p->p_pgrp->pg_id;
	sess = p->p_pgrp->pg_session;
	SESS_LOCK(sess);
	sid = sess->s_leader ? sess->s_leader->p_pid : 0;

	/* comm pid ppid pgid sid maj,min ctty,sldr start ut st wmsg
	euid ruid rgid,egid,groups[1 .. NGROUPS]
	*/

	pc = p->p_comm;
	do {
	if (pc < 33 \|\| pc > 126 \|\| *pc == '\\')
	sbuf_printf(sb, "\\%03o", *pc);
	else
	sbuf_putc(sb, *pc);
	} while (*++pc);
	sbuf_printf(sb, " %d %d %d %d ", pid, ppid, pgid, sid);
	if ((p->p_flag&P_CONTROLT) && (tp = sess->s_ttyp))
	sbuf_printf(sb, "%d,%d ", major(tp->t_dev), minor(tp->t_dev));
	else
	sbuf_printf(sb, "%d,%d ", -1, -1);

	sep = "";
	if (sess->s_ttyvp) {
	sbuf_printf(sb, "%sctty", sep);
	sep = ",";
	}
	if (SESS_LEADER(p)) {
	sbuf_printf(sb, "%ssldr", sep);
	sep = ",";
	}
	SESS_UNLOCK(sess);
	if (*sep != ',') {
	sbuf_printf(sb, "noflags");
	}

	mtx_lock_spin(&sched_lock);
	if (p->p_sflag & PS_INMEM) {
	struct timeval ut, st;

	calcru(p, &ut, &st, (struct timeval *) NULL);
	mtx_unlock_spin(&sched_lock);
	sbuf_printf(sb, " %lld,%ld %ld,%ld %ld,%ld",
	(long long)p->p_stats->p_start.tv_sec,
	p->p_stats->p_start.tv_usec,
	ut.tv_sec, ut.tv_usec,
	st.tv_sec, st.tv_usec);
	} else {
	mtx_unlock_spin(&sched_lock);
	sbuf_printf(sb, " -1,-1 -1,-1 -1,-1");
	}

	if (p->p_flag & P_KSES)
	sbuf_printf(sb, " %s", "-kse- ");
	else {
	tdfirst = FIRST_THREAD_IN_PROC(p); /* XXX diff from td? */
	sbuf_printf(sb, " %s",
	(tdfirst->td_wchan && tdfirst->td_wmesg) ?
	tdfirst->td_wmesg : "nochan");
	}

	cr = p->p_ucred;

	sbuf_printf(sb, " %lu %lu %lu",
	(u_long)cr->cr_uid,
	(u_long)cr->cr_ruid,
	(u_long)cr->cr_rgid);

	/* egid (cr->cr_svgid) is equal to cr_ngroups[0]
	see also getegid(2) in /sys/kern/kern_prot.c */

	for (i = 0; i < cr->cr_ngroups; i++) {
	sbuf_printf(sb, ",%lu", (u_long)cr->cr_groups[i]);
	}

	if (jailed(p->p_ucred)) {
	mtx_lock(&p->p_ucred->cr_prison->pr_mtx);
	sbuf_printf(sb, " %s", p->p_ucred->cr_prison->pr_host);
	mtx_unlock(&p->p_ucred->cr_prison->pr_mtx);
	} else {
	sbuf_printf(sb, " -");
	}
	PROC_UNLOCK(p);
	sbuf_printf(sb, "\n");

	return (0);
	}

	int
	procfs_doproccmdline(PFS_FILL_ARGS)
	{
	struct ps_strings pstr;
	int error, i;

	/*
	* If we are using the ps/cmdline caching, use that. Otherwise
	* revert back to the old way which only implements full cmdline
	* for the currept process and just p->p_comm for all other
	* processes.
	* Note that if the argv is no longer available, we deliberately
	* don't fall back on p->p_comm or return an error: the authentic
	* Linux behaviour is to return zero-length in this case.
	*/

	PROC_LOCK(p);
	if (p->p_args && (ps_argsopen \|\| !p_cansee(td, p))) {
	sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length);
	PROC_UNLOCK(p);
	return (0);
	}
	PROC_UNLOCK(p);
	if (p != td->td_proc) {
	sbuf_printf(sb, "%.*s", MAXCOMLEN, p->p_comm);
	} else {
	- error = copyin((void*)PS_STRINGS, &pstr, sizeof(pstr));
	+ error = copyin((void *)p->p_sysent->sv_psstrings, &pstr,
	+ sizeof(pstr));
	if (error)
	return (error);
	for (i = 0; i < pstr.ps_nargvstr; i++) {
	sbuf_copyin(sb, pstr.ps_argvstr[i], 0);
	sbuf_printf(sb, "%c", '\0');
	}
	}

	return (0);
	}
	Index: head/sys/i386/ibcs2/imgact_coff.c
	===================================================================
	--- head/sys/i386/ibcs2/imgact_coff.c (revision 103766)
	+++ head/sys/i386/ibcs2/imgact_coff.c (revision 103767)
	@@ -1,493 +1,493 @@
	/*-
	* Copyright (c) 1994 Sean Eric Fagan
	* Copyright (c) 1994 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/fcntl.h>
	#include <sys/imgact.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mman.h>
	#include <sys/mount.h>
	#include <sys/namei.h>
	#include <sys/vnode.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>

	#include <i386/ibcs2/coff.h>
	#include <i386/ibcs2/ibcs2_util.h>

	MODULE_DEPEND(coff, ibcs2, 1, 1, 1);

	extern struct sysentvec ibcs2_svr3_sysvec;

	static int coff_load_file(struct thread td, char name);
	static int exec_coff_imgact(struct image_params *imgp);

	static int load_coff_section(struct vmspace vmspace, struct vnode vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot);

	static int
	load_coff_section(struct vmspace vmspace, struct vnode vp, vm_offset_t offset,
	caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
	{
	size_t map_len;
	vm_offset_t map_offset;
	vm_offset_t map_addr;
	int error;
	unsigned char *data_buf = 0;
	size_t copy_len;

	map_offset = trunc_page(offset);
	map_addr = trunc_page((vm_offset_t)vmaddr);

	if (memsz > filsz) {
	/*
	* We have the stupid situation that
	* the section is longer than it is on file,
	* which means it has zero-filled areas, and
	* we have to work for it. Stupid iBCS!
	*/
	map_len = trunc_page(offset + filsz) - trunc_page(map_offset);
	} else {
	/*
	* The only stuff we care about is on disk, and we
	* don't care if we map in more than is really there.
	*/
	map_len = round_page(offset + filsz) - trunc_page(map_offset);
	}

	DPRINTF(("%s(%d): vm_mmap(&vmspace->vm_map, &0x%08lx, 0x%x, 0x%x, "
	"VM_PROT_ALL, MAP_PRIVATE \| MAP_FIXED, vp, 0x%x)\n",
	__FILE__, __LINE__, map_addr, map_len, prot, map_offset));

	if ((error = vm_mmap(&vmspace->vm_map,
	&map_addr,
	map_len,
	prot,
	VM_PROT_ALL,
	MAP_PRIVATE \| MAP_FIXED,
	(caddr_t) vp,
	map_offset)) != 0)
	return error;

	if (memsz == filsz) {
	/* We're done! */
	return 0;
	}

	/*
	* Now we have screwball stuff, to accomodate stupid COFF.
	* We have to map the remaining bit of the file into the kernel's
	* memory map, allocate some anonymous memory, copy that last
	* bit into it, and then we're done. sigh
	* For clean-up reasons, we actally map in the file last.
	*/

	copy_len = (offset + filsz) - trunc_page(offset + filsz);
	map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
	map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;

	DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx,0x%x, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n", __FILE__, __LINE__, map_addr, map_len));

	if (map_len != 0) {
	error = vm_map_find(&vmspace->vm_map, NULL, 0, &map_addr,
	map_len, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error)
	return error;
	}

	if ((error = vm_mmap(kernel_map,
	(vm_offset_t *) &data_buf,
	PAGE_SIZE,
	VM_PROT_READ,
	VM_PROT_READ,
	0,
	(caddr_t) vp,
	trunc_page(offset + filsz))) != 0)
	return error;

	error = copyout(data_buf, (caddr_t) map_addr, copy_len);

	if (vm_map_remove(kernel_map,
	(vm_offset_t) data_buf,
	(vm_offset_t) data_buf + PAGE_SIZE))
	panic("load_coff_section vm_map_remove failed");

	return error;
	}

	static int
	coff_load_file(struct thread td, char name)
	{
	struct proc *p = td->td_proc;
	struct vmspace *vmspace = p->p_vmspace;
	int error;
	struct nameidata nd;
	struct vnode *vp;
	struct vattr attr;
	struct filehdr *fhdr;
	struct aouthdr *ahdr;
	struct scnhdr *scns;
	char *ptr = 0;
	int nscns;
	unsigned long text_offset = 0, text_address = 0, text_size = 0;
	unsigned long data_offset = 0, data_address = 0, data_size = 0;
	unsigned long bss_size = 0;
	int i;

	NDINIT(&nd, LOOKUP, LOCKLEAF \| FOLLOW \| SAVENAME, UIO_SYSSPACE, name, td);

	error = namei(&nd);
	if (error)
	return error;

	vp = nd.ni_vp;
	if (vp == NULL)
	return ENOEXEC;

	if (vp->v_writecount) {
	error = ETXTBSY;
	goto fail;
	}

	if ((error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) != 0)
	goto fail;

	if ((vp->v_mount->mnt_flag & MNT_NOEXEC)
	\|\| ((attr.va_mode & 0111) == 0)
	\|\| (attr.va_type != VREG))
	goto fail;

	if (attr.va_size == 0) {
	error = ENOEXEC;
	goto fail;
	}

	if ((error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td)) != 0)
	goto fail;

	if ((error = VOP_OPEN(vp, FREAD, td->td_ucred, td)) != 0)
	goto fail;

	/*
	* Lose the lock on the vnode. It's no longer needed, and must not
	* exist for the pagefault paging to work below.
	*/
	VOP_UNLOCK(vp, 0, td);

	if ((error = vm_mmap(kernel_map,
	(vm_offset_t *) &ptr,
	PAGE_SIZE,
	VM_PROT_READ,
	VM_PROT_READ,
	0,
	(caddr_t) vp,
	0)) != 0)
	goto unlocked_fail;

	fhdr = (struct filehdr *)ptr;

	if (fhdr->f_magic != I386_COFF) {
	error = ENOEXEC;
	goto dealloc_and_fail;
	}

	nscns = fhdr->f_nscns;

	if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) {
	/*
	* XXX -- just fail. I'm so lazy.
	*/
	error = ENOEXEC;
	goto dealloc_and_fail;
	}

	ahdr = (struct aouthdr*)(ptr + sizeof(struct filehdr));

	scns = (struct scnhdr*)(ptr + sizeof(struct filehdr)
	+ sizeof(struct aouthdr));

	for (i = 0; i < nscns; i++) {
	if (scns[i].s_flags & STYP_NOLOAD)
	continue;
	else if (scns[i].s_flags & STYP_TEXT) {
	text_address = scns[i].s_vaddr;
	text_size = scns[i].s_size;
	text_offset = scns[i].s_scnptr;
	}
	else if (scns[i].s_flags & STYP_DATA) {
	data_address = scns[i].s_vaddr;
	data_size = scns[i].s_size;
	data_offset = scns[i].s_scnptr;
	} else if (scns[i].s_flags & STYP_BSS) {
	bss_size = scns[i].s_size;
	}
	}

	if ((error = load_coff_section(vmspace, vp, text_offset,
	(caddr_t)(void *)(uintptr_t)text_address,
	text_size, text_size,
	VM_PROT_READ \| VM_PROT_EXECUTE)) != 0) {
	goto dealloc_and_fail;
	}
	if ((error = load_coff_section(vmspace, vp, data_offset,
	(caddr_t)(void *)(uintptr_t)data_address,
	data_size + bss_size, data_size,
	VM_PROT_ALL)) != 0) {
	goto dealloc_and_fail;
	}

	error = 0;

	dealloc_and_fail:
	if (vm_map_remove(kernel_map,
	(vm_offset_t) ptr,
	(vm_offset_t) ptr + PAGE_SIZE))
	panic("%s vm_map_remove failed", __func__);

	fail:
	VOP_UNLOCK(vp, 0, td);
	unlocked_fail:
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vrele(nd.ni_vp);
	return error;
	}

	static int
	exec_coff_imgact(imgp)
	struct image_params *imgp;
	{
	const struct filehdr fhdr = (const struct filehdr)imgp->image_header;
	const struct aouthdr *ahdr;
	const struct scnhdr *scns;
	int i;
	struct vmspace *vmspace;
	int nscns;
	int error;
	unsigned long text_offset = 0, text_address = 0, text_size = 0;
	unsigned long data_offset = 0, data_address = 0, data_size = 0;
	unsigned long bss_size = 0;
	caddr_t hole;
	struct thread *td = curthread;

	if (fhdr->f_magic != I386_COFF \|\|
	!(fhdr->f_flags & F_EXEC)) {

	DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__));
	return -1;
	}

	nscns = fhdr->f_nscns;
	if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) {
	/*
	* For now, return an error -- need to be able to
	* read in all of the section structures.
	*/

	DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__));
	return -1;
	}

	ahdr = (const struct aouthdr*)
	((const char*)(imgp->image_header) + sizeof(struct filehdr));
	imgp->entry_addr = ahdr->entry;

	scns = (const struct scnhdr*)
	((const char*)(imgp->image_header) + sizeof(struct filehdr) +
	sizeof(struct aouthdr));

	VOP_UNLOCK(imgp->vp, 0, td);

	if ((error = exec_extract_strings(imgp)) != 0) {
	DPRINTF(("%s(%d): return %d\n", __FILE__, __LINE__, error));
	goto fail;
	}

	- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
	+ exec_new_vmspace(imgp, &ibcs2_svr3_sysvec);
	vmspace = imgp->proc->p_vmspace;

	for (i = 0; i < nscns; i++) {

	DPRINTF(("i = %d, scns[i].s_name = %s, scns[i].s_vaddr = %08lx, "
	"scns[i].s_scnptr = %d\n", i, scns[i].s_name,
	scns[i].s_vaddr, scns[i].s_scnptr));
	if (scns[i].s_flags & STYP_NOLOAD) {
	/*
	* A section that is not loaded, for whatever
	* reason. It takes precedance over other flag
	* bits...
	*/
	continue;
	} else if (scns[i].s_flags & STYP_TEXT) {
	text_address = scns[i].s_vaddr;
	text_size = scns[i].s_size;
	text_offset = scns[i].s_scnptr;
	} else if (scns[i].s_flags & STYP_DATA) {
	/* .data section */
	data_address = scns[i].s_vaddr;
	data_size = scns[i].s_size;
	data_offset = scns[i].s_scnptr;
	} else if (scns[i].s_flags & STYP_BSS) {
	/* .bss section */
	bss_size = scns[i].s_size;
	} else if (scns[i].s_flags & STYP_LIB) {
	char *buf = 0;
	int foff = trunc_page(scns[i].s_scnptr);
	int off = scns[i].s_scnptr - foff;
	int len = round_page(scns[i].s_size + PAGE_SIZE);
	int j;

	if ((error = vm_mmap(kernel_map,
	(vm_offset_t *) &buf,
	len,
	VM_PROT_READ,
	VM_PROT_READ,
	0,
	(caddr_t) imgp->vp,
	foff)) != 0) {
	error = ENOEXEC;
	goto fail;
	}
	if(scns[i].s_size) {
	char *libbuf;
	int emul_path_len = strlen(ibcs2_emul_path);

	libbuf = malloc(MAXPATHLEN + emul_path_len,
	M_TEMP, M_WAITOK);
	strcpy(libbuf, ibcs2_emul_path);

	for (j = off; j < scns[i].s_size + off;) {
	long stroff, nextoff;
	char *libname;

	nextoff = 4 * (long )(buf + j);
	stroff = 4 * (long )(buf + j + sizeof(long));

	libname = buf + j + stroff;
	j += nextoff;

	DPRINTF(("%s(%d): shared library %s\n",
	__FILE__, __LINE__, libname));
	strcpy(&libbuf[emul_path_len], libname);
	/* XXXKSE only 1:1 in coff */ error = coff_load_file(
	FIRST_THREAD_IN_PROC(imgp->proc), libbuf);
	if (error)
	error = coff_load_file(
	FIRST_THREAD_IN_PROC(imgp->proc),
	libname);
	if (error)
	break;
	}
	free(libbuf, M_TEMP);
	}
	if (vm_map_remove(kernel_map,
	(vm_offset_t) buf,
	(vm_offset_t) buf + len))
	panic("exec_coff_imgact vm_map_remove failed");
	if (error)
	goto fail;
	}
	}
	/*
	* Map in .text now
	*/

	DPRINTF(("%s(%d): load_coff_section(vmspace, "
	"imgp->vp, %08lx, %08lx, 0x%x, 0x%x, 0x%x)\n",
	__FILE__, __LINE__, text_offset, text_address,
	text_size, text_size, VM_PROT_READ \| VM_PROT_EXECUTE));
	if ((error = load_coff_section(vmspace, imgp->vp,
	text_offset,
	(caddr_t)(void *)(uintptr_t)text_address,
	text_size, text_size,
	VM_PROT_READ \| VM_PROT_EXECUTE)) != 0) {
	DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
	goto fail;
	}
	/*
	* Map in .data and .bss now
	*/


	DPRINTF(("%s(%d): load_coff_section(vmspace, "
	"imgp->vp, 0x%08lx, 0x%08lx, 0x%x, 0x%x, 0x%x)\n",
	__FILE__, __LINE__, data_offset, data_address,
	data_size + bss_size, data_size, VM_PROT_ALL));
	if ((error = load_coff_section(vmspace, imgp->vp,
	data_offset,
	(caddr_t)(void *)(uintptr_t)data_address,
	data_size + bss_size, data_size,
	VM_PROT_ALL)) != 0) {

	DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
	goto fail;
	}

	imgp->interpreted = 0;
	imgp->proc->p_sysent = &ibcs2_svr3_sysvec;

	vmspace->vm_tsize = round_page(text_size) >> PAGE_SHIFT;
	vmspace->vm_dsize = round_page(data_size + bss_size) >> PAGE_SHIFT;
	vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)text_address;
	vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t)data_address;

	hole = (caddr_t)trunc_page((vm_offset_t)vmspace->vm_daddr) + ctob(vmspace->vm_dsize);


	DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n",
	__FILE__, __LINE__, hole));
	DPRINTF(("imgact: error = %d\n", error));

	error = vm_map_find(&vmspace->vm_map, NULL, 0,
	(vm_offset_t *) &hole, PAGE_SIZE, FALSE,
	VM_PROT_ALL, VM_PROT_ALL, 0);

	DPRINTF(("IBCS2: start vm_dsize = 0x%x, vm_daddr = 0x%x end = 0x%x\n",
	ctob(vmspace->vm_dsize), vmspace->vm_daddr,
	ctob(vmspace->vm_dsize) + vmspace->vm_daddr ));
	DPRINTF(("%s(%d): returning successfully!\n", __FILE__, __LINE__));

	fail:
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY, td);

	return error;
	}

	/*
	* Tell kern_execve.c about it, with a little help from the linker.
	*/
	static struct execsw coff_execsw = { exec_coff_imgact, "coff" };
	EXEC_SET(coff, coff_execsw);
	Index: head/sys/i386/linux/imgact_linux.c
	===================================================================
	--- head/sys/i386/linux/imgact_linux.c (revision 103766)
	+++ head/sys/i386/linux/imgact_linux.c (revision 103767)
	@@ -1,246 +1,246 @@
	/*-
	* Copyright (c) 1994-1996 Søren Schmidt
	* All rights reserved.
	*
	* Based heavily on /sys/kern/imgact_aout.c which is:
	* Copyright (c) 1993, David Greenman
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/imgact_aout.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mman.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/vnode.h>

	#include <vm/vm.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_extern.h>

	#include <i386/linux/linux.h>

	static int exec_linux_imgact(struct image_params *iparams);

	static int
	exec_linux_imgact(imgp)
	struct image_params *imgp;
	{
	const struct exec a_out = (const struct exec ) imgp->image_header;
	struct vmspace *vmspace;
	vm_offset_t vmaddr;
	unsigned long virtual_offset, file_offset;
	vm_offset_t buffer;
	unsigned long bss_size;
	struct thread *td = curthread;
	int error;

	if (((a_out->a_magic >> 16) & 0xff) != 0x64)
	return -1;

	/*
	* Set file/virtual offset based on a.out variant.
	*/
	switch ((int)(a_out->a_magic & 0xffff)) {
	case 0413:
	virtual_offset = 0;
	file_offset = 1024;
	break;
	case 0314:
	virtual_offset = 4096;
	file_offset = 0;
	break;
	default:
	return (-1);
	}
	bss_size = round_page(a_out->a_bss);
	#ifdef DEBUG
	printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n",
	(u_long)a_out->a_text, (u_long)a_out->a_data, bss_size);
	#endif

	/*
	* Check various fields in header for validity/bounds.
	*/
	if (a_out->a_entry < virtual_offset \|\|
	a_out->a_entry >= virtual_offset + a_out->a_text \|\|
	a_out->a_text & PAGE_MASK \|\| a_out->a_data & PAGE_MASK)
	return (-1);

	/* text + data can't exceed file size */
	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
	return (EFAULT);
	/*
	* text/data/bss must not exceed limits
	*/
	mtx_assert(&Giant, MA_OWNED);
	if (a_out->a_text > maxtsiz \|\|
	a_out->a_data + bss_size > imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
	return (ENOMEM);

	VOP_UNLOCK(imgp->vp, 0, td);

	/* copy in arguments and/or environment from old process */
	error = exec_extract_strings(imgp);
	if (error)
	goto fail;

	/*
	* Destroy old process VM and create a new one (with a new stack)
	*/
	- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
	+ exec_new_vmspace(imgp, &linux_sysvec);
	vmspace = imgp->proc->p_vmspace;

	/*
	* Check if file_offset page aligned,.
	* Currently we cannot handle misalinged file offsets,
	* and so we read in the entire image (what a waste).
	*/
	if (file_offset & PAGE_MASK) {
	#ifdef DEBUG
	printf("imgact: Non page aligned binary %lu\n", file_offset);
	#endif
	/*
	* Map text+data+bss read/write/execute
	*/
	vmaddr = virtual_offset;
	error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
	a_out->a_text + a_out->a_data + bss_size, FALSE,
	VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error)
	goto fail;

	error = vm_mmap(kernel_map, &buffer,
	round_page(a_out->a_text + a_out->a_data + file_offset),
	VM_PROT_READ, VM_PROT_READ, 0,
	(caddr_t) imgp->vp, trunc_page(file_offset));
	if (error)
	goto fail;

	error = copyout((caddr_t)(void *)(uintptr_t)(buffer + file_offset),
	(caddr_t)vmaddr, a_out->a_text + a_out->a_data);

	vm_map_remove(kernel_map, buffer,
	buffer + round_page(a_out->a_text + a_out->a_data + file_offset));

	if (error)
	goto fail;

	/*
	* remove write enable on the 'text' part
	*/
	error = vm_map_protect(&vmspace->vm_map,
	vmaddr,
	vmaddr + a_out->a_text,
	VM_PROT_EXECUTE\|VM_PROT_READ,
	TRUE);
	if (error)
	goto fail;
	}
	else {
	#ifdef DEBUG
	printf("imgact: Page aligned binary %lu\n", file_offset);
	#endif
	/*
	* Map text+data read/execute
	*/
	vmaddr = virtual_offset;
	error = vm_mmap(&vmspace->vm_map, &vmaddr,
	a_out->a_text + a_out->a_data,
	VM_PROT_READ \| VM_PROT_EXECUTE,
	VM_PROT_ALL,
	MAP_PRIVATE \| MAP_FIXED,
	(caddr_t)imgp->vp, file_offset);
	if (error)
	goto fail;

	#ifdef DEBUG
	printf("imgact: startaddr=%08lx, length=%08lx\n",
	(u_long)vmaddr, a_out->a_text + a_out->a_data);
	#endif
	/*
	* allow read/write of data
	*/
	error = vm_map_protect(&vmspace->vm_map,
	vmaddr + a_out->a_text,
	vmaddr + a_out->a_text + a_out->a_data,
	VM_PROT_ALL,
	FALSE);
	if (error)
	goto fail;

	/*
	* Allocate anon demand-zeroed area for uninitialized data
	*/
	if (bss_size != 0) {
	vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
	error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
	bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error)
	goto fail;
	#ifdef DEBUG
	printf("imgact: bssaddr=%08lx, length=%08lx\n",
	(u_long)vmaddr, bss_size);
	#endif

	}
	/* Indicate that this file should not be modified */
	mp_fixme("Unlocked v_flag access");
	imgp->vp->v_vflag \|= VV_TEXT;
	}
	/* Fill in process VM information */
	vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT;
	vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT;
	vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)virtual_offset;
	vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t)
	(virtual_offset + a_out->a_text);

	/* Fill in image_params */
	imgp->interpreted = 0;
	imgp->entry_addr = a_out->a_entry;

	imgp->proc->p_sysent = &linux_sysvec;

	fail:
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY, td);
	return (error);
	}

	/*
	* Tell kern_execve.c about it, with a little help from the linker.
	*/
	static struct execsw linux_execsw = { exec_linux_imgact, "linux a.out" };
	EXEC_SET(linuxaout, linux_execsw);
	Index: head/sys/kern/imgact_aout.c
	===================================================================
	--- head/sys/kern/imgact_aout.c (revision 103766)
	+++ head/sys/kern/imgact_aout.c (revision 103767)
	@@ -1,314 +1,314 @@
	/*
	* Copyright (c) 1993, David Greenman
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/imgact_aout.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/signalvar.h>
	#include <sys/syscall.h>
	#include <sys/sysent.h>
	#include <sys/systm.h>
	#include <sys/vnode.h>
	#include <sys/user.h>

	#include <machine/frame.h>
	#include <machine/md_var.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_param.h>

	static int exec_aout_imgact(struct image_params *imgp);
	static int aout_fixup(register_t *stack_base, struct image_params imgp);

	struct sysentvec aout_sysvec = {
	SYS_MAXSYSCALL,
	sysent,
	0,
	0,
	NULL,
	0,
	NULL,
	NULL,
	aout_fixup,
	sendsig,
	sigcode,
	&szsigcode,
	NULL,
	"FreeBSD a.out",
	aout_coredump,
	NULL,
	MINSIGSTKSZ,
	PAGE_SIZE,
	VM_MIN_ADDRESS,
	VM_MAXUSER_ADDRESS,
	USRSTACK,
	PS_STRINGS,
	VM_PROT_ALL,
	exec_copyout_strings,
	exec_setregs
	};

	static int
	aout_fixup(stack_base, imgp)
	register_t **stack_base;
	struct image_params *imgp;
	{

	return (suword(--(*stack_base), imgp->argc));
	}

	static int
	exec_aout_imgact(imgp)
	struct image_params *imgp;
	{
	const struct exec a_out = (const struct exec ) imgp->image_header;
	struct vmspace *vmspace;
	struct vnode *vp;
	vm_map_t map;
	vm_object_t object;
	vm_offset_t text_end, data_end;
	unsigned long virtual_offset;
	unsigned long file_offset;
	unsigned long bss_size;
	int error;

	GIANT_REQUIRED;

	/*
	* Linux and *BSD binaries look very much alike,
	* only the machine id is different:
	* 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
	* NetBSD is in network byte order.. ugh.
	*/
	if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
	((a_out->a_magic >> 16) & 0xff) != 0 &&
	((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
	return -1;

	/*
	* Set file/virtual offset based on a.out variant.
	* We do two cases: host byte order and network byte order
	* (for NetBSD compatibility)
	*/
	switch ((int)(a_out->a_magic & 0xffff)) {
	case ZMAGIC:
	virtual_offset = 0;
	if (a_out->a_text) {
	file_offset = PAGE_SIZE;
	} else {
	/* Bill's "screwball mode" */
	file_offset = 0;
	}
	break;
	case QMAGIC:
	virtual_offset = PAGE_SIZE;
	file_offset = 0;
	/* Pass PS_STRINGS for BSD/OS binaries only. */
	if (N_GETMID(*a_out) == MID_ZERO)
	- imgp->ps_strings = PS_STRINGS;
	+ imgp->ps_strings = aout_sysvec.sv_psstrings;
	break;
	default:
	/* NetBSD compatibility */
	switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
	case ZMAGIC:
	case QMAGIC:
	virtual_offset = PAGE_SIZE;
	file_offset = 0;
	break;
	default:
	return (-1);
	}
	}

	bss_size = roundup(a_out->a_bss, PAGE_SIZE);

	/*
	* Check various fields in header for validity/bounds.
	*/
	if (/* entry point must lay with text region */
	a_out->a_entry < virtual_offset \|\|
	a_out->a_entry >= virtual_offset + a_out->a_text \|\|

	/* text and data size must each be page rounded */
	a_out->a_text & PAGE_MASK \|\| a_out->a_data & PAGE_MASK)
	return (-1);

	/* text + data can't exceed file size */
	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
	return (EFAULT);

	/*
	* text/data/bss must not exceed limits
	*/
	mtx_assert(&Giant, MA_OWNED);
	if (/* text can't exceed maximum text size */
	a_out->a_text > maxtsiz \|\|

	/* data + bss can't exceed rlimit */
	a_out->a_data + bss_size >
	imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
	return (ENOMEM);

	/* copy in arguments and/or environment from old process */
	error = exec_extract_strings(imgp);
	if (error)
	return (error);

	/*
	* Destroy old process VM and create a new one (with a new stack)
	*/
	- exec_new_vmspace(imgp, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
	+ exec_new_vmspace(imgp, &aout_sysvec);

	/*
	* The vm space can be changed by exec_new_vmspace
	*/
	vmspace = imgp->proc->p_vmspace;

	vp = imgp->vp;
	object = imgp->object;
	map = &vmspace->vm_map;
	vm_map_lock(map);
	vm_object_reference(object);

	text_end = virtual_offset + a_out->a_text;
	error = vm_map_insert(map, object,
	file_offset,
	virtual_offset, text_end,
	VM_PROT_READ \| VM_PROT_EXECUTE, VM_PROT_ALL,
	MAP_COPY_ON_WRITE \| MAP_PREFAULT);
	if (error) {
	vm_map_unlock(map);
	return (error);
	}
	data_end = text_end + a_out->a_data;
	if (a_out->a_data) {
	vm_object_reference(object);
	error = vm_map_insert(map, object,
	file_offset + a_out->a_text,
	text_end, data_end,
	VM_PROT_ALL, VM_PROT_ALL,
	MAP_COPY_ON_WRITE \| MAP_PREFAULT);
	if (error) {
	vm_map_unlock(map);
	return (error);
	}
	}

	if (bss_size) {
	error = vm_map_insert(map, NULL, 0,
	data_end, data_end + bss_size,
	VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error) {
	vm_map_unlock(map);
	return (error);
	}
	}
	vm_map_unlock(map);

	/* Fill in process VM information */
	vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
	vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
	vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
	vmspace->vm_daddr = (caddr_t) (uintptr_t)
	(virtual_offset + a_out->a_text);

	/* Fill in image_params */
	imgp->interpreted = 0;
	imgp->entry_addr = a_out->a_entry;

	imgp->proc->p_sysent = &aout_sysvec;

	return (0);
	}

	/*
	* Dump core, into a file named as described in the comments for
	* expand_name(), unless the process was setuid/setgid.
	*/
	int
	aout_coredump(td, vp, limit)
	register struct thread *td;
	register struct vnode *vp;
	off_t limit;
	{
	struct proc *p = td->td_proc;
	register struct ucred *cred = td->td_ucred;
	register struct vmspace *vm = p->p_vmspace;
	char *tempuser;
	int error;

	if (ctob((uarea_pages + kstack_pages)
	+ vm->vm_dsize + vm->vm_ssize) >= limit)
	return (EFAULT);
	tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
	M_WAITOK \| M_ZERO);
	if (tempuser == NULL)
	return (ENOMEM);
	PROC_LOCK(p);
	fill_kinfo_proc(p, &p->p_uarea->u_kproc);
	PROC_UNLOCK(p);
	bcopy(p->p_uarea, tempuser, sizeof(struct user));
	bcopy(td->td_frame,
	tempuser + ctob(uarea_pages) +
	((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
	sizeof(struct trapframe));
	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
	ctob(uarea_pages + kstack_pages),
	(off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
	(int *)NULL, td);
	free(tempuser, M_TEMP);
	if (error == 0)
	error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
	(int)ctob(vm->vm_dsize),
	(off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE,
	IO_UNIT \| IO_DIRECT, cred, NOCRED, (int *) NULL, td);
	if (error == 0)
	error = vn_rdwr_inchunks(UIO_WRITE, vp,
	- (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
	- round_page(ctob(vm->vm_ssize)),
	+ (caddr_t)trunc_page(p->p_sysent->sv_usrstack -
	+ ctob(vm->vm_ssize)), round_page(ctob(vm->vm_ssize)),
	(off_t)ctob(uarea_pages + kstack_pages) +
	ctob(vm->vm_dsize), UIO_USERSPACE,
	IO_UNIT \| IO_DIRECT, cred, NOCRED, (int *) NULL, td);
	return (error);
	}

	/*
	* Tell kern_execve.c about it, with a little help from the linker.
	*/
	static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
	EXEC_SET(aout, aout_execsw);
	Index: head/sys/kern/imgact_elf.c
	===================================================================
	--- head/sys/kern/imgact_elf.c (revision 103766)
	+++ head/sys/kern/imgact_elf.c (revision 103767)
	@@ -1,1241 +1,1240 @@
	/*-
	* Copyright (c) 2000 David O'Brien
	* Copyright (c) 1995-1996 Søren Schmidt
	* Copyright (c) 1996 Peter Wemm
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/exec.h>
	#include <sys/fcntl.h>
	#include <sys/imgact.h>
	#include <sys/imgact_elf.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/mman.h>
	#include <sys/namei.h>
	#include <sys/pioctl.h>
	#include <sys/proc.h>
	#include <sys/procfs.h>
	#include <sys/resourcevar.h>
	#include <sys/systm.h>
	#include <sys/signalvar.h>
	#include <sys/stat.h>
	#include <sys/sx.h>
	#include <sys/syscall.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/vnode.h>

	#include <vm/vm.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_extern.h>

	#include <machine/elf.h>
	#include <machine/md_var.h>

	#define OLD_EI_BRAND 8

	__ElfType(Brandinfo);
	__ElfType(Auxargs);

	static int __elfN(check_header)(const Elf_Ehdr *hdr);
	static Elf_Brandinfo __elfN(get_brandinfo)(const Elf_Ehdr hdr,
	const char *interp);
	static int __elfN(load_file)(struct proc p, const char file, u_long *addr,
	u_long *entry, size_t pagesize);
	static int __elfN(load_section)(struct proc *p,
	struct vmspace vmspace, struct vnode vp, vm_object_t object,
	vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
	vm_prot_t prot, size_t pagesize);
	static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);

	static int elf_trace = 0;
	#if __ELF_WORD_SIZE == 32
	SYSCTL_INT(_debug, OID_AUTO, elf32_trace, CTLFLAG_RW, &elf_trace, 0, "");
	#else
	SYSCTL_INT(_debug, OID_AUTO, elf64_trace, CTLFLAG_RW, &elf_trace, 0, "");
	#endif

	static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
	extern int fallback_elf_brand;

	int
	__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
	{
	int i;

	for (i = 0; i < MAX_BRANDS; i++) {
	if (elf_brand_list[i] == NULL) {
	elf_brand_list[i] = entry;
	break;
	}
	}
	if (i == MAX_BRANDS)
	return (-1);
	return (0);
	}

	int
	__elfN(remove_brand_entry)(Elf_Brandinfo *entry)
	{
	int i;

	for (i = 0; i < MAX_BRANDS; i++) {
	if (elf_brand_list[i] == entry) {
	elf_brand_list[i] = NULL;
	break;
	}
	}
	if (i == MAX_BRANDS)
	return (-1);
	return (0);
	}

	int
	__elfN(brand_inuse)(Elf_Brandinfo *entry)
	{
	struct proc *p;
	int rval = FALSE;

	sx_slock(&allproc_lock);
	LIST_FOREACH(p, &allproc, p_list) {
	if (p->p_sysent == entry->sysvec) {
	rval = TRUE;
	break;
	}
	}
	sx_sunlock(&allproc_lock);

	return (rval);
	}

	static Elf_Brandinfo *
	__elfN(get_brandinfo)(const Elf_Ehdr hdr, const char interp)
	{
	Elf_Brandinfo *bi;
	int i;

	/*
	* We support three types of branding -- (1) the ELF EI_OSABI field
	* that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
	* branding w/in the ELF header, and (3) path of the `interp_path'
	* field. We should also look for an ".note.ABI-tag" ELF section now
	* in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones.
	*/

	/* If the executable has a brand, search for it in the brand list. */
	for (i = 0; i < MAX_BRANDS; i++) {
	bi = elf_brand_list[i];
	if (bi != NULL && hdr->e_machine == bi->machine &&
	(hdr->e_ident[EI_OSABI] == bi->brand \|\|
	strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
	bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0))
	return (bi);
	}

	/* Lacking a known brand, search for a recognized interpreter. */
	if (interp != NULL) {
	for (i = 0; i < MAX_BRANDS; i++) {
	bi = elf_brand_list[i];
	if (bi != NULL && hdr->e_machine == bi->machine &&
	strcmp(interp, bi->interp_path) == 0)
	return (bi);
	}
	}

	/* Lacking a recognized interpreter, try the default brand */
	for (i = 0; i < MAX_BRANDS; i++) {
	bi = elf_brand_list[i];
	if (bi != NULL && hdr->e_machine == bi->machine &&
	fallback_elf_brand == bi->brand)
	return (bi);
	}
	return (NULL);
	}

	static int
	__elfN(check_header)(const Elf_Ehdr *hdr)
	{
	Elf_Brandinfo *bi;
	int i;

	if (!IS_ELF(*hdr) \|\|
	hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS \|\|
	hdr->e_ident[EI_DATA] != ELF_TARG_DATA \|\|
	hdr->e_ident[EI_VERSION] != EV_CURRENT)
	return (ENOEXEC);

	/*
	* Make sure we have at least one brand for this machine.
	*/

	for (i = 0; i < MAX_BRANDS; i++) {
	bi = elf_brand_list[i];
	if (bi != NULL && bi->machine == hdr->e_machine)
	break;
	}
	if (i == MAX_BRANDS)
	return (ENOEXEC);

	if (hdr->e_version != ELF_TARG_VER)
	return (ENOEXEC);

	return (0);
	}

	static int
	__elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
	vm_offset_t start, vm_offset_t end, vm_prot_t prot,
	vm_prot_t max)
	{
	int error, rv;
	vm_offset_t off;
	vm_offset_t data_buf = 0;

	/*
	* Create the page if it doesn't exist yet. Ignore errors.
	*/
	vm_map_lock(map);
	vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), max,
	max, 0);
	vm_map_unlock(map);

	/*
	* Find the page from the underlying object.
	*/
	if (object) {
	vm_object_reference(object);
	rv = vm_map_find(exec_map,
	object,
	trunc_page(offset),
	&data_buf,
	PAGE_SIZE,
	TRUE,
	VM_PROT_READ,
	VM_PROT_ALL,
	MAP_COPY_ON_WRITE \| MAP_PREFAULT_PARTIAL);
	if (rv != KERN_SUCCESS) {
	vm_object_deallocate(object);
	return (rv);
	}

	off = offset - trunc_page(offset);
	error = copyout((caddr_t)data_buf + off, (caddr_t)start,
	end - start);
	vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
	if (error) {
	return (KERN_FAILURE);
	}
	}

	return (KERN_SUCCESS);
	}

	static int
	__elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
	vm_offset_t start, vm_offset_t end, vm_prot_t prot,
	vm_prot_t max, int cow)
	{
	int rv;

	if (start != trunc_page(start)) {
	rv = __elfN(map_partial)(map, object, offset, start,
	round_page(start), prot, max);
	if (rv)
	return (rv);
	offset += round_page(start) - start;
	start = round_page(start);
	}
	if (end != round_page(end)) {
	rv = __elfN(map_partial)(map, object, offset +
	trunc_page(end) - start, trunc_page(end), end, prot, max);
	if (rv)
	return (rv);
	end = trunc_page(end);
	}
	if (end > start) {
	if (offset & PAGE_MASK) {
	vm_offset_t data_buf, off;
	vm_size_t sz;
	int error;

	/*
	* The mapping is not page aligned. This means we have
	* to copy the data. Sigh.
	*/
	rv = vm_map_find(map, 0, 0, &start, end - start,
	FALSE, prot, max, 0);
	if (rv)
	return (rv);
	while (start < end) {
	vm_object_reference(object);
	rv = vm_map_find(exec_map,
	object,
	trunc_page(offset),
	&data_buf,
	2 * PAGE_SIZE,
	TRUE,
	VM_PROT_READ,
	VM_PROT_ALL,
	(MAP_COPY_ON_WRITE
	\| MAP_PREFAULT_PARTIAL));
	if (rv != KERN_SUCCESS) {
	vm_object_deallocate(object);
	return (rv);
	}
	off = offset - trunc_page(offset);
	sz = end - start;
	if (sz > PAGE_SIZE)
	sz = PAGE_SIZE;
	error = copyout((caddr_t)data_buf + off,
	(caddr_t)start, sz);
	vm_map_remove(exec_map, data_buf,
	data_buf + 2 * PAGE_SIZE);
	if (error) {
	return (KERN_FAILURE);
	}
	start += sz;
	}
	rv = KERN_SUCCESS;
	} else {
	vm_map_lock(map);
	rv = vm_map_insert(map, object, offset, start, end,
	prot, max, cow);
	vm_map_unlock(map);
	}
	return (rv);
	} else {
	return (KERN_SUCCESS);
	}
	}

	static int
	__elfN(load_section)(struct proc p, struct vmspace vmspace,
	struct vnode *vp, vm_object_t object, vm_offset_t offset,
	caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
	size_t pagesize)
	{
	size_t map_len;
	vm_offset_t map_addr;
	int error, rv;
	size_t copy_len;
	vm_offset_t file_addr;
	vm_offset_t data_buf = 0;

	GIANT_REQUIRED;

	error = 0;

	/*
	* It's necessary to fail if the filsz + offset taken from the
	* header is greater than the actual file pager object's size.
	* If we were to allow this, then the vm_map_find() below would
	* walk right off the end of the file object and into the ether.
	*
	* While I'm here, might as well check for something else that
	* is invalid: filsz cannot be greater than memsz.
	*/
	if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size \|\|
	filsz > memsz) {
	uprintf("elf_load_section: truncated ELF file\n");
	return (ENOEXEC);
	}

	#define trunc_page_ps(va, ps) ((va) & ~(ps - 1))
	#define round_page_ps(va, ps) (((va) + (ps - 1)) & ~(ps - 1))

	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
	file_addr = trunc_page_ps(offset, pagesize);

	/*
	* We have two choices. We can either clear the data in the last page
	* of an oversized mapping, or we can start the anon mapping a page
	* early and copy the initialized data into that first page. We
	* choose the second..
	*/
	if (memsz > filsz)
	map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
	else
	map_len = round_page_ps(offset + filsz, pagesize) - file_addr;

	if (map_len != 0) {
	vm_object_reference(object);
	rv = __elfN(map_insert)(&vmspace->vm_map,
	object,
	file_addr, /* file offset */
	map_addr, /* virtual start */
	map_addr + map_len,/* virtual end */
	prot,
	VM_PROT_ALL,
	MAP_COPY_ON_WRITE \| MAP_PREFAULT);
	if (rv != KERN_SUCCESS) {
	vm_object_deallocate(object);
	return (EINVAL);
	}

	/* we can stop now if we've covered it all */
	if (memsz == filsz) {
	return (0);
	}
	}


	/*
	* We have to get the remaining bit of the file into the first part
	* of the oversized map segment. This is normally because the .data
	* segment in the file is extended to provide bss. It's a neat idea
	* to try and save a page, but it's a pain in the behind to implement.
	*/
	copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
	map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
	map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
	map_addr;

	/* This had damn well better be true! */
	if (map_len != 0) {
	rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr,
	map_addr + map_len, VM_PROT_ALL, VM_PROT_ALL, 0);
	if (rv != KERN_SUCCESS) {
	return (EINVAL);
	}
	}

	if (copy_len != 0) {
	vm_offset_t off;
	vm_object_reference(object);
	rv = vm_map_find(exec_map,
	object,
	trunc_page(offset + filsz),
	&data_buf,
	PAGE_SIZE,
	TRUE,
	VM_PROT_READ,
	VM_PROT_ALL,
	MAP_COPY_ON_WRITE \| MAP_PREFAULT_PARTIAL);
	if (rv != KERN_SUCCESS) {
	vm_object_deallocate(object);
	return (EINVAL);
	}

	/* send the page fragment to user space */
	off = trunc_page_ps(offset + filsz, pagesize) -
	trunc_page(offset + filsz);
	error = copyout((caddr_t)data_buf + off, (caddr_t)map_addr,
	copy_len);
	vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE);
	if (error) {
	return (error);
	}
	}

	/*
	* set it to the specified protection.
	* XXX had better undo the damage from pasting over the cracks here!
	*/
	vm_map_protect(&vmspace->vm_map, trunc_page(map_addr),
	round_page(map_addr + map_len), prot, FALSE);

	return (error);
	}

	/*
	* Load the file "file" into memory. It may be either a shared object
	* or an executable.
	*
	* The "addr" reference parameter is in/out. On entry, it specifies
	* the address where a shared object should be loaded. If the file is
	* an executable, this value is ignored. On exit, "addr" specifies
	* where the file was actually loaded.
	*
	* The "entry" reference parameter is out only. On exit, it specifies
	* the entry point for the loaded file.
	*/
	static int
	__elfN(load_file)(struct proc p, const char file, u_long *addr,
	u_long *entry, size_t pagesize)
	{
	struct {
	struct nameidata nd;
	struct vattr attr;
	struct image_params image_params;
	} *tempdata;
	const Elf_Ehdr *hdr = NULL;
	const Elf_Phdr *phdr = NULL;
	struct nameidata *nd;
	struct vmspace *vmspace = p->p_vmspace;
	struct vattr *attr;
	struct image_params *imgp;
	vm_prot_t prot;
	u_long rbase;
	u_long base_addr = 0;
	int error, i, numsegs;

	if (curthread->td_proc != p)
	panic("elf_load_file - thread"); /* XXXKSE DIAGNOSTIC */

	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
	nd = &tempdata->nd;
	attr = &tempdata->attr;
	imgp = &tempdata->image_params;

	/*
	* Initialize part of the common data
	*/
	imgp->proc = p;
	imgp->uap = NULL;
	imgp->attr = attr;
	imgp->firstpage = NULL;
	imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE);
	imgp->object = NULL;

	if (imgp->image_header == NULL) {
	nd->ni_vp = NULL;
	error = ENOMEM;
	goto fail;
	}

	/* XXXKSE */
	NDINIT(nd, LOOKUP, LOCKLEAF\|FOLLOW, UIO_SYSSPACE, file, curthread);

	if ((error = namei(nd)) != 0) {
	nd->ni_vp = NULL;
	goto fail;
	}
	NDFREE(nd, NDF_ONLY_PNBUF);
	imgp->vp = nd->ni_vp;

	/*
	* Check permissions, modes, uid, etc on the file, and "open" it.
	*/
	error = exec_check_permissions(imgp);
	if (error) {
	VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
	goto fail;
	}

	error = exec_map_first_page(imgp);
	/*
	* Also make certain that the interpreter stays the same, so set
	* its VV_TEXT flag, too.
	*/
	if (error == 0)
	nd->ni_vp->v_vflag \|= VV_TEXT;

	VOP_GETVOBJECT(nd->ni_vp, &imgp->object);
	vm_object_reference(imgp->object);

	VOP_UNLOCK(nd->ni_vp, 0, curthread); /* XXXKSE */
	if (error)
	goto fail;

	hdr = (const Elf_Ehdr *)imgp->image_header;
	if ((error = __elfN(check_header)(hdr)) != 0)
	goto fail;
	if (hdr->e_type == ET_DYN)
	rbase = *addr;
	else if (hdr->e_type == ET_EXEC)
	rbase = 0;
	else {
	error = ENOEXEC;
	goto fail;
	}

	/* Only support headers that fit within first page for now */
	if ((hdr->e_phoff > PAGE_SIZE) \|\|
	(hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
	error = ENOEXEC;
	goto fail;
	}

	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);

	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
	if (phdr[i].p_type == PT_LOAD) { /* Loadable segment */
	prot = 0;
	if (phdr[i].p_flags & PF_X)
	prot \|= VM_PROT_EXECUTE;
	if (phdr[i].p_flags & PF_W)
	prot \|= VM_PROT_WRITE;
	if (phdr[i].p_flags & PF_R)
	prot \|= VM_PROT_READ;

	if ((error = __elfN(load_section)(p, vmspace,
	nd->ni_vp, imgp->object, phdr[i].p_offset,
	(caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
	phdr[i].p_memsz, phdr[i].p_filesz, prot,
	pagesize)) != 0)
	goto fail;
	/*
	* Establish the base address if this is the
	* first segment.
	*/
	if (numsegs == 0)
	base_addr = trunc_page(phdr[i].p_vaddr +
	rbase);
	numsegs++;
	}
	}
	*addr = base_addr;
	*entry = (unsigned long)hdr->e_entry + rbase;

	fail:
	if (imgp->firstpage)
	exec_unmap_first_page(imgp);
	if (imgp->image_header)
	kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header,
	PAGE_SIZE);
	if (imgp->object)
	vm_object_deallocate(imgp->object);

	if (nd->ni_vp)
	vrele(nd->ni_vp);

	free(tempdata, M_TEMP);

	return (error);
	}

	static int
	__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
	{
	const Elf_Ehdr hdr = (const Elf_Ehdr )imgp->image_header;
	const Elf_Phdr *phdr;
	Elf_Auxargs *elf_auxargs = NULL;
	struct vmspace *vmspace;
	vm_prot_t prot;
	u_long text_size = 0, data_size = 0, total_size = 0;
	u_long text_addr = 0, data_addr = 0;
	u_long seg_size, seg_addr;
	u_long addr, entry = 0, proghdr = 0;
	int error, i;
	const char *interp = NULL;
	Elf_Brandinfo *brand_info;
	char *path;
	struct thread *td = curthread;
	struct sysentvec *sv;

	GIANT_REQUIRED;

	/*
	* Do we have a valid ELF header ?
	*/
	if (__elfN(check_header)(hdr) != 0 \|\| hdr->e_type != ET_EXEC)
	return (-1);

	/*
	* From here on down, we return an errno, not -1, as we've
	* detected an ELF file.
	*/

	if ((hdr->e_phoff > PAGE_SIZE) \|\|
	(hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
	/* Only support headers in first page for now */
	return (ENOEXEC);
	}
	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);

	/*
	* From this point on, we may have resources that need to be freed.
	*/

	VOP_UNLOCK(imgp->vp, 0, td);

	for (i = 0; i < hdr->e_phnum; i++) {
	switch (phdr[i].p_type) {
	case PT_INTERP: /* Path to interpreter */
	if (phdr[i].p_filesz > MAXPATHLEN \|\|
	phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) {
	error = ENOEXEC;
	goto fail;
	}
	interp = imgp->image_header + phdr[i].p_offset;
	break;
	default:
	break;
	}
	}

	brand_info = __elfN(get_brandinfo)(hdr, interp);
	if (brand_info == NULL) {
	uprintf("ELF binary type \"%u\" not known.\n",
	hdr->e_ident[EI_OSABI]);
	error = ENOEXEC;
	goto fail;
	}
	sv = brand_info->sysvec;

	if ((error = exec_extract_strings(imgp)) != 0)
	goto fail;

	- exec_new_vmspace(imgp, sv->sv_minuser, sv->sv_maxuser,
	- sv->sv_usrstack);
	+ exec_new_vmspace(imgp, sv);

	vmspace = imgp->proc->p_vmspace;

	for (i = 0; i < hdr->e_phnum; i++) {
	switch (phdr[i].p_type) {
	case PT_LOAD: /* Loadable segment */
	prot = 0;
	if (phdr[i].p_flags & PF_X)
	prot \|= VM_PROT_EXECUTE;
	if (phdr[i].p_flags & PF_W)
	prot \|= VM_PROT_WRITE;
	if (phdr[i].p_flags & PF_R)
	prot \|= VM_PROT_READ;

	#if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER)
	/*
	* Some x86 binaries assume read == executable,
	* notably the M3 runtime and therefore cvsup
	*/
	if (prot & VM_PROT_READ)
	prot \|= VM_PROT_EXECUTE;
	#endif

	if ((error = __elfN(load_section)(imgp->proc, vmspace,
	imgp->vp, imgp->object, phdr[i].p_offset,
	(caddr_t)(uintptr_t)phdr[i].p_vaddr,
	phdr[i].p_memsz, phdr[i].p_filesz, prot,
	sv->sv_pagesize)) != 0)
	goto fail;

	seg_addr = trunc_page(phdr[i].p_vaddr);
	seg_size = round_page(phdr[i].p_memsz +
	phdr[i].p_vaddr - seg_addr);

	/*
	* Is this .text or .data? We can't use
	* VM_PROT_WRITE or VM_PROT_EXEC, it breaks the
	* alpha terribly and possibly does other bad
	* things so we stick to the old way of figuring
	* it out: If the segment contains the program
	* entry point, it's a text segment, otherwise it
	* is a data segment.
	*
	* Note that obreak() assumes that data_addr +
	* data_size == end of data load area, and the ELF
	* file format expects segments to be sorted by
	* address. If multiple data segments exist, the
	* last one will be used.
	*/
	if (hdr->e_entry >= phdr[i].p_vaddr &&
	hdr->e_entry < (phdr[i].p_vaddr +
	phdr[i].p_memsz)) {
	text_size = seg_size;
	text_addr = seg_addr;
	entry = (u_long)hdr->e_entry;
	} else {
	data_size = seg_size;
	data_addr = seg_addr;
	}
	total_size += seg_size;

	/*
	* Check limits. It should be safe to check the
	* limits after loading the segment since we do
	* not actually fault in all the segment's pages.
	*/
	if (data_size >
	imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur \|\|
	text_size > maxtsiz \|\|
	total_size >
	imgp->proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	error = ENOMEM;
	goto fail;
	}
	break;
	case PT_PHDR: /* Program header table info */
	proghdr = phdr[i].p_vaddr;
	break;
	default:
	break;
	}
	}

	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;

	addr = ELF_RTLD_ADDR(vmspace);

	imgp->entry_addr = entry;

	imgp->proc->p_sysent = sv;
	if (interp != NULL) {
	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
	snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path,
	interp);
	if ((error = __elfN(load_file)(imgp->proc, path, &addr,
	&imgp->entry_addr, sv->sv_pagesize)) != 0) {
	if ((error = __elfN(load_file)(imgp->proc, interp,
	&addr, &imgp->entry_addr, sv->sv_pagesize)) != 0) {
	uprintf("ELF interpreter %s not found\n",
	path);
	free(path, M_TEMP);
	goto fail;
	}
	}
	free(path, M_TEMP);
	}

	/*
	* Construct auxargs table (used by the fixup routine)
	*/
	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
	elf_auxargs->execfd = -1;
	elf_auxargs->phdr = proghdr;
	elf_auxargs->phent = hdr->e_phentsize;
	elf_auxargs->phnum = hdr->e_phnum;
	elf_auxargs->pagesz = PAGE_SIZE;
	elf_auxargs->base = addr;
	elf_auxargs->flags = 0;
	elf_auxargs->entry = entry;
	elf_auxargs->trace = elf_trace;

	imgp->auxargs = elf_auxargs;
	imgp->interpreted = 0;

	fail:
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY, td);
	return (error);
	}

	#if __ELF_WORD_SIZE == 32
	#define suword suword32
	#define stacktype u_int32_t
	#else
	#define suword suword64
	#define stacktype u_int64_t
	#endif

	int
	__elfN(freebsd_fixup)(register_t *stack_base, struct image_params imgp)
	{
	Elf_Auxargs args = (Elf_Auxargs )imgp->auxargs;
	stacktype *base;
	stacktype *pos;

	base = (stacktype )stack_base;
	pos = base + (imgp->argc + imgp->envc + 2);

	if (args->trace) {
	AUXARGS_ENTRY(pos, AT_DEBUG, 1);
	}
	if (args->execfd != -1) {
	AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
	}
	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
	AUXARGS_ENTRY(pos, AT_BASE, args->base);
	AUXARGS_ENTRY(pos, AT_NULL, 0);

	free(imgp->auxargs, M_TEMP);
	imgp->auxargs = NULL;

	base--;
	suword(base, (long)imgp->argc);
	stack_base = (register_t )base;
	return (0);
	}

	/*
	* Code for generating ELF core dumps.
	*/

	typedef void (segment_callback)(vm_map_entry_t, void );

	/* Closure for cb_put_phdr(). */
	struct phdr_closure {
	Elf_Phdr phdr; / Program header to fill in */
	Elf_Off offset; /* Offset of segment in core file */
	};

	/* Closure for cb_size_segment(). */
	struct sseg_closure {
	int count; /* Count of writable segments. */
	size_t size; /* Total size of all writable segments. */
	};

	static void cb_put_phdr(vm_map_entry_t, void *);
	static void cb_size_segment(vm_map_entry_t, void *);
	static void each_writable_segment(struct proc , segment_callback, void );
	static int __elfN(corehdr)(struct thread , struct vnode , struct ucred *,
	int, void *, size_t);
	static void __elfN(puthdr)(struct proc , void , size_t *,
	const prstatus_t , const prfpregset_t , const prpsinfo_t *, int);
	static void __elfN(putnote)(void , size_t , const char *, int,
	const void *, size_t);

	extern int osreldate;

	int
	__elfN(coredump)(td, vp, limit)
	struct thread *td;
	register struct vnode *vp;
	off_t limit;
	{
	register struct proc *p = td->td_proc;
	register struct ucred *cred = td->td_ucred;
	int error = 0;
	struct sseg_closure seginfo;
	void *hdr;
	size_t hdrsize;

	/* Size the program segments. */
	seginfo.count = 0;
	seginfo.size = 0;
	each_writable_segment(p, cb_size_segment, &seginfo);

	/*
	* Calculate the size of the core file header area by making
	* a dry run of generating it. Nothing is written, but the
	* size is calculated.
	*/
	hdrsize = 0;
	__elfN(puthdr)((struct proc )NULL, (void )NULL, &hdrsize,
	(const prstatus_t )NULL, (const prfpregset_t )NULL,
	(const prpsinfo_t *)NULL, seginfo.count);

	if (hdrsize + seginfo.size >= limit)
	return (EFAULT);

	/*
	* Allocate memory for building the header, fill it up,
	* and write it out.
	*/
	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
	if (hdr == NULL) {
	return (EINVAL);
	}
	error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize);

	/* Write the contents of all of the writable segments. */
	if (error == 0) {
	Elf_Phdr *php;
	off_t offset;
	int i;

	php = (Elf_Phdr )((char )hdr + sizeof(Elf_Ehdr)) + 1;
	offset = hdrsize;
	for (i = 0; i < seginfo.count; i++) {
	error = vn_rdwr_inchunks(UIO_WRITE, vp,
	(caddr_t)(uintptr_t)php->p_vaddr,
	php->p_filesz, offset, UIO_USERSPACE,
	IO_UNIT \| IO_DIRECT, cred, NOCRED, (int *)NULL,
	curthread); /* XXXKSE */
	if (error != 0)
	break;
	offset += php->p_filesz;
	php++;
	}
	}
	free(hdr, M_TEMP);

	return (error);
	}

	/*
	* A callback for each_writable_segment() to write out the segment's
	* program header entry.
	*/
	static void
	cb_put_phdr(entry, closure)
	vm_map_entry_t entry;
	void *closure;
	{
	struct phdr_closure phc = (struct phdr_closure )closure;
	Elf_Phdr *phdr = phc->phdr;

	phc->offset = round_page(phc->offset);

	phdr->p_type = PT_LOAD;
	phdr->p_offset = phc->offset;
	phdr->p_vaddr = entry->start;
	phdr->p_paddr = 0;
	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
	phdr->p_align = PAGE_SIZE;
	phdr->p_flags = 0;
	if (entry->protection & VM_PROT_READ)
	phdr->p_flags \|= PF_R;
	if (entry->protection & VM_PROT_WRITE)
	phdr->p_flags \|= PF_W;
	if (entry->protection & VM_PROT_EXECUTE)
	phdr->p_flags \|= PF_X;

	phc->offset += phdr->p_filesz;
	phc->phdr++;
	}

	/*
	* A callback for each_writable_segment() to gather information about
	* the number of segments and their total size.
	*/
	static void
	cb_size_segment(entry, closure)
	vm_map_entry_t entry;
	void *closure;
	{
	struct sseg_closure ssc = (struct sseg_closure )closure;

	ssc->count++;
	ssc->size += entry->end - entry->start;
	}

	/*
	* For each writable segment in the process's memory map, call the given
	* function with a pointer to the map entry and some arbitrary
	* caller-supplied data.
	*/
	static void
	each_writable_segment(p, func, closure)
	struct proc *p;
	segment_callback func;
	void *closure;
	{
	vm_map_t map = &p->p_vmspace->vm_map;
	vm_map_entry_t entry;

	for (entry = map->header.next; entry != &map->header;
	entry = entry->next) {
	vm_object_t obj;

	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) \|\|
	(entry->protection & (VM_PROT_READ\|VM_PROT_WRITE)) !=
	(VM_PROT_READ\|VM_PROT_WRITE))
	continue;

	/*
	** Dont include memory segment in the coredump if
	** MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
	** madvise(2).
	*/
	if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
	continue;

	if ((obj = entry->object.vm_object) == NULL)
	continue;

	/* Find the deepest backing object. */
	while (obj->backing_object != NULL)
	obj = obj->backing_object;

	/* Ignore memory-mapped devices and such things. */
	if (obj->type != OBJT_DEFAULT &&
	obj->type != OBJT_SWAP &&
	obj->type != OBJT_VNODE)
	continue;

	(*func)(entry, closure);
	}
	}

	/*
	* Write the core file header to the file, including padding up to
	* the page boundary.
	*/
	static int
	__elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize)
	struct thread *td;
	struct vnode *vp;
	struct ucred *cred;
	int numsegs;
	size_t hdrsize;
	void *hdr;
	{
	struct {
	prstatus_t status;
	prfpregset_t fpregset;
	prpsinfo_t psinfo;
	} *tempdata;
	struct proc *p = td->td_proc;
	size_t off;
	prstatus_t *status;
	prfpregset_t *fpregset;
	prpsinfo_t *psinfo;

	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO \| M_WAITOK);
	status = &tempdata->status;
	fpregset = &tempdata->fpregset;
	psinfo = &tempdata->psinfo;

	/* Gather the information for the header. */
	status->pr_version = PRSTATUS_VERSION;
	status->pr_statussz = sizeof(prstatus_t);
	status->pr_gregsetsz = sizeof(gregset_t);
	status->pr_fpregsetsz = sizeof(fpregset_t);
	status->pr_osreldate = osreldate;
	status->pr_cursig = p->p_sig;
	status->pr_pid = p->p_pid;
	fill_regs(td, &status->pr_reg);

	fill_fpregs(td, fpregset);

	psinfo->pr_version = PRPSINFO_VERSION;
	psinfo->pr_psinfosz = sizeof(prpsinfo_t);
	strncpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname) - 1);

	/* XXX - We don't fill in the command line arguments properly yet. */
	strncpy(psinfo->pr_psargs, p->p_comm, PRARGSZ);

	/* Fill in the header. */
	bzero(hdr, hdrsize);
	off = 0;
	__elfN(puthdr)(p, hdr, &off, status, fpregset, psinfo, numsegs);

	free(tempdata, M_TEMP);

	/* Write it to the core file. */
	return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
	UIO_SYSSPACE, IO_UNIT \| IO_DIRECT, cred, NOCRED, NULL,
	td)); /* XXXKSE */
	}

	static void
	__elfN(puthdr)(struct proc p, void dst, size_t off, const prstatus_t status,
	const prfpregset_t fpregset, const prpsinfo_t psinfo, int numsegs)
	{
	size_t ehoff;
	size_t phoff;
	size_t noteoff;
	size_t notesz;

	ehoff = *off;
	*off += sizeof(Elf_Ehdr);

	phoff = *off;
	off += (numsegs + 1) sizeof(Elf_Phdr);

	noteoff = *off;
	__elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status,
	sizeof *status);
	__elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
	sizeof *fpregset);
	__elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
	sizeof *psinfo);
	notesz = *off - noteoff;

	/* Align up to a page boundary for the program segments. */
	off = round_page(off);

	if (dst != NULL) {
	Elf_Ehdr *ehdr;
	Elf_Phdr *phdr;
	struct phdr_closure phc;

	/*
	* Fill in the ELF header.
	*/
	ehdr = (Elf_Ehdr )((char )dst + ehoff);
	ehdr->e_ident[EI_MAG0] = ELFMAG0;
	ehdr->e_ident[EI_MAG1] = ELFMAG1;
	ehdr->e_ident[EI_MAG2] = ELFMAG2;
	ehdr->e_ident[EI_MAG3] = ELFMAG3;
	ehdr->e_ident[EI_CLASS] = ELF_CLASS;
	ehdr->e_ident[EI_DATA] = ELF_DATA;
	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
	ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
	ehdr->e_ident[EI_ABIVERSION] = 0;
	ehdr->e_ident[EI_PAD] = 0;
	ehdr->e_type = ET_CORE;
	ehdr->e_machine = ELF_ARCH;
	ehdr->e_version = EV_CURRENT;
	ehdr->e_entry = 0;
	ehdr->e_phoff = phoff;
	ehdr->e_flags = 0;
	ehdr->e_ehsize = sizeof(Elf_Ehdr);
	ehdr->e_phentsize = sizeof(Elf_Phdr);
	ehdr->e_phnum = numsegs + 1;
	ehdr->e_shentsize = sizeof(Elf_Shdr);
	ehdr->e_shnum = 0;
	ehdr->e_shstrndx = SHN_UNDEF;

	/*
	* Fill in the program header entries.
	*/
	phdr = (Elf_Phdr )((char )dst + phoff);

	/* The note segement. */
	phdr->p_type = PT_NOTE;
	phdr->p_offset = noteoff;
	phdr->p_vaddr = 0;
	phdr->p_paddr = 0;
	phdr->p_filesz = notesz;
	phdr->p_memsz = 0;
	phdr->p_flags = 0;
	phdr->p_align = 0;
	phdr++;

	/* All the writable segments from the program. */
	phc.phdr = phdr;
	phc.offset = *off;
	each_writable_segment(p, cb_put_phdr, &phc);
	}
	}

	static void
	__elfN(putnote)(void dst, size_t off, const char *name, int type,
	const void *desc, size_t descsz)
	{
	Elf_Note note;

	note.n_namesz = strlen(name) + 1;
	note.n_descsz = descsz;
	note.n_type = type;
	if (dst != NULL)
	bcopy(&note, (char )dst + off, sizeof note);
	*off += sizeof note;
	if (dst != NULL)
	bcopy(name, (char )dst + off, note.n_namesz);
	*off += roundup2(note.n_namesz, sizeof(Elf_Size));
	if (dst != NULL)
	bcopy(desc, (char )dst + off, note.n_descsz);
	*off += roundup2(note.n_descsz, sizeof(Elf_Size));
	}

	/*
	* Tell kern_execve.c about it, with a little help from the linker.
	*/
	#if __ELF_WORD_SIZE == 32
	static struct execsw elf_execsw = {exec_elf32_imgact, "ELF32"};
	EXEC_SET(elf32, elf_execsw);
	#else
	static struct execsw elf_execsw = {exec_elf64_imgact, "ELF64"};
	EXEC_SET(elf64, elf_execsw);
	#endif
	Index: head/sys/kern/imgact_gzip.c
	===================================================================
	--- head/sys/kern/imgact_gzip.c (revision 103766)
	+++ head/sys/kern/imgact_gzip.c (revision 103767)
	@@ -1,385 +1,385 @@
	/*
	* ----------------------------------------------------------------------------
	* "THE BEER-WARE LICENSE" (Revision 42):
	* <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
	* can do whatever you want with this stuff. If we meet some day, and you think
	* this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
	* ----------------------------------------------------------------------------
	*
	* $FreeBSD$
	*
	* This module handles execution of a.out files which have been run through
	* "gzip". This saves diskspace, but wastes cpu-cycles and VM.
	*
	* TODO:
	* text-segments should be made R/O after being filled
	* is the vm-stuff safe ?
	* should handle the entire header of gzip'ed stuff.
	* inflate isn't quite reentrant yet...
	* error-handling is a mess...
	* so is the rest...
	* tidy up unnecesary includes
	*/

	#include <sys/param.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/imgact_aout.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mman.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/sysent.h>
	#include <sys/systm.h>
	#include <sys/vnode.h>
	#include <sys/inflate.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>

	struct imgact_gzip {
	struct image_params *ip;
	struct exec a_out;
	int error;
	int gotheader;
	int where;
	u_char *inbuf;
	u_long offset;
	u_long output;
	u_long len;
	int idx;
	u_long virtual_offset, file_offset, file_end, bss_size;
	};

	static int exec_gzip_imgact(struct image_params *imgp);
	static int NextByte(void *vp);
	static int do_aout_hdr(struct imgact_gzip *);
	static int Flush(void vp, u_char , u_long siz);

	static int
	exec_gzip_imgact(imgp)
	struct image_params *imgp;
	{
	int error, error2 = 0;
	const u_char p = (const u_char ) imgp->image_header;
	struct imgact_gzip igz;
	struct inflate infl;
	struct vmspace *vmspace;

	/* If these four are not OK, it isn't a gzip file */
	if (p[0] != 0x1f)
	return -1; /* 0 Simply magic */
	if (p[1] != 0x8b)
	return -1; /* 1 Simply magic */
	if (p[2] != 0x08)
	return -1; /* 2 Compression method */
	if (p[9] != 0x03)
	return -1; /* 9 OS compressed on */

	/*
	* If this one contains anything but a comment or a filename marker,
	* we don't want to chew on it
	*/
	if (p[3] & ~(0x18))
	return ENOEXEC; /* 3 Flags */

	/* These are of no use to us */
	/* 4-7 Timestamp */
	/* 8 Extra flags */

	bzero(&igz, sizeof igz);
	bzero(&infl, sizeof infl);
	infl.gz_private = (void *) &igz;
	infl.gz_input = NextByte;
	infl.gz_output = Flush;

	igz.ip = imgp;
	igz.idx = 10;

	if (p[3] & 0x08) { /* skip a filename */
	while (p[igz.idx++])
	if (igz.idx >= PAGE_SIZE)
	return ENOEXEC;
	}
	if (p[3] & 0x10) { /* skip a comment */
	while (p[igz.idx++])
	if (igz.idx >= PAGE_SIZE)
	return ENOEXEC;
	}
	igz.len = imgp->attr->va_size;

	error = inflate(&infl);

	/*
	* The unzipped file may not even have been long enough to contain
	* a header giving Flush() a chance to return error. Check for this.
	*/
	if ( !igz.gotheader )
	return ENOEXEC;

	if ( !error ) {
	vmspace = imgp->proc->p_vmspace;
	error = vm_map_protect(&vmspace->vm_map,
	(vm_offset_t) vmspace->vm_taddr,
	(vm_offset_t) (vmspace->vm_taddr +
	(vmspace->vm_tsize << PAGE_SHIFT)) ,
	VM_PROT_READ\|VM_PROT_EXECUTE,0);
	}

	if (igz.inbuf) {
	error2 =
	vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
	(vm_offset_t) igz.inbuf + PAGE_SIZE);
	}
	if (igz.error \|\| error \|\| error2) {
	printf("Output=%lu ", igz.output);
	printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
	error, igz.error, error2, igz.where);
	}
	if (igz.error)
	return igz.error;
	if (error)
	return ENOEXEC;
	if (error2)
	return error2;
	return 0;
	}

	static int
	do_aout_hdr(struct imgact_gzip * gz)
	{
	int error;
	struct vmspace *vmspace;
	vm_offset_t vmaddr;

	/*
	* Set file/virtual offset based on a.out variant. We do two cases:
	* host byte order and network byte order (for NetBSD compatibility)
	*/
	switch ((int) (gz->a_out.a_magic & 0xffff)) {
	case ZMAGIC:
	gz->virtual_offset = 0;
	if (gz->a_out.a_text) {
	gz->file_offset = PAGE_SIZE;
	} else {
	/* Bill's "screwball mode" */
	gz->file_offset = 0;
	}
	break;
	case QMAGIC:
	gz->virtual_offset = PAGE_SIZE;
	gz->file_offset = 0;
	break;
	default:
	/* NetBSD compatibility */
	switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
	case ZMAGIC:
	case QMAGIC:
	gz->virtual_offset = PAGE_SIZE;
	gz->file_offset = 0;
	break;
	default:
	gz->where = __LINE__;
	return (-1);
	}
	}

	gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);

	/*
	* Check various fields in header for validity/bounds.
	*/
	if ( /* entry point must lay with text region */
	gz->a_out.a_entry < gz->virtual_offset \|\|
	gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text \|\|

	/* text and data size must each be page rounded */
	gz->a_out.a_text & PAGE_MASK \|\| gz->a_out.a_data & PAGE_MASK) {
	gz->where = __LINE__;
	return (-1);
	}
	/*
	* text/data/bss must not exceed limits
	*/
	mtx_assert(&Giant, MA_OWNED);
	if ( /* text can't exceed maximum text size */
	gz->a_out.a_text > maxtsiz \|\|

	/* data + bss can't exceed rlimit */
	gz->a_out.a_data + gz->bss_size >
	gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) {
	gz->where = __LINE__;
	return (ENOMEM);
	}
	/* Find out how far we should go */
	gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;

	/* copy in arguments and/or environment from old process */
	error = exec_extract_strings(gz->ip);
	if (error) {
	gz->where = __LINE__;
	return (error);
	}
	/*
	* Destroy old process VM and create a new one (with a new stack)
	*/
	- exec_new_vmspace(gz->ip, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS, USRSTACK);
	+ exec_new_vmspace(gz->ip, &aout_sysvec);

	vmspace = gz->ip->proc->p_vmspace;

	vmaddr = gz->virtual_offset;

	error = vm_mmap(&vmspace->vm_map,
	&vmaddr,
	gz->a_out.a_text + gz->a_out.a_data,
	VM_PROT_ALL, VM_PROT_ALL, MAP_ANON \| MAP_FIXED,
	0,
	0);

	if (error) {
	gz->where = __LINE__;
	return (error);
	}

	if (gz->bss_size != 0) {
	/*
	* Allocate demand-zeroed area for uninitialized data.
	* "bss" = 'block started by symbol' - named after the
	* IBM 7090 instruction of the same name.
	*/
	vmaddr = gz->virtual_offset + gz->a_out.a_text +
	gz->a_out.a_data;
	error = vm_map_find(&vmspace->vm_map,
	NULL,
	0,
	&vmaddr,
	gz->bss_size,
	FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error) {
	gz->where = __LINE__;
	return (error);
	}
	}
	/* Fill in process VM information */
	vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
	vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
	vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset;
	vmspace->vm_daddr = (caddr_t) (uintptr_t)
	(gz->virtual_offset + gz->a_out.a_text);

	/* Fill in image_params */
	gz->ip->interpreted = 0;
	gz->ip->entry_addr = gz->a_out.a_entry;

	gz->ip->proc->p_sysent = &aout_sysvec;

	return 0;
	}

	static int
	NextByte(void *vp)
	{
	int error;
	struct imgact_gzip igz = (struct imgact_gzip ) vp;

	if (igz->idx >= igz->len) {
	igz->where = __LINE__;
	return GZ_EOF;
	}
	if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
	return igz->inbuf[(igz->idx++) - igz->offset];
	}
	if (igz->inbuf) {
	error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
	(vm_offset_t) igz->inbuf + PAGE_SIZE);
	if (error) {
	igz->where = __LINE__;
	igz->error = error;
	return GZ_EOF;
	}
	}
	igz->offset = igz->idx & ~PAGE_MASK;

	error = vm_mmap(kernel_map, /* map */
	(vm_offset_t ) & igz->inbuf, / address */
	PAGE_SIZE, /* size */
	VM_PROT_READ, /* protection */
	VM_PROT_READ, /* max protection */
	0, /* flags */
	(caddr_t) igz->ip->vp, /* vnode */
	igz->offset); /* offset */
	if (error) {
	igz->where = __LINE__;
	igz->error = error;
	return GZ_EOF;
	}
	return igz->inbuf[(igz->idx++) - igz->offset];
	}

	static int
	Flush(void vp, u_char ptr, u_long siz)
	{
	struct imgact_gzip gz = (struct imgact_gzip ) vp;
	u_char p = ptr, q;
	int i;

	/* First, find a a.out-header */
	if (gz->output < sizeof gz->a_out) {
	q = (u_char *) & gz->a_out;
	i = min(siz, sizeof gz->a_out - gz->output);
	bcopy(p, q + gz->output, i);
	gz->output += i;
	p += i;
	siz -= i;
	if (gz->output == sizeof gz->a_out) {
	gz->gotheader = 1;
	i = do_aout_hdr(gz);
	if (i == -1) {
	if (!gz->where)
	gz->where = __LINE__;
	gz->error = ENOEXEC;
	return ENOEXEC;
	} else if (i) {
	gz->where = __LINE__;
	gz->error = i;
	return ENOEXEC;
	}
	if (gz->file_offset == 0) {
	q = (u_char *) (uintptr_t) gz->virtual_offset;
	copyout(&gz->a_out, q, sizeof gz->a_out);
	}
	}
	}
	/* Skip over zero-padded first PAGE if needed */
	if (gz->output < gz->file_offset &&
	gz->output + siz > gz->file_offset) {
	i = min(siz, gz->file_offset - gz->output);
	gz->output += i;
	p += i;
	siz -= i;
	}
	if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
	i = min(siz, gz->file_end - gz->output);
	q = (u_char *) (uintptr_t)
	(gz->virtual_offset + gz->output - gz->file_offset);
	copyout(p, q, i);
	gz->output += i;
	p += i;
	siz -= i;
	}
	gz->output += siz;
	return 0;
	}


	/*
	* Tell kern_execve.c about it, with a little help from the linker.
	*/
	static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
	EXEC_SET(execgzip, gzip_execsw);
	Index: head/sys/kern/init_main.c
	===================================================================
	--- head/sys/kern/init_main.c (revision 103766)
	+++ head/sys/kern/init_main.c (revision 103767)
	@@ -1,725 +1,725 @@
	/*
	* Copyright (c) 1995 Terrence R. Lambert
	* All rights reserved.
	*
	* Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)init_main.c 8.9 (Berkeley) 1/21/94
	* $FreeBSD$
	*/

	#include "opt_init_path.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/exec.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/mac.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/systm.h>
	#include <sys/signalvar.h>
	#include <sys/vnode.h>
	#include <sys/sysent.h>
	#include <sys/reboot.h>
	#include <sys/sx.h>
	#include <sys/sysproto.h>
	#include <sys/vmmeter.h>
	#include <sys/unistd.h>
	#include <sys/malloc.h>
	#include <sys/conf.h>

	#include <machine/cpu.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <sys/user.h>
	#include <sys/copyright.h>

	void mi_startup(void); /* Should be elsewhere */

	/* Components of the first process -- never freed. */
	static struct session session0;
	static struct pgrp pgrp0;
	struct proc proc0;
	struct thread thread0;
	struct kse kse0;
	struct ksegrp ksegrp0;
	static struct procsig procsig0;
	static struct filedesc0 filedesc0;
	static struct plimit limit0;
	static struct vmspace vmspace0;
	struct proc *initproc;

	int cmask = CMASK;
	extern int fallback_elf_brand;

	struct vnode *rootvp;
	int boothowto = 0; /* initialized so that it can be patched */
	SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "");
	int bootverbose;
	SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "");

	/*
	* This ensures that there is at least one entry so that the sysinit_set
	* symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never
	* executed.
	*/
	SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL)

	/*
	* The sysinit table itself. Items are checked off as the are run.
	* If we want to register new sysinit types, add them to newsysinit.
	*/
	SET_DECLARE(sysinit_set, struct sysinit);
	struct sysinit sysinit, sysinit_end;
	struct sysinit newsysinit, newsysinit_end;

	/*
	* Merge a new sysinit set into the current set, reallocating it if
	* necessary. This can only be called after malloc is running.
	*/
	void
	sysinit_add(struct sysinit set, struct sysinit set_end)
	{
	struct sysinit **newset;
	struct sysinit **sipp;
	struct sysinit **xipp;
	int count;

	count = set_end - set;
	if (newsysinit)
	count += newsysinit_end - newsysinit;
	else
	count += sysinit_end - sysinit;
	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
	if (newset == NULL)
	panic("cannot malloc for sysinit");
	xipp = newset;
	if (newsysinit)
	for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
	xipp++ = sipp;
	else
	for (sipp = sysinit; sipp < sysinit_end; sipp++)
	xipp++ = sipp;
	for (sipp = set; sipp < set_end; sipp++)
	xipp++ = sipp;
	if (newsysinit)
	free(newsysinit, M_TEMP);
	newsysinit = newset;
	newsysinit_end = newset + count;
	}

	/*
	* System startup; initialize the world, create process 0, mount root
	* filesystem, and fork to create init and pagedaemon. Most of the
	* hard work is done in the lower-level initialization routines including
	* startup(), which does memory initialization and autoconfiguration.
	*
	* This allows simple addition of new kernel subsystems that require
	* boot time initialization. It also allows substitution of subsystem
	* (for instance, a scheduler, kernel profiler, or VM system) by object
	* module. Finally, it allows for optional "kernel threads".
	*/
	void
	mi_startup(void)
	{

	register struct sysinit *sipp; / system initialization*/
	register struct sysinit *xipp; / interior loop of sort*/
	register struct sysinit save; / bubble*/

	if (sysinit == NULL) {
	sysinit = SET_BEGIN(sysinit_set);
	sysinit_end = SET_LIMIT(sysinit_set);
	}

	restart:
	/*
	* Perform a bubble sort of the system initialization objects by
	* their subsystem (primary key) and order (secondary key).
	*/
	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
	for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
	if ((sipp)->subsystem < (xipp)->subsystem \|\|
	((sipp)->subsystem == (xipp)->subsystem &&
	(sipp)->order <= (xipp)->order))
	continue; /* skip*/
	save = *sipp;
	sipp = xipp;
	*xipp = save;
	}
	}

	/*
	* Traverse the (now) ordered list of system initialization tasks.
	* Perform each task, and continue on to the next task.
	*
	* The last item on the list is expected to be the scheduler,
	* which will not return.
	*/
	for (sipp = sysinit; sipp < sysinit_end; sipp++) {

	if ((*sipp)->subsystem == SI_SUB_DUMMY)
	continue; /* skip dummy task(s)*/

	if ((*sipp)->subsystem == SI_SUB_DONE)
	continue;

	/* Call function */
	(((sipp)->func))((*sipp)->udata);

	/* Check off the one we're just done */
	(*sipp)->subsystem = SI_SUB_DONE;

	/* Check if we've installed more sysinit items via KLD */
	if (newsysinit != NULL) {
	if (sysinit != SET_BEGIN(sysinit_set))
	free(sysinit, M_TEMP);
	sysinit = newsysinit;
	sysinit_end = newsysinit_end;
	newsysinit = NULL;
	newsysinit_end = NULL;
	goto restart;
	}
	}

	panic("Shouldn't get here!");
	/* NOTREACHED*/
	}


	/*
	***************************************************************************
	****
	**** The following SYSINIT's belong elsewhere, but have not yet
	**** been moved.
	****
	***************************************************************************
	*/
	static void
	print_caddr_t(void *data __unused)
	{
	printf("%s", (char *)data);
	}
	SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
	SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version)

	static void
	set_boot_verbose(void *data __unused)
	{

	if (boothowto & RB_VERBOSE)
	bootverbose++;
	}
	SYSINIT(boot_verbose, SI_SUB_TUNABLES, SI_ORDER_ANY, set_boot_verbose, NULL)

	struct sysentvec null_sysvec = {
	0,
	NULL,
	0,
	0,
	NULL,
	0,
	NULL,
	NULL,
	NULL,
	NULL,
	NULL,
	NULL,
	NULL,
	"null",
	NULL,
	NULL,
	0,
	PAGE_SIZE,
	VM_MIN_ADDRESS,
	VM_MAXUSER_ADDRESS,
	USRSTACK,
	PS_STRINGS,
	VM_PROT_ALL,
	NULL,
	NULL
	};

	/*
	***************************************************************************
	****
	**** The two following SYSINT's are proc0 specific glue code. I am not
	**** convinced that they can not be safely combined, but their order of
	**** operation has been maintained as the same as the original init_main.c
	**** for right now.
	****
	**** These probably belong in init_proc.c or kern_proc.c, since they
	**** deal with proc0 (the fork template process).
	****
	***************************************************************************
	*/
	/* ARGSUSED*/
	static void
	proc0_init(void *dummy __unused)
	{
	register struct proc *p;
	register struct filedesc0 *fdp;
	register unsigned i;
	struct thread *td;
	struct ksegrp *kg;
	struct kse *ke;

	GIANT_REQUIRED;
	p = &proc0;
	td = &thread0;
	ke = &kse0;
	kg = &ksegrp0;

	/*
	* Initialize magic number.
	*/
	p->p_magic = P_MAGIC;

	/*
	* Initialize thread, process and pgrp structures.
	*/
	procinit();
	threadinit();

	/*
	* Initialize sleep queue hash table
	*/
	sleepinit();

	/*
	* additional VM structures
	*/
	vm_init2();

	/*
	* Create process 0 (the swapper).
	*/
	LIST_INSERT_HEAD(&allproc, p, p_list);
	LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
	mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF \| MTX_DUPOK);
	p->p_pgrp = &pgrp0;
	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
	LIST_INIT(&pgrp0.pg_members);
	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);

	pgrp0.pg_session = &session0;
	mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
	session0.s_count = 1;
	session0.s_leader = p;

	p->p_sysent = &null_sysvec;

	/*
	* proc_linkup was already done in init_i386() or alphainit() etc.
	* because the earlier code needed to follow td->td_proc. Otherwise
	* I would have done it here.. maybe this means this should be
	* done earlier too.
	*/
	p->p_flag = P_SYSTEM;
	p->p_sflag = PS_INMEM;
	p->p_state = PRS_NORMAL;
	td->td_state = TDS_RUNNING;
	kg->kg_nice = NZERO;
	kg->kg_pri_class = PRI_TIMESHARE;
	kg->kg_user_pri = PUSER;
	td->td_priority = PVM;
	td->td_base_pri = PUSER;
	td->td_kse = ke; /* XXXKSE */
	ke->ke_oncpu = 0;
	ke->ke_state = KES_THREAD;
	ke->ke_thread = td;
	/* proc_linkup puts it in the idle queue, that's not what we want. */
	TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
	kg->kg_idle_kses--;
	p->p_peers = 0;
	p->p_leader = p;
	KASSERT((ke->ke_kgrlist.tqe_next != ke), ("linked to self!"));


	bcopy("swapper", p->p_comm, sizeof ("swapper"));

	callout_init(&p->p_itcallout, 0);
	callout_init(&td->td_slpcallout, 1);

	/* Create credentials. */
	p->p_ucred = crget();
	p->p_ucred->cr_ngroups = 1; /* group 0 */
	p->p_ucred->cr_uidinfo = uifind(0);
	p->p_ucred->cr_ruidinfo = uifind(0);
	p->p_ucred->cr_prison = NULL; /* Don't jail it. */
	#ifdef MAC
	mac_create_proc0(p->p_ucred);
	#endif
	td->td_ucred = crhold(p->p_ucred);

	/* Create procsig. */
	p->p_procsig = &procsig0;
	p->p_procsig->ps_refcnt = 1;

	/* Initialize signal state for process 0. */
	siginit(&proc0);

	/* Create the file descriptor table. */
	fdp = &filedesc0;
	p->p_fd = &fdp->fd_fd;
	mtx_init(&fdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
	fdp->fd_fd.fd_refcnt = 1;
	fdp->fd_fd.fd_cmask = cmask;
	fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
	fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
	fdp->fd_fd.fd_nfiles = NDFILE;

	/* Create the limits structures. */
	p->p_limit = &limit0;
	for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
	limit0.pl_rlimit[i].rlim_cur =
	limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
	limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
	limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
	limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
	limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
	i = ptoa(cnt.v_free_count);
	limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
	limit0.p_cpulimit = RLIM_INFINITY;
	limit0.p_refcnt = 1;

	/* Allocate a prototype map so we have something to fork. */
	pmap_pinit0(vmspace_pmap(&vmspace0));
	p->p_vmspace = &vmspace0;
	vmspace0.vm_refcnt = 1;
	- vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
	- trunc_page(VM_MAXUSER_ADDRESS));
	+ vm_map_init(&vmspace0.vm_map, p->p_sysent->sv_minuser,
	+ p->p_sysent->sv_maxuser);
	vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0);

	/*
	* We continue to place resource usage info and signal
	* actions in the user struct so they're pageable.
	*/
	p->p_stats = &p->p_uarea->u_stats;
	p->p_sigacts = &p->p_uarea->u_sigacts;

	/*
	* Charge root for one process.
	*/
	(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
	}
	SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)

	/* ARGSUSED*/
	static void
	proc0_post(void *dummy __unused)
	{
	struct timespec ts;
	struct proc *p;

	/*
	* Now we can look at the time, having had a chance to verify the
	* time from the filesystem. Pretend that proc0 started now.
	*/
	sx_slock(&allproc_lock);
	LIST_FOREACH(p, &allproc, p_list) {
	microtime(&p->p_stats->p_start);
	p->p_runtime.sec = 0;
	p->p_runtime.frac = 0;
	}
	sx_sunlock(&allproc_lock);
	binuptime(PCPU_PTR(switchtime));
	PCPU_SET(switchticks, ticks);

	/*
	* Give the ``random'' number generator a thump.
	*/
	nanotime(&ts);
	srandom(ts.tv_sec ^ ts.tv_nsec);
	}
	SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)

	/*
	***************************************************************************
	****
	**** The following SYSINIT's and glue code should be moved to the
	**** respective files on a per subsystem basis.
	****
	***************************************************************************
	*/


	/*
	***************************************************************************
	****
	**** The following code probably belongs in another file, like
	**** kern/init_init.c.
	****
	***************************************************************************
	*/

	/*
	* List of paths to try when searching for "init".
	*/
	static char init_path[MAXPATHLEN] =
	#ifdef INIT_PATH
	__XSTRING(INIT_PATH);
	#else
	"/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall";
	#endif
	SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
	"Path used to search the init process");

	/*
	* Start the initial user process; try exec'ing each pathname in init_path.
	* The program is invoked with one argument containing the boot flags.
	*/
	static void
	start_init(void *dummy)
	{
	vm_offset_t addr;
	struct execve_args args;
	int options, error;
	char var, path, next, s;
	char ucp, uap, arg0, *arg1;
	struct thread *td;
	struct proc *p;
	int init_does_devfs = 0;

	mtx_lock(&Giant);

	GIANT_REQUIRED;

	td = curthread;
	p = td->td_proc;

	vfs_mountroot();

	/* Get the vnode for '/'. Set p->p_fd->fd_cdir to reference it. */
	if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode))
	panic("cannot find root vnode");
	FILEDESC_LOCK(p->p_fd);
	p->p_fd->fd_cdir = rootvnode;
	VREF(p->p_fd->fd_cdir);
	p->p_fd->fd_rdir = rootvnode;
	VREF(p->p_fd->fd_rdir);
	FILEDESC_UNLOCK(p->p_fd);
	VOP_UNLOCK(rootvnode, 0, td);
	#ifdef MAC
	mac_create_root_mount(td->td_ucred, TAILQ_FIRST(&mountlist));
	#endif

	if (devfs_present) {
	/*
	* For disk based systems, we probably cannot do this yet
	* since the fs will be read-only. But a NFS root
	* might be ok. It is worth a shot.
	*/
	error = kern_mkdir(td, "/dev", UIO_SYSSPACE, 0700);
	if (error == EEXIST)
	error = 0;
	if (error == 0)
	error = kernel_vmount(0, "fstype", "devfs",
	"fspath", "/dev", NULL);
	if (error != 0)
	init_does_devfs = 1;
	}

	/*
	* Need just enough stack to hold the faked-up "execve()" arguments.
	*/
	- addr = trunc_page(USRSTACK - PAGE_SIZE);
	+ addr = p->p_sysent->sv_usrstack - PAGE_SIZE;
	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
	FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
	panic("init: couldn't allocate argument space");
	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
	p->p_vmspace->vm_ssize = 1;

	if ((var = getenv("init_path")) != NULL) {
	strncpy(init_path, var, sizeof init_path);
	init_path[sizeof init_path - 1] = 0;
	freeenv(var);
	}
	if ((var = getenv("kern.fallback_elf_brand")) != NULL) {
	fallback_elf_brand = strtol(var, NULL, 0);
	freeenv(var);
	}

	for (path = init_path; *path != '\0'; path = next) {
	while (*path == ':')
	path++;
	if (*path == '\0')
	break;
	for (next = path; next != '\0' && next != ':'; next++)
	/* nothing */ ;
	if (bootverbose)
	printf("start_init: trying %.*s\n", (int)(next - path),
	path);

	/*
	* Move out the boot flag argument.
	*/
	options = 0;
	- ucp = (char *)USRSTACK;
	+ ucp = (char *)p->p_sysent->sv_usrstack;
	(void)subyte(--ucp, 0); /* trailing zero */
	if (boothowto & RB_SINGLE) {
	(void)subyte(--ucp, 's');
	options = 1;
	}
	#ifdef notyet
	if (boothowto & RB_FASTBOOT) {
	(void)subyte(--ucp, 'f');
	options = 1;
	}
	#endif

	#ifdef BOOTCDROM
	(void)subyte(--ucp, 'C');
	options = 1;
	#endif
	if (init_does_devfs) {
	(void)subyte(--ucp, 'd');
	options = 1;
	}

	if (options == 0)
	(void)subyte(--ucp, '-');
	(void)subyte(--ucp, '-'); /* leading hyphen */
	arg1 = ucp;

	/*
	* Move out the file name (also arg 0).
	*/
	(void)subyte(--ucp, 0);
	for (s = next - 1; s >= path; s--)
	(void)subyte(--ucp, *s);
	arg0 = ucp;

	/*
	* Move out the arg pointers.
	*/
	uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
	(void)suword((caddr_t)--uap, (long)0); /* terminator */
	(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
	(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);

	/*
	* Point at the arguments.
	*/
	args.fname = arg0;
	args.argv = uap;
	args.envv = NULL;

	/*
	* Now try to exec the program. If can't for any reason
	* other than it doesn't exist, complain.
	*
	* Otherwise, return via fork_trampoline() all the way
	* to user mode as init!
	*/
	if ((error = execve(td, &args)) == 0) {
	mtx_unlock(&Giant);
	return;
	}
	if (error != ENOENT)
	printf("exec %.*s: error %d\n", (int)(next - path),
	path, error);
	}
	printf("init: not found in path %s\n", init_path);
	panic("no init");
	}

	/*
	* Like kthread_create(), but runs in it's own address space.
	* We do this early to reserve pid 1.
	*
	* Note special case - do not make it runnable yet. Other work
	* in progress will change this more.
	*/
	static void
	create_init(const void *udata __unused)
	{
	struct ucred newcred, oldcred;
	int error;

	error = fork1(&thread0, RFFDG \| RFPROC \| RFSTOPPED, &initproc);
	if (error)
	panic("cannot fork init: %d\n", error);
	/* divorce init's credentials from the kernel's */
	newcred = crget();
	PROC_LOCK(initproc);
	initproc->p_flag \|= P_SYSTEM;
	oldcred = initproc->p_ucred;
	crcopy(newcred, oldcred);
	#ifdef MAC
	mac_create_proc1(newcred);
	#endif
	initproc->p_ucred = newcred;
	PROC_UNLOCK(initproc);
	crfree(oldcred);
	cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
	mtx_lock_spin(&sched_lock);
	initproc->p_sflag \|= PS_INMEM;
	mtx_unlock_spin(&sched_lock);
	cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
	}
	SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)

	/*
	* Make it runnable now.
	*/
	static void
	kick_init(const void *udata __unused)
	{
	struct thread *td;

	td = FIRST_THREAD_IN_PROC(initproc);
	mtx_lock_spin(&sched_lock);
	TD_SET_CAN_RUN(td);
	setrunqueue(td); /* XXXKSE */
	mtx_unlock_spin(&sched_lock);
	}
	SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
	Index: head/sys/kern/kern_exec.c
	===================================================================
	--- head/sys/kern/kern_exec.c (revision 103766)
	+++ head/sys/kern/kern_exec.c (revision 103767)
	@@ -1,1113 +1,1139 @@
	/*
	* Copyright (c) 1993, David Greenman
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include "opt_ktrace.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	#include <sys/signalvar.h>
	#include <sys/kernel.h>
	#include <sys/mac.h>
	#include <sys/mount.h>
	#include <sys/filedesc.h>
	#include <sys/fcntl.h>
	#include <sys/acct.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/imgact_elf.h>
	#include <sys/wait.h>
	#include <sys/malloc.h>
	#include <sys/proc.h>
	#include <sys/pioctl.h>
	#include <sys/namei.h>
	#include <sys/sysent.h>
	#include <sys/shm.h>
	#include <sys/sysctl.h>
	#include <sys/user.h>
	#include <sys/vnode.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_page.h>
	#include <vm/vm_map.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_pager.h>

	#include <machine/reg.h>

	MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");

	static MALLOC_DEFINE(M_ATEXEC, "atexec", "atexec callback");

	+static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
	+static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
	+
	/*
	* callout list for things to do at exec time
	*/
	struct execlist {
	execlist_fn function;
	TAILQ_ENTRY(execlist) next;
	};

	TAILQ_HEAD(exec_list_head, execlist);
	static struct exec_list_head exec_list = TAILQ_HEAD_INITIALIZER(exec_list);

	/* XXX This should be vm_size_t. */
	-static u_long ps_strings = PS_STRINGS;
	-SYSCTL_ULONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings,
	- 0, "");
	+SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG\|CTLFLAG_RD,
	+ NULL, 0, sysctl_kern_ps_strings, "LU", "");

	/* XXX This should be vm_size_t. */
	-static u_long usrstack = USRSTACK;
	-SYSCTL_ULONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, 0, "");
	+SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG\|CTLFLAG_RD,
	+ NULL, 0, sysctl_kern_usrstack, "LU", "");

	u_long ps_arg_cache_limit = PAGE_SIZE / 16;
	SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
	&ps_arg_cache_limit, 0, "");

	int ps_argsopen = 1;
	SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, "");

	#ifdef __ia64__
	/* XXX HACK */
	static int regstkpages = 256;
	SYSCTL_INT(_machdep, OID_AUTO, regstkpages, CTLFLAG_RW, &regstkpages, 0, "");
	#endif

	+static int
	+sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
	+{
	+ struct proc *p;
	+
	+ p = curproc;
	+ return (SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
	+ sizeof(p->p_sysent->sv_psstrings)));
	+}
	+
	+static int
	+sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
	+{
	+ struct proc *p;
	+
	+ p = curproc;
	+ return (SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
	+ sizeof(p->p_sysent->sv_usrstack)));
	+}
	+
	/*
	* Each of the items is a pointer to a `const struct execsw', hence the
	* double pointer here.
	*/
	static const struct execsw **execsw;

	#ifndef _SYS_SYSPROTO_H_
	struct execve_args {
	char *fname;
	char **argv;
	char **envv;
	};
	#endif

	/*
	* execve() system call.
	*
	* MPSAFE
	*/
	int
	execve(td, uap)
	struct thread *td;
	register struct execve_args *uap;
	{
	struct proc *p = td->td_proc;
	struct nameidata nd, *ndp;
	struct ucred newcred = NULL, oldcred;
	struct uidinfo *euip;
	register_t *stack_base;
	int error, len, i;
	struct image_params image_params, *imgp;
	struct vattr attr;
	int (img_first)(struct image_params );
	struct pargs oldargs = NULL, newargs = NULL;
	struct procsig oldprocsig, newprocsig;
	#ifdef KTRACE
	struct vnode *tracevp = NULL;
	#endif
	struct vnode *textvp = NULL;
	int credential_changing;
	int textset;

	imgp = &image_params;

	/*
	* Lock the process and set the P_INEXEC flag to indicate that
	* it should be left alone until we're done here. This is
	* necessary to avoid race conditions - e.g. in ptrace() -
	* that might allow a local user to illicitly obtain elevated
	* privileges.
	*/
	PROC_LOCK(p);
	KASSERT((p->p_flag & P_INEXEC) == 0,
	("%s(): process already has P_INEXEC flag", __func__));
	if (p->p_flag & P_KSES) {
	if (thread_single(SINGLE_EXIT)) {
	PROC_UNLOCK(p);
	return (ERESTART); /* Try again later. */
	}
	/*
	* If we get here all other threads are dead,
	* so unset the associated flags and lose KSE mode.
	*/
	p->p_flag &= ~P_KSES;
	td->td_flags &= ~TDF_UNBOUND;
	thread_single_end();
	}
	p->p_flag \|= P_INEXEC;
	PROC_UNLOCK(p);

	/*
	* Initialize part of the common data
	*/
	imgp->proc = p;
	imgp->uap = uap;
	imgp->attr = &attr;
	imgp->argc = imgp->envc = 0;
	imgp->argv0 = NULL;
	imgp->entry_addr = 0;
	imgp->vmspace_destroyed = 0;
	imgp->interpreted = 0;
	imgp->interpreter_name[0] = '\0';
	imgp->auxargs = NULL;
	imgp->vp = NULL;
	imgp->object = NULL;
	imgp->firstpage = NULL;
	imgp->ps_strings = 0;
	imgp->auxarg_size = 0;

	/*
	* Allocate temporary demand zeroed space for argument and
	* environment strings
	*/
	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX +
	PAGE_SIZE);
	if (imgp->stringbase == NULL) {
	error = ENOMEM;
	mtx_lock(&Giant);
	goto exec_fail;
	}
	imgp->stringp = imgp->stringbase;
	imgp->stringspace = ARG_MAX;
	imgp->image_header = imgp->stringbase + ARG_MAX;

	/*
	* Translate the file name. namei() returns a vnode pointer
	* in ni_vp amoung other things.
	*/
	ndp = &nd;
	NDINIT(ndp, LOOKUP, LOCKLEAF \| FOLLOW \| SAVENAME,
	UIO_USERSPACE, uap->fname, td);

	mtx_lock(&Giant);
	interpret:

	error = namei(ndp);
	if (error) {
	kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
	ARG_MAX + PAGE_SIZE);
	goto exec_fail;
	}

	imgp->vp = ndp->ni_vp;
	imgp->fname = uap->fname;

	/*
	* Check file permissions (also 'opens' file)
	*/
	error = exec_check_permissions(imgp);
	if (error)
	goto exec_fail_dealloc;

	if (VOP_GETVOBJECT(imgp->vp, &imgp->object) == 0)
	vm_object_reference(imgp->object);

	/*
	* Set VV_TEXT now so no one can write to the executable while we're
	* activating it.
	*
	* Remember if this was set before and unset it in case this is not
	* actually an executable image.
	*/
	textset = imgp->vp->v_vflag & VV_TEXT;
	imgp->vp->v_vflag \|= VV_TEXT;

	error = exec_map_first_page(imgp);
	if (error)
	goto exec_fail_dealloc;

	/*
	* If the current process has a special image activator it
	* wants to try first, call it. For example, emulating shell
	* scripts differently.
	*/
	error = -1;
	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
	error = img_first(imgp);

	/*
	* Loop through the list of image activators, calling each one.
	* An activator returns -1 if there is no match, 0 on success,
	* and an error otherwise.
	*/
	for (i = 0; error == -1 && execsw[i]; ++i) {
	if (execsw[i]->ex_imgact == NULL \|\|
	execsw[i]->ex_imgact == img_first) {
	continue;
	}
	error = (*execsw[i]->ex_imgact)(imgp);
	}

	if (error) {
	if (error == -1) {
	if (textset == 0)
	imgp->vp->v_vflag &= ~VV_TEXT;
	error = ENOEXEC;
	}
	goto exec_fail_dealloc;
	}

	/*
	* Special interpreter operation, cleanup and loop up to try to
	* activate the interpreter.
	*/
	if (imgp->interpreted) {
	exec_unmap_first_page(imgp);
	/*
	* VV_TEXT needs to be unset for scripts. There is a short
	* period before we determine that something is a script where
	* VV_TEXT will be set. The vnode lock is held over this
	* entire period so nothing should illegitimately be blocked.
	*/
	imgp->vp->v_vflag &= ~VV_TEXT;
	/* free name buffer and old vnode */
	NDFREE(ndp, NDF_ONLY_PNBUF);
	vput(ndp->ni_vp);
	vm_object_deallocate(imgp->object);
	imgp->object = NULL;
	/* set new name to that of the interpreter */
	NDINIT(ndp, LOOKUP, LOCKLEAF \| FOLLOW \| SAVENAME,
	UIO_SYSSPACE, imgp->interpreter_name, td);
	goto interpret;
	}

	/*
	* Copy out strings (args and env) and initialize stack base
	*/
	if (p->p_sysent->sv_copyout_strings)
	stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
	else
	stack_base = exec_copyout_strings(imgp);

	/*
	* If custom stack fixup routine present for this process
	* let it do the stack setup.
	* Else stuff argument count as first item on stack
	*/
	if (p->p_sysent->sv_fixup)
	(*p->p_sysent->sv_fixup)(&stack_base, imgp);
	else
	suword(--stack_base, imgp->argc);

	/*
	* For security and other reasons, the file descriptor table cannot
	* be shared after an exec.
	*/
	FILEDESC_LOCK(p->p_fd);
	if (p->p_fd->fd_refcnt > 1) {
	struct filedesc *tmp;

	tmp = fdcopy(td);
	FILEDESC_UNLOCK(p->p_fd);
	fdfree(td);
	p->p_fd = tmp;
	} else
	FILEDESC_UNLOCK(p->p_fd);

	/*
	* Malloc things before we need locks.
	*/
	newcred = crget();
	euip = uifind(attr.va_uid);
	i = imgp->endargs - imgp->stringbase;
	if (ps_arg_cache_limit >= i + sizeof(struct pargs))
	newargs = pargs_alloc(i);

	/* close files on exec */
	fdcloseexec(td);

	/* Get a reference to the vnode prior to locking the proc */
	VREF(ndp->ni_vp);

	/*
	* For security and other reasons, signal handlers cannot
	* be shared after an exec. The new process gets a copy of the old
	* handlers. In execsigs(), the new process will have its signals
	* reset.
	*/
	PROC_LOCK(p);
	mp_fixme("procsig needs a lock");
	if (p->p_procsig->ps_refcnt > 1) {
	oldprocsig = p->p_procsig;
	PROC_UNLOCK(p);
	MALLOC(newprocsig, struct procsig *, sizeof(struct procsig),
	M_SUBPROC, M_WAITOK);
	bcopy(oldprocsig, newprocsig, sizeof(*newprocsig));
	newprocsig->ps_refcnt = 1;
	oldprocsig->ps_refcnt--;
	PROC_LOCK(p);
	p->p_procsig = newprocsig;
	if (p->p_sigacts == &p->p_uarea->u_sigacts)
	panic("shared procsig but private sigacts?");

	p->p_uarea->u_sigacts = *p->p_sigacts;
	p->p_sigacts = &p->p_uarea->u_sigacts;
	}
	/* Stop profiling */
	stopprofclock(p);

	/* reset caught signals */
	execsigs(p);

	/* name this process - nameiexec(p, ndp) */
	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
	p->p_comm[len] = 0;

	/*
	* mark as execed, wakeup the process that vforked (if any) and tell
	* it that it now has its own resources back
	*/
	p->p_flag \|= P_EXEC;
	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
	p->p_flag &= ~P_PPWAIT;
	wakeup(p->p_pptr);
	}

	/*
	* Implement image setuid/setgid.
	*
	* Don't honor setuid/setgid if the filesystem prohibits it or if
	* the process is being traced.
	*/
	oldcred = p->p_ucred;
	credential_changing = 0;
	credential_changing \|= (attr.va_mode & VSUID) && oldcred->cr_uid !=
	attr.va_uid;
	credential_changing \|= (attr.va_mode & VSGID) && oldcred->cr_gid !=
	attr.va_gid;

	if (credential_changing &&
	(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
	(p->p_flag & P_TRACED) == 0) {
	/*
	* Turn off syscall tracing for set-id programs, except for
	* root. Record any set-id flags first to make sure that
	* we do not regain any tracing during a possible block.
	*/
	setsugid(p);
	#ifdef KTRACE
	if (p->p_tracep && suser_cred(oldcred, PRISON_ROOT)) {
	mtx_lock(&ktrace_mtx);
	p->p_traceflag = 0;
	tracevp = p->p_tracep;
	p->p_tracep = NULL;
	mtx_unlock(&ktrace_mtx);
	}
	#endif
	/*
	* Close any file descriptors 0..2 that reference procfs,
	* then make sure file descriptors 0..2 are in use.
	*
	* setugidsafety() may call closef() and then pfind()
	* which may grab the process lock.
	* fdcheckstd() may call falloc() which may block to
	* allocate memory, so temporarily drop the process lock.
	*/
	PROC_UNLOCK(p);
	setugidsafety(td);
	error = fdcheckstd(td);
	PROC_LOCK(p);
	if (error != 0)
	goto done1;
	/*
	* Set the new credentials.
	*/
	crcopy(newcred, oldcred);
	if (attr.va_mode & VSUID)
	change_euid(newcred, euip);
	if (attr.va_mode & VSGID)
	change_egid(newcred, attr.va_gid);
	/*
	* Implement correct POSIX saved-id behavior.
	*/
	change_svuid(newcred, newcred->cr_uid);
	change_svgid(newcred, newcred->cr_gid);
	p->p_ucred = newcred;
	newcred = NULL;
	} else {
	if (oldcred->cr_uid == oldcred->cr_ruid &&
	oldcred->cr_gid == oldcred->cr_rgid)
	p->p_flag &= ~P_SUGID;
	/*
	* Implement correct POSIX saved-id behavior.
	*
	* XXX: It's not clear that the existing behavior is
	* POSIX-compliant. A number of sources indicate that the
	* saved uid/gid should only be updated if the new ruid is
	* not equal to the old ruid, or the new euid is not equal
	* to the old euid and the new euid is not equal to the old
	* ruid. The FreeBSD code always updates the saved uid/gid.
	* Also, this code uses the new (replaced) euid and egid as
	* the source, which may or may not be the right ones to use.
	*/
	if (oldcred->cr_svuid != oldcred->cr_uid \|\|
	oldcred->cr_svgid != oldcred->cr_gid) {
	crcopy(newcred, oldcred);
	change_svuid(newcred, newcred->cr_uid);
	change_svgid(newcred, newcred->cr_gid);
	p->p_ucred = newcred;
	newcred = NULL;
	}
	}

	/*
	* Store the vp for use in procfs. This vnode was referenced prior
	* to locking the proc lock.
	*/
	textvp = p->p_textvp;
	p->p_textvp = ndp->ni_vp;

	/*
	* Notify others that we exec'd, and clear the P_INEXEC flag
	* as we're now a bona fide freshly-execed process.
	*/
	KNOTE(&p->p_klist, NOTE_EXEC);
	p->p_flag &= ~P_INEXEC;

	/*
	* If tracing the process, trap to debugger so breakpoints
	* can be set before the program executes.
	*/
	_STOPEVENT(p, S_EXEC, 0);

	if (p->p_flag & P_TRACED)
	psignal(p, SIGTRAP);

	/* clear "fork but no exec" flag, as we _are_ execing */
	p->p_acflag &= ~AFORK;

	/* Free any previous argument cache */
	oldargs = p->p_args;
	p->p_args = NULL;

	/* Set values passed into the program in registers. */
	if (p->p_sysent->sv_setregs)
	(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
	(u_long)(uintptr_t)stack_base, imgp->ps_strings);
	else
	exec_setregs(td, imgp->entry_addr,
	(u_long)(uintptr_t)stack_base, imgp->ps_strings);

	/* Cache arguments if they fit inside our allowance */
	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
	bcopy(imgp->stringbase, newargs->ar_args, i);
	p->p_args = newargs;
	newargs = NULL;
	}
	done1:
	PROC_UNLOCK(p);


	/*
	* Free any resources malloc'd earlier that we didn't use.
	*/
	uifree(euip);
	if (newcred == NULL)
	crfree(oldcred);
	else
	crfree(newcred);
	/*
	* Handle deferred decrement of ref counts.
	*/
	if (textvp != NULL)
	vrele(textvp);
	if (ndp->ni_vp && error != 0)
	vrele(ndp->ni_vp);
	#ifdef KTRACE
	if (tracevp != NULL)
	vrele(tracevp);
	#endif
	if (oldargs != NULL)
	pargs_drop(oldargs);
	if (newargs != NULL)
	pargs_drop(newargs);

	exec_fail_dealloc:

	/*
	* free various allocated resources
	*/
	if (imgp->firstpage)
	exec_unmap_first_page(imgp);

	if (imgp->stringbase != NULL)
	kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
	ARG_MAX + PAGE_SIZE);

	if (imgp->vp) {
	NDFREE(ndp, NDF_ONLY_PNBUF);
	vput(imgp->vp);
	}

	if (imgp->object)
	vm_object_deallocate(imgp->object);

	if (error == 0)
	goto done2;

	exec_fail:
	/* we're done here, clear P_INEXEC */
	PROC_LOCK(p);
	p->p_flag &= ~P_INEXEC;
	PROC_UNLOCK(p);

	if (imgp->vmspace_destroyed) {
	/* sorry, no more process anymore. exit gracefully */
	exit1(td, W_EXITCODE(0, SIGABRT));
	/* NOT REACHED */
	error = 0;
	}
	done2:
	mtx_unlock(&Giant);
	return (error);
	}

	int
	exec_map_first_page(imgp)
	struct image_params *imgp;
	{
	int rv, i;
	int initial_pagein;
	vm_page_t ma[VM_INITIAL_PAGEIN];
	vm_object_t object;

	GIANT_REQUIRED;

	if (imgp->firstpage) {
	exec_unmap_first_page(imgp);
	}

	VOP_GETVOBJECT(imgp->vp, &object);

	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);

	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
	initial_pagein = VM_INITIAL_PAGEIN;
	if (initial_pagein > object->size)
	initial_pagein = object->size;
	for (i = 1; i < initial_pagein; i++) {
	if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
	if ((ma[i]->flags & PG_BUSY) \|\| ma[i]->busy)
	break;
	if (ma[i]->valid)
	break;
	vm_page_busy(ma[i]);
	} else {
	ma[i] = vm_page_alloc(object, i,
	VM_ALLOC_NORMAL);
	if (ma[i] == NULL)
	break;
	}
	}
	initial_pagein = i;

	rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
	ma[0] = vm_page_lookup(object, 0);

	if ((rv != VM_PAGER_OK) \|\| (ma[0] == NULL) \|\|
	(ma[0]->valid == 0)) {
	if (ma[0]) {
	vm_page_lock_queues();
	vm_page_protect(ma[0], VM_PROT_NONE);
	vm_page_free(ma[0]);
	vm_page_unlock_queues();
	}
	return (EIO);
	}
	}
	vm_page_lock_queues();
	vm_page_wire(ma[0]);
	vm_page_wakeup(ma[0]);
	vm_page_unlock_queues();

	pmap_qenter((vm_offset_t)imgp->image_header, ma, 1);
	imgp->firstpage = ma[0];

	return (0);
	}

	void
	exec_unmap_first_page(imgp)
	struct image_params *imgp;
	{
	GIANT_REQUIRED;

	if (imgp->firstpage) {
	pmap_qremove((vm_offset_t)imgp->image_header, 1);
	vm_page_lock_queues();
	vm_page_unwire(imgp->firstpage, 1);
	vm_page_unlock_queues();
	imgp->firstpage = NULL;
	}
	}

	/*
	* Destroy old address space, and allocate a new stack
	* The new stack is only SGROWSIZ large because it is grown
	* automatically in trap.c.
	*/
	int
	-exec_new_vmspace(imgp, minuser, maxuser, stack_addr)
	+exec_new_vmspace(imgp, sv)
	struct image_params *imgp;
	- vm_offset_t minuser, maxuser, stack_addr;
	+ struct sysentvec *sv;
	{
	int error;
	struct execlist *ep;
	struct proc *p = imgp->proc;
	struct vmspace *vmspace = p->p_vmspace;
	+ vm_offset_t stack_addr;
	+ vm_map_t map;

	GIANT_REQUIRED;

	- stack_addr = stack_addr - maxssiz;
	+ stack_addr = sv->sv_usrstack - maxssiz;

	imgp->vmspace_destroyed = 1;

	/*
	* Perform functions registered with at_exec().
	*/
	TAILQ_FOREACH(ep, &exec_list, next)
	(*ep->function)(p);

	/*
	* Blow away entire process VM, if address space not shared,
	* otherwise, create a new VM space so that other threads are
	* not disrupted
	*/
	- if (vmspace->vm_refcnt == 1 &&
	- vm_map_min(&vmspace->vm_map) == minuser &&
	- vm_map_max(&vmspace->vm_map) == maxuser) {
	+ map = &vmspace->vm_map;
	+ if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
	+ vm_map_max(map) == sv->sv_maxuser) {
	if (vmspace->vm_shm)
	shmexit(p);
	- pmap_remove_pages(vmspace_pmap(vmspace), minuser, maxuser);
	- vm_map_remove(&vmspace->vm_map, minuser, maxuser);
	+ pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map),
	+ vm_map_max(map));
	+ vm_map_remove(map, vm_map_min(map), vm_map_max(map));
	} else {
	- vmspace_exec(p, minuser, maxuser);
	+ vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
	vmspace = p->p_vmspace;
	+ map = &vmspace->vm_map;
	}

	/* Allocate a new stack */
	- error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz,
	- VM_PROT_ALL, VM_PROT_ALL, 0);
	+ error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
	+ sv->sv_stackprot, VM_PROT_ALL, 0);
	if (error)
	return (error);

	#ifdef __ia64__
	{
	/*
	* Allocate backing store. We really need something
	* similar to vm_map_stack which can allow the backing
	* store to grow upwards. This will do for now.
	*/
	vm_offset_t bsaddr;
	- bsaddr = USRSTACK - 2 * maxssiz;
	- error = vm_map_find(&vmspace->vm_map, 0, 0, &bsaddr,
	+ bsaddr = p->p_sysent->sv_usrstack - 2 * maxssiz;
	+ error = vm_map_find(map, 0, 0, &bsaddr,
	regstkpages * PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0);
	FIRST_THREAD_IN_PROC(p)->td_md.md_bspstore = bsaddr;
	}
	#endif

	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
	* VM_STACK case, but they are still used to monitor the size of the
	* process stack so we can check the stack rlimit.
	*/
	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
	- vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz;
	+ vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;

	return (0);
	}

	/*
	* Copy out argument and environment strings from the old process
	* address space into the temporary string buffer.
	*/
	int
	exec_extract_strings(imgp)
	struct image_params *imgp;
	{
	char argv, envv;
	char argp, envp;
	int error;
	size_t length;

	/*
	* extract arguments first
	*/

	argv = imgp->uap->argv;

	if (argv) {
	argp = (caddr_t)(intptr_t)fuword(argv);
	if (argp == (caddr_t)-1)
	return (EFAULT);
	if (argp)
	argv++;
	if (imgp->argv0)
	argp = imgp->argv0;
	if (argp) {
	do {
	if (argp == (caddr_t)-1)
	return (EFAULT);
	if ((error = copyinstr(argp, imgp->stringp,
	imgp->stringspace, &length))) {
	if (error == ENAMETOOLONG)
	return (E2BIG);
	return (error);
	}
	imgp->stringspace -= length;
	imgp->stringp += length;
	imgp->argc++;
	} while ((argp = (caddr_t)(intptr_t)fuword(argv++)));
	}
	}

	imgp->endargs = imgp->stringp;

	/*
	* extract environment strings
	*/

	envv = imgp->uap->envv;

	if (envv) {
	while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
	if (envp == (caddr_t)-1)
	return (EFAULT);
	if ((error = copyinstr(envp, imgp->stringp,
	imgp->stringspace, &length))) {
	if (error == ENAMETOOLONG)
	return (E2BIG);
	return (error);
	}
	imgp->stringspace -= length;
	imgp->stringp += length;
	imgp->envc++;
	}
	}

	return (0);
	}

	/*
	* Copy strings out to the new process address space, constructing
	* new arg and env vector tables. Return a pointer to the base
	* so that it can be used as the initial stack pointer.
	*/
	register_t *
	exec_copyout_strings(imgp)
	struct image_params *imgp;
	{
	int argc, envc;
	char **vectp;
	char stringp, destp;
	register_t *stack_base;
	struct ps_strings *arginfo;
	struct proc *p;
	int szsigcode;

	/*
	* Calculate string base and vector table pointers.
	* Also deal with signal trampoline code for this exec type.
	*/
	p = imgp->proc;
	szsigcode = 0;
	- arginfo = (struct ps_strings *)PS_STRINGS;
	+ arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
	if (p->p_sysent->sv_szsigcode != NULL)
	szsigcode = *(p->p_sysent->sv_szsigcode);
	destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
	roundup((ARG_MAX - imgp->stringspace), sizeof(char *));

	/*
	* install sigcode
	*/
	if (szsigcode)
	copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
	szsigcode), szsigcode);

	/*
	* If we have a valid auxargs ptr, prepare some room
	* on the stack.
	*/
	if (imgp->auxargs) {
	/*
	* 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
	* lower compatibility.
	*/
	imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
	(AT_COUNT * 2);
	/*
	* The '+ 2' is for the null pointers at the end of each of
	* the arg and env vector sets,and imgp->auxarg_size is room
	* for argument of Runtime loader.
	*/
	vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
	imgp->auxarg_size) * sizeof(char *));

	} else
	/*
	* The '+ 2' is for the null pointers at the end of each of
	* the arg and env vector sets
	*/
	vectp = (char *)(destp - (imgp->argc + imgp->envc + 2)
	sizeof(char *));

	/*
	* vectp also becomes our initial stack base
	*/
	stack_base = (register_t *)vectp;

	stringp = imgp->stringbase;
	argc = imgp->argc;
	envc = imgp->envc;

	/*
	* Copy out strings - arguments and environment.
	*/
	copyout(stringp, destp, ARG_MAX - imgp->stringspace);

	/*
	* Fill in "ps_strings" struct for ps, w, etc.
	*/
	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
	suword(&arginfo->ps_nargvstr, argc);

	/*
	* Fill in argument portion of vector table.
	*/
	for (; argc > 0; --argc) {
	suword(vectp++, (long)(intptr_t)destp);
	while (*stringp++ != 0)
	destp++;
	destp++;
	}

	/* a null vector table pointer separates the argp's from the envp's */
	suword(vectp++, 0);

	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
	suword(&arginfo->ps_nenvstr, envc);

	/*
	* Fill in environment portion of vector table.
	*/
	for (; envc > 0; --envc) {
	suword(vectp++, (long)(intptr_t)destp);
	while (*stringp++ != 0)
	destp++;
	destp++;
	}

	/* end of vector table is a null pointer */
	suword(vectp, 0);

	return (stack_base);
	}

	/*
	* Check permissions of file to execute.
	* Called with imgp->vp locked.
	* Return 0 for success or error code on failure.
	*/
	int
	exec_check_permissions(imgp)
	struct image_params *imgp;
	{
	struct vnode *vp = imgp->vp;
	struct vattr *attr = imgp->attr;
	struct thread *td;
	int error;

	td = curthread; /* XXXKSE */

	#ifdef MAC
	error = mac_check_vnode_exec(td->td_ucred, imgp->vp);
	if (error)
	return (error);
	#endif

	/* Get file attributes */
	error = VOP_GETATTR(vp, attr, td->td_ucred, td);
	if (error)
	return (error);

	/*
	* 1) Check if file execution is disabled for the filesystem that this
	* file resides on.
	* 2) Insure that at least one execute bit is on - otherwise root
	* will always succeed, and we don't want to happen unless the
	* file really is executable.
	* 3) Insure that the file is a regular file.
	*/
	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) \|\|
	((attr->va_mode & 0111) == 0) \|\|
	(attr->va_type != VREG))
	return (EACCES);

	/*
	* Zero length files can't be exec'd
	*/
	if (attr->va_size == 0)
	return (ENOEXEC);

	/*
	* Check for execute permission to file based on current credentials.
	*/
	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
	if (error)
	return (error);

	/*
	* Check number of open-for-writes on the file and deny execution
	* if there are any.
	*/
	if (vp->v_writecount)
	return (ETXTBSY);

	/*
	* Call filesystem specific open routine (which does nothing in the
	* general case).
	*/
	error = VOP_OPEN(vp, FREAD, td->td_ucred, td);
	return (error);
	}

	/*
	* Exec handler registration
	*/
	int
	exec_register(execsw_arg)
	const struct execsw *execsw_arg;
	{
	const struct execsw es, xs, **newexecsw;
	int count = 2; /* New slot and trailing NULL */

	if (execsw)
	for (es = execsw; *es; es++)
	count++;
	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	if (newexecsw == NULL)
	return (ENOMEM);
	xs = newexecsw;
	if (execsw)
	for (es = execsw; *es; es++)
	xs++ = es;
	*xs++ = execsw_arg;
	*xs = NULL;
	if (execsw)
	free(execsw, M_TEMP);
	execsw = newexecsw;
	return (0);
	}

	int
	exec_unregister(execsw_arg)
	const struct execsw *execsw_arg;
	{
	const struct execsw es, xs, **newexecsw;
	int count = 1;

	if (execsw == NULL)
	panic("unregister with no handlers left?\n");

	for (es = execsw; *es; es++) {
	if (*es == execsw_arg)
	break;
	}
	if (*es == NULL)
	return (ENOENT);
	for (es = execsw; *es; es++)
	if (*es != execsw_arg)
	count++;
	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	if (newexecsw == NULL)
	return (ENOMEM);
	xs = newexecsw;
	for (es = execsw; *es; es++)
	if (*es != execsw_arg)
	xs++ = es;
	*xs = NULL;
	if (execsw)
	free(execsw, M_TEMP);
	execsw = newexecsw;
	return (0);
	}

	int
	at_exec(function)
	execlist_fn function;
	{
	struct execlist *ep;

	#ifdef INVARIANTS
	/* Be noisy if the programmer has lost track of things */
	if (rm_at_exec(function))
	printf("WARNING: exec callout entry (%p) already present\n",
	function);
	#endif
	ep = malloc(sizeof(*ep), M_ATEXEC, M_NOWAIT);
	if (ep == NULL)
	return (ENOMEM);
	ep->function = function;
	TAILQ_INSERT_TAIL(&exec_list, ep, next);
	return (0);
	}

	/*
	* Scan the exec callout list for the given item and remove it.
	* Returns the number of items removed (0 or 1)
	*/
	int
	rm_at_exec(function)
	execlist_fn function;
	{
	struct execlist *ep;

	TAILQ_FOREACH(ep, &exec_list, next) {
	if (ep->function == function) {
	TAILQ_REMOVE(&exec_list, ep, next);
	free(ep, M_ATEXEC);
	return (1);
	}
	}
	return (0);
	}
	Index: head/sys/kern/kern_exit.c
	===================================================================
	--- head/sys/kern/kern_exit.c (revision 103766)
	+++ head/sys/kern/kern_exit.c (revision 103767)
	@@ -1,883 +1,883 @@
	/*
	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_exit.c 8.7 (Berkeley) 2/12/94
	* $FreeBSD$
	*/

	#include "opt_compat.h"
	#include "opt_ktrace.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/pioctl.h>
	#include <sys/tty.h>
	#include <sys/wait.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#include <sys/resourcevar.h>
	#include <sys/signalvar.h>
	#include <sys/sx.h>
	#include <sys/ptrace.h>
	#include <sys/acct.h> /* for acct_process() function prototype */
	#include <sys/filedesc.h>
	#include <sys/shm.h>
	#include <sys/sem.h>
	#include <sys/jail.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/uma.h>
	#include <sys/user.h>

	/* Required to be non-static for SysVR4 emulator */
	MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status");

	static MALLOC_DEFINE(M_ATEXIT, "atexit", "atexit callback");

	static int wait1(struct thread , struct wait_args , int);

	/*
	* callout list for things to do at exit time
	*/
	struct exitlist {
	exitlist_fn function;
	TAILQ_ENTRY(exitlist) next;
	};

	TAILQ_HEAD(exit_list_head, exitlist);
	static struct exit_list_head exit_list = TAILQ_HEAD_INITIALIZER(exit_list);

	/*
	* exit --
	* Death of process.
	*
	* MPSAFE
	*/
	void
	sys_exit(td, uap)
	struct thread *td;
	struct sys_exit_args /* {
	int rval;
	} / uap;
	{

	mtx_lock(&Giant);
	exit1(td, W_EXITCODE(uap->rval, 0));
	/* NOTREACHED */
	}

	/*
	* Exit: deallocate address space and other resources, change proc state
	* to zombie, and unlink proc from allproc and parent's lists. Save exit
	* status and rusage for wait(). Check for child processes and orphan them.
	*/
	void
	exit1(td, rv)
	register struct thread *td;
	int rv;
	{
	struct exitlist *ep;
	struct proc p, nq, *q;
	struct tty *tp;
	struct vnode *ttyvp;
	register struct vmspace *vm;
	struct vnode *vtmp;
	#ifdef KTRACE
	struct vnode *tracevp;
	#endif

	GIANT_REQUIRED;

	p = td->td_proc;
	if (p == initproc) {
	printf("init died (signal %d, exit %d)\n",
	WTERMSIG(rv), WEXITSTATUS(rv));
	panic("Going nowhere without my init!");
	}

	/*
	* XXXXKSE: MUST abort all other threads before proceeding past here.
	*/
	PROC_LOCK(p);
	if (p->p_flag & P_KSES) {
	/*
	* First check if some other thread got here before us..
	* if so, act apropriatly, (exit or suspend);
	*/
	thread_suspend_check(0);
	/*
	* Here is a trick..
	* We need to free up our KSE to process other threads
	* so that we can safely set the UNBOUND flag
	* (whether or not we have a mailbox) as we are NEVER
	* going to return to the user.
	* The flag will not be set yet if we are exiting
	* because of a signal, pagefault, or similar
	* (or even an exit(2) from the UTS).
	*/
	td->td_flags \|= TDF_UNBOUND;

	/*
	* Kill off the other threads. This requires
	* Some co-operation from other parts of the kernel
	* so it may not be instant.
	* With this state set:
	* Any thread entering the kernel from userspace will
	* thread_exit() in trap(). Any thread attempting to
	* sleep will return immediatly
	* with EINTR or EWOULDBLOCK, which will hopefully force them
	* to back out to userland, freeing resources as they go, and
	* anything attempting to return to userland will thread_exit()
	* from userret(). thread_exit() will unsuspend us
	* when the last other thread exits.
	*/
	if (thread_single(SINGLE_EXIT)) {
	panic ("Exit: Single threading fouled up");
	}
	/*
	* All other activity in this process is now stopped.
	* Remove excess KSEs and KSEGRPS. XXXKSE (when we have them)
	* ...
	* Turn off threading support.
	*/
	p->p_flag &= ~P_KSES;
	td->td_flags &= ~TDF_UNBOUND;
	thread_single_end(); /* Don't need this any more. */
	}
	/*
	* With this state set:
	* Any thread entering the kernel from userspace will thread_exit()
	* in trap(). Any thread attempting to sleep will return immediatly
	* with EINTR or EWOULDBLOCK, which will hopefully force them
	* to back out to userland, freeing resources as they go, and
	* anything attempting to return to userland will thread_exit()
	* from userret(). thread_exit() will do a wakeup on p->p_numthreads
	* if it transitions to 1.
	*/

	p->p_flag \|= P_WEXIT;
	PROC_UNLOCK(p);

	/* Are we a task leader? */
	PROC_LOCK(p);
	if (p == p->p_leader) {
	q = p->p_peers;
	while (q != NULL) {
	PROC_LOCK(q);
	psignal(q, SIGKILL);
	PROC_UNLOCK(q);
	q = q->p_peers;
	}
	while (p->p_peers)
	msleep(p, &p->p_mtx, PWAIT, "exit1", 0);
	}
	PROC_UNLOCK(p);

	#ifdef PGINPROF
	vmsizmon();
	#endif
	STOPEVENT(p, S_EXIT, rv);
	wakeup(&p->p_stype); /* Wakeup anyone in procfs' PIOCWAIT */

	/*
	* Check if any loadable modules need anything done at process exit.
	* e.g. SYSV IPC stuff
	* XXX what if one of these generates an error?
	*/
	TAILQ_FOREACH(ep, &exit_list, next)
	(*ep->function)(p);

	stopprofclock(p);

	MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
	M_ZOMBIE, M_WAITOK);
	/*
	* If parent is waiting for us to exit or exec,
	* P_PPWAIT is set; we will wakeup the parent below.
	*/
	PROC_LOCK(p);
	p->p_flag &= ~(P_TRACED \| P_PPWAIT);
	SIGEMPTYSET(p->p_siglist);
	PROC_UNLOCK(p);
	if (timevalisset(&p->p_realtimer.it_value))
	callout_stop(&p->p_itcallout);

	/*
	* Reset any sigio structures pointing to us as a result of
	* F_SETOWN with our pid.
	*/
	funsetownlst(&p->p_sigiolst);

	/*
	* Close open files and release open-file table.
	* This may block!
	*/
	fdfree(td); /* XXXKSE // may not be the one in proc */

	/*
	* Remove ourself from our leader's peer list and wake our leader.
	*/
	PROC_LOCK(p->p_leader);
	if (p->p_leader->p_peers) {
	q = p->p_leader;
	while (q->p_peers != p)
	q = q->p_peers;
	q->p_peers = p->p_peers;
	wakeup(p->p_leader);
	}
	PROC_UNLOCK(p->p_leader);

	/* The next two chunks should probably be moved to vmspace_exit. */
	vm = p->p_vmspace;
	/*
	* Release user portion of address space.
	* This releases references to vnodes,
	* which could cause I/O if the file has been unlinked.
	* Need to do this early enough that we can still sleep.
	* Can't free the entire vmspace as the kernel stack
	* may be mapped within that space also.
	*/
	if (--vm->vm_refcnt == 0) {
	if (vm->vm_shm)
	shmexit(p);
	- pmap_remove_pages(vmspace_pmap(vm), VM_MIN_ADDRESS,
	- VM_MAXUSER_ADDRESS);
	- (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS,
	- VM_MAXUSER_ADDRESS);
	+ pmap_remove_pages(vmspace_pmap(vm), vm_map_min(&vm->vm_map),
	+ vm_map_max(&vm->vm_map));
	+ (void) vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
	+ vm_map_max(&vm->vm_map));
	vm->vm_freer = p;
	}

	sx_xlock(&proctree_lock);
	if (SESS_LEADER(p)) {
	register struct session *sp;

	sp = p->p_session;
	if (sp->s_ttyvp) {
	/*
	* Controlling process.
	* Signal foreground pgrp,
	* drain controlling terminal
	* and revoke access to controlling terminal.
	*/
	if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
	tp = sp->s_ttyp;
	if (sp->s_ttyp->t_pgrp) {
	PGRP_LOCK(sp->s_ttyp->t_pgrp);
	pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
	PGRP_UNLOCK(sp->s_ttyp->t_pgrp);
	}
	/* XXX tp should be locked. */
	sx_xunlock(&proctree_lock);
	(void) ttywait(tp);
	sx_xlock(&proctree_lock);
	/*
	* The tty could have been revoked
	* if we blocked.
	*/
	if (sp->s_ttyvp) {
	ttyvp = sp->s_ttyvp;
	SESS_LOCK(p->p_session);
	sp->s_ttyvp = NULL;
	SESS_UNLOCK(p->p_session);
	sx_xunlock(&proctree_lock);
	VOP_REVOKE(ttyvp, REVOKEALL);
	vrele(ttyvp);
	sx_xlock(&proctree_lock);
	}
	}
	if (sp->s_ttyvp) {
	ttyvp = sp->s_ttyvp;
	SESS_LOCK(p->p_session);
	sp->s_ttyvp = NULL;
	SESS_UNLOCK(p->p_session);
	vrele(ttyvp);
	}
	/*
	* s_ttyp is not zero'd; we use this to indicate
	* that the session once had a controlling terminal.
	* (for logging and informational purposes)
	*/
	}
	SESS_LOCK(p->p_session);
	sp->s_leader = NULL;
	SESS_UNLOCK(p->p_session);
	}
	fixjobc(p, p->p_pgrp, 0);
	sx_xunlock(&proctree_lock);
	(void)acct_process(td);
	#ifdef KTRACE
	/*
	* release trace file
	*/
	PROC_LOCK(p);
	mtx_lock(&ktrace_mtx);
	p->p_traceflag = 0; /* don't trace the vrele() */
	tracevp = p->p_tracep;
	p->p_tracep = NULL;
	mtx_unlock(&ktrace_mtx);
	PROC_UNLOCK(p);
	if (tracevp != NULL)
	vrele(tracevp);
	#endif
	/*
	* Release reference to text vnode
	*/
	if ((vtmp = p->p_textvp) != NULL) {
	p->p_textvp = NULL;
	vrele(vtmp);
	}

	/*
	* Release our limits structure.
	*/
	mtx_assert(&Giant, MA_OWNED);
	if (--p->p_limit->p_refcnt == 0) {
	FREE(p->p_limit, M_SUBPROC);
	p->p_limit = NULL;
	}

	/*
	* Release this thread's reference to the ucred. The actual proc
	* reference will stay around until the proc is harvested by
	* wait(). At this point the ucred is immutable (no other threads
	* from this proc are around that can change it) so we leave the
	* per-thread ucred pointer intact in case it is needed although
	* in theory nothing should be using it at this point.
	*/
	crfree(td->td_ucred);

	/*
	* Remove proc from allproc queue and pidhash chain.
	* Place onto zombproc. Unlink from parent's child list.
	*/
	sx_xlock(&allproc_lock);
	LIST_REMOVE(p, p_list);
	LIST_INSERT_HEAD(&zombproc, p, p_list);
	LIST_REMOVE(p, p_hash);
	sx_xunlock(&allproc_lock);

	sx_xlock(&proctree_lock);
	q = LIST_FIRST(&p->p_children);
	if (q != NULL) /* only need this if any child is S_ZOMB */
	wakeup(initproc);
	for (; q != NULL; q = nq) {
	nq = LIST_NEXT(q, p_sibling);
	PROC_LOCK(q);
	proc_reparent(q, initproc);
	q->p_sigparent = SIGCHLD;
	/*
	* Traced processes are killed
	* since their existence means someone is screwing up.
	*/
	if (q->p_flag & P_TRACED) {
	q->p_flag &= ~P_TRACED;
	psignal(q, SIGKILL);
	}
	PROC_UNLOCK(q);
	}

	/*
	* Save exit status and final rusage info, adding in child rusage
	* info and self times.
	*/
	PROC_LOCK(p);
	p->p_xstat = rv;
	*p->p_ru = p->p_stats->p_ru;
	mtx_lock_spin(&sched_lock);
	calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL);
	mtx_unlock_spin(&sched_lock);
	ruadd(p->p_ru, &p->p_stats->p_cru);

	/*
	* Notify interested parties of our demise.
	*/
	KNOTE(&p->p_klist, NOTE_EXIT);

	/*
	* Notify parent that we're gone. If parent has the PS_NOCLDWAIT
	* flag set, or if the handler is set to SIG_IGN, notify process
	* 1 instead (and hope it will handle this situation).
	*/
	PROC_LOCK(p->p_pptr);
	if (p->p_pptr->p_procsig->ps_flag & (PS_NOCLDWAIT \| PS_CLDSIGIGN)) {
	struct proc *pp;

	pp = p->p_pptr;
	PROC_UNLOCK(pp);
	proc_reparent(p, initproc);
	PROC_LOCK(p->p_pptr);
	/*
	* If this was the last child of our parent, notify
	* parent, so in case he was wait(2)ing, he will
	* continue.
	*/
	if (LIST_EMPTY(&pp->p_children))
	wakeup(pp);
	}

	if (p->p_sigparent && p->p_pptr != initproc)
	psignal(p->p_pptr, p->p_sigparent);
	else
	psignal(p->p_pptr, SIGCHLD);
	PROC_UNLOCK(p->p_pptr);

	/*
	* If this is a kthread, then wakeup anyone waiting for it to exit.
	*/
	if (p->p_flag & P_KTHREAD)
	wakeup(p);
	PROC_UNLOCK(p);

	/*
	* Finally, call machine-dependent code to release the remaining
	* resources including address space, the kernel stack and pcb.
	* The address space is released by "vmspace_exitfree(p)" in
	* vm_waitproc().
	*/
	cpu_exit(td);

	PROC_LOCK(p);
	PROC_LOCK(p->p_pptr);
	sx_xunlock(&proctree_lock);
	mtx_lock_spin(&sched_lock);
	while (mtx_owned(&Giant))
	mtx_unlock(&Giant);

	/*
	* We have to wait until after releasing all locks before
	* changing p_state. If we block on a mutex then we will be
	* back at SRUN when we resume and our parent will never
	* harvest us.
	*/
	p->p_state = PRS_ZOMBIE;

	wakeup(p->p_pptr);
	PROC_UNLOCK(p->p_pptr);
	cnt.v_swtch++;
	binuptime(PCPU_PTR(switchtime));
	PCPU_SET(switchticks, ticks);

	cpu_sched_exit(td); /* XXXKSE check if this should be in thread_exit */
	/*
	* Make sure this thread is discarded from the zombie.
	* This will also release this thread's reference to the ucred.
	*/
	thread_exit();
	panic("exit1");
	}

	#ifdef COMPAT_43
	/*
	* MPSAFE. The dirty work is handled by wait1().
	*/
	int
	owait(td, uap)
	struct thread *td;
	register struct owait_args /* {
	int dummy;
	} / uap;
	{
	struct wait_args w;

	w.options = 0;
	w.rusage = NULL;
	w.pid = WAIT_ANY;
	w.status = NULL;
	return (wait1(td, &w, 1));
	}
	#endif /* COMPAT_43 */

	/*
	* MPSAFE. The dirty work is handled by wait1().
	*/
	int
	wait4(td, uap)
	struct thread *td;
	struct wait_args *uap;
	{

	return (wait1(td, uap, 0));
	}

	/*
	* MPSAFE
	*/
	static int
	wait1(td, uap, compat)
	register struct thread *td;
	register struct wait_args /* {
	int pid;
	int *status;
	int options;
	struct rusage *rusage;
	} / uap;
	int compat;
	{
	struct rusage ru;
	register int nfound;
	register struct proc p, q, *t;
	int status, error;
	struct kse *ke;
	struct ksegrp *kg;

	q = td->td_proc;
	if (uap->pid == 0) {
	PROC_LOCK(q);
	uap->pid = -q->p_pgid;
	PROC_UNLOCK(q);
	}
	if (uap->options &~ (WUNTRACED\|WNOHANG\|WCONTINUED\|WLINUXCLONE))
	return (EINVAL);
	mtx_lock(&Giant);
	loop:
	nfound = 0;
	sx_xlock(&proctree_lock);
	LIST_FOREACH(p, &q->p_children, p_sibling) {
	PROC_LOCK(p);
	if (uap->pid != WAIT_ANY &&
	p->p_pid != uap->pid && p->p_pgid != -uap->pid) {
	PROC_UNLOCK(p);
	continue;
	}

	/*
	* This special case handles a kthread spawned by linux_clone
	* (see linux_misc.c). The linux_wait4 and linux_waitpid
	* functions need to be able to distinguish between waiting
	* on a process and waiting on a thread. It is a thread if
	* p_sigparent is not SIGCHLD, and the WLINUXCLONE option
	* signifies we want to wait for threads and not processes.
	*/
	if ((p->p_sigparent != SIGCHLD) ^
	((uap->options & WLINUXCLONE) != 0)) {
	PROC_UNLOCK(p);
	continue;
	}

	nfound++;
	if (p->p_state == PRS_ZOMBIE) {
	/*
	* charge childs scheduling cpu usage to parent
	* XXXKSE assume only one thread & kse & ksegrp
	* keep estcpu in each ksegrp
	* so charge it to the ksegrp that did the wait
	* since process estcpu is sum of all ksegrps,
	* this is strictly as expected.
	* Assume that the child process aggregated all
	* tke estcpu into the 'build-in' ksegrp.
	* XXXKSE
	*/
	if (curthread->td_proc->p_pid != 1) {
	mtx_lock_spin(&sched_lock);
	curthread->td_ksegrp->kg_estcpu =
	ESTCPULIM(curthread->td_ksegrp->kg_estcpu +
	FIRST_KSEGRP_IN_PROC(p)->kg_estcpu);
	mtx_unlock_spin(&sched_lock);
	}

	td->td_retval[0] = p->p_pid;
	#ifdef COMPAT_43
	if (compat)
	td->td_retval[1] = p->p_xstat;
	else
	#endif
	if (uap->status) {
	status = p->p_xstat; /* convert to int */
	PROC_UNLOCK(p);
	if ((error = copyout(&status,
	uap->status, sizeof(status)))) {
	sx_xunlock(&proctree_lock);
	mtx_unlock(&Giant);
	return (error);
	}
	PROC_LOCK(p);
	}
	if (uap->rusage) {
	bcopy(p->p_ru, &ru, sizeof(ru));
	PROC_UNLOCK(p);
	if ((error = copyout(&ru,
	uap->rusage, sizeof (struct rusage)))) {
	sx_xunlock(&proctree_lock);
	mtx_unlock(&Giant);
	return (error);
	}
	} else
	PROC_UNLOCK(p);
	/*
	* If we got the child via a ptrace 'attach',
	* we need to give it back to the old parent.
	*/
	if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
	PROC_LOCK(p);
	p->p_oppid = 0;
	proc_reparent(p, t);
	PROC_UNLOCK(p);
	psignal(t, SIGCHLD);
	wakeup(t);
	PROC_UNLOCK(t);
	sx_xunlock(&proctree_lock);
	mtx_unlock(&Giant);
	return (0);
	}
	/*
	* Remove other references to this process to ensure
	* we have an exclusive reference.
	*/
	leavepgrp(p);

	sx_xlock(&allproc_lock);
	LIST_REMOVE(p, p_list); /* off zombproc */
	sx_xunlock(&allproc_lock);

	LIST_REMOVE(p, p_sibling);
	sx_xunlock(&proctree_lock);

	/*
	* As a side effect of this lock, we know that
	* all other writes to this proc are visible now, so
	* no more locking is needed for p.
	*/
	PROC_LOCK(p);
	p->p_xstat = 0; /* XXX: why? */
	PROC_UNLOCK(p);
	PROC_LOCK(q);
	ruadd(&q->p_stats->p_cru, p->p_ru);
	PROC_UNLOCK(q);
	FREE(p->p_ru, M_ZOMBIE);
	p->p_ru = NULL;

	/*
	* Decrement the count of procs running with this uid.
	*/
	(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);

	/*
	* Free up credentials.
	*/
	crfree(p->p_ucred);
	p->p_ucred = NULL; /* XXX: why? */

	/*
	* Remove unused arguments
	*/
	pargs_drop(p->p_args);
	p->p_args = NULL;

	if (--p->p_procsig->ps_refcnt == 0) {
	if (p->p_sigacts != &p->p_uarea->u_sigacts)
	FREE(p->p_sigacts, M_SUBPROC);
	FREE(p->p_procsig, M_SUBPROC);
	p->p_procsig = NULL;
	}

	/*
	* There should only be one KSE/KSEGRP but
	* do it right anyhow.
	*/
	FOREACH_KSEGRP_IN_PROC(p, kg) {
	FOREACH_KSE_IN_GROUP(kg, ke) {
	/* Free the KSE spare thread. */
	if (ke->ke_tdspare != NULL) {
	thread_free(ke->ke_tdspare);
	ke->ke_tdspare = NULL;
	}
	}
	}
	thread_reap(); /* check for zombie threads */

	/*
	* Give vm and machine-dependent layer a chance
	* to free anything that cpu_exit couldn't
	* release while still running in process context.
	*/
	vm_waitproc(p);
	mtx_destroy(&p->p_mtx);
	KASSERT(FIRST_THREAD_IN_PROC(p),
	("wait1: no residual thread!"));
	uma_zfree(proc_zone, p);
	sx_xlock(&allproc_lock);
	nprocs--;
	sx_xunlock(&allproc_lock);
	mtx_unlock(&Giant);
	return (0);
	}
	if (P_SHOULDSTOP(p) && ((p->p_flag & P_WAITED) == 0) &&
	(p->p_flag & P_TRACED \|\| uap->options & WUNTRACED)) {
	p->p_flag \|= P_WAITED;
	sx_xunlock(&proctree_lock);
	td->td_retval[0] = p->p_pid;
	#ifdef COMPAT_43
	if (compat) {
	td->td_retval[1] = W_STOPCODE(p->p_xstat);
	PROC_UNLOCK(p);
	error = 0;
	} else
	#endif
	if (uap->status) {
	status = W_STOPCODE(p->p_xstat);
	PROC_UNLOCK(p);
	error = copyout(&status,
	uap->status, sizeof(status));
	} else {
	PROC_UNLOCK(p);
	error = 0;
	}
	mtx_unlock(&Giant);
	return (error);
	}
	if (uap->options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
	sx_xunlock(&proctree_lock);
	td->td_retval[0] = p->p_pid;
	p->p_flag &= ~P_CONTINUED;
	PROC_UNLOCK(p);

	if (uap->status) {
	status = SIGCONT;
	error = copyout(&status,
	uap->status, sizeof(status));
	} else
	error = 0;

	mtx_unlock(&Giant);
	return (error);
	}
	PROC_UNLOCK(p);
	}
	if (nfound == 0) {
	sx_xunlock(&proctree_lock);
	mtx_unlock(&Giant);
	return (ECHILD);
	}
	if (uap->options & WNOHANG) {
	sx_xunlock(&proctree_lock);
	td->td_retval[0] = 0;
	mtx_unlock(&Giant);
	return (0);
	}
	PROC_LOCK(q);
	sx_xunlock(&proctree_lock);
	error = msleep(q, &q->p_mtx, PWAIT \| PCATCH, "wait", 0);
	PROC_UNLOCK(q);
	if (error) {
	mtx_unlock(&Giant);
	return (error);
	}
	goto loop;
	}

	/*
	* Make process 'parent' the new parent of process 'child'.
	* Must be called with an exclusive hold of proctree lock.
	*/
	void
	proc_reparent(child, parent)
	register struct proc *child;
	register struct proc *parent;
	{

	sx_assert(&proctree_lock, SX_XLOCKED);
	PROC_LOCK_ASSERT(child, MA_OWNED);
	if (child->p_pptr == parent)
	return;

	LIST_REMOVE(child, p_sibling);
	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
	child->p_pptr = parent;
	}

	/*
	* The next two functions are to handle adding/deleting items on the
	* exit callout list
	*
	* at_exit():
	* Take the arguments given and put them onto the exit callout list,
	* However first make sure that it's not already there.
	* returns 0 on success.
	*/

	int
	at_exit(function)
	exitlist_fn function;
	{
	struct exitlist *ep;

	#ifdef INVARIANTS
	/* Be noisy if the programmer has lost track of things */
	if (rm_at_exit(function))
	printf("WARNING: exit callout entry (%p) already present\n",
	function);
	#endif
	ep = malloc(sizeof(*ep), M_ATEXIT, M_NOWAIT);
	if (ep == NULL)
	return (ENOMEM);
	ep->function = function;
	TAILQ_INSERT_TAIL(&exit_list, ep, next);
	return (0);
	}

	/*
	* Scan the exit callout list for the given item and remove it.
	* Returns the number of items removed (0 or 1)
	*/
	int
	rm_at_exit(function)
	exitlist_fn function;
	{
	struct exitlist *ep;

	TAILQ_FOREACH(ep, &exit_list, next) {
	if (ep->function == function) {
	TAILQ_REMOVE(&exit_list, ep, next);
	free(ep, M_ATEXIT);
	return (1);
	}
	}
	return (0);
	}
	Index: head/sys/kern/kern_resource.c
	===================================================================
	--- head/sys/kern/kern_resource.c (revision 103766)
	+++ head/sys/kern/kern_resource.c (revision 103767)
	@@ -1,1045 +1,1048 @@
	/*-
	* Copyright (c) 1982, 1986, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_resource.c 8.5 (Berkeley) 1/21/94
	* $FreeBSD$
	*/

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/file.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/sx.h>
	+#include <sys/sysent.h>
	#include <sys/time.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>

	static int donice(struct thread td, struct proc chgp, int n);

	static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
	#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
	static struct mtx uihashtbl_mtx;
	static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
	static u_long uihash; /* size of hash table - 1 */

	static struct uidinfo *uilookup(uid_t uid);

	/*
	* Resource controls and accounting.
	*/

	#ifndef _SYS_SYSPROTO_H_
	struct getpriority_args {
	int which;
	int who;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	getpriority(td, uap)
	struct thread *td;
	register struct getpriority_args *uap;
	{
	struct proc *p;
	int low = PRIO_MAX + 1;
	int error = 0;
	struct ksegrp *kg;

	mtx_lock(&Giant);

	switch (uap->which) {
	case PRIO_PROCESS:
	if (uap->who == 0)
	low = td->td_ksegrp->kg_nice;
	else {
	p = pfind(uap->who);
	if (p == NULL)
	break;
	if (p_cansee(td, p) == 0) {
	FOREACH_KSEGRP_IN_PROC(p, kg) {
	if (kg->kg_nice < low)
	low = kg->kg_nice;
	}
	}
	PROC_UNLOCK(p);
	}
	break;

	case PRIO_PGRP: {
	register struct pgrp *pg;

	sx_slock(&proctree_lock);
	if (uap->who == 0) {
	pg = td->td_proc->p_pgrp;
	PGRP_LOCK(pg);
	} else {
	pg = pgfind(uap->who);
	if (pg == NULL) {
	sx_sunlock(&proctree_lock);
	break;
	}
	}
	sx_sunlock(&proctree_lock);
	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
	PROC_LOCK(p);
	if (!p_cansee(td, p)) {
	FOREACH_KSEGRP_IN_PROC(p, kg) {
	if (kg->kg_nice < low)
	low = kg->kg_nice;
	}
	}
	PROC_UNLOCK(p);
	}
	PGRP_UNLOCK(pg);
	break;
	}

	case PRIO_USER:
	if (uap->who == 0)
	uap->who = td->td_ucred->cr_uid;
	sx_slock(&allproc_lock);
	LIST_FOREACH(p, &allproc, p_list) {
	PROC_LOCK(p);
	if (!p_cansee(td, p) &&
	p->p_ucred->cr_uid == uap->who) {
	FOREACH_KSEGRP_IN_PROC(p, kg) {
	if (kg->kg_nice < low)
	low = kg->kg_nice;
	}
	}
	PROC_UNLOCK(p);
	}
	sx_sunlock(&allproc_lock);
	break;

	default:
	error = EINVAL;
	break;
	}
	if (low == PRIO_MAX + 1 && error == 0)
	error = ESRCH;
	td->td_retval[0] = low;
	mtx_unlock(&Giant);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setpriority_args {
	int which;
	int who;
	int prio;
	};
	#endif
	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	setpriority(td, uap)
	struct thread *td;
	register struct setpriority_args *uap;
	{
	struct proc *curp = td->td_proc;
	register struct proc *p;
	int found = 0, error = 0;

	mtx_lock(&Giant);

	switch (uap->which) {
	case PRIO_PROCESS:
	if (uap->who == 0) {
	PROC_LOCK(curp);
	error = donice(td, curp, uap->prio);
	PROC_UNLOCK(curp);
	} else {
	p = pfind(uap->who);
	if (p == 0)
	break;
	if (p_cansee(td, p) == 0)
	error = donice(td, p, uap->prio);
	PROC_UNLOCK(p);
	}
	found++;
	break;

	case PRIO_PGRP: {
	register struct pgrp *pg;

	sx_slock(&proctree_lock);
	if (uap->who == 0) {
	pg = curp->p_pgrp;
	PGRP_LOCK(pg);
	} else {
	pg = pgfind(uap->who);
	if (pg == NULL) {
	sx_sunlock(&proctree_lock);
	break;
	}
	}
	sx_sunlock(&proctree_lock);
	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
	PROC_LOCK(p);
	if (!p_cansee(td, p)) {
	error = donice(td, p, uap->prio);
	found++;
	}
	PROC_UNLOCK(p);
	}
	PGRP_UNLOCK(pg);
	break;
	}

	case PRIO_USER:
	if (uap->who == 0)
	uap->who = td->td_ucred->cr_uid;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	PROC_LOCK(p);
	if (p->p_ucred->cr_uid == uap->who &&
	!p_cansee(td, p)) {
	error = donice(td, p, uap->prio);
	found++;
	}
	PROC_UNLOCK(p);
	}
	sx_sunlock(&allproc_lock);
	break;

	default:
	error = EINVAL;
	break;
	}
	if (found == 0 && error == 0)
	error = ESRCH;
	mtx_unlock(&Giant);
	return (error);
	}

	/*
	* Set "nice" for a process. Doesn't really understand threaded processes well
	* but does try. Has the unfortunate side effect of making all the NICE
	* values for a process's ksegrps the same.. This suggests that
	* NICE valuse should be stored as a process nice and deltas for the ksegrps.
	* (but not yet).
	*/
	static int
	donice(struct thread td, struct proc p, int n)
	{
	int error;
	int low = PRIO_MAX + 1;
	struct ksegrp *kg;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	if ((error = p_cansched(td, p)))
	return (error);
	if (n > PRIO_MAX)
	n = PRIO_MAX;
	if (n < PRIO_MIN)
	n = PRIO_MIN;
	/*
	* Only allow nicing if to more than the lowest nice.
	* e.g. nices of 4,3,2 allow nice to 3 but not 1
	*/
	FOREACH_KSEGRP_IN_PROC(p, kg) {
	if (kg->kg_nice < low)
	low = kg->kg_nice;
	}
	if (n < low && suser(td))
	return (EACCES);
	FOREACH_KSEGRP_IN_PROC(p, kg) {
	kg->kg_nice = n;
	(void)resetpriority(kg);
	}
	return (0);
	}

	/* rtprio system call */
	#ifndef _SYS_SYSPROTO_H_
	struct rtprio_args {
	int function;
	pid_t pid;
	struct rtprio *rtp;
	};
	#endif

	/*
	* Set realtime priority
	*/

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	rtprio(td, uap)
	struct thread *td;
	register struct rtprio_args *uap;
	{
	struct proc *curp = td->td_proc;
	register struct proc *p;
	struct rtprio rtp;
	int error, cierror = 0;

	/* Perform copyin before acquiring locks if needed. */
	if (uap->function == RTP_SET)
	cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));

	if (uap->pid == 0) {
	p = curp;
	PROC_LOCK(p);
	} else {
	p = pfind(uap->pid);
	if (p == NULL)
	return (ESRCH);
	}

	switch (uap->function) {
	case RTP_LOOKUP:
	if ((error = p_cansee(td, p)))
	break;
	mtx_lock_spin(&sched_lock);
	pri_to_rtp(FIRST_KSEGRP_IN_PROC(p), &rtp);
	mtx_unlock_spin(&sched_lock);
	PROC_UNLOCK(p);
	return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
	case RTP_SET:
	if ((error = p_cansched(td, p)) \|\| (error = cierror))
	break;
	/* disallow setting rtprio in most cases if not superuser */
	if (suser(td) != 0) {
	/* can't set someone else's */
	if (uap->pid) {
	error = EPERM;
	break;
	}
	/* can't set realtime priority */
	/*
	* Realtime priority has to be restricted for reasons which should be
	* obvious. However, for idle priority, there is a potential for
	* system deadlock if an idleprio process gains a lock on a resource
	* that other processes need (and the idleprio process can't run
	* due to a CPU-bound normal process). Fix me! XXX
	*/
	#if 0
	if (RTP_PRIO_IS_REALTIME(rtp.type))
	#endif
	if (rtp.type != RTP_PRIO_NORMAL) {
	error = EPERM;
	break;
	}
	}
	mtx_lock_spin(&sched_lock);
	error = rtp_to_pri(&rtp, FIRST_KSEGRP_IN_PROC(p));
	mtx_unlock_spin(&sched_lock);
	break;
	default:
	error = EINVAL;
	break;
	}
	PROC_UNLOCK(p);
	return (error);
	}

	int
	rtp_to_pri(struct rtprio rtp, struct ksegrp kg)
	{

	if (rtp->prio > RTP_PRIO_MAX)
	return (EINVAL);
	switch (RTP_PRIO_BASE(rtp->type)) {
	case RTP_PRIO_REALTIME:
	kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio;
	break;
	case RTP_PRIO_NORMAL:
	kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio;
	break;
	case RTP_PRIO_IDLE:
	kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio;
	break;
	default:
	return (EINVAL);
	}
	kg->kg_pri_class = rtp->type;
	if (curthread->td_ksegrp == kg) {
	curthread->td_base_pri = kg->kg_user_pri;
	curthread->td_priority = kg->kg_user_pri; /* XXX dubious */
	}
	return (0);
	}

	void
	pri_to_rtp(struct ksegrp kg, struct rtprio rtp)
	{

	switch (PRI_BASE(kg->kg_pri_class)) {
	case PRI_REALTIME:
	rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME;
	break;
	case PRI_TIMESHARE:
	rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE;
	break;
	case PRI_IDLE:
	rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE;
	break;
	default:
	break;
	}
	rtp->type = kg->kg_pri_class;
	}

	#if defined(COMPAT_43) \|\| defined(COMPAT_SUNOS)
	#ifndef _SYS_SYSPROTO_H_
	struct osetrlimit_args {
	u_int which;
	struct orlimit *rlp;
	};
	#endif
	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	osetrlimit(td, uap)
	struct thread *td;
	register struct osetrlimit_args *uap;
	{
	struct orlimit olim;
	struct rlimit lim;
	int error;

	if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
	return (error);
	lim.rlim_cur = olim.rlim_cur;
	lim.rlim_max = olim.rlim_max;
	mtx_lock(&Giant);
	error = dosetrlimit(td, uap->which, &lim);
	mtx_unlock(&Giant);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ogetrlimit_args {
	u_int which;
	struct orlimit *rlp;
	};
	#endif
	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	ogetrlimit(td, uap)
	struct thread *td;
	register struct ogetrlimit_args *uap;
	{
	struct proc *p = td->td_proc;
	struct orlimit olim;
	int error;

	if (uap->which >= RLIM_NLIMITS)
	return (EINVAL);
	mtx_lock(&Giant);
	olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur;
	if (olim.rlim_cur == -1)
	olim.rlim_cur = 0x7fffffff;
	olim.rlim_max = p->p_rlimit[uap->which].rlim_max;
	if (olim.rlim_max == -1)
	olim.rlim_max = 0x7fffffff;
	error = copyout(&olim, uap->rlp, sizeof(olim));
	mtx_unlock(&Giant);
	return (error);
	}
	#endif /* COMPAT_43 \|\| COMPAT_SUNOS */

	#ifndef _SYS_SYSPROTO_H_
	struct __setrlimit_args {
	u_int which;
	struct rlimit *rlp;
	};
	#endif
	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	setrlimit(td, uap)
	struct thread *td;
	register struct __setrlimit_args *uap;
	{
	struct rlimit alim;
	int error;

	if ((error = copyin(uap->rlp, &alim, sizeof (struct rlimit))))
	return (error);
	mtx_lock(&Giant);
	error = dosetrlimit(td, uap->which, &alim);
	mtx_unlock(&Giant);
	return (error);
	}

	int
	dosetrlimit(td, which, limp)
	struct thread *td;
	u_int which;
	struct rlimit *limp;
	{
	struct proc *p = td->td_proc;
	register struct rlimit *alimp;
	int error;

	GIANT_REQUIRED;

	if (which >= RLIM_NLIMITS)
	return (EINVAL);
	alimp = &p->p_rlimit[which];

	/*
	* Preserve historical bugs by treating negative limits as unsigned.
	*/
	if (limp->rlim_cur < 0)
	limp->rlim_cur = RLIM_INFINITY;
	if (limp->rlim_max < 0)
	limp->rlim_max = RLIM_INFINITY;

	if (limp->rlim_cur > alimp->rlim_max \|\|
	limp->rlim_max > alimp->rlim_max)
	if ((error = suser_cred(td->td_ucred, PRISON_ROOT)))
	return (error);
	if (limp->rlim_cur > limp->rlim_max)
	limp->rlim_cur = limp->rlim_max;
	if (p->p_limit->p_refcnt > 1 &&
	(p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
	p->p_limit->p_refcnt--;
	p->p_limit = limcopy(p->p_limit);
	alimp = &p->p_rlimit[which];
	}

	switch (which) {

	case RLIMIT_CPU:
	if (limp->rlim_cur > RLIM_INFINITY / (rlim_t)1000000)
	p->p_limit->p_cpulimit = RLIM_INFINITY;
	else
	p->p_limit->p_cpulimit =
	(rlim_t)1000000 * limp->rlim_cur;
	break;
	case RLIMIT_DATA:
	if (limp->rlim_cur > maxdsiz)
	limp->rlim_cur = maxdsiz;
	if (limp->rlim_max > maxdsiz)
	limp->rlim_max = maxdsiz;
	break;

	case RLIMIT_STACK:
	if (limp->rlim_cur > maxssiz)
	limp->rlim_cur = maxssiz;
	if (limp->rlim_max > maxssiz)
	limp->rlim_max = maxssiz;
	/*
	* Stack is allocated to the max at exec time with only
	* "rlim_cur" bytes accessible. If stack limit is going
	* up make more accessible, if going down make inaccessible.
	*/
	if (limp->rlim_cur != alimp->rlim_cur) {
	vm_offset_t addr;
	vm_size_t size;
	vm_prot_t prot;

	if (limp->rlim_cur > alimp->rlim_cur) {
	- prot = VM_PROT_ALL;
	+ prot = p->p_sysent->sv_stackprot;
	size = limp->rlim_cur - alimp->rlim_cur;
	- addr = USRSTACK - limp->rlim_cur;
	+ addr = p->p_sysent->sv_usrstack -
	+ limp->rlim_cur;
	} else {
	prot = VM_PROT_NONE;
	size = alimp->rlim_cur - limp->rlim_cur;
	- addr = USRSTACK - alimp->rlim_cur;
	+ addr = p->p_sysent->sv_usrstack -
	+ alimp->rlim_cur;
	}
	addr = trunc_page(addr);
	size = round_page(size);
	(void) vm_map_protect(&p->p_vmspace->vm_map,
	addr, addr+size, prot, FALSE);
	}
	break;

	case RLIMIT_NOFILE:
	if (limp->rlim_cur > maxfilesperproc)
	limp->rlim_cur = maxfilesperproc;
	if (limp->rlim_max > maxfilesperproc)
	limp->rlim_max = maxfilesperproc;
	break;

	case RLIMIT_NPROC:
	if (limp->rlim_cur > maxprocperuid)
	limp->rlim_cur = maxprocperuid;
	if (limp->rlim_max > maxprocperuid)
	limp->rlim_max = maxprocperuid;
	if (limp->rlim_cur < 1)
	limp->rlim_cur = 1;
	if (limp->rlim_max < 1)
	limp->rlim_max = 1;
	break;
	}
	alimp = limp;
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct __getrlimit_args {
	u_int which;
	struct rlimit *rlp;
	};
	#endif
	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	getrlimit(td, uap)
	struct thread *td;
	register struct __getrlimit_args *uap;
	{
	int error;
	struct proc *p = td->td_proc;

	if (uap->which >= RLIM_NLIMITS)
	return (EINVAL);
	mtx_lock(&Giant);
	error = copyout(&p->p_rlimit[uap->which], uap->rlp,
	sizeof (struct rlimit));
	mtx_unlock(&Giant);
	return(error);
	}

	/*
	* Transform the running time and tick information in proc p into user,
	* system, and interrupt time usage.
	*/
	void
	calcru(p, up, sp, ip)
	struct proc *p;
	struct timeval *up;
	struct timeval *sp;
	struct timeval *ip;
	{
	/* {user, system, interrupt, total} {ticks, usec}; previous tu: */
	u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
	u_int64_t uut = 0, sut = 0, iut = 0;
	int s;
	struct timeval tv;
	struct bintime bt;
	struct kse *ke;
	struct ksegrp *kg;

	mtx_assert(&sched_lock, MA_OWNED);
	/* XXX: why spl-protect ? worst case is an off-by-one report */

	FOREACH_KSEGRP_IN_PROC(p, kg) {
	/* we could accumulate per ksegrp and per process here*/
	FOREACH_KSE_IN_GROUP(kg, ke) {
	s = splstatclock();
	ut = ke->ke_uticks;
	st = ke->ke_sticks;
	it = ke->ke_iticks;
	splx(s);

	tt = ut + st + it;
	if (tt == 0) {
	st = 1;
	tt = 1;
	}

	if (ke == curthread->td_kse) {
	/*
	* Adjust for the current time slice. This is actually fairly
	* important since the error here is on the order of a time
	* quantum, which is much greater than the sampling error.
	* XXXKSE use a different test due to threads on other
	* processors also being 'current'.
	*/

	binuptime(&bt);
	bintime_sub(&bt, PCPU_PTR(switchtime));
	bintime_add(&bt, &p->p_runtime);
	} else {
	bt = p->p_runtime;
	}
	bintime2timeval(&bt, &tv);
	tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
	ptu = ke->ke_uu + ke->ke_su + ke->ke_iu;
	if (tu < ptu \|\| (int64_t)tu < 0) {
	/* XXX no %qd in kernel. Truncate. */
	printf("calcru: negative time of %ld usec for pid %d (%s)\n",
	(long)tu, p->p_pid, p->p_comm);
	tu = ptu;
	}

	/* Subdivide tu. */
	uu = (tu * ut) / tt;
	su = (tu * st) / tt;
	iu = tu - uu - su;

	/* Enforce monotonicity. */
	if (uu < ke->ke_uu \|\| su < ke->ke_su \|\| iu < ke->ke_iu) {
	if (uu < ke->ke_uu)
	uu = ke->ke_uu;
	else if (uu + ke->ke_su + ke->ke_iu > tu)
	uu = tu - ke->ke_su - ke->ke_iu;
	if (st == 0)
	su = ke->ke_su;
	else {
	su = ((tu - uu) * st) / (st + it);
	if (su < ke->ke_su)
	su = ke->ke_su;
	else if (uu + su + ke->ke_iu > tu)
	su = tu - uu - ke->ke_iu;
	}
	KASSERT(uu + su + ke->ke_iu <= tu,
	("calcru: monotonisation botch 1"));
	iu = tu - uu - su;
	KASSERT(iu >= ke->ke_iu,
	("calcru: monotonisation botch 2"));
	}
	ke->ke_uu = uu;
	ke->ke_su = su;
	ke->ke_iu = iu;
	uut += uu;
	sut += su;
	iut += iu;

	} /* end kse loop */
	} /* end kseg loop */
	up->tv_sec = uut / 1000000;
	up->tv_usec = uut % 1000000;
	sp->tv_sec = sut / 1000000;
	sp->tv_usec = sut % 1000000;
	if (ip != NULL) {
	ip->tv_sec = iut / 1000000;
	ip->tv_usec = iut % 1000000;
	}
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getrusage_args {
	int who;
	struct rusage *rusage;
	};
	#endif
	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	getrusage(td, uap)
	register struct thread *td;
	register struct getrusage_args *uap;
	{
	struct proc *p = td->td_proc;
	register struct rusage *rup;
	int error = 0;

	mtx_lock(&Giant);

	switch (uap->who) {
	case RUSAGE_SELF:
	rup = &p->p_stats->p_ru;
	mtx_lock_spin(&sched_lock);
	calcru(p, &rup->ru_utime, &rup->ru_stime, NULL);
	mtx_unlock_spin(&sched_lock);
	break;

	case RUSAGE_CHILDREN:
	rup = &p->p_stats->p_cru;
	break;

	default:
	rup = NULL;
	error = EINVAL;
	break;
	}
	mtx_unlock(&Giant);
	if (error == 0) {
	error = copyout(rup, uap->rusage, sizeof (struct rusage));
	}
	return(error);
	}

	void
	ruadd(ru, ru2)
	register struct rusage ru, ru2;
	{
	register long ip, ip2;
	register int i;

	timevaladd(&ru->ru_utime, &ru2->ru_utime);
	timevaladd(&ru->ru_stime, &ru2->ru_stime);
	if (ru->ru_maxrss < ru2->ru_maxrss)
	ru->ru_maxrss = ru2->ru_maxrss;
	ip = &ru->ru_first; ip2 = &ru2->ru_first;
	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
	ip++ += ip2++;
	}

	/*
	* Make a copy of the plimit structure.
	* We share these structures copy-on-write after fork,
	* and copy when a limit is changed.
	*/
	struct plimit *
	limcopy(lim)
	struct plimit *lim;
	{
	register struct plimit *copy;

	MALLOC(copy, struct plimit *, sizeof(struct plimit),
	M_SUBPROC, M_WAITOK);
	bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct plimit));
	copy->p_lflags = 0;
	copy->p_refcnt = 1;
	return (copy);
	}

	/*
	* Find the uidinfo structure for a uid. This structure is used to
	* track the total resource consumption (process count, socket buffer
	* size, etc.) for the uid and impose limits.
	*/
	void
	uihashinit()
	{

	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
	mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
	}

	/*
	* lookup a uidinfo struct for the parameter uid.
	* uihashtbl_mtx must be locked.
	*/
	static struct uidinfo *
	uilookup(uid)
	uid_t uid;
	{
	struct uihashhead *uipp;
	struct uidinfo *uip;

	mtx_assert(&uihashtbl_mtx, MA_OWNED);
	uipp = UIHASH(uid);
	LIST_FOREACH(uip, uipp, ui_hash)
	if (uip->ui_uid == uid)
	break;

	return (uip);
	}

	/*
	* Find or allocate a struct uidinfo for a particular uid.
	* Increase refcount on uidinfo struct returned.
	* uifree() should be called on a struct uidinfo when released.
	*/
	struct uidinfo *
	uifind(uid)
	uid_t uid;
	{
	struct uidinfo *uip;

	mtx_lock(&uihashtbl_mtx);
	uip = uilookup(uid);
	if (uip == NULL) {
	struct uidinfo *old_uip;

	mtx_unlock(&uihashtbl_mtx);
	uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK \| M_ZERO);
	mtx_lock(&uihashtbl_mtx);
	/*
	* There's a chance someone created our uidinfo while we
	* were in malloc and not holding the lock, so we have to
	* make sure we don't insert a duplicate uidinfo
	*/
	if ((old_uip = uilookup(uid)) != NULL) {
	/* someone else beat us to it */
	free(uip, M_UIDINFO);
	uip = old_uip;
	} else {
	uip->ui_mtxp = mtx_pool_alloc();
	uip->ui_uid = uid;
	LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
	}
	}
	uihold(uip);
	mtx_unlock(&uihashtbl_mtx);
	return (uip);
	}

	/*
	* Place another refcount on a uidinfo struct.
	*/
	void
	uihold(uip)
	struct uidinfo *uip;
	{

	UIDINFO_LOCK(uip);
	uip->ui_ref++;
	UIDINFO_UNLOCK(uip);
	}

	/*-
	* Since uidinfo structs have a long lifetime, we use an
	* opportunistic refcounting scheme to avoid locking the lookup hash
	* for each release.
	*
	* If the refcount hits 0, we need to free the structure,
	* which means we need to lock the hash.
	* Optimal case:
	* After locking the struct and lowering the refcount, if we find
	* that we don't need to free, simply unlock and return.
	* Suboptimal case:
	* If refcount lowering results in need to free, bump the count
	* back up, loose the lock and aquire the locks in the proper
	* order to try again.
	*/
	void
	uifree(uip)
	struct uidinfo *uip;
	{

	/* Prepare for optimal case. */
	UIDINFO_LOCK(uip);

	if (--uip->ui_ref != 0) {
	UIDINFO_UNLOCK(uip);
	return;
	}

	/* Prepare for suboptimal case. */
	uip->ui_ref++;
	UIDINFO_UNLOCK(uip);
	mtx_lock(&uihashtbl_mtx);
	UIDINFO_LOCK(uip);

	/*
	* We must subtract one from the count again because we backed out
	* our initial subtraction before dropping the lock.
	* Since another thread may have added a reference after we dropped the
	* initial lock we have to test for zero again.
	*/
	if (--uip->ui_ref == 0) {
	LIST_REMOVE(uip, ui_hash);
	mtx_unlock(&uihashtbl_mtx);
	if (uip->ui_sbsize != 0)
	/* XXX no %qd in kernel. Truncate. */
	printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
	uip->ui_uid, (long)uip->ui_sbsize);
	if (uip->ui_proccnt != 0)
	printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
	uip->ui_uid, uip->ui_proccnt);
	UIDINFO_UNLOCK(uip);
	FREE(uip, M_UIDINFO);
	return;
	}

	mtx_unlock(&uihashtbl_mtx);
	UIDINFO_UNLOCK(uip);
	}

	/*
	* Change the count associated with number of processes
	* a given user is using. When 'max' is 0, don't enforce a limit
	*/
	int
	chgproccnt(uip, diff, max)
	struct uidinfo *uip;
	int diff;
	int max;
	{

	UIDINFO_LOCK(uip);
	/* don't allow them to exceed max, but allow subtraction */
	if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
	UIDINFO_UNLOCK(uip);
	return (0);
	}
	uip->ui_proccnt += diff;
	if (uip->ui_proccnt < 0)
	printf("negative proccnt for uid = %d\n", uip->ui_uid);
	UIDINFO_UNLOCK(uip);
	return (1);
	}

	/*
	* Change the total socket buffer size a user has used.
	*/
	int
	chgsbsize(uip, hiwat, to, max)
	struct uidinfo *uip;
	u_int *hiwat;
	u_int to;
	rlim_t max;
	{
	rlim_t new;
	int s;

	s = splnet();
	UIDINFO_LOCK(uip);
	new = uip->ui_sbsize + to - *hiwat;
	/* don't allow them to exceed max, but allow subtraction */
	if (to > *hiwat && new > max) {
	splx(s);
	UIDINFO_UNLOCK(uip);
	return (0);
	}
	uip->ui_sbsize = new;
	*hiwat = to;
	if (uip->ui_sbsize < 0)
	printf("negative sbsize for uid = %d\n", uip->ui_uid);
	splx(s);
	UIDINFO_UNLOCK(uip);
	return (1);
	}
	Index: head/sys/sys/imgact.h
	===================================================================
	--- head/sys/sys/imgact.h (revision 103766)
	+++ head/sys/sys/imgact.h (revision 103767)
	@@ -1,78 +1,78 @@
	/*-
	* Copyright (c) 1993, David Greenman
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef _SYS_IMGACT_H_
	#define _SYS_IMGACT_H_

	#define MAXSHELLCMDLEN 128

	+struct sysentvec;
	struct thread;
	struct vm_object;

	struct image_params {
	struct proc proc; / our process struct */
	struct execve_args uap; / syscall arguments */
	struct vnode vp; / pointer to vnode of file to exec */
	struct vm_object object; / The vm object for this vp */
	struct vattr attr; / attributes of file */
	const char image_header; / head of file to exec */
	char stringbase; / base address of tmp string storage */
	char stringp; / current 'end' pointer of tmp strings */
	char endargs; / end of argv vector */
	int stringspace; /* space left in tmp string storage area */
	int argc, envc; /* count of argument and environment strings */
	char argv0; / Replacement for argv[0] when interpreting */
	unsigned long entry_addr; /* entry address of target executable */
	char vmspace_destroyed; /* flag - we've blown away original vm space */
	char interpreted; /* flag - this executable is interpreted */
	char interpreter_name[MAXSHELLCMDLEN]; /* name of the interpreter */
	void auxargs; / ELF Auxinfo structure pointer */
	struct vm_page firstpage; / first page that we mapped */
	char fname; / pointer to filename of executable (user space) */
	unsigned long ps_strings; /* PS_STRINGS for BSD/OS binaries */
	size_t auxarg_size;
	};

	#ifdef _KERNEL
	int exec_check_permissions(struct image_params *);
	register_t exec_copyout_strings(struct image_params );
	int exec_extract_strings(struct image_params *);
	-int exec_new_vmspace(struct image_params *, vm_offset_t, vm_offset_t,
	- vm_offset_t);
	+int exec_new_vmspace(struct image_params , struct sysentvec );
	void exec_setregs(struct thread *, u_long, u_long, u_long);
	int exec_shell_imgact(struct image_params *);
	#endif

	#endif /* !_SYS_IMGACT_H_ */
	Index: head/sys/vm/vm_glue.c
	===================================================================
	--- head/sys/vm/vm_glue.c (revision 103766)
	+++ head/sys/vm/vm_glue.c (revision 103767)
	@@ -1,873 +1,865 @@
	/*
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* The Mach Operating System project at Carnegie-Mellon University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94
	*
	*
	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	* All rights reserved.
	*
	* Permission to use, copy, modify and distribute this software and
	* its documentation is hereby granted, provided that both the copyright
	* notice and this permission notice appear in all copies of the
	* software, derivative works or modified versions, and any portions
	* thereof, and that both notices appear in supporting documentation.
	*
	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	*
	* Carnegie Mellon requests users of this software to return to
	*
	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	* School of Computer Science
	* Carnegie Mellon University
	* Pittsburgh PA 15213-3890
	*
	* any improvements or extensions that they make and grant Carnegie the
	* rights to redistribute these changes.
	*
	* $FreeBSD$
	*/

	#include "opt_vm.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/shm.h>
	#include <sys/vmmeter.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>

	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/unistd.h>

	#include <machine/limits.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_object.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_pager.h>

	#include <sys/user.h>

	extern int maxslp;

	/*
	* System initialization
	*
	* Note: proc0 from proc.h
	*/
	static void vm_init_limits(void *);
	SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0)

	/*
	* THIS MUST BE THE LAST INITIALIZATION ITEM!!!
	*
	* Note: run scheduling should be divorced from the vm system.
	*/
	static void scheduler(void *);
	SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL)

	#ifndef NO_SWAPPING
	static void swapout(struct proc *);
	static void vm_proc_swapin(struct proc *p);
	static void vm_proc_swapout(struct proc *p);
	#endif

	/*
	* MPSAFE
	*/
	int
	kernacc(addr, len, rw)
	caddr_t addr;
	int len, rw;
	{
	boolean_t rv;
	vm_offset_t saddr, eaddr;
	vm_prot_t prot;

	KASSERT((rw & ~VM_PROT_ALL) == 0,
	("illegal ``rw'' argument to kernacc (%x)\n", rw));
	prot = rw;
	saddr = trunc_page((vm_offset_t)addr);
	eaddr = round_page((vm_offset_t)addr + len);
	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
	return (rv == TRUE);
	}

	/*
	* MPSAFE
	*/
	int
	useracc(addr, len, rw)
	caddr_t addr;
	int len, rw;
	{
	boolean_t rv;
	vm_prot_t prot;
	+ vm_map_t map;

	KASSERT((rw & ~VM_PROT_ALL) == 0,
	("illegal ``rw'' argument to useracc (%x)\n", rw));
	prot = rw;
	- /*
	- * XXX - check separately to disallow access to user area and user
	- * page tables - they are in the map.
	- *
	- * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. It was once
	- * only used (as an end address) in trap.c. Use it as an end address
	- * here too. This bogusness has spread. I just fixed where it was
	- * used as a max in vm_mmap.c.
	- */
	- if ((vm_offset_t) addr + len > /* XXX */ VM_MAXUSER_ADDRESS
	- \|\| (vm_offset_t) addr + len < (vm_offset_t) addr) {
	+ map = &curproc->p_vmspace->vm_map;
	+ if ((vm_offset_t)addr + len > vm_map_max(map) \|\|
	+ (vm_offset_t)addr + len < (vm_offset_t)addr) {
	return (FALSE);
	}
	- rv = vm_map_check_protection(&curproc->p_vmspace->vm_map,
	- trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
	- prot);
	+ rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
	+ round_page((vm_offset_t)addr + len), prot);
	return (rv == TRUE);
	}

	/*
	* MPSAFE
	*/
	void
	vslock(addr, len)
	caddr_t addr;
	u_int len;
	{

	vm_map_wire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr),
	round_page((vm_offset_t)addr + len), FALSE);
	}

	/*
	* MPSAFE
	*/
	void
	vsunlock(addr, len)
	caddr_t addr;
	u_int len;
	{

	vm_map_unwire(&curproc->p_vmspace->vm_map,
	trunc_page((vm_offset_t)addr),
	round_page((vm_offset_t)addr + len), FALSE);
	}

	/*
	* Create the U area for a new process.
	* This routine directly affects the fork perf for a process.
	*/
	void
	vm_proc_new(struct proc *p)
	{
	vm_page_t ma[UAREA_PAGES];
	vm_object_t upobj;
	vm_offset_t up;
	vm_page_t m;
	u_int i;

	/*
	* Allocate object for the upage.
	*/
	upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES);
	p->p_upages_obj = upobj;

	/*
	* Get a kernel virtual address for the U area for this process.
	*/
	up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE);
	if (up == 0)
	panic("vm_proc_new: upage allocation failed");
	p->p_uarea = (struct user *)up;

	for (i = 0; i < UAREA_PAGES; i++) {
	/*
	* Get a uarea page.
	*/
	m = vm_page_grab(upobj, i,
	VM_ALLOC_NORMAL \| VM_ALLOC_RETRY \| VM_ALLOC_WIRED);
	ma[i] = m;

	vm_page_wakeup(m);
	vm_page_flag_clear(m, PG_ZERO);
	m->valid = VM_PAGE_BITS_ALL;
	}

	/*
	* Enter the pages into the kernel address space.
	*/
	pmap_qenter(up, ma, UAREA_PAGES);
	}

	/*
	* Dispose the U area for a process that has exited.
	* This routine directly impacts the exit perf of a process.
	* XXX proc_zone is marked UMA_ZONE_NOFREE, so this should never be called.
	*/
	void
	vm_proc_dispose(struct proc *p)
	{
	vm_object_t upobj;
	vm_offset_t up;
	vm_page_t m;

	upobj = p->p_upages_obj;
	if (upobj->resident_page_count != UAREA_PAGES)
	panic("vm_proc_dispose: incorrect number of pages in upobj");
	vm_page_lock_queues();
	while ((m = TAILQ_FIRST(&upobj->memq)) != NULL) {
	vm_page_busy(m);
	vm_page_unwire(m, 0);
	vm_page_free(m);
	}
	vm_page_unlock_queues();
	up = (vm_offset_t)p->p_uarea;
	pmap_qremove(up, UAREA_PAGES);
	kmem_free(kernel_map, up, UAREA_PAGES * PAGE_SIZE);
	vm_object_deallocate(upobj);
	}

	#ifndef NO_SWAPPING
	/*
	* Allow the U area for a process to be prejudicially paged out.
	*/
	void
	vm_proc_swapout(struct proc *p)
	{
	vm_object_t upobj;
	vm_offset_t up;
	vm_page_t m;

	upobj = p->p_upages_obj;
	if (upobj->resident_page_count != UAREA_PAGES)
	panic("vm_proc_dispose: incorrect number of pages in upobj");
	vm_page_lock_queues();
	TAILQ_FOREACH(m, &upobj->memq, listq) {
	vm_page_dirty(m);
	vm_page_unwire(m, 0);
	}
	vm_page_unlock_queues();
	up = (vm_offset_t)p->p_uarea;
	pmap_qremove(up, UAREA_PAGES);
	}

	/*
	* Bring the U area for a specified process back in.
	*/
	void
	vm_proc_swapin(struct proc *p)
	{
	vm_page_t ma[UAREA_PAGES];
	vm_object_t upobj;
	vm_offset_t up;
	vm_page_t m;
	int rv;
	int i;

	upobj = p->p_upages_obj;
	for (i = 0; i < UAREA_PAGES; i++) {
	m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	if (m->valid != VM_PAGE_BITS_ALL) {
	rv = vm_pager_get_pages(upobj, &m, 1, 0);
	if (rv != VM_PAGER_OK)
	panic("vm_proc_swapin: cannot get upage");
	}
	ma[i] = m;
	}
	if (upobj->resident_page_count != UAREA_PAGES)
	panic("vm_proc_swapin: lost pages from upobj");
	vm_page_lock_queues();
	TAILQ_FOREACH(m, &upobj->memq, listq) {
	m->valid = VM_PAGE_BITS_ALL;
	vm_page_wire(m);
	vm_page_wakeup(m);
	}
	vm_page_unlock_queues();
	up = (vm_offset_t)p->p_uarea;
	pmap_qenter(up, ma, UAREA_PAGES);
	}
	#endif

	/*
	* Implement fork's actions on an address space.
	* Here we arrange for the address space to be copied or referenced,
	* allocate a user struct (pcb and kernel stack), then call the
	* machine-dependent layer to fill those in and make the new process
	* ready to run. The new process is set up so that it returns directly
	* to user mode to avoid stack copying and relocation problems.
	*/
	void
	vm_forkproc(td, p2, td2, flags)
	struct thread *td;
	struct proc *p2;
	struct thread *td2;
	int flags;
	{
	struct proc *p1 = td->td_proc;
	struct user *up;

	GIANT_REQUIRED;

	if ((flags & RFPROC) == 0) {
	/*
	* Divorce the memory, if it is shared, essentially
	* this changes shared memory amongst threads, into
	* COW locally.
	*/
	if ((flags & RFMEM) == 0) {
	if (p1->p_vmspace->vm_refcnt > 1) {
	vmspace_unshare(p1);
	}
	}
	cpu_fork(td, p2, td2, flags);
	return;
	}

	if (flags & RFMEM) {
	p2->p_vmspace = p1->p_vmspace;
	p1->p_vmspace->vm_refcnt++;
	}

	while (vm_page_count_severe()) {
	VM_WAIT;
	}

	if ((flags & RFMEM) == 0) {
	p2->p_vmspace = vmspace_fork(p1->p_vmspace);

	pmap_pinit2(vmspace_pmap(p2->p_vmspace));

	if (p1->p_vmspace->vm_shm)
	shmfork(p1, p2);
	}

	/* XXXKSE this is unsatisfactory but should be adequate */
	up = p2->p_uarea;

	/*
	* p_stats currently points at fields in the user struct
	* but not at &u, instead at p_addr. Copy parts of
	* p_stats; zero the rest of p_stats (statistics).
	*
	* If procsig->ps_refcnt is 1 and p2->p_sigacts is NULL we dont' need
	* to share sigacts, so we use the up->u_sigacts.
	*/
	p2->p_stats = &up->u_stats;
	if (p2->p_sigacts == NULL) {
	if (p2->p_procsig->ps_refcnt != 1)
	printf ("PID:%d NULL sigacts with refcnt not 1!\n",p2->p_pid);
	p2->p_sigacts = &up->u_sigacts;
	up->u_sigacts = *p1->p_sigacts;
	}

	bzero(&up->u_stats.pstat_startzero,
	(unsigned) ((caddr_t) &up->u_stats.pstat_endzero -
	(caddr_t) &up->u_stats.pstat_startzero));
	bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy,
	((caddr_t) &up->u_stats.pstat_endcopy -
	(caddr_t) &up->u_stats.pstat_startcopy));


	/*
	* cpu_fork will copy and update the pcb, set up the kernel stack,
	* and make the child ready to run.
	*/
	cpu_fork(td, p2, td2, flags);
	}

	/*
	* Called after process has been wait(2)'ed apon and is being reaped.
	* The idea is to reclaim resources that we could not reclaim while
	* the process was still executing.
	*/
	void
	vm_waitproc(p)
	struct proc *p;
	{

	GIANT_REQUIRED;
	cpu_wait(p);
	vmspace_exitfree(p); /* and clean-out the vmspace */
	}

	/*
	* Set default limits for VM system.
	* Called for proc 0, and then inherited by all others.
	*
	* XXX should probably act directly on proc0.
	*/
	static void
	vm_init_limits(udata)
	void *udata;
	{
	struct proc *p = udata;
	int rss_limit;

	/*
	* Set up the initial limits on process VM. Set the maximum resident
	* set size to be half of (reasonably) available memory. Since this
	* is a soft limit, it comes into effect only when the system is out
	* of memory - half of main memory helps to favor smaller processes,
	* and reduces thrashing of the object cache.
	*/
	p->p_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
	p->p_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
	p->p_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
	p->p_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
	/* limit the limit to no less than 2MB */
	rss_limit = max(cnt.v_free_count, 512);
	p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
	p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
	}

	void
	faultin(p)
	struct proc *p;
	{

	GIANT_REQUIRED;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	mtx_assert(&sched_lock, MA_OWNED);
	#ifdef NO_SWAPPING
	if ((p->p_sflag & PS_INMEM) == 0)
	panic("faultin: proc swapped out with NO_SWAPPING!");
	#else
	if ((p->p_sflag & PS_INMEM) == 0) {
	struct thread *td;

	++p->p_lock;
	/*
	* If another process is swapping in this process,
	* just wait until it finishes.
	*/
	if (p->p_sflag & PS_SWAPPINGIN) {
	mtx_unlock_spin(&sched_lock);
	msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0);
	mtx_lock_spin(&sched_lock);
	--p->p_lock;
	return;
	}

	p->p_sflag \|= PS_SWAPPINGIN;
	mtx_unlock_spin(&sched_lock);
	PROC_UNLOCK(p);

	vm_proc_swapin(p);
	FOREACH_THREAD_IN_PROC (p, td) {
	pmap_swapin_thread(td);
	TD_CLR_SWAPPED(td);
	}

	PROC_LOCK(p);
	mtx_lock_spin(&sched_lock);
	p->p_sflag &= ~PS_SWAPPINGIN;
	p->p_sflag \|= PS_INMEM;
	FOREACH_THREAD_IN_PROC (p, td)
	if (TD_CAN_RUN(td))
	setrunnable(td);

	wakeup(&p->p_sflag);

	/* undo the effect of setting SLOCK above */
	--p->p_lock;
	}
	#endif
	}

	/*
	* This swapin algorithm attempts to swap-in processes only if there
	* is enough space for them. Of course, if a process waits for a long
	* time, it will be swapped in anyway.
	*
	* XXXKSE - process with the thread with highest priority counts..
	*
	* Giant is still held at this point, to be released in tsleep.
	*/
	/* ARGSUSED*/
	static void
	scheduler(dummy)
	void *dummy;
	{
	struct proc *p;
	struct thread *td;
	int pri;
	struct proc *pp;
	int ppri;

	mtx_assert(&Giant, MA_OWNED \| MA_NOTRECURSED);
	/* GIANT_REQUIRED */

	loop:
	if (vm_page_count_min()) {
	VM_WAIT;
	goto loop;
	}

	pp = NULL;
	ppri = INT_MIN;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	struct ksegrp *kg;
	if (p->p_sflag & (PS_INMEM \| PS_SWAPPING \| PS_SWAPPINGIN)) {
	continue;
	}
	mtx_lock_spin(&sched_lock);
	FOREACH_THREAD_IN_PROC(p, td) {
	/*
	* An otherwise runnable thread of a process
	* swapped out has only the TDI_SWAPPED bit set.
	*
	*/
	if (td->td_inhibitors == TDI_SWAPPED) {
	kg = td->td_ksegrp;
	pri = p->p_swtime + kg->kg_slptime;
	if ((p->p_sflag & PS_SWAPINREQ) == 0) {
	pri -= kg->kg_nice * 8;
	}

	/*
	* if this ksegrp is higher priority
	* and there is enough space, then select
	* this process instead of the previous
	* selection.
	*/
	if (pri > ppri) {
	pp = p;
	ppri = pri;
	}
	}
	}
	mtx_unlock_spin(&sched_lock);
	}
	sx_sunlock(&allproc_lock);

	/*
	* Nothing to do, back to sleep.
	*/
	if ((p = pp) == NULL) {
	tsleep(&proc0, PVM, "sched", maxslp * hz / 2);
	goto loop;
	}
	PROC_LOCK(p);
	mtx_lock_spin(&sched_lock);

	/*
	* Another process may be bringing or may have already
	* brought this process in while we traverse all threads.
	* Or, this process may even be being swapped out again.
	*/
	if (p->p_sflag & (PS_INMEM\|PS_SWAPPING\|PS_SWAPPINGIN)) {
	mtx_unlock_spin(&sched_lock);
	PROC_UNLOCK(p);
	goto loop;
	}

	p->p_sflag &= ~PS_SWAPINREQ;

	/*
	* We would like to bring someone in. (only if there is space).
	* [What checks the space? ]
	*/
	faultin(p);
	PROC_UNLOCK(p);
	p->p_swtime = 0;
	mtx_unlock_spin(&sched_lock);
	goto loop;
	}

	#ifndef NO_SWAPPING

	/*
	* Swap_idle_threshold1 is the guaranteed swapped in time for a process
	*/
	static int swap_idle_threshold1 = 2;
	SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1,
	CTLFLAG_RW, &swap_idle_threshold1, 0, "");

	/*
	* Swap_idle_threshold2 is the time that a process can be idle before
	* it will be swapped out, if idle swapping is enabled.
	*/
	static int swap_idle_threshold2 = 10;
	SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2,
	CTLFLAG_RW, &swap_idle_threshold2, 0, "");

	/*
	* Swapout is driven by the pageout daemon. Very simple, we find eligible
	* procs and unwire their u-areas. We try to always "swap" at least one
	* process in case we need the room for a swapin.
	* If any procs have been sleeping/stopped for at least maxslp seconds,
	* they are swapped. Else, we swap the longest-sleeping or stopped process,
	* if any, otherwise the longest-resident process.
	*/
	void
	swapout_procs(action)
	int action;
	{
	struct proc *p;
	struct thread *td;
	struct ksegrp *kg;
	struct proc outp, outp2;
	int outpri, outpri2;
	int didswap = 0;

	GIANT_REQUIRED;

	outp = outp2 = NULL;
	outpri = outpri2 = INT_MIN;
	retry:
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	struct vmspace *vm;
	int minslptime = 100000;

	/*
	* Do not swapout a process that
	* is waiting for VM data
	* structures there is a possible
	* deadlock. Test this first as
	* this may block.
	*
	* Lock the map until swapout
	* finishes, or a thread of this
	* process may attempt to alter
	* the map.
	*
	* Watch out for a process in
	* creation. It may have no
	* address space yet.
	*
	* An aio daemon switches its
	* address space while running.
	* Perform a quick check whether
	* a process has P_SYSTEM.
	*/
	PROC_LOCK(p);
	if ((p->p_flag & P_SYSTEM) != 0) {
	PROC_UNLOCK(p);
	continue;
	}
	mtx_lock_spin(&sched_lock);
	if (p->p_state == PRS_NEW) {
	mtx_unlock_spin(&sched_lock);
	PROC_UNLOCK(p);
	continue;
	}
	vm = p->p_vmspace;
	KASSERT(vm != NULL,
	("swapout_procs: a process has no address space"));
	++vm->vm_refcnt;
	mtx_unlock_spin(&sched_lock);
	PROC_UNLOCK(p);
	if (!vm_map_trylock(&vm->vm_map))
	goto nextproc1;

	PROC_LOCK(p);
	if (p->p_lock != 0 \|\|
	(p->p_flag & (P_STOPPED_SINGLE\|P_TRACED\|P_SYSTEM\|P_WEXIT)
	) != 0) {
	goto nextproc2;
	}
	/*
	* only aiod changes vmspace, however it will be
	* skipped because of the if statement above checking
	* for P_SYSTEM
	*/
	mtx_lock_spin(&sched_lock);
	if ((p->p_sflag & (PS_INMEM\|PS_SWAPPING\|PS_SWAPPINGIN)) != PS_INMEM)
	goto nextproc;

	switch (p->p_state) {
	default:
	/* Don't swap out processes in any sort
	* of 'special' state. */
	goto nextproc;

	case PRS_NORMAL:
	/*
	* do not swapout a realtime process
	* Check all the thread groups..
	*/
	FOREACH_KSEGRP_IN_PROC(p, kg) {
	if (PRI_IS_REALTIME(kg->kg_pri_class))
	goto nextproc;

	/*
	* Guarantee swap_idle_threshold1
	* time in memory.
	*/
	if (kg->kg_slptime < swap_idle_threshold1)
	goto nextproc;

	/*
	* Do not swapout a process if it is
	* waiting on a critical event of some
	* kind or there is a thread whose
	* pageable memory may be accessed.
	*
	* This could be refined to support
	* swapping out a thread.
	*/
	FOREACH_THREAD_IN_GROUP(kg, td) {
	if ((td->td_priority) < PSOCK \|\|
	!thread_safetoswapout(td))
	goto nextproc;
	}
	/*
	* If the system is under memory stress,
	* or if we are swapping
	* idle processes >= swap_idle_threshold2,
	* then swap the process out.
	*/
	if (((action & VM_SWAP_NORMAL) == 0) &&
	(((action & VM_SWAP_IDLE) == 0) \|\|
	(kg->kg_slptime < swap_idle_threshold2)))
	goto nextproc;

	if (minslptime > kg->kg_slptime)
	minslptime = kg->kg_slptime;
	}

	/*
	* If the process has been asleep for awhile and had
	* most of its pages taken away already, swap it out.
	*/
	if ((action & VM_SWAP_NORMAL) \|\|
	((action & VM_SWAP_IDLE) &&
	(minslptime > swap_idle_threshold2))) {
	swapout(p);
	didswap++;

	/*
	* swapout() unlocks a proc lock. This is
	* ugly, but avoids superfluous lock.
	*/
	mtx_unlock_spin(&sched_lock);
	vm_map_unlock(&vm->vm_map);
	vmspace_free(vm);
	sx_sunlock(&allproc_lock);
	goto retry;
	}
	}
	nextproc:
	mtx_unlock_spin(&sched_lock);
	nextproc2:
	PROC_UNLOCK(p);
	vm_map_unlock(&vm->vm_map);
	nextproc1:
	vmspace_free(vm);
	continue;
	}
	sx_sunlock(&allproc_lock);
	/*
	* If we swapped something out, and another process needed memory,
	* then wakeup the sched process.
	*/
	if (didswap)
	wakeup(&proc0);
	}

	static void
	swapout(p)
	struct proc *p;
	{
	struct thread *td;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	mtx_assert(&sched_lock, MA_OWNED \| MA_NOTRECURSED);
	#if defined(SWAP_DEBUG)
	printf("swapping out %d\n", p->p_pid);
	#endif

	/*
	* The states of this process and its threads may have changed
	* by now. Assuming that there is only one pageout daemon thread,
	* this process should still be in memory.
	*/
	KASSERT((p->p_sflag & (PS_INMEM\|PS_SWAPPING\|PS_SWAPPINGIN)) == PS_INMEM,
	("swapout: lost a swapout race?"));

	#if defined(INVARIANTS)
	/*
	* Make sure that all threads are safe to be swapped out.
	*
	* Alternatively, we could swap out only safe threads.
	*/
	FOREACH_THREAD_IN_PROC(p, td) {
	KASSERT(thread_safetoswapout(td),
	("swapout: there is a thread not safe for swapout"));
	}
	#endif /* INVARIANTS */

	++p->p_stats->p_ru.ru_nswap;
	/*
	* remember the process resident count
	*/
	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);

	PROC_UNLOCK(p);
	FOREACH_THREAD_IN_PROC (p, td) /* shouldn't be possible, but..... */
	if (TD_ON_RUNQ(td)) { /* XXXKSE */
	panic("swapping out runnable process");
	remrunqueue(td); /* XXXKSE */
	}
	p->p_sflag &= ~PS_INMEM;
	p->p_sflag \|= PS_SWAPPING;
	mtx_unlock_spin(&sched_lock);

	vm_proc_swapout(p);
	FOREACH_THREAD_IN_PROC(p, td) {
	pmap_swapout_thread(td);
	TD_SET_SWAPPED(td);
	}
	mtx_lock_spin(&sched_lock);
	p->p_sflag &= ~PS_SWAPPING;
	p->p_swtime = 0;
	}
	#endif /* !NO_SWAPPING */
	Index: head/sys/vm/vm_map.c
	===================================================================
	--- head/sys/vm/vm_map.c (revision 103766)
	+++ head/sys/vm/vm_map.c (revision 103767)
	@@ -1,3159 +1,3158 @@
	/*
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* The Mach Operating System project at Carnegie-Mellon University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94
	*
	*
	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	* All rights reserved.
	*
	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	*
	* Permission to use, copy, modify and distribute this software and
	* its documentation is hereby granted, provided that both the copyright
	* notice and this permission notice appear in all copies of the
	* software, derivative works or modified versions, and any portions
	* thereof, and that both notices appear in supporting documentation.
	*
	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	*
	* Carnegie Mellon requests users of this software to return to
	*
	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	* School of Computer Science
	* Carnegie Mellon University
	* Pittsburgh PA 15213-3890
	*
	* any improvements or extensions that they make and grant Carnegie the
	* rights to redistribute these changes.
	*
	* $FreeBSD$
	*/

	/*
	* Virtual memory mapping module.
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/vmmeter.h>
	#include <sys/mman.h>
	#include <sys/vnode.h>
	#include <sys/resourcevar.h>
	+#include <sys/sysent.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_page.h>
	#include <vm/vm_object.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>
	#include <vm/swap_pager.h>
	#include <vm/uma.h>

	/*
	* Virtual memory maps provide for the mapping, protection,
	* and sharing of virtual memory objects. In addition,
	* this module provides for an efficient virtual copy of
	* memory from one map to another.
	*
	* Synchronization is required prior to most operations.
	*
	* Maps consist of an ordered doubly-linked list of simple
	* entries; a single hint is used to speed up lookups.
	*
	* Since portions of maps are specified by start/end addresses,
	* which may not align with existing map entries, all
	* routines merely "clip" entries to these start/end values.
	* [That is, an entry is split into two, bordering at a
	* start or end value.] Note that these clippings may not
	* always be necessary (as the two resulting entries are then
	* not changed); however, the clipping is done for convenience.
	*
	* As mentioned above, virtual copy operations are performed
	* by copying VM object references from one map to
	* another, and then marking both regions as copy-on-write.
	*/

	/*
	* vm_map_startup:
	*
	* Initialize the vm_map module. Must be called before
	* any other vm_map routines.
	*
	* Map and entry structures are allocated from the general
	* purpose memory pool with some exceptions:
	*
	* - The kernel map and kmem submap are allocated statically.
	* - Kernel map entries are allocated out of a static pool.
	*
	* These restrictions are necessary since malloc() uses the
	* maps and requires map entries.
	*/

	static uma_zone_t mapentzone;
	static uma_zone_t kmapentzone;
	static uma_zone_t mapzone;
	static uma_zone_t vmspace_zone;
	static struct vm_object kmapentobj;
	static void vmspace_zinit(void *mem, int size);
	static void vmspace_zfini(void *mem, int size);
	static void vm_map_zinit(void *mem, int size);
	static void vm_map_zfini(void *mem, int size);
	static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max);

	#ifdef INVARIANTS
	static void vm_map_zdtor(void mem, int size, void arg);
	static void vmspace_zdtor(void mem, int size, void arg);
	#endif

	void
	vm_map_startup(void)
	{
	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
	#ifdef INVARIANTS
	vm_map_zdtor,
	#else
	NULL,
	#endif
	vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	uma_prealloc(mapzone, MAX_KMAP);
	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
	UMA_ZONE_MTXCLASS \| UMA_ZONE_VM);
	uma_prealloc(kmapentzone, MAX_KMAPENT);
	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	uma_prealloc(mapentzone, MAX_MAPENT);
	}

	static void
	vmspace_zfini(void *mem, int size)
	{
	struct vmspace *vm;

	vm = (struct vmspace *)mem;

	vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
	}

	static void
	vmspace_zinit(void *mem, int size)
	{
	struct vmspace *vm;

	vm = (struct vmspace *)mem;

	vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map));
	}

	static void
	vm_map_zfini(void *mem, int size)
	{
	vm_map_t map;

	map = (vm_map_t)mem;

	lockdestroy(&map->lock);
	}

	static void
	vm_map_zinit(void *mem, int size)
	{
	vm_map_t map;

	map = (vm_map_t)mem;
	map->nentries = 0;
	map->size = 0;
	map->infork = 0;
	lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE);
	}

	#ifdef INVARIANTS
	static void
	vmspace_zdtor(void mem, int size, void arg)
	{
	struct vmspace *vm;

	vm = (struct vmspace *)mem;

	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
	}
	static void
	vm_map_zdtor(void mem, int size, void arg)
	{
	vm_map_t map;

	map = (vm_map_t)mem;
	KASSERT(map->nentries == 0,
	("map %p nentries == %d on free.",
	map, map->nentries));
	KASSERT(map->size == 0,
	("map %p size == %lu on free.",
	map, (unsigned long)map->size));
	KASSERT(map->infork == 0,
	("map %p infork == %d on free.",
	map, map->infork));
	}
	#endif /* INVARIANTS */

	/*
	* Allocate a vmspace structure, including a vm_map and pmap,
	* and initialize those structures. The refcnt is set to 1.
	* The remaining fields must be initialized by the caller.
	*/
	struct vmspace *
	vmspace_alloc(min, max)
	vm_offset_t min, max;
	{
	struct vmspace *vm;

	GIANT_REQUIRED;
	vm = uma_zalloc(vmspace_zone, M_WAITOK);
	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
	_vm_map_init(&vm->vm_map, min, max);
	pmap_pinit(vmspace_pmap(vm));
	vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */
	vm->vm_refcnt = 1;
	vm->vm_shm = NULL;
	vm->vm_freer = NULL;
	return (vm);
	}

	void
	vm_init2(void)
	{
	uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count,
	(VM_MAX_KERNEL_ADDRESS - KERNBASE) / PAGE_SIZE) / 8);
	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
	#ifdef INVARIANTS
	vmspace_zdtor,
	#else
	NULL,
	#endif
	vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	pmap_init2();
	vm_object_init2();
	}

	static __inline void
	vmspace_dofree(struct vmspace *vm)
	{
	CTR1(KTR_VM, "vmspace_free: %p", vm);
	/*
	* Lock the map, to wait out all other references to it.
	* Delete all of the mappings and pages they hold, then call
	* the pmap module to reclaim anything left.
	*/
	vm_map_lock(&vm->vm_map);
	(void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
	vm->vm_map.max_offset);
	vm_map_unlock(&vm->vm_map);

	pmap_release(vmspace_pmap(vm));
	uma_zfree(vmspace_zone, vm);
	}

	void
	vmspace_free(struct vmspace *vm)
	{
	GIANT_REQUIRED;

	if (vm->vm_refcnt == 0)
	panic("vmspace_free: attempt to free already freed vmspace");

	if (--vm->vm_refcnt == 0)
	vmspace_dofree(vm);
	}

	void
	vmspace_exitfree(struct proc *p)
	{
	struct vmspace *vm;

	GIANT_REQUIRED;
	if (p == p->p_vmspace->vm_freer) {
	vm = p->p_vmspace;
	p->p_vmspace = NULL;
	vmspace_dofree(vm);
	}
	}

	/*
	* vmspace_swap_count() - count the approximate swap useage in pages for a
	* vmspace.
	*
	* Swap useage is determined by taking the proportional swap used by
	* VM objects backing the VM map. To make up for fractional losses,
	* if the VM object has any swap use at all the associated map entries
	* count for at least 1 swap page.
	*/
	int
	vmspace_swap_count(struct vmspace *vmspace)
	{
	vm_map_t map = &vmspace->vm_map;
	vm_map_entry_t cur;
	int count = 0;

	vm_map_lock_read(map);
	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
	vm_object_t object;

	if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
	(object = cur->object.vm_object) != NULL &&
	object->type == OBJT_SWAP
	) {
	int n = (cur->end - cur->start) / PAGE_SIZE;

	if (object->un_pager.swp.swp_bcount) {
	count += object->un_pager.swp.swp_bcount *
	SWAP_META_PAGES * n / object->size + 1;
	}
	}
	}
	vm_map_unlock_read(map);
	return (count);
	}

	void
	_vm_map_lock(vm_map_t map, const char *file, int line)
	{
	int error;

	if (map->system_map)
	GIANT_REQUIRED;
	error = lockmgr(&map->lock, LK_EXCLUSIVE, NULL, curthread);
	KASSERT(error == 0, ("%s: failed to get lock", __func__));
	map->timestamp++;
	}

	void
	_vm_map_unlock(vm_map_t map, const char *file, int line)
	{

	lockmgr(&map->lock, LK_RELEASE, NULL, curthread);
	}

	void
	_vm_map_lock_read(vm_map_t map, const char *file, int line)
	{
	int error;

	if (map->system_map)
	GIANT_REQUIRED;
	error = lockmgr(&map->lock, LK_EXCLUSIVE, NULL, curthread);
	KASSERT(error == 0, ("%s: failed to get lock", __func__));
	}

	void
	_vm_map_unlock_read(vm_map_t map, const char *file, int line)
	{

	lockmgr(&map->lock, LK_RELEASE, NULL, curthread);
	}

	int
	_vm_map_trylock(vm_map_t map, const char *file, int line)
	{
	int error;

	if (map->system_map)
	GIANT_REQUIRED;
	error = lockmgr(&map->lock, LK_EXCLUSIVE \| LK_NOWAIT, NULL, curthread);
	return (error == 0);
	}

	int
	_vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
	{

	KASSERT(lockstatus(&map->lock, curthread) == LK_EXCLUSIVE,
	("%s: lock not held", __func__));
	map->timestamp++;
	return (0);
	}

	void
	_vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
	{

	KASSERT(lockstatus(&map->lock, curthread) == LK_EXCLUSIVE,
	("%s: lock not held", __func__));
	}

	/*
	* vm_map_unlock_and_wait:
	*/
	int
	vm_map_unlock_and_wait(vm_map_t map, boolean_t user_wait)
	{
	int retval;

	mtx_lock(&Giant);
	vm_map_unlock(map);
	retval = tsleep(&map->root, PVM, "vmmapw", 0);
	mtx_unlock(&Giant);
	return (retval);
	}

	/*
	* vm_map_wakeup:
	*/
	void
	vm_map_wakeup(vm_map_t map)
	{

	/*
	* Acquire and release Giant to prevent a wakeup() from being
	* performed (and lost) between the vm_map_unlock() and the
	* tsleep() in vm_map_unlock_and_wait().
	*/
	mtx_lock(&Giant);
	mtx_unlock(&Giant);
	wakeup(&map->root);
	}

	long
	vmspace_resident_count(struct vmspace *vmspace)
	{
	return pmap_resident_count(vmspace_pmap(vmspace));
	}

	/*
	* vm_map_create:
	*
	* Creates and returns a new empty VM map with
	* the given physical map structure, and having
	* the given lower and upper address bounds.
	*/
	vm_map_t
	vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
	{
	vm_map_t result;

	result = uma_zalloc(mapzone, M_WAITOK);
	CTR1(KTR_VM, "vm_map_create: %p", result);
	_vm_map_init(result, min, max);
	result->pmap = pmap;
	return (result);
	}

	/*
	* Initialize an existing vm_map structure
	* such as that in the vmspace structure.
	* The pmap is set elsewhere.
	*/
	static void
	_vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
	{

	map->header.next = map->header.prev = &map->header;
	map->needs_wakeup = FALSE;
	map->system_map = 0;
	map->min_offset = min;
	map->max_offset = max;
	map->first_free = &map->header;
	map->root = NULL;
	map->timestamp = 0;
	}

	void
	vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
	{
	_vm_map_init(map, min, max);
	lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE);
	}

	/*
	* vm_map_entry_dispose: [ internal use only ]
	*
	* Inverse of vm_map_entry_create.
	*/
	static void
	vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
	{
	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
	}

	/*
	* vm_map_entry_create: [ internal use only ]
	*
	* Allocates a VM map entry for insertion.
	* No entry fields are filled in.
	*/
	static vm_map_entry_t
	vm_map_entry_create(vm_map_t map)
	{
	vm_map_entry_t new_entry;

	if (map->system_map)
	new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
	else
	new_entry = uma_zalloc(mapentzone, M_WAITOK);
	if (new_entry == NULL)
	panic("vm_map_entry_create: kernel resources exhausted");
	return (new_entry);
	}

	/*
	* vm_map_entry_set_behavior:
	*
	* Set the expected access behavior, either normal, random, or
	* sequential.
	*/
	static __inline void
	vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
	{
	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) \|
	(behavior & MAP_ENTRY_BEHAV_MASK);
	}

	/*
	* vm_map_entry_splay:
	*
	* Implements Sleator and Tarjan's top-down splay algorithm. Returns
	* the vm_map_entry containing the given address. If, however, that
	* address is not found in the vm_map, returns a vm_map_entry that is
	* adjacent to the address, coming before or after it.
	*/
	static vm_map_entry_t
	vm_map_entry_splay(vm_offset_t address, vm_map_entry_t root)
	{
	struct vm_map_entry dummy;
	vm_map_entry_t lefttreemax, righttreemin, y;

	if (root == NULL)
	return (root);
	lefttreemax = righttreemin = &dummy;
	for (;; root = y) {
	if (address < root->start) {
	if ((y = root->left) == NULL)
	break;
	if (address < y->start) {
	/* Rotate right. */
	root->left = y->right;
	y->right = root;
	root = y;
	if ((y = root->left) == NULL)
	break;
	}
	/* Link into the new root's right tree. */
	righttreemin->left = root;
	righttreemin = root;
	} else if (address >= root->end) {
	if ((y = root->right) == NULL)
	break;
	if (address >= y->end) {
	/* Rotate left. */
	root->right = y->left;
	y->left = root;
	root = y;
	if ((y = root->right) == NULL)
	break;
	}
	/* Link into the new root's left tree. */
	lefttreemax->right = root;
	lefttreemax = root;
	} else
	break;
	}
	/* Assemble the new root. */
	lefttreemax->right = root->left;
	righttreemin->left = root->right;
	root->left = dummy.right;
	root->right = dummy.left;
	return (root);
	}

	/*
	* vm_map_entry_{un,}link:
	*
	* Insert/remove entries from maps.
	*/
	static void
	vm_map_entry_link(vm_map_t map,
	vm_map_entry_t after_where,
	vm_map_entry_t entry)
	{

	CTR4(KTR_VM,
	"vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
	map->nentries, entry, after_where);
	map->nentries++;
	entry->prev = after_where;
	entry->next = after_where->next;
	entry->next->prev = entry;
	after_where->next = entry;

	if (after_where != &map->header) {
	if (after_where != map->root)
	vm_map_entry_splay(after_where->start, map->root);
	entry->right = after_where->right;
	entry->left = after_where;
	after_where->right = NULL;
	} else {
	entry->right = map->root;
	entry->left = NULL;
	}
	map->root = entry;
	}

	static void
	vm_map_entry_unlink(vm_map_t map,
	vm_map_entry_t entry)
	{
	vm_map_entry_t next, prev, root;

	if (entry != map->root)
	vm_map_entry_splay(entry->start, map->root);
	if (entry->left == NULL)
	root = entry->right;
	else {
	root = vm_map_entry_splay(entry->start, entry->left);
	root->right = entry->right;
	}
	map->root = root;

	prev = entry->prev;
	next = entry->next;
	next->prev = prev;
	prev->next = next;
	map->nentries--;
	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
	map->nentries, entry);
	}

	/*
	* vm_map_lookup_entry: [ internal use only ]
	*
	* Finds the map entry containing (or
	* immediately preceding) the specified address
	* in the given map; the entry is returned
	* in the "entry" parameter. The boolean
	* result indicates whether the address is
	* actually contained in the map.
	*/
	boolean_t
	vm_map_lookup_entry(
	vm_map_t map,
	vm_offset_t address,
	vm_map_entry_t entry) / OUT */
	{
	vm_map_entry_t cur;

	cur = vm_map_entry_splay(address, map->root);
	if (cur == NULL)
	*entry = &map->header;
	else {
	map->root = cur;

	if (address >= cur->start) {
	*entry = cur;
	if (cur->end > address)
	return (TRUE);
	} else
	*entry = cur->prev;
	}
	return (FALSE);
	}

	/*
	* vm_map_insert:
	*
	* Inserts the given whole VM object into the target
	* map at the specified address range. The object's
	* size should match that of the address range.
	*
	* Requires that the map be locked, and leaves it so.
	*
	* If object is non-NULL, ref count must be bumped by caller
	* prior to making call to account for the new entry.
	*/
	int
	vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
	vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
	int cow)
	{
	vm_map_entry_t new_entry;
	vm_map_entry_t prev_entry;
	vm_map_entry_t temp_entry;
	vm_eflags_t protoeflags;

	/*
	* Check that the start and end points are not bogus.
	*/
	if ((start < map->min_offset) \|\| (end > map->max_offset) \|\|
	(start >= end))
	return (KERN_INVALID_ADDRESS);

	/*
	* Find the entry prior to the proposed starting address; if it's part
	* of an existing entry, this range is bogus.
	*/
	if (vm_map_lookup_entry(map, start, &temp_entry))
	return (KERN_NO_SPACE);

	prev_entry = temp_entry;

	/*
	* Assert that the next entry doesn't overlap the end point.
	*/
	if ((prev_entry->next != &map->header) &&
	(prev_entry->next->start < end))
	return (KERN_NO_SPACE);

	protoeflags = 0;

	if (cow & MAP_COPY_ON_WRITE)
	protoeflags \|= MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY;

	if (cow & MAP_NOFAULT) {
	protoeflags \|= MAP_ENTRY_NOFAULT;

	KASSERT(object == NULL,
	("vm_map_insert: paradoxical MAP_NOFAULT request"));
	}
	if (cow & MAP_DISABLE_SYNCER)
	protoeflags \|= MAP_ENTRY_NOSYNC;
	if (cow & MAP_DISABLE_COREDUMP)
	protoeflags \|= MAP_ENTRY_NOCOREDUMP;

	if (object) {
	/*
	* When object is non-NULL, it could be shared with another
	* process. We have to set or clear OBJ_ONEMAPPING
	* appropriately.
	*/
	vm_object_lock(object);
	if ((object->ref_count > 1) \|\| (object->shadow_count != 0)) {
	vm_object_clear_flag(object, OBJ_ONEMAPPING);
	}
	vm_object_unlock(object);
	}
	else if ((prev_entry != &map->header) &&
	(prev_entry->eflags == protoeflags) &&
	(prev_entry->end == start) &&
	(prev_entry->wired_count == 0) &&
	((prev_entry->object.vm_object == NULL) \|\|
	vm_object_coalesce(prev_entry->object.vm_object,
	OFF_TO_IDX(prev_entry->offset),
	(vm_size_t)(prev_entry->end - prev_entry->start),
	(vm_size_t)(end - prev_entry->end)))) {
	/*
	* We were able to extend the object. Determine if we
	* can extend the previous map entry to include the
	* new range as well.
	*/
	if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
	(prev_entry->protection == prot) &&
	(prev_entry->max_protection == max)) {
	map->size += (end - prev_entry->end);
	prev_entry->end = end;
	vm_map_simplify_entry(map, prev_entry);
	return (KERN_SUCCESS);
	}

	/*
	* If we can extend the object but cannot extend the
	* map entry, we have to create a new map entry. We
	* must bump the ref count on the extended object to
	* account for it. object may be NULL.
	*/
	object = prev_entry->object.vm_object;
	offset = prev_entry->offset +
	(prev_entry->end - prev_entry->start);
	vm_object_reference(object);
	}

	/*
	* NOTE: if conditionals fail, object can be NULL here. This occurs
	* in things like the buffer map where we manage kva but do not manage
	* backing objects.
	*/

	/*
	* Create a new entry
	*/
	new_entry = vm_map_entry_create(map);
	new_entry->start = start;
	new_entry->end = end;

	new_entry->eflags = protoeflags;
	new_entry->object.vm_object = object;
	new_entry->offset = offset;
	new_entry->avail_ssize = 0;

	new_entry->inheritance = VM_INHERIT_DEFAULT;
	new_entry->protection = prot;
	new_entry->max_protection = max;
	new_entry->wired_count = 0;

	/*
	* Insert the new entry into the list
	*/
	vm_map_entry_link(map, prev_entry, new_entry);
	map->size += new_entry->end - new_entry->start;

	/*
	* Update the free space hint
	*/
	if ((map->first_free == prev_entry) &&
	(prev_entry->end >= new_entry->start)) {
	map->first_free = new_entry;
	}

	#if 0
	/*
	* Temporarily removed to avoid MAP_STACK panic, due to
	* MAP_STACK being a huge hack. Will be added back in
	* when MAP_STACK (and the user stack mapping) is fixed.
	*/
	/*
	* It may be possible to simplify the entry
	*/
	vm_map_simplify_entry(map, new_entry);
	#endif

	if (cow & (MAP_PREFAULT\|MAP_PREFAULT_PARTIAL)) {
	mtx_lock(&Giant);
	pmap_object_init_pt(map->pmap, start,
	object, OFF_TO_IDX(offset), end - start,
	cow & MAP_PREFAULT_PARTIAL);
	mtx_unlock(&Giant);
	}

	return (KERN_SUCCESS);
	}

	/*
	* Find sufficient space for `length' bytes in the given map, starting at
	* `start'. The map must be locked. Returns 0 on success, 1 on no space.
	*/
	int
	vm_map_findspace(
	vm_map_t map,
	vm_offset_t start,
	vm_size_t length,
	vm_offset_t *addr)
	{
	vm_map_entry_t entry, next;
	vm_offset_t end;

	if (start < map->min_offset)
	start = map->min_offset;
	if (start > map->max_offset)
	return (1);

	/*
	* Look for the first possible address; if there's already something
	* at this address, we have to start after it.
	*/
	if (start == map->min_offset) {
	if ((entry = map->first_free) != &map->header)
	start = entry->end;
	} else {
	vm_map_entry_t tmp;

	if (vm_map_lookup_entry(map, start, &tmp))
	start = tmp->end;
	entry = tmp;
	}

	/*
	* Look through the rest of the map, trying to fit a new region in the
	* gap between existing regions, or after the very last region.
	*/
	for (;; start = (entry = next)->end) {
	/*
	* Find the end of the proposed new region. Be sure we didn't
	* go beyond the end of the map, or wrap around the address;
	* if so, we lose. Otherwise, if this is the last entry, or
	* if the proposed new region fits before the next entry, we
	* win.
	*/
	end = start + length;
	if (end > map->max_offset \|\| end < start)
	return (1);
	next = entry->next;
	if (next == &map->header \|\| next->start >= end)
	break;
	}
	*addr = start;
	if (map == kernel_map) {
	vm_offset_t ksize;
	if ((ksize = round_page(start + length)) > kernel_vm_end) {
	mtx_lock(&Giant);
	pmap_growkernel(ksize);
	mtx_unlock(&Giant);
	}
	}
	return (0);
	}

	/*
	* vm_map_find finds an unallocated region in the target address
	* map with the given length. The search is defined to be
	* first-fit from the specified address; the region found is
	* returned in the same parameter.
	*
	* If object is non-NULL, ref count must be bumped by caller
	* prior to making call to account for the new entry.
	*/
	int
	vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
	vm_offset_t addr, / IN/OUT */
	vm_size_t length, boolean_t find_space, vm_prot_t prot,
	vm_prot_t max, int cow)
	{
	vm_offset_t start;
	int result, s = 0;

	start = *addr;

	if (map == kmem_map)
	s = splvm();

	vm_map_lock(map);
	if (find_space) {
	if (vm_map_findspace(map, start, length, addr)) {
	vm_map_unlock(map);
	if (map == kmem_map)
	splx(s);
	return (KERN_NO_SPACE);
	}
	start = *addr;
	}
	result = vm_map_insert(map, object, offset,
	start, start + length, prot, max, cow);
	vm_map_unlock(map);

	if (map == kmem_map)
	splx(s);

	return (result);
	}

	/*
	* vm_map_simplify_entry:
	*
	* Simplify the given map entry by merging with either neighbor. This
	* routine also has the ability to merge with both neighbors.
	*
	* The map must be locked.
	*
	* This routine guarentees that the passed entry remains valid (though
	* possibly extended). When merging, this routine may delete one or
	* both neighbors.
	*/
	void
	vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
	{
	vm_map_entry_t next, prev;
	vm_size_t prevsize, esize;

	if (entry->eflags & (MAP_ENTRY_IN_TRANSITION \| MAP_ENTRY_IS_SUB_MAP))
	return;

	prev = entry->prev;
	if (prev != &map->header) {
	prevsize = prev->end - prev->start;
	if ( (prev->end == entry->start) &&
	(prev->object.vm_object == entry->object.vm_object) &&
	(!prev->object.vm_object \|\|
	(prev->offset + prevsize == entry->offset)) &&
	(prev->eflags == entry->eflags) &&
	(prev->protection == entry->protection) &&
	(prev->max_protection == entry->max_protection) &&
	(prev->inheritance == entry->inheritance) &&
	(prev->wired_count == entry->wired_count)) {
	if (map->first_free == prev)
	map->first_free = entry;
	vm_map_entry_unlink(map, prev);
	entry->start = prev->start;
	entry->offset = prev->offset;
	if (prev->object.vm_object)
	vm_object_deallocate(prev->object.vm_object);
	vm_map_entry_dispose(map, prev);
	}
	}

	next = entry->next;
	if (next != &map->header) {
	esize = entry->end - entry->start;
	if ((entry->end == next->start) &&
	(next->object.vm_object == entry->object.vm_object) &&
	(!entry->object.vm_object \|\|
	(entry->offset + esize == next->offset)) &&
	(next->eflags == entry->eflags) &&
	(next->protection == entry->protection) &&
	(next->max_protection == entry->max_protection) &&
	(next->inheritance == entry->inheritance) &&
	(next->wired_count == entry->wired_count)) {
	if (map->first_free == next)
	map->first_free = entry;
	vm_map_entry_unlink(map, next);
	entry->end = next->end;
	if (next->object.vm_object)
	vm_object_deallocate(next->object.vm_object);
	vm_map_entry_dispose(map, next);
	}
	}
	}
	/*
	* vm_map_clip_start: [ internal use only ]
	*
	* Asserts that the given entry begins at or after
	* the specified address; if necessary,
	* it splits the entry into two.
	*/
	#define vm_map_clip_start(map, entry, startaddr) \
	{ \
	if (startaddr > entry->start) \
	_vm_map_clip_start(map, entry, startaddr); \
	}

	/*
	* This routine is called only when it is known that
	* the entry must be split.
	*/
	static void
	_vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
	{
	vm_map_entry_t new_entry;

	/*
	* Split off the front portion -- note that we must insert the new
	* entry BEFORE this one, so that this entry has the specified
	* starting address.
	*/
	vm_map_simplify_entry(map, entry);

	/*
	* If there is no object backing this entry, we might as well create
	* one now. If we defer it, an object can get created after the map
	* is clipped, and individual objects will be created for the split-up
	* map. This is a bit of a hack, but is also about the best place to
	* put this improvement.
	*/
	if (entry->object.vm_object == NULL && !map->system_map) {
	vm_object_t object;
	object = vm_object_allocate(OBJT_DEFAULT,
	atop(entry->end - entry->start));
	entry->object.vm_object = object;
	entry->offset = 0;
	}

	new_entry = vm_map_entry_create(map);
	new_entry = entry;

	new_entry->end = start;
	entry->offset += (start - entry->start);
	entry->start = start;

	vm_map_entry_link(map, entry->prev, new_entry);

	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	vm_object_reference(new_entry->object.vm_object);
	}
	}

	/*
	* vm_map_clip_end: [ internal use only ]
	*
	* Asserts that the given entry ends at or before
	* the specified address; if necessary,
	* it splits the entry into two.
	*/
	#define vm_map_clip_end(map, entry, endaddr) \
	{ \
	if (endaddr < entry->end) \
	_vm_map_clip_end(map, entry, endaddr); \
	}

	/*
	* This routine is called only when it is known that
	* the entry must be split.
	*/
	static void
	_vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
	{
	vm_map_entry_t new_entry;

	/*
	* If there is no object backing this entry, we might as well create
	* one now. If we defer it, an object can get created after the map
	* is clipped, and individual objects will be created for the split-up
	* map. This is a bit of a hack, but is also about the best place to
	* put this improvement.
	*/
	if (entry->object.vm_object == NULL && !map->system_map) {
	vm_object_t object;
	object = vm_object_allocate(OBJT_DEFAULT,
	atop(entry->end - entry->start));
	entry->object.vm_object = object;
	entry->offset = 0;
	}

	/*
	* Create a new entry and insert it AFTER the specified entry
	*/
	new_entry = vm_map_entry_create(map);
	new_entry = entry;

	new_entry->start = entry->end = end;
	new_entry->offset += (end - entry->start);

	vm_map_entry_link(map, entry, new_entry);

	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	vm_object_reference(new_entry->object.vm_object);
	}
	}

	/*
	* VM_MAP_RANGE_CHECK: [ internal use only ]
	*
	* Asserts that the starting and ending region
	* addresses fall within the valid range of the map.
	*/
	#define VM_MAP_RANGE_CHECK(map, start, end) \
	{ \
	if (start < vm_map_min(map)) \
	start = vm_map_min(map); \
	if (end > vm_map_max(map)) \
	end = vm_map_max(map); \
	if (start > end) \
	start = end; \
	}

	/*
	* vm_map_submap: [ kernel use only ]
	*
	* Mark the given range as handled by a subordinate map.
	*
	* This range must have been created with vm_map_find,
	* and no other operations may have been performed on this
	* range prior to calling vm_map_submap.
	*
	* Only a limited number of operations can be performed
	* within this rage after calling vm_map_submap:
	* vm_fault
	* [Don't try vm_map_copy!]
	*
	* To remove a submapping, one must first remove the
	* range from the superior map, and then destroy the
	* submap (if desired). [Better yet, don't try it.]
	*/
	int
	vm_map_submap(
	vm_map_t map,
	vm_offset_t start,
	vm_offset_t end,
	vm_map_t submap)
	{
	vm_map_entry_t entry;
	int result = KERN_INVALID_ARGUMENT;

	vm_map_lock(map);

	VM_MAP_RANGE_CHECK(map, start, end);

	if (vm_map_lookup_entry(map, start, &entry)) {
	vm_map_clip_start(map, entry, start);
	} else
	entry = entry->next;

	vm_map_clip_end(map, entry, end);

	if ((entry->start == start) && (entry->end == end) &&
	((entry->eflags & MAP_ENTRY_COW) == 0) &&
	(entry->object.vm_object == NULL)) {
	entry->object.sub_map = submap;
	entry->eflags \|= MAP_ENTRY_IS_SUB_MAP;
	result = KERN_SUCCESS;
	}
	vm_map_unlock(map);

	return (result);
	}

	/*
	* vm_map_protect:
	*
	* Sets the protection of the specified address
	* region in the target map. If "set_max" is
	* specified, the maximum protection is to be set;
	* otherwise, only the current protection is affected.
	*/
	int
	vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
	vm_prot_t new_prot, boolean_t set_max)
	{
	vm_map_entry_t current;
	vm_map_entry_t entry;

	vm_map_lock(map);

	VM_MAP_RANGE_CHECK(map, start, end);

	if (vm_map_lookup_entry(map, start, &entry)) {
	vm_map_clip_start(map, entry, start);
	} else {
	entry = entry->next;
	}

	/*
	* Make a first pass to check for protection violations.
	*/
	current = entry;
	while ((current != &map->header) && (current->start < end)) {
	if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
	vm_map_unlock(map);
	return (KERN_INVALID_ARGUMENT);
	}
	if ((new_prot & current->max_protection) != new_prot) {
	vm_map_unlock(map);
	return (KERN_PROTECTION_FAILURE);
	}
	current = current->next;
	}

	/*
	* Go back and fix up protections. [Note that clipping is not
	* necessary the second time.]
	*/
	current = entry;
	while ((current != &map->header) && (current->start < end)) {
	vm_prot_t old_prot;

	vm_map_clip_end(map, current, end);

	old_prot = current->protection;
	if (set_max)
	current->protection =
	(current->max_protection = new_prot) &
	old_prot;
	else
	current->protection = new_prot;

	/*
	* Update physical map if necessary. Worry about copy-on-write
	* here -- CHECK THIS XXX
	*/
	if (current->protection != old_prot) {
	mtx_lock(&Giant);
	#define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
	VM_PROT_ALL)
	pmap_protect(map->pmap, current->start,
	current->end,
	current->protection & MASK(current));
	#undef MASK
	mtx_unlock(&Giant);
	}
	vm_map_simplify_entry(map, current);
	current = current->next;
	}
	vm_map_unlock(map);
	return (KERN_SUCCESS);
	}

	/*
	* vm_map_madvise:
	*
	* This routine traverses a processes map handling the madvise
	* system call. Advisories are classified as either those effecting
	* the vm_map_entry structure, or those effecting the underlying
	* objects.
	*/
	int
	vm_map_madvise(
	vm_map_t map,
	vm_offset_t start,
	vm_offset_t end,
	int behav)
	{
	vm_map_entry_t current, entry;
	int modify_map = 0;

	/*
	* Some madvise calls directly modify the vm_map_entry, in which case
	* we need to use an exclusive lock on the map and we need to perform
	* various clipping operations. Otherwise we only need a read-lock
	* on the map.
	*/
	switch(behav) {
	case MADV_NORMAL:
	case MADV_SEQUENTIAL:
	case MADV_RANDOM:
	case MADV_NOSYNC:
	case MADV_AUTOSYNC:
	case MADV_NOCORE:
	case MADV_CORE:
	modify_map = 1;
	vm_map_lock(map);
	break;
	case MADV_WILLNEED:
	case MADV_DONTNEED:
	case MADV_FREE:
	vm_map_lock_read(map);
	break;
	default:
	return (KERN_INVALID_ARGUMENT);
	}

	/*
	* Locate starting entry and clip if necessary.
	*/
	VM_MAP_RANGE_CHECK(map, start, end);

	if (vm_map_lookup_entry(map, start, &entry)) {
	if (modify_map)
	vm_map_clip_start(map, entry, start);
	} else {
	entry = entry->next;
	}

	if (modify_map) {
	/*
	* madvise behaviors that are implemented in the vm_map_entry.
	*
	* We clip the vm_map_entry so that behavioral changes are
	* limited to the specified address range.
	*/
	for (current = entry;
	(current != &map->header) && (current->start < end);
	current = current->next
	) {
	if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
	continue;

	vm_map_clip_end(map, current, end);

	switch (behav) {
	case MADV_NORMAL:
	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
	break;
	case MADV_SEQUENTIAL:
	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
	break;
	case MADV_RANDOM:
	vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
	break;
	case MADV_NOSYNC:
	current->eflags \|= MAP_ENTRY_NOSYNC;
	break;
	case MADV_AUTOSYNC:
	current->eflags &= ~MAP_ENTRY_NOSYNC;
	break;
	case MADV_NOCORE:
	current->eflags \|= MAP_ENTRY_NOCOREDUMP;
	break;
	case MADV_CORE:
	current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
	break;
	default:
	break;
	}
	vm_map_simplify_entry(map, current);
	}
	vm_map_unlock(map);
	} else {
	vm_pindex_t pindex;
	int count;

	/*
	* madvise behaviors that are implemented in the underlying
	* vm_object.
	*
	* Since we don't clip the vm_map_entry, we have to clip
	* the vm_object pindex and count.
	*/
	for (current = entry;
	(current != &map->header) && (current->start < end);
	current = current->next
	) {
	vm_offset_t useStart;

	if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
	continue;

	pindex = OFF_TO_IDX(current->offset);
	count = atop(current->end - current->start);
	useStart = current->start;

	if (current->start < start) {
	pindex += atop(start - current->start);
	count -= atop(start - current->start);
	useStart = start;
	}
	if (current->end > end)
	count -= atop(current->end - end);

	if (count <= 0)
	continue;

	vm_object_madvise(current->object.vm_object,
	pindex, count, behav);
	if (behav == MADV_WILLNEED) {
	mtx_lock(&Giant);
	pmap_object_init_pt(
	map->pmap,
	useStart,
	current->object.vm_object,
	pindex,
	(count << PAGE_SHIFT),
	MAP_PREFAULT_MADVISE
	);
	mtx_unlock(&Giant);
	}
	}
	vm_map_unlock_read(map);
	}
	return (0);
	}


	/*
	* vm_map_inherit:
	*
	* Sets the inheritance of the specified address
	* range in the target map. Inheritance
	* affects how the map will be shared with
	* child maps at the time of vm_map_fork.
	*/
	int
	vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
	vm_inherit_t new_inheritance)
	{
	vm_map_entry_t entry;
	vm_map_entry_t temp_entry;

	switch (new_inheritance) {
	case VM_INHERIT_NONE:
	case VM_INHERIT_COPY:
	case VM_INHERIT_SHARE:
	break;
	default:
	return (KERN_INVALID_ARGUMENT);
	}
	vm_map_lock(map);
	VM_MAP_RANGE_CHECK(map, start, end);
	if (vm_map_lookup_entry(map, start, &temp_entry)) {
	entry = temp_entry;
	vm_map_clip_start(map, entry, start);
	} else
	entry = temp_entry->next;
	while ((entry != &map->header) && (entry->start < end)) {
	vm_map_clip_end(map, entry, end);
	entry->inheritance = new_inheritance;
	vm_map_simplify_entry(map, entry);
	entry = entry->next;
	}
	vm_map_unlock(map);
	return (KERN_SUCCESS);
	}

	/*
	* vm_map_unwire:
	*
	* Implements both kernel and user unwiring.
	*/
	int
	vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
	boolean_t user_unwire)
	{
	vm_map_entry_t entry, first_entry, tmp_entry;
	vm_offset_t saved_start;
	unsigned int last_timestamp;
	int rv;
	boolean_t need_wakeup, result;

	vm_map_lock(map);
	VM_MAP_RANGE_CHECK(map, start, end);
	if (!vm_map_lookup_entry(map, start, &first_entry)) {
	vm_map_unlock(map);
	return (KERN_INVALID_ADDRESS);
	}
	last_timestamp = map->timestamp;
	entry = first_entry;
	while (entry != &map->header && entry->start < end) {
	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
	/*
	* We have not yet clipped the entry.
	*/
	saved_start = (start >= entry->start) ? start :
	entry->start;
	entry->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	if (vm_map_unlock_and_wait(map, user_unwire)) {
	/*
	* Allow interruption of user unwiring?
	*/
	}
	vm_map_lock(map);
	if (last_timestamp+1 != map->timestamp) {
	/*
	* Look again for the entry because the map was
	* modified while it was unlocked.
	* Specifically, the entry may have been
	* clipped, merged, or deleted.
	*/
	if (!vm_map_lookup_entry(map, saved_start,
	&tmp_entry)) {
	if (saved_start == start) {
	/*
	* First_entry has been deleted.
	*/
	vm_map_unlock(map);
	return (KERN_INVALID_ADDRESS);
	}
	end = saved_start;
	rv = KERN_INVALID_ADDRESS;
	goto done;
	}
	if (entry == first_entry)
	first_entry = tmp_entry;
	else
	first_entry = NULL;
	entry = tmp_entry;
	}
	last_timestamp = map->timestamp;
	continue;
	}
	vm_map_clip_start(map, entry, start);
	vm_map_clip_end(map, entry, end);
	/*
	* Mark the entry in case the map lock is released. (See
	* above.)
	*/
	entry->eflags \|= MAP_ENTRY_IN_TRANSITION;
	/*
	* Check the map for holes in the specified region.
	*/
	if (entry->end < end && (entry->next == &map->header \|\|
	entry->next->start > entry->end)) {
	end = entry->end;
	rv = KERN_INVALID_ADDRESS;
	goto done;
	}
	/*
	* Require that the entry is wired.
	*/
	if (entry->wired_count == 0 \|\| (user_unwire &&
	(entry->eflags & MAP_ENTRY_USER_WIRED) == 0)) {
	end = entry->end;
	rv = KERN_INVALID_ARGUMENT;
	goto done;
	}
	entry = entry->next;
	}
	rv = KERN_SUCCESS;
	done:
	need_wakeup = FALSE;
	if (first_entry == NULL) {
	result = vm_map_lookup_entry(map, start, &first_entry);
	KASSERT(result, ("vm_map_unwire: lookup failed"));
	}
	entry = first_entry;
	while (entry != &map->header && entry->start < end) {
	if (rv == KERN_SUCCESS) {
	if (user_unwire)
	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	entry->wired_count--;
	if (entry->wired_count == 0) {
	/*
	* Retain the map lock.
	*/
	vm_fault_unwire(map, entry->start, entry->end);
	}
	}
	KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
	("vm_map_unwire: in-transition flag missing"));
	entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
	if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
	entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
	need_wakeup = TRUE;
	}
	vm_map_simplify_entry(map, entry);
	entry = entry->next;
	}
	vm_map_unlock(map);
	if (need_wakeup)
	vm_map_wakeup(map);
	return (rv);
	}

	/*
	* vm_map_wire:
	*
	* Implements both kernel and user wiring.
	*/
	int
	vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
	boolean_t user_wire)
	{
	vm_map_entry_t entry, first_entry, tmp_entry;
	vm_offset_t saved_end, saved_start;
	unsigned int last_timestamp;
	int rv;
	boolean_t need_wakeup, result;

	vm_map_lock(map);
	VM_MAP_RANGE_CHECK(map, start, end);
	if (!vm_map_lookup_entry(map, start, &first_entry)) {
	vm_map_unlock(map);
	return (KERN_INVALID_ADDRESS);
	}
	last_timestamp = map->timestamp;
	entry = first_entry;
	while (entry != &map->header && entry->start < end) {
	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
	/*
	* We have not yet clipped the entry.
	*/
	saved_start = (start >= entry->start) ? start :
	entry->start;
	entry->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	if (vm_map_unlock_and_wait(map, user_wire)) {
	/*
	* Allow interruption of user wiring?
	*/
	}
	vm_map_lock(map);
	if (last_timestamp + 1 != map->timestamp) {
	/*
	* Look again for the entry because the map was
	* modified while it was unlocked.
	* Specifically, the entry may have been
	* clipped, merged, or deleted.
	*/
	if (!vm_map_lookup_entry(map, saved_start,
	&tmp_entry)) {
	if (saved_start == start) {
	/*
	* first_entry has been deleted.
	*/
	vm_map_unlock(map);
	return (KERN_INVALID_ADDRESS);
	}
	end = saved_start;
	rv = KERN_INVALID_ADDRESS;
	goto done;
	}
	if (entry == first_entry)
	first_entry = tmp_entry;
	else
	first_entry = NULL;
	entry = tmp_entry;
	}
	last_timestamp = map->timestamp;
	continue;
	}
	vm_map_clip_start(map, entry, start);
	vm_map_clip_end(map, entry, end);
	/*
	* Mark the entry in case the map lock is released. (See
	* above.)
	*/
	entry->eflags \|= MAP_ENTRY_IN_TRANSITION;
	/*
	*
	*/
	if (entry->wired_count == 0) {
	entry->wired_count++;
	saved_start = entry->start;
	saved_end = entry->end;
	/*
	* Release the map lock, relying on the in-transition
	* mark.
	*/
	vm_map_unlock(map);
	rv = vm_fault_wire(map, saved_start, saved_end,
	user_wire);
	vm_map_lock(map);
	if (last_timestamp + 1 != map->timestamp) {
	/*
	* Look again for the entry because the map was
	* modified while it was unlocked. The entry
	* may have been clipped, but NOT merged or
	* deleted.
	*/
	result = vm_map_lookup_entry(map, saved_start,
	&tmp_entry);
	KASSERT(result, ("vm_map_wire: lookup failed"));
	if (entry == first_entry)
	first_entry = tmp_entry;
	else
	first_entry = NULL;
	entry = tmp_entry;
	while (entry->end < saved_end) {
	if (rv != KERN_SUCCESS) {
	KASSERT(entry->wired_count == 1,
	("vm_map_wire: bad count"));
	entry->wired_count = -1;
	}
	entry = entry->next;
	}
	}
	last_timestamp = map->timestamp;
	if (rv != KERN_SUCCESS) {
	KASSERT(entry->wired_count == 1,
	("vm_map_wire: bad count"));
	/*
	* Assign an out-of-range value to represent
	* the failure to wire this entry.
	*/
	entry->wired_count = -1;
	end = entry->end;
	goto done;
	}
	} else if (!user_wire \|\|
	(entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
	entry->wired_count++;
	}
	/*
	* Check the map for holes in the specified region.
	*/
	if (entry->end < end && (entry->next == &map->header \|\|
	entry->next->start > entry->end)) {
	end = entry->end;
	rv = KERN_INVALID_ADDRESS;
	goto done;
	}
	entry = entry->next;
	}
	rv = KERN_SUCCESS;
	done:
	need_wakeup = FALSE;
	if (first_entry == NULL) {
	result = vm_map_lookup_entry(map, start, &first_entry);
	KASSERT(result, ("vm_map_wire: lookup failed"));
	}
	entry = first_entry;
	while (entry != &map->header && entry->start < end) {
	if (rv == KERN_SUCCESS) {
	if (user_wire)
	entry->eflags \|= MAP_ENTRY_USER_WIRED;
	} else if (entry->wired_count == -1) {
	/*
	* Wiring failed on this entry. Thus, unwiring is
	* unnecessary.
	*/
	entry->wired_count = 0;
	} else {
	if (!user_wire \|\| (entry->wired_count == 1 &&
	(entry->eflags & MAP_ENTRY_USER_WIRED) == 0))
	entry->wired_count--;
	if (entry->wired_count == 0) {
	/*
	* Retain the map lock.
	*/
	vm_fault_unwire(map, entry->start, entry->end);
	}
	}
	KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
	("vm_map_wire: in-transition flag missing"));
	entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
	if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
	entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
	need_wakeup = TRUE;
	}
	vm_map_simplify_entry(map, entry);
	entry = entry->next;
	}
	vm_map_unlock(map);
	if (need_wakeup)
	vm_map_wakeup(map);
	return (rv);
	}

	/*
	* vm_map_clean
	*
	* Push any dirty cached pages in the address range to their pager.
	* If syncio is TRUE, dirty pages are written synchronously.
	* If invalidate is TRUE, any cached pages are freed as well.
	*
	* Returns an error if any part of the specified range is not mapped.
	*/
	int
	vm_map_clean(
	vm_map_t map,
	vm_offset_t start,
	vm_offset_t end,
	boolean_t syncio,
	boolean_t invalidate)
	{
	vm_map_entry_t current;
	vm_map_entry_t entry;
	vm_size_t size;
	vm_object_t object;
	vm_ooffset_t offset;

	GIANT_REQUIRED;

	vm_map_lock_read(map);
	VM_MAP_RANGE_CHECK(map, start, end);
	if (!vm_map_lookup_entry(map, start, &entry)) {
	vm_map_unlock_read(map);
	return (KERN_INVALID_ADDRESS);
	}
	/*
	* Make a first pass to check for holes.
	*/
	for (current = entry; current->start < end; current = current->next) {
	if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
	vm_map_unlock_read(map);
	return (KERN_INVALID_ARGUMENT);
	}
	if (end > current->end &&
	(current->next == &map->header \|\|
	current->end != current->next->start)) {
	vm_map_unlock_read(map);
	return (KERN_INVALID_ADDRESS);
	}
	}

	if (invalidate)
	pmap_remove(vm_map_pmap(map), start, end);
	/*
	* Make a second pass, cleaning/uncaching pages from the indicated
	* objects as we go.
	*/
	for (current = entry; current->start < end; current = current->next) {
	offset = current->offset + (start - current->start);
	size = (end <= current->end ? end : current->end) - start;
	if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
	vm_map_t smap;
	vm_map_entry_t tentry;
	vm_size_t tsize;

	smap = current->object.sub_map;
	vm_map_lock_read(smap);
	(void) vm_map_lookup_entry(smap, offset, &tentry);
	tsize = tentry->end - offset;
	if (tsize < size)
	size = tsize;
	object = tentry->object.vm_object;
	offset = tentry->offset + (offset - tentry->start);
	vm_map_unlock_read(smap);
	} else {
	object = current->object.vm_object;
	}
	/*
	* Note that there is absolutely no sense in writing out
	* anonymous objects, so we track down the vnode object
	* to write out.
	* We invalidate (remove) all pages from the address space
	* anyway, for semantic correctness.
	*
	* note: certain anonymous maps, such as MAP_NOSYNC maps,
	* may start out with a NULL object.
	*/
	while (object && object->backing_object) {
	object = object->backing_object;
	offset += object->backing_object_offset;
	if (object->size < OFF_TO_IDX(offset + size))
	size = IDX_TO_OFF(object->size) - offset;
	}
	if (object && (object->type == OBJT_VNODE) &&
	(current->protection & VM_PROT_WRITE)) {
	/*
	* Flush pages if writing is allowed, invalidate them
	* if invalidation requested. Pages undergoing I/O
	* will be ignored by vm_object_page_remove().
	*
	* We cannot lock the vnode and then wait for paging
	* to complete without deadlocking against vm_fault.
	* Instead we simply call vm_object_page_remove() and
	* allow it to block internally on a page-by-page
	* basis when it encounters pages undergoing async
	* I/O.
	*/
	int flags;

	vm_object_reference(object);
	vn_lock(object->handle, LK_EXCLUSIVE \| LK_RETRY, curthread);
	flags = (syncio \|\| invalidate) ? OBJPC_SYNC : 0;
	flags \|= invalidate ? OBJPC_INVAL : 0;
	vm_object_page_clean(object,
	OFF_TO_IDX(offset),
	OFF_TO_IDX(offset + size + PAGE_MASK),
	flags);
	if (invalidate) {
	/vm_object_pip_wait(object, "objmcl");/
	vm_object_page_remove(object,
	OFF_TO_IDX(offset),
	OFF_TO_IDX(offset + size + PAGE_MASK),
	FALSE);
	}
	VOP_UNLOCK(object->handle, 0, curthread);
	vm_object_deallocate(object);
	}
	start += size;
	}

	vm_map_unlock_read(map);
	return (KERN_SUCCESS);
	}

	/*
	* vm_map_entry_unwire: [ internal use only ]
	*
	* Make the region specified by this entry pageable.
	*
	* The map in question should be locked.
	* [This is the reason for this routine's existence.]
	*/
	static void
	vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
	{
	vm_fault_unwire(map, entry->start, entry->end);
	entry->wired_count = 0;
	}

	/*
	* vm_map_entry_delete: [ internal use only ]
	*
	* Deallocate the given entry from the target map.
	*/
	static void
	vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
	{
	vm_map_entry_unlink(map, entry);
	map->size -= entry->end - entry->start;

	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	vm_object_deallocate(entry->object.vm_object);
	}

	vm_map_entry_dispose(map, entry);
	}

	/*
	* vm_map_delete: [ internal use only ]
	*
	* Deallocates the given address range from the target
	* map.
	*/
	int
	vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
	{
	vm_object_t object;
	vm_map_entry_t entry;
	vm_map_entry_t first_entry;

	/*
	* Find the start of the region, and clip it
	*/
	if (!vm_map_lookup_entry(map, start, &first_entry))
	entry = first_entry->next;
	else {
	entry = first_entry;
	vm_map_clip_start(map, entry, start);
	}

	/*
	* Save the free space hint
	*/
	if (entry == &map->header) {
	map->first_free = &map->header;
	} else if (map->first_free->start >= start) {
	map->first_free = entry->prev;
	}

	/*
	* Step through all entries in this region
	*/
	while ((entry != &map->header) && (entry->start < end)) {
	vm_map_entry_t next;
	vm_offset_t s, e;
	vm_pindex_t offidxstart, offidxend, count;

	/*
	* Wait for wiring or unwiring of an entry to complete.
	*/
	if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0) {
	unsigned int last_timestamp;
	vm_offset_t saved_start;
	vm_map_entry_t tmp_entry;

	saved_start = entry->start;
	entry->eflags \|= MAP_ENTRY_NEEDS_WAKEUP;
	last_timestamp = map->timestamp;
	(void) vm_map_unlock_and_wait(map, FALSE);
	vm_map_lock(map);
	if (last_timestamp + 1 != map->timestamp) {
	/*
	* Look again for the entry because the map was
	* modified while it was unlocked.
	* Specifically, the entry may have been
	* clipped, merged, or deleted.
	*/
	if (!vm_map_lookup_entry(map, saved_start,
	&tmp_entry))
	entry = tmp_entry->next;
	else {
	entry = tmp_entry;
	vm_map_clip_start(map, entry,
	saved_start);
	}
	}
	continue;
	}
	vm_map_clip_end(map, entry, end);

	s = entry->start;
	e = entry->end;
	next = entry->next;

	offidxstart = OFF_TO_IDX(entry->offset);
	count = OFF_TO_IDX(e - s);
	object = entry->object.vm_object;

	/*
	* Unwire before removing addresses from the pmap; otherwise,
	* unwiring will put the entries back in the pmap.
	*/
	if (entry->wired_count != 0) {
	vm_map_entry_unwire(map, entry);
	}

	offidxend = offidxstart + count;

	if ((object == kernel_object) \|\| (object == kmem_object)) {
	vm_object_page_remove(object, offidxstart, offidxend, FALSE);
	} else {
	mtx_lock(&Giant);
	pmap_remove(map->pmap, s, e);
	if (object != NULL &&
	object->ref_count != 1 &&
	(object->flags & (OBJ_NOSPLIT\|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING &&
	(object->type == OBJT_DEFAULT \|\| object->type == OBJT_SWAP)) {
	vm_object_collapse(object);
	vm_object_page_remove(object, offidxstart, offidxend, FALSE);
	if (object->type == OBJT_SWAP) {
	swap_pager_freespace(object, offidxstart, count);
	}
	if (offidxend >= object->size &&
	offidxstart < object->size) {
	object->size = offidxstart;
	}
	}
	mtx_unlock(&Giant);
	}

	/*
	* Delete the entry (which may delete the object) only after
	* removing all pmap entries pointing to its pages.
	* (Otherwise, its page frames may be reallocated, and any
	* modify bits will be set in the wrong object!)
	*/
	vm_map_entry_delete(map, entry);
	entry = next;
	}
	return (KERN_SUCCESS);
	}

	/*
	* vm_map_remove:
	*
	* Remove the given address range from the target map.
	* This is the exported form of vm_map_delete.
	*/
	int
	vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
	{
	int result, s = 0;

	if (map == kmem_map)
	s = splvm();

	vm_map_lock(map);
	VM_MAP_RANGE_CHECK(map, start, end);
	result = vm_map_delete(map, start, end);
	vm_map_unlock(map);

	if (map == kmem_map)
	splx(s);

	return (result);
	}

	/*
	* vm_map_check_protection:
	*
	* Assert that the target map allows the specified
	* privilege on the entire address region given.
	* The entire region must be allocated.
	*/
	boolean_t
	vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
	vm_prot_t protection)
	{
	vm_map_entry_t entry;
	vm_map_entry_t tmp_entry;

	vm_map_lock_read(map);
	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
	vm_map_unlock_read(map);
	return (FALSE);
	}
	entry = tmp_entry;

	while (start < end) {
	if (entry == &map->header) {
	vm_map_unlock_read(map);
	return (FALSE);
	}
	/*
	* No holes allowed!
	*/
	if (start < entry->start) {
	vm_map_unlock_read(map);
	return (FALSE);
	}
	/*
	* Check protection associated with entry.
	*/
	if ((entry->protection & protection) != protection) {
	vm_map_unlock_read(map);
	return (FALSE);
	}
	/* go to next entry */
	start = entry->end;
	entry = entry->next;
	}
	vm_map_unlock_read(map);
	return (TRUE);
	}

	/*
	* vm_map_copy_entry:
	*
	* Copies the contents of the source entry to the destination
	* entry. The entries must be aligned properly.
	*/
	static void
	vm_map_copy_entry(
	vm_map_t src_map,
	vm_map_t dst_map,
	vm_map_entry_t src_entry,
	vm_map_entry_t dst_entry)
	{
	vm_object_t src_object;

	if ((dst_entry->eflags\|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
	return;

	if (src_entry->wired_count == 0) {

	/*
	* If the source entry is marked needs_copy, it is already
	* write-protected.
	*/
	if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
	pmap_protect(src_map->pmap,
	src_entry->start,
	src_entry->end,
	src_entry->protection & ~VM_PROT_WRITE);
	}

	/*
	* Make a copy of the object.
	*/
	if ((src_object = src_entry->object.vm_object) != NULL) {

	if ((src_object->handle == NULL) &&
	(src_object->type == OBJT_DEFAULT \|\|
	src_object->type == OBJT_SWAP)) {
	vm_object_collapse(src_object);
	if ((src_object->flags & (OBJ_NOSPLIT\|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
	vm_object_split(src_entry);
	src_object = src_entry->object.vm_object;
	}
	}

	vm_object_reference(src_object);
	vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
	dst_entry->object.vm_object = src_object;
	src_entry->eflags \|= (MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY);
	dst_entry->eflags \|= (MAP_ENTRY_COW\|MAP_ENTRY_NEEDS_COPY);
	dst_entry->offset = src_entry->offset;
	} else {
	dst_entry->object.vm_object = NULL;
	dst_entry->offset = 0;
	}

	pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
	dst_entry->end - dst_entry->start, src_entry->start);
	} else {
	/*
	* Of course, wired down pages can't be set copy-on-write.
	* Cause wired pages to be copied into the new map by
	* simulating faults (the new pages are pageable)
	*/
	vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
	}
	}

	/*
	* vmspace_fork:
	* Create a new process vmspace structure and vm_map
	* based on those of an existing process. The new map
	* is based on the old map, according to the inheritance
	* values on the regions in that map.
	*
	* The source map must not be locked.
	*/
	struct vmspace *
	vmspace_fork(struct vmspace *vm1)
	{
	struct vmspace *vm2;
	vm_map_t old_map = &vm1->vm_map;
	vm_map_t new_map;
	vm_map_entry_t old_entry;
	vm_map_entry_t new_entry;
	vm_object_t object;

	GIANT_REQUIRED;

	vm_map_lock(old_map);
	old_map->infork = 1;

	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
	bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
	(caddr_t) &vm1->vm_endcopy - (caddr_t) &vm1->vm_startcopy);
	new_map = &vm2->vm_map; /* XXX */
	new_map->timestamp = 1;

	old_entry = old_map->header.next;

	while (old_entry != &old_map->header) {
	if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
	panic("vm_map_fork: encountered a submap");

	switch (old_entry->inheritance) {
	case VM_INHERIT_NONE:
	break;

	case VM_INHERIT_SHARE:
	/*
	* Clone the entry, creating the shared object if necessary.
	*/
	object = old_entry->object.vm_object;
	if (object == NULL) {
	object = vm_object_allocate(OBJT_DEFAULT,
	atop(old_entry->end - old_entry->start));
	old_entry->object.vm_object = object;
	old_entry->offset = (vm_offset_t) 0;
	}

	/*
	* Add the reference before calling vm_object_shadow
	* to insure that a shadow object is created.
	*/
	vm_object_reference(object);
	if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
	vm_object_shadow(&old_entry->object.vm_object,
	&old_entry->offset,
	atop(old_entry->end - old_entry->start));
	old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
	/* Transfer the second reference too. */
	vm_object_reference(
	old_entry->object.vm_object);
	vm_object_deallocate(object);
	object = old_entry->object.vm_object;
	}
	vm_object_clear_flag(object, OBJ_ONEMAPPING);

	/*
	* Clone the entry, referencing the shared object.
	*/
	new_entry = vm_map_entry_create(new_map);
	new_entry = old_entry;
	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	new_entry->wired_count = 0;

	/*
	* Insert the entry into the new map -- we know we're
	* inserting at the end of the new map.
	*/
	vm_map_entry_link(new_map, new_map->header.prev,
	new_entry);

	/*
	* Update the physical map
	*/
	pmap_copy(new_map->pmap, old_map->pmap,
	new_entry->start,
	(old_entry->end - old_entry->start),
	old_entry->start);
	break;

	case VM_INHERIT_COPY:
	/*
	* Clone the entry and link into the map.
	*/
	new_entry = vm_map_entry_create(new_map);
	new_entry = old_entry;
	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
	new_entry->wired_count = 0;
	new_entry->object.vm_object = NULL;
	vm_map_entry_link(new_map, new_map->header.prev,
	new_entry);
	vm_map_copy_entry(old_map, new_map, old_entry,
	new_entry);
	break;
	}
	old_entry = old_entry->next;
	}

	new_map->size = old_map->size;
	old_map->infork = 0;
	vm_map_unlock(old_map);

	return (vm2);
	}

	int
	vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
	vm_prot_t prot, vm_prot_t max, int cow)
	{
	vm_map_entry_t prev_entry;
	vm_map_entry_t new_stack_entry;
	vm_size_t init_ssize;
	int rv;

	- if (VM_MIN_ADDRESS > 0 && addrbos < VM_MIN_ADDRESS)
	+ if (addrbos < vm_map_min(map))
	return (KERN_NO_SPACE);

	if (max_ssize < sgrowsiz)
	init_ssize = max_ssize;
	else
	init_ssize = sgrowsiz;

	vm_map_lock(map);

	/* If addr is already mapped, no go */
	if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
	vm_map_unlock(map);
	return (KERN_NO_SPACE);
	}

	/* If we would blow our VMEM resource limit, no go */
	if (map->size + init_ssize >
	curthread->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	vm_map_unlock(map);
	return (KERN_NO_SPACE);
	}

	/* If we can't accomodate max_ssize in the current mapping,
	* no go. However, we need to be aware that subsequent user
	* mappings might map into the space we have reserved for
	* stack, and currently this space is not protected.
	*
	* Hopefully we will at least detect this condition
	* when we try to grow the stack.
	*/
	if ((prev_entry->next != &map->header) &&
	(prev_entry->next->start < addrbos + max_ssize)) {
	vm_map_unlock(map);
	return (KERN_NO_SPACE);
	}

	/* We initially map a stack of only init_ssize. We will
	* grow as needed later. Since this is to be a grow
	* down stack, we map at the top of the range.
	*
	* Note: we would normally expect prot and max to be
	* VM_PROT_ALL, and cow to be 0. Possibly we should
	* eliminate these as input parameters, and just
	* pass these values here in the insert call.
	*/
	rv = vm_map_insert(map, NULL, 0, addrbos + max_ssize - init_ssize,
	addrbos + max_ssize, prot, max, cow);

	/* Now set the avail_ssize amount */
	if (rv == KERN_SUCCESS){
	if (prev_entry != &map->header)
	vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize);
	new_stack_entry = prev_entry->next;
	if (new_stack_entry->end != addrbos + max_ssize \|\|
	new_stack_entry->start != addrbos + max_ssize - init_ssize)
	panic ("Bad entry start/end for new stack entry");
	else
	new_stack_entry->avail_ssize = max_ssize - init_ssize;
	}

	vm_map_unlock(map);
	return (rv);
	}

	/* Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the
	* desired address is already mapped, or if we successfully grow
	* the stack. Also returns KERN_SUCCESS if addr is outside the
	* stack range (this is strange, but preserves compatibility with
	* the grow function in vm_machdep.c).
	*/
	int
	vm_map_growstack (struct proc *p, vm_offset_t addr)
	{
	vm_map_entry_t prev_entry;
	vm_map_entry_t stack_entry;
	vm_map_entry_t new_stack_entry;
	struct vmspace *vm = p->p_vmspace;
	vm_map_t map = &vm->vm_map;
	vm_offset_t end;
	int grow_amount;
	int rv;
	int is_procstack;

	GIANT_REQUIRED;

	Retry:
	vm_map_lock_read(map);

	/* If addr is already in the entry range, no need to grow.*/
	if (vm_map_lookup_entry(map, addr, &prev_entry)) {
	vm_map_unlock_read(map);
	return (KERN_SUCCESS);
	}

	if ((stack_entry = prev_entry->next) == &map->header) {
	vm_map_unlock_read(map);
	return (KERN_SUCCESS);
	}
	if (prev_entry == &map->header)
	end = stack_entry->start - stack_entry->avail_ssize;
	else
	end = prev_entry->end;

	/* This next test mimics the old grow function in vm_machdep.c.
	* It really doesn't quite make sense, but we do it anyway
	* for compatibility.
	*
	* If not growable stack, return success. This signals the
	* caller to proceed as he would normally with normal vm.
	*/
	if (stack_entry->avail_ssize < 1 \|\|
	addr >= stack_entry->start \|\|
	addr < stack_entry->start - stack_entry->avail_ssize) {
	vm_map_unlock_read(map);
	return (KERN_SUCCESS);
	}

	/* Find the minimum grow amount */
	grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
	if (grow_amount > stack_entry->avail_ssize) {
	vm_map_unlock_read(map);
	return (KERN_NO_SPACE);
	}

	/* If there is no longer enough space between the entries
	* nogo, and adjust the available space. Note: this
	* should only happen if the user has mapped into the
	* stack area after the stack was created, and is
	* probably an error.
	*
	* This also effectively destroys any guard page the user
	* might have intended by limiting the stack size.
	*/
	if (grow_amount > stack_entry->start - end) {
	if (vm_map_lock_upgrade(map))
	goto Retry;

	stack_entry->avail_ssize = stack_entry->start - end;

	vm_map_unlock(map);
	return (KERN_NO_SPACE);
	}

	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;

	/* If this is the main process stack, see if we're over the
	* stack limit.
	*/
	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
	p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
	vm_map_unlock_read(map);
	return (KERN_NO_SPACE);
	}

	/* Round up the grow amount modulo SGROWSIZ */
	grow_amount = roundup (grow_amount, sgrowsiz);
	if (grow_amount > stack_entry->avail_ssize) {
	grow_amount = stack_entry->avail_ssize;
	}
	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
	p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
	grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
	ctob(vm->vm_ssize);
	}

	/* If we would blow our VMEM resource limit, no go */
	if (map->size + grow_amount >
	curthread->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	vm_map_unlock_read(map);
	return (KERN_NO_SPACE);
	}

	if (vm_map_lock_upgrade(map))
	goto Retry;

	/* Get the preliminary new entry start value */
	addr = stack_entry->start - grow_amount;

	/* If this puts us into the previous entry, cut back our growth
	* to the available space. Also, see the note above.
	*/
	if (addr < end) {
	stack_entry->avail_ssize = stack_entry->start - end;
	addr = end;
	}

	rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
	- VM_PROT_ALL,
	- VM_PROT_ALL,
	- 0);
	+ p->p_sysent->sv_stackprot, VM_PROT_ALL, 0);

	/* Adjust the available stack space by the amount we grew. */
	if (rv == KERN_SUCCESS) {
	if (prev_entry != &map->header)
	vm_map_clip_end(map, prev_entry, addr);
	new_stack_entry = prev_entry->next;
	if (new_stack_entry->end != stack_entry->start \|\|
	new_stack_entry->start != addr)
	panic ("Bad stack grow start/end in new stack entry");
	else {
	new_stack_entry->avail_ssize = stack_entry->avail_ssize -
	(new_stack_entry->end -
	new_stack_entry->start);
	if (is_procstack)
	vm->vm_ssize += btoc(new_stack_entry->end -
	new_stack_entry->start);
	}
	}

	vm_map_unlock(map);
	return (rv);
	}

	/*
	* Unshare the specified VM space for exec. If other processes are
	* mapped to it, then create a new one. The new vmspace is null.
	*/
	void
	vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
	{
	struct vmspace *oldvmspace = p->p_vmspace;
	struct vmspace *newvmspace;

	GIANT_REQUIRED;
	newvmspace = vmspace_alloc(minuser, maxuser);
	bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
	(caddr_t) (newvmspace + 1) - (caddr_t) &newvmspace->vm_startcopy);
	/*
	* This code is written like this for prototype purposes. The
	* goal is to avoid running down the vmspace here, but let the
	* other process's that are still using the vmspace to finally
	* run it down. Even though there is little or no chance of blocking
	* here, it is a good idea to keep this form for future mods.
	*/
	p->p_vmspace = newvmspace;
	pmap_pinit2(vmspace_pmap(newvmspace));
	vmspace_free(oldvmspace);
	if (p == curthread->td_proc) /* XXXKSE ? */
	pmap_activate(curthread);
	}

	/*
	* Unshare the specified VM space for forcing COW. This
	* is called by rfork, for the (RFMEM\|RFPROC) == 0 case.
	*/
	void
	vmspace_unshare(struct proc *p)
	{
	struct vmspace *oldvmspace = p->p_vmspace;
	struct vmspace *newvmspace;

	GIANT_REQUIRED;
	if (oldvmspace->vm_refcnt == 1)
	return;
	newvmspace = vmspace_fork(oldvmspace);
	p->p_vmspace = newvmspace;
	pmap_pinit2(vmspace_pmap(newvmspace));
	vmspace_free(oldvmspace);
	if (p == curthread->td_proc) /* XXXKSE ? */
	pmap_activate(curthread);
	}

	/*
	* vm_map_lookup:
	*
	* Finds the VM object, offset, and
	* protection for a given virtual address in the
	* specified map, assuming a page fault of the
	* type specified.
	*
	* Leaves the map in question locked for read; return
	* values are guaranteed until a vm_map_lookup_done
	* call is performed. Note that the map argument
	* is in/out; the returned map must be used in
	* the call to vm_map_lookup_done.
	*
	* A handle (out_entry) is returned for use in
	* vm_map_lookup_done, to make that fast.
	*
	* If a lookup is requested with "write protection"
	* specified, the map may be changed to perform virtual
	* copying operations, although the data referenced will
	* remain the same.
	*/
	int
	vm_map_lookup(vm_map_t var_map, / IN/OUT */
	vm_offset_t vaddr,
	vm_prot_t fault_typea,
	vm_map_entry_t out_entry, / OUT */
	vm_object_t object, / OUT */
	vm_pindex_t pindex, / OUT */
	vm_prot_t out_prot, / OUT */
	boolean_t wired) / OUT */
	{
	vm_map_entry_t entry;
	vm_map_t map = *var_map;
	vm_prot_t prot;
	vm_prot_t fault_type = fault_typea;

	RetryLookup:;
	/*
	* Lookup the faulting address.
	*/

	vm_map_lock_read(map);
	#define RETURN(why) \
	{ \
	vm_map_unlock_read(map); \
	return (why); \
	}

	/*
	* If the map has an interesting hint, try it before calling full
	* blown lookup routine.
	*/
	entry = map->root;
	*out_entry = entry;
	if (entry == NULL \|\|
	(vaddr < entry->start) \|\| (vaddr >= entry->end)) {
	/*
	* Entry was either not a valid hint, or the vaddr was not
	* contained in the entry, so do a full lookup.
	*/
	if (!vm_map_lookup_entry(map, vaddr, out_entry))
	RETURN(KERN_INVALID_ADDRESS);

	entry = *out_entry;
	}

	/*
	* Handle submaps.
	*/
	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
	vm_map_t old_map = map;

	*var_map = map = entry->object.sub_map;
	vm_map_unlock_read(old_map);
	goto RetryLookup;
	}

	/*
	* Check whether this task is allowed to have this page.
	* Note the special case for MAP_ENTRY_COW
	* pages with an override. This is to implement a forced
	* COW for debuggers.
	*/
	if (fault_type & VM_PROT_OVERRIDE_WRITE)
	prot = entry->max_protection;
	else
	prot = entry->protection;
	fault_type &= (VM_PROT_READ\|VM_PROT_WRITE\|VM_PROT_EXECUTE);
	if ((fault_type & prot) != fault_type) {
	RETURN(KERN_PROTECTION_FAILURE);
	}
	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
	(entry->eflags & MAP_ENTRY_COW) &&
	(fault_type & VM_PROT_WRITE) &&
	(fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
	RETURN(KERN_PROTECTION_FAILURE);
	}

	/*
	* If this page is not pageable, we have to get it for all possible
	* accesses.
	*/
	*wired = (entry->wired_count != 0);
	if (*wired)
	prot = fault_type = entry->protection;

	/*
	* If the entry was copy-on-write, we either ...
	*/
	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
	/*
	* If we want to write the page, we may as well handle that
	* now since we've got the map locked.
	*
	* If we don't need to write the page, we just demote the
	* permissions allowed.
	*/
	if (fault_type & VM_PROT_WRITE) {
	/*
	* Make a new object, and place it in the object
	* chain. Note that no new references have appeared
	* -- one just moved from the map to the new
	* object.
	*/
	if (vm_map_lock_upgrade(map))
	goto RetryLookup;

	vm_object_shadow(
	&entry->object.vm_object,
	&entry->offset,
	atop(entry->end - entry->start));
	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;

	vm_map_lock_downgrade(map);
	} else {
	/*
	* We're attempting to read a copy-on-write page --
	* don't allow writes.
	*/
	prot &= ~VM_PROT_WRITE;
	}
	}

	/*
	* Create an object if necessary.
	*/
	if (entry->object.vm_object == NULL &&
	!map->system_map) {
	if (vm_map_lock_upgrade(map))
	goto RetryLookup;
	entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
	atop(entry->end - entry->start));
	entry->offset = 0;
	vm_map_lock_downgrade(map);
	}

	/*
	* Return the object/offset from this entry. If the entry was
	* copy-on-write or empty, it has been fixed up.
	*/
	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
	*object = entry->object.vm_object;

	/*
	* Return whether this is the only map sharing this data.
	*/
	*out_prot = prot;
	return (KERN_SUCCESS);

	#undef RETURN
	}

	/*
	* vm_map_lookup_done:
	*
	* Releases locks acquired by a vm_map_lookup
	* (according to the handle returned by that lookup).
	*/
	void
	vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
	{
	/*
	* Unlock the main-level map
	*/
	vm_map_unlock_read(map);
	}

	#ifdef ENABLE_VFS_IOOPT
	/*
	* Experimental support for zero-copy I/O
	*
	* Implement uiomove with VM operations. This handles (and collateral changes)
	* support every combination of source object modification, and COW type
	* operations.
	*/
	int
	vm_uiomove(
	vm_map_t mapa,
	vm_object_t srcobject,
	off_t cp,
	int cnta,
	vm_offset_t uaddra,
	int *npages)
	{
	vm_map_t map;
	vm_object_t first_object, oldobject, object;
	vm_map_entry_t entry;
	vm_prot_t prot;
	boolean_t wired;
	int tcnt, rv;
	vm_offset_t uaddr, start, end, tend;
	vm_pindex_t first_pindex, oindex;
	vm_size_t osize;
	off_t ooffset;
	int cnt;

	GIANT_REQUIRED;

	if (npages)
	*npages = 0;

	cnt = cnta;
	uaddr = uaddra;

	while (cnt > 0) {
	map = mapa;

	if ((vm_map_lookup(&map, uaddr,
	VM_PROT_READ, &entry, &first_object,
	&first_pindex, &prot, &wired)) != KERN_SUCCESS) {
	return EFAULT;
	}

	vm_map_clip_start(map, entry, uaddr);

	tcnt = cnt;
	tend = uaddr + tcnt;
	if (tend > entry->end) {
	tcnt = entry->end - uaddr;
	tend = entry->end;
	}

	vm_map_clip_end(map, entry, tend);

	start = entry->start;
	end = entry->end;

	osize = atop(tcnt);

	oindex = OFF_TO_IDX(cp);
	if (npages) {
	vm_size_t idx;
	for (idx = 0; idx < osize; idx++) {
	vm_page_t m;
	if ((m = vm_page_lookup(srcobject, oindex + idx)) == NULL) {
	vm_map_lookup_done(map, entry);
	return 0;
	}
	/*
	* disallow busy or invalid pages, but allow
	* m->busy pages if they are entirely valid.
	*/
	if ((m->flags & PG_BUSY) \|\|
	((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
	vm_map_lookup_done(map, entry);
	return 0;
	}
	}
	}

	/*
	* If we are changing an existing map entry, just redirect
	* the object, and change mappings.
	*/
	if ((first_object->type == OBJT_VNODE) &&
	((oldobject = entry->object.vm_object) == first_object)) {

	if ((entry->offset != cp) \|\| (oldobject != srcobject)) {
	/*
	* Remove old window into the file
	*/
	pmap_remove (map->pmap, uaddr, tend);

	/*
	* Force copy on write for mmaped regions
	*/
	vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);

	/*
	* Point the object appropriately
	*/
	if (oldobject != srcobject) {

	/*
	* Set the object optimization hint flag
	*/
	vm_object_set_flag(srcobject, OBJ_OPT);
	vm_object_reference(srcobject);
	entry->object.vm_object = srcobject;

	if (oldobject) {
	vm_object_deallocate(oldobject);
	}
	}

	entry->offset = cp;
	map->timestamp++;
	} else {
	pmap_remove (map->pmap, uaddr, tend);
	}

	} else if ((first_object->ref_count == 1) &&
	(first_object->size == osize) &&
	((first_object->type == OBJT_DEFAULT) \|\|
	(first_object->type == OBJT_SWAP)) ) {

	oldobject = first_object->backing_object;

	if ((first_object->backing_object_offset != cp) \|\|
	(oldobject != srcobject)) {
	/*
	* Remove old window into the file
	*/
	pmap_remove (map->pmap, uaddr, tend);

	/*
	* Remove unneeded old pages
	*/
	vm_object_page_remove(first_object, 0, 0, 0);

	/*
	* Invalidate swap space
	*/
	if (first_object->type == OBJT_SWAP) {
	swap_pager_freespace(first_object,
	0,
	first_object->size);
	}

	/*
	* Force copy on write for mmaped regions
	*/
	vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);

	/*
	* Point the object appropriately
	*/
	if (oldobject != srcobject) {
	/*
	* Set the object optimization hint flag
	*/
	vm_object_set_flag(srcobject, OBJ_OPT);
	vm_object_reference(srcobject);

	if (oldobject) {
	TAILQ_REMOVE(&oldobject->shadow_head,
	first_object, shadow_list);
	oldobject->shadow_count--;
	/* XXX bump generation? */
	vm_object_deallocate(oldobject);
	}

	TAILQ_INSERT_TAIL(&srcobject->shadow_head,
	first_object, shadow_list);
	srcobject->shadow_count++;
	/* XXX bump generation? */

	first_object->backing_object = srcobject;
	}
	first_object->backing_object_offset = cp;
	map->timestamp++;
	} else {
	pmap_remove (map->pmap, uaddr, tend);
	}
	/*
	* Otherwise, we have to do a logical mmap.
	*/
	} else {

	vm_object_set_flag(srcobject, OBJ_OPT);
	vm_object_reference(srcobject);

	pmap_remove (map->pmap, uaddr, tend);

	vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
	vm_map_lock_upgrade(map);

	if (entry == &map->header) {
	map->first_free = &map->header;
	} else if (map->first_free->start >= start) {
	map->first_free = entry->prev;
	}

	vm_map_entry_delete(map, entry);

	object = srcobject;
	ooffset = cp;

	rv = vm_map_insert(map, object, ooffset, start, tend,
	VM_PROT_ALL, VM_PROT_ALL, MAP_COPY_ON_WRITE);

	if (rv != KERN_SUCCESS)
	panic("vm_uiomove: could not insert new entry: %d", rv);
	}

	/*
	* Map the window directly, if it is already in memory
	*/
	pmap_object_init_pt(map->pmap, uaddr,
	srcobject, oindex, tcnt, 0);

	map->timestamp++;
	vm_map_unlock(map);

	cnt -= tcnt;
	uaddr += tcnt;
	cp += tcnt;
	if (npages)
	*npages += osize;
	}
	return 0;
	}
	#endif

	#include "opt_ddb.h"
	#ifdef DDB
	#include <sys/kernel.h>

	#include <ddb/ddb.h>

	/*
	* vm_map_print: [ debug ]
	*/
	DB_SHOW_COMMAND(map, vm_map_print)
	{
	static int nlines;
	/* XXX convert args. */
	vm_map_t map = (vm_map_t)addr;
	boolean_t full = have_addr;

	vm_map_entry_t entry;

	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
	(void *)map,
	(void *)map->pmap, map->nentries, map->timestamp);
	nlines++;

	if (!full && db_indent)
	return;

	db_indent += 2;
	for (entry = map->header.next; entry != &map->header;
	entry = entry->next) {
	db_iprintf("map entry %p: start=%p, end=%p\n",
	(void )entry, (void )entry->start, (void *)entry->end);
	nlines++;
	{
	static char *inheritance_name[4] =
	{"share", "copy", "none", "donate_copy"};

	db_iprintf(" prot=%x/%x/%s",
	entry->protection,
	entry->max_protection,
	inheritance_name[(int)(unsigned char)entry->inheritance]);
	if (entry->wired_count != 0)
	db_printf(", wired");
	}
	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
	/* XXX no %qd in kernel. Truncate entry->offset. */
	db_printf(", share=%p, offset=0x%lx\n",
	(void *)entry->object.sub_map,
	(long)entry->offset);
	nlines++;
	if ((entry->prev == &map->header) \|\|
	(entry->prev->object.sub_map !=
	entry->object.sub_map)) {
	db_indent += 2;
	vm_map_print((db_expr_t)(intptr_t)
	entry->object.sub_map,
	full, 0, (char *)0);
	db_indent -= 2;
	}
	} else {
	/* XXX no %qd in kernel. Truncate entry->offset. */
	db_printf(", object=%p, offset=0x%lx",
	(void *)entry->object.vm_object,
	(long)entry->offset);
	if (entry->eflags & MAP_ENTRY_COW)
	db_printf(", copy (%s)",
	(entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
	db_printf("\n");
	nlines++;

	if ((entry->prev == &map->header) \|\|
	(entry->prev->object.vm_object !=
	entry->object.vm_object)) {
	db_indent += 2;
	vm_object_print((db_expr_t)(intptr_t)
	entry->object.vm_object,
	full, 0, (char *)0);
	nlines += 4;
	db_indent -= 2;
	}
	}
	}
	db_indent -= 2;
	if (db_indent == 0)
	nlines = 0;
	}


	DB_SHOW_COMMAND(procvm, procvm)
	{
	struct proc *p;

	if (have_addr) {
	p = (struct proc *) addr;
	} else {
	p = curproc;
	}

	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
	(void )p, (void )p->p_vmspace, (void *)&p->p_vmspace->vm_map,
	(void *)vmspace_pmap(p->p_vmspace));

	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
	}

	#endif /* DDB */
	Index: head/sys/vm/vm_mmap.c
	===================================================================
	--- head/sys/vm/vm_mmap.c (revision 103766)
	+++ head/sys/vm/vm_mmap.c (revision 103767)
	@@ -1,1275 +1,1262 @@
	/*
	* Copyright (c) 1988 University of Utah.
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* the Systems Programming Group of the University of Utah Computer
	* Science Department.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
	*
	* @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94
	* $FreeBSD$
	*/

	/*
	* Mapped file (mmap) interface to VM
	*/

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	#include <sys/filedesc.h>
	#include <sys/proc.h>
	#include <sys/resource.h>
	#include <sys/resourcevar.h>
	#include <sys/vnode.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/mman.h>
	#include <sys/conf.h>
	#include <sys/stat.h>
	#include <sys/vmmeter.h>
	#include <sys/sysctl.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_page.h>
	#include <vm/vm_kern.h>

	#ifndef _SYS_SYSPROTO_H_
	struct sbrk_args {
	int incr;
	};
	#endif

	static int max_proc_mmap;
	SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");

	/*
	* Set the maximum number of vm_map_entry structures per process. Roughly
	* speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
	* of our KVM malloc space still results in generous limits. We want a
	* default that is good enough to prevent the kernel running out of resources
	* if attacked from compromised user account but generous enough such that
	* multi-threaded processes are not unduly inconvenienced.
	*/
	static void vmmapentry_rsrc_init(void *);
	SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL)

	static void
	vmmapentry_rsrc_init(dummy)
	void *dummy;
	{
	max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
	max_proc_mmap /= 100;
	}

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	sbrk(td, uap)
	struct thread *td;
	struct sbrk_args *uap;
	{
	/* Not yet implemented */
	/* mtx_lock(&Giant); */
	/* mtx_unlock(&Giant); */
	return (EOPNOTSUPP);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct sstk_args {
	int incr;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	sstk(td, uap)
	struct thread *td;
	struct sstk_args *uap;
	{
	/* Not yet implemented */
	/* mtx_lock(&Giant); */
	/* mtx_unlock(&Giant); */
	return (EOPNOTSUPP);
	}

	#if defined(COMPAT_43) \|\| defined(COMPAT_SUNOS)
	#ifndef _SYS_SYSPROTO_H_
	struct getpagesize_args {
	int dummy;
	};
	#endif

	/* ARGSUSED */
	int
	ogetpagesize(td, uap)
	struct thread *td;
	struct getpagesize_args *uap;
	{
	/* MP SAFE */
	td->td_retval[0] = PAGE_SIZE;
	return (0);
	}
	#endif /* COMPAT_43 \|\| COMPAT_SUNOS */


	/*
	* Memory Map (mmap) system call. Note that the file offset
	* and address are allowed to be NOT page aligned, though if
	* the MAP_FIXED flag it set, both must have the same remainder
	* modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
	* page-aligned, the actual mapping starts at trunc_page(addr)
	* and the return value is adjusted up by the page offset.
	*
	* Generally speaking, only character devices which are themselves
	* memory-based, such as a video framebuffer, can be mmap'd. Otherwise
	* there would be no cache coherency between a descriptor and a VM mapping
	* both to the same character device.
	*
	* Block devices can be mmap'd no matter what they represent. Cache coherency
	* is maintained as long as you do not write directly to the underlying
	* character device.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct mmap_args {
	void *addr;
	size_t len;
	int prot;
	int flags;
	int fd;
	long pad;
	off_t pos;
	};
	#endif

	/*
	* MPSAFE
	*/
	int
	mmap(td, uap)
	struct thread *td;
	struct mmap_args *uap;
	{
	struct file *fp = NULL;
	struct vnode *vp;
	vm_offset_t addr;
	vm_size_t size, pageoff;
	vm_prot_t prot, maxprot;
	void *handle;
	int flags, error;
	int disablexworkaround;
	off_t pos;
	struct vmspace *vms = td->td_proc->p_vmspace;
	vm_object_t obj;

	addr = (vm_offset_t) uap->addr;
	size = uap->len;
	prot = uap->prot & VM_PROT_ALL;
	flags = uap->flags;
	pos = uap->pos;

	vp = NULL;
	fp = NULL;
	/* make sure mapping fits into numeric range etc */
	if ((ssize_t) uap->len < 0 \|\|
	((flags & MAP_ANON) && uap->fd != -1))
	return (EINVAL);

	if (flags & MAP_STACK) {
	if ((uap->fd != -1) \|\|
	((prot & (PROT_READ \| PROT_WRITE)) != (PROT_READ \| PROT_WRITE)))
	return (EINVAL);
	flags \|= MAP_ANON;
	pos = 0;
	}

	/*
	* Align the file position to a page boundary,
	* and save its page offset component.
	*/
	pageoff = (pos & PAGE_MASK);
	pos -= pageoff;

	/* Adjust size for rounding (on both ends). */
	size += pageoff; /* low end... */
	size = (vm_size_t) round_page(size); /* hi end */

	/*
	* Check for illegal addresses. Watch out for address wrap... Note
	* that VM_*_ADDRESS are not constants due to casts (argh).
	*/
	if (flags & MAP_FIXED) {
	/*
	* The specified address must have the same remainder
	* as the file offset taken modulo PAGE_SIZE, so it
	* should be aligned after adjustment by pageoff.
	*/
	addr -= pageoff;
	if (addr & PAGE_MASK)
	return (EINVAL);
	/* Address range must be all in user VM space. */
	- if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
	+ if (addr < vm_map_min(&vms->vm_map) \|\|
	+ addr + size > vm_map_max(&vms->vm_map))
	return (EINVAL);
	-#ifndef __i386__
	- if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
	- return (EINVAL);
	-#endif
	if (addr + size < addr)
	return (EINVAL);
	}
	/*
	* XXX for non-fixed mappings where no hint is provided or
	* the hint would fall in the potential heap space,
	* place it after the end of the largest possible heap.
	*
	* There should really be a pmap call to determine a reasonable
	* location.
	*/
	else if (addr == 0 \|\|
	(addr >= round_page((vm_offset_t)vms->vm_taddr) &&
	addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz)))
	addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);

	mtx_lock(&Giant); /* syscall marked mp-safe but isn't */
	if (flags & MAP_ANON) {
	/*
	* Mapping blank space is trivial.
	*/
	handle = NULL;
	maxprot = VM_PROT_ALL;
	pos = 0;
	} else {
	/*
	* Mapping file, get fp for validation. Obtain vnode and make
	* sure it is of appropriate type.
	* don't let the descriptor disappear on us if we block
	*/
	if ((error = fget(td, uap->fd, &fp)) != 0)
	goto done;
	if (fp->f_type != DTYPE_VNODE) {
	error = EINVAL;
	goto done;
	}

	/*
	* POSIX shared-memory objects are defined to have
	* kernel persistence, and are not defined to support
	* read(2)/write(2) -- or even open(2). Thus, we can
	* use MAP_ASYNC to trade on-disk coherence for speed.
	* The shm_open(3) library routine turns on the FPOSIXSHM
	* flag to request this behavior.
	*/
	if (fp->f_flag & FPOSIXSHM)
	flags \|= MAP_NOSYNC;
	vp = (struct vnode *) fp->f_data;
	error = vget(vp, LK_EXCLUSIVE, td);
	if (error)
	goto done;
	if (vp->v_type != VREG && vp->v_type != VCHR) {
	error = EINVAL;
	goto done;
	}
	if (vp->v_type == VREG) {
	/*
	* Get the proper underlying object
	*/
	if (VOP_GETVOBJECT(vp, &obj) != 0) {
	error = EINVAL;
	goto done;
	}
	if (obj->handle != vp) {
	vput(vp);
	vp = (struct vnode*)obj->handle;
	vget(vp, LK_EXCLUSIVE, td);
	}
	}
	/*
	* XXX hack to handle use of /dev/zero to map anon memory (ala
	* SunOS).
	*/
	if ((vp->v_type == VCHR) &&
	(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) {
	handle = NULL;
	maxprot = VM_PROT_ALL;
	flags \|= MAP_ANON;
	pos = 0;
	} else {
	/*
	* cdevs does not provide private mappings of any kind.
	*/
	/*
	* However, for XIG X server to continue to work,
	* we should allow the superuser to do it anyway.
	* We only allow it at securelevel < 1.
	* (Because the XIG X server writes directly to video
	* memory via /dev/mem, it should never work at any
	* other securelevel.
	* XXX this will have to go
	*/
	if (securelevel_ge(td->td_ucred, 1))
	disablexworkaround = 1;
	else
	disablexworkaround = suser(td);
	if (vp->v_type == VCHR && disablexworkaround &&
	(flags & (MAP_PRIVATE\|MAP_COPY))) {
	error = EINVAL;
	goto done;
	}
	/*
	* Ensure that file and memory protections are
	* compatible. Note that we only worry about
	* writability if mapping is shared; in this case,
	* current and max prot are dictated by the open file.
	* XXX use the vnode instead? Problem is: what
	* credentials do we use for determination? What if
	* proc does a setuid?
	*/
	maxprot = VM_PROT_EXECUTE; /* ??? */
	if (fp->f_flag & FREAD) {
	maxprot \|= VM_PROT_READ;
	} else if (prot & PROT_READ) {
	error = EACCES;
	goto done;
	}
	/*
	* If we are sharing potential changes (either via
	* MAP_SHARED or via the implicit sharing of character
	* device mappings), and we are trying to get write
	* permission although we opened it without asking
	* for it, bail out. Check for superuser, only if
	* we're at securelevel < 1, to allow the XIG X server
	* to continue to work.
	*/
	if ((flags & MAP_SHARED) != 0 \|\|
	(vp->v_type == VCHR && disablexworkaround)) {
	if ((fp->f_flag & FWRITE) != 0) {
	struct vattr va;
	if ((error =
	VOP_GETATTR(vp, &va,
	td->td_ucred, td))) {
	goto done;
	}
	if ((va.va_flags &
	(SF_SNAPSHOT\|IMMUTABLE\|APPEND)) == 0) {
	maxprot \|= VM_PROT_WRITE;
	} else if (prot & PROT_WRITE) {
	error = EPERM;
	goto done;
	}
	} else if ((prot & PROT_WRITE) != 0) {
	error = EACCES;
	goto done;
	}
	} else {
	maxprot \|= VM_PROT_WRITE;
	}

	handle = (void *)vp;
	}
	}

	/*
	* Do not allow more then a certain number of vm_map_entry structures
	* per process. Scale with the number of rforks sharing the map
	* to make the limit reasonable for threads.
	*/
	if (max_proc_mmap &&
	vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
	error = ENOMEM;
	goto done;
	}

	mtx_unlock(&Giant);
	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
	flags, handle, pos);
	mtx_lock(&Giant);
	if (error == 0)
	td->td_retval[0] = (register_t) (addr + pageoff);
	done:
	if (vp)
	vput(vp);
	mtx_unlock(&Giant);
	if (fp)
	fdrop(fp, td);

	return (error);
	}

	#ifdef COMPAT_43
	#ifndef _SYS_SYSPROTO_H_
	struct ommap_args {
	caddr_t addr;
	int len;
	int prot;
	int flags;
	int fd;
	long pos;
	};
	#endif
	int
	ommap(td, uap)
	struct thread *td;
	struct ommap_args *uap;
	{
	struct mmap_args nargs;
	static const char cvtbsdprot[8] = {
	0,
	PROT_EXEC,
	PROT_WRITE,
	PROT_EXEC \| PROT_WRITE,
	PROT_READ,
	PROT_EXEC \| PROT_READ,
	PROT_WRITE \| PROT_READ,
	PROT_EXEC \| PROT_WRITE \| PROT_READ,
	};

	#define OMAP_ANON 0x0002
	#define OMAP_COPY 0x0020
	#define OMAP_SHARED 0x0010
	#define OMAP_FIXED 0x0100

	nargs.addr = uap->addr;
	nargs.len = uap->len;
	nargs.prot = cvtbsdprot[uap->prot & 0x7];
	nargs.flags = 0;
	if (uap->flags & OMAP_ANON)
	nargs.flags \|= MAP_ANON;
	if (uap->flags & OMAP_COPY)
	nargs.flags \|= MAP_COPY;
	if (uap->flags & OMAP_SHARED)
	nargs.flags \|= MAP_SHARED;
	else
	nargs.flags \|= MAP_PRIVATE;
	if (uap->flags & OMAP_FIXED)
	nargs.flags \|= MAP_FIXED;
	nargs.fd = uap->fd;
	nargs.pos = uap->pos;
	return (mmap(td, &nargs));
	}
	#endif /* COMPAT_43 */


	#ifndef _SYS_SYSPROTO_H_
	struct msync_args {
	void *addr;
	int len;
	int flags;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	msync(td, uap)
	struct thread *td;
	struct msync_args *uap;
	{
	vm_offset_t addr;
	vm_size_t size, pageoff;
	int flags;
	vm_map_t map;
	int rv;

	addr = (vm_offset_t) uap->addr;
	size = uap->len;
	flags = uap->flags;

	pageoff = (addr & PAGE_MASK);
	addr -= pageoff;
	size += pageoff;
	size = (vm_size_t) round_page(size);
	if (addr + size < addr)
	return (EINVAL);

	if ((flags & (MS_ASYNC\|MS_INVALIDATE)) == (MS_ASYNC\|MS_INVALIDATE))
	return (EINVAL);

	mtx_lock(&Giant);

	map = &td->td_proc->p_vmspace->vm_map;

	/*
	* XXX Gak! If size is zero we are supposed to sync "all modified
	* pages with the region containing addr". Unfortunately, we don't
	* really keep track of individual mmaps so we approximate by flushing
	* the range of the map entry containing addr. This can be incorrect
	* if the region splits or is coalesced with a neighbor.
	*/
	if (size == 0) {
	vm_map_entry_t entry;

	vm_map_lock_read(map);
	rv = vm_map_lookup_entry(map, addr, &entry);
	vm_map_unlock_read(map);
	if (rv == FALSE) {
	rv = -1;
	goto done2;
	}
	addr = entry->start;
	size = entry->end - entry->start;
	}

	/*
	* Clean the pages and interpret the return value.
	*/
	rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0,
	(flags & MS_INVALIDATE) != 0);

	done2:
	mtx_unlock(&Giant);

	switch (rv) {
	case KERN_SUCCESS:
	return (0);
	case KERN_INVALID_ADDRESS:
	return (EINVAL); /* Sun returns ENOMEM? */
	case KERN_FAILURE:
	return (EIO);
	default:
	return (EINVAL);
	}
	}

	#ifndef _SYS_SYSPROTO_H_
	struct munmap_args {
	void *addr;
	size_t len;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	munmap(td, uap)
	struct thread *td;
	struct munmap_args *uap;
	{
	vm_offset_t addr;
	vm_size_t size, pageoff;
	vm_map_t map;

	addr = (vm_offset_t) uap->addr;
	size = uap->len;

	pageoff = (addr & PAGE_MASK);
	addr -= pageoff;
	size += pageoff;
	size = (vm_size_t) round_page(size);
	if (addr + size < addr)
	return (EINVAL);

	if (size == 0)
	return (0);

	/*
	- * Check for illegal addresses. Watch out for address wrap... Note
	- * that VM_*_ADDRESS are not constants due to casts (argh).
	+ * Check for illegal addresses. Watch out for address wrap...
	*/
	- if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
	- return (EINVAL);
	-#ifndef __i386__
	- if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
	- return (EINVAL);
	-#endif
	map = &td->td_proc->p_vmspace->vm_map;
	+ if (addr < vm_map_min(map) \|\| addr + size > vm_map_max(map))
	+ return (EINVAL);
	/*
	* Make sure entire range is allocated.
	*/
	if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE))
	return (EINVAL);

	/* returns nothing but KERN_SUCCESS anyway */
	(void) vm_map_remove(map, addr, addr + size);
	return (0);
	}

	#if 0
	void
	munmapfd(td, fd)
	struct thread *td;
	int fd;
	{
	/*
	* XXX should unmap any regions mapped to this file
	*/
	FILEDESC_LOCK(p->p_fd);
	td->td_proc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED;
	FILEDESC_UNLOCK(p->p_fd);
	}
	#endif

	#ifndef _SYS_SYSPROTO_H_
	struct mprotect_args {
	const void *addr;
	size_t len;
	int prot;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	mprotect(td, uap)
	struct thread *td;
	struct mprotect_args *uap;
	{
	vm_offset_t addr;
	vm_size_t size, pageoff;
	vm_prot_t prot;

	addr = (vm_offset_t) uap->addr;
	size = uap->len;
	prot = uap->prot & VM_PROT_ALL;
	#if defined(VM_PROT_READ_IS_EXEC)
	if (prot & VM_PROT_READ)
	prot \|= VM_PROT_EXECUTE;
	#endif

	pageoff = (addr & PAGE_MASK);
	addr -= pageoff;
	size += pageoff;
	size = (vm_size_t) round_page(size);
	if (addr + size < addr)
	return (EINVAL);

	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
	addr + size, prot, FALSE)) {
	case KERN_SUCCESS:
	return (0);
	case KERN_PROTECTION_FAILURE:
	return (EACCES);
	}
	return (EINVAL);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct minherit_args {
	void *addr;
	size_t len;
	int inherit;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	minherit(td, uap)
	struct thread *td;
	struct minherit_args *uap;
	{
	vm_offset_t addr;
	vm_size_t size, pageoff;
	vm_inherit_t inherit;

	addr = (vm_offset_t)uap->addr;
	size = uap->len;
	inherit = uap->inherit;

	pageoff = (addr & PAGE_MASK);
	addr -= pageoff;
	size += pageoff;
	size = (vm_size_t) round_page(size);
	if (addr + size < addr)
	return (EINVAL);

	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
	addr + size, inherit)) {
	case KERN_SUCCESS:
	return (0);
	case KERN_PROTECTION_FAILURE:
	return (EACCES);
	}
	return (EINVAL);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct madvise_args {
	void *addr;
	size_t len;
	int behav;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	madvise(td, uap)
	struct thread *td;
	struct madvise_args *uap;
	{
	vm_offset_t start, end;
	+ vm_map_t map;

	/*
	* Check for illegal behavior
	*/
	if (uap->behav < 0 \|\| uap->behav > MADV_CORE)
	return (EINVAL);
	/*
	* Check for illegal addresses. Watch out for address wrap... Note
	* that VM_*_ADDRESS are not constants due to casts (argh).
	*/
	- if (VM_MAXUSER_ADDRESS > 0 &&
	- ((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS)
	+ map = &td->td_proc->p_vmspace->vm_map;
	+ if ((vm_offset_t)uap->addr < vm_map_min(map) \|\|
	+ (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
	return (EINVAL);
	-#ifndef __i386__
	- if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS)
	- return (EINVAL);
	-#endif
	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
	return (EINVAL);

	/*
	* Since this routine is only advisory, we default to conservative
	* behavior.
	*/
	start = trunc_page((vm_offset_t) uap->addr);
	end = round_page((vm_offset_t) uap->addr + uap->len);

	- if (vm_map_madvise(&td->td_proc->p_vmspace->vm_map, start, end,
	- uap->behav))
	+ if (vm_map_madvise(map, start, end, uap->behav))
	return (EINVAL);
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct mincore_args {
	const void *addr;
	size_t len;
	char *vec;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	mincore(td, uap)
	struct thread *td;
	struct mincore_args *uap;
	{
	vm_offset_t addr, first_addr;
	vm_offset_t end, cend;
	pmap_t pmap;
	vm_map_t map;
	char *vec;
	int error = 0;
	int vecindex, lastvecindex;
	vm_map_entry_t current;
	vm_map_entry_t entry;
	int mincoreinfo;
	unsigned int timestamp;

	/*
	* Make sure that the addresses presented are valid for user
	* mode.
	*/
	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
	end = addr + (vm_size_t)round_page(uap->len);
	- if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS)
	+ map = &td->td_proc->p_vmspace->vm_map;
	+ if (end > vm_map_max(map) \|\| end < addr)
	return (EINVAL);
	- if (end < addr)
	- return (EINVAL);

	/*
	* Address of byte vector
	*/
	vec = uap->vec;

	mtx_lock(&Giant);
	- map = &td->td_proc->p_vmspace->vm_map;
	pmap = vmspace_pmap(td->td_proc->p_vmspace);

	vm_map_lock_read(map);
	RestartScan:
	timestamp = map->timestamp;

	if (!vm_map_lookup_entry(map, addr, &entry))
	entry = entry->next;

	/*
	* Do this on a map entry basis so that if the pages are not
	* in the current processes address space, we can easily look
	* up the pages elsewhere.
	*/
	lastvecindex = -1;
	for (current = entry;
	(current != &map->header) && (current->start < end);
	current = current->next) {

	/*
	* ignore submaps (for now) or null objects
	*/
	if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) \|\|
	current->object.vm_object == NULL)
	continue;

	/*
	* limit this scan to the current map entry and the
	* limits for the mincore call
	*/
	if (addr < current->start)
	addr = current->start;
	cend = current->end;
	if (cend > end)
	cend = end;

	/*
	* scan this entry one page at a time
	*/
	while (addr < cend) {
	/*
	* Check pmap first, it is likely faster, also
	* it can provide info as to whether we are the
	* one referencing or modifying the page.
	*/
	mincoreinfo = pmap_mincore(pmap, addr);
	if (!mincoreinfo) {
	vm_pindex_t pindex;
	vm_ooffset_t offset;
	vm_page_t m;
	/*
	* calculate the page index into the object
	*/
	offset = current->offset + (addr - current->start);
	pindex = OFF_TO_IDX(offset);
	m = vm_page_lookup(current->object.vm_object,
	pindex);
	/*
	* if the page is resident, then gather information about
	* it.
	*/
	if (m) {
	mincoreinfo = MINCORE_INCORE;
	if (m->dirty \|\|
	pmap_is_modified(m))
	mincoreinfo \|= MINCORE_MODIFIED_OTHER;
	if ((m->flags & PG_REFERENCED) \|\|
	pmap_ts_referenced(m)) {
	vm_page_flag_set(m, PG_REFERENCED);
	mincoreinfo \|= MINCORE_REFERENCED_OTHER;
	}
	}
	}

	/*
	* subyte may page fault. In case it needs to modify
	* the map, we release the lock.
	*/
	vm_map_unlock_read(map);

	/*
	* calculate index into user supplied byte vector
	*/
	vecindex = OFF_TO_IDX(addr - first_addr);

	/*
	* If we have skipped map entries, we need to make sure that
	* the byte vector is zeroed for those skipped entries.
	*/
	while ((lastvecindex + 1) < vecindex) {
	error = subyte(vec + lastvecindex, 0);
	if (error) {
	error = EFAULT;
	goto done2;
	}
	++lastvecindex;
	}

	/*
	* Pass the page information to the user
	*/
	error = subyte(vec + vecindex, mincoreinfo);
	if (error) {
	error = EFAULT;
	goto done2;
	}

	/*
	* If the map has changed, due to the subyte, the previous
	* output may be invalid.
	*/
	vm_map_lock_read(map);
	if (timestamp != map->timestamp)
	goto RestartScan;

	lastvecindex = vecindex;
	addr += PAGE_SIZE;
	}
	}

	/*
	* subyte may page fault. In case it needs to modify
	* the map, we release the lock.
	*/
	vm_map_unlock_read(map);

	/*
	* Zero the last entries in the byte vector.
	*/
	vecindex = OFF_TO_IDX(end - first_addr);
	while ((lastvecindex + 1) < vecindex) {
	error = subyte(vec + lastvecindex, 0);
	if (error) {
	error = EFAULT;
	goto done2;
	}
	++lastvecindex;
	}

	/*
	* If the map has changed, due to the subyte, the previous
	* output may be invalid.
	*/
	vm_map_lock_read(map);
	if (timestamp != map->timestamp)
	goto RestartScan;
	vm_map_unlock_read(map);
	done2:
	mtx_unlock(&Giant);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct mlock_args {
	const void *addr;
	size_t len;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	mlock(td, uap)
	struct thread *td;
	struct mlock_args *uap;
	{
	vm_offset_t addr;
	vm_size_t size, pageoff;
	int error;

	addr = (vm_offset_t) uap->addr;
	size = uap->len;

	pageoff = (addr & PAGE_MASK);
	addr -= pageoff;
	size += pageoff;
	size = (vm_size_t) round_page(size);

	/* disable wrap around */
	if (addr + size < addr)
	return (EINVAL);

	if (atop(size) + cnt.v_wire_count > vm_page_max_wired)
	return (EAGAIN);

	#ifdef pmap_wired_count
	if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) >
	td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
	return (ENOMEM);
	#else
	error = suser(td);
	if (error)
	return (error);
	#endif

	error = vm_map_wire(&td->td_proc->p_vmspace->vm_map, addr,
	addr + size, TRUE);
	return (error == KERN_SUCCESS ? 0 : ENOMEM);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct mlockall_args {
	int how;
	};
	#endif

	/*
	* MPSAFE
	*/
	int
	mlockall(td, uap)
	struct thread *td;
	struct mlockall_args *uap;
	{
	/* mtx_lock(&Giant); */
	/* mtx_unlock(&Giant); */
	return 0;
	}

	#ifndef _SYS_SYSPROTO_H_
	struct munlockall_args {
	int how;
	};
	#endif

	/*
	* MPSAFE
	*/
	int
	munlockall(td, uap)
	struct thread *td;
	struct munlockall_args *uap;
	{
	/* mtx_lock(&Giant); */
	/* mtx_unlock(&Giant); */
	return 0;
	}

	#ifndef _SYS_SYSPROTO_H_
	struct munlock_args {
	const void *addr;
	size_t len;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	munlock(td, uap)
	struct thread *td;
	struct munlock_args *uap;
	{
	vm_offset_t addr;
	vm_size_t size, pageoff;
	int error;

	addr = (vm_offset_t) uap->addr;
	size = uap->len;

	pageoff = (addr & PAGE_MASK);
	addr -= pageoff;
	size += pageoff;
	size = (vm_size_t) round_page(size);

	/* disable wrap around */
	if (addr + size < addr)
	return (EINVAL);

	#ifndef pmap_wired_count
	error = suser(td);
	if (error)
	return (error);
	#endif

	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, addr,
	addr + size, TRUE);
	return (error == KERN_SUCCESS ? 0 : ENOMEM);
	}

	/*
	* vm_mmap()
	*
	* MPSAFE
	*
	* Internal version of mmap. Currently used by mmap, exec, and sys5
	* shared memory. Handle is either a vnode pointer or NULL for MAP_ANON.
	*/
	int
	vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
	vm_prot_t maxprot, int flags,
	void *handle,
	vm_ooffset_t foff)
	{
	boolean_t fitit;
	vm_object_t object;
	struct vnode *vp = NULL;
	objtype_t type;
	int rv = KERN_SUCCESS;
	vm_ooffset_t objsize;
	int docow;
	struct thread *td = curthread;

	if (size == 0)
	return (0);

	objsize = size = round_page(size);

	if (td->td_proc->p_vmspace->vm_map.size + size >
	td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	return(ENOMEM);
	}

	/*
	* We currently can only deal with page aligned file offsets.
	* The check is here rather than in the syscall because the
	* kernel calls this function internally for other mmaping
	* operations (such as in exec) and non-aligned offsets will
	* cause pmap inconsistencies...so we want to be sure to
	* disallow this in all cases.
	*/
	if (foff & PAGE_MASK)
	return (EINVAL);

	if ((flags & MAP_FIXED) == 0) {
	fitit = TRUE;
	addr = round_page(addr);
	} else {
	if (addr != trunc_page(addr))
	return (EINVAL);
	fitit = FALSE;
	(void) vm_map_remove(map, addr, addr + size);
	}

	/*
	* Lookup/allocate object.
	*/
	if (flags & MAP_ANON) {
	type = OBJT_DEFAULT;
	/*
	* Unnamed anonymous regions always start at 0.
	*/
	if (handle == 0)
	foff = 0;
	} else {
	vp = (struct vnode *) handle;
	mtx_lock(&Giant);
	ASSERT_VOP_LOCKED(vp, "vm_mmap");
	if (vp->v_type == VCHR) {
	type = OBJT_DEVICE;
	handle = (void *)(intptr_t)vp->v_rdev;
	} else {
	struct vattr vat;
	int error;

	error = VOP_GETATTR(vp, &vat, td->td_ucred, td);
	if (error) {
	mtx_unlock(&Giant);
	return (error);
	}
	objsize = round_page(vat.va_size);
	type = OBJT_VNODE;
	/*
	* if it is a regular file without any references
	* we do not need to sync it.
	*/
	if (vp->v_type == VREG && vat.va_nlink == 0) {
	flags \|= MAP_NOSYNC;
	}
	}
	mtx_unlock(&Giant);
	}

	if (handle == NULL) {
	object = NULL;
	docow = 0;
	} else {
	object = vm_pager_allocate(type,
	handle, objsize, prot, foff);
	if (object == NULL) {
	return (type == OBJT_DEVICE ? EINVAL : ENOMEM);
	}
	docow = MAP_PREFAULT_PARTIAL;
	}

	/*
	* Force device mappings to be shared.
	*/
	if (type == OBJT_DEVICE \|\| type == OBJT_PHYS) {
	flags &= ~(MAP_PRIVATE\|MAP_COPY);
	flags \|= MAP_SHARED;
	}

	if ((flags & (MAP_ANON\|MAP_SHARED)) == 0)
	docow \|= MAP_COPY_ON_WRITE;
	if (flags & MAP_NOSYNC)
	docow \|= MAP_DISABLE_SYNCER;
	if (flags & MAP_NOCORE)
	docow \|= MAP_DISABLE_COREDUMP;

	#if defined(VM_PROT_READ_IS_EXEC)
	if (prot & VM_PROT_READ)
	prot \|= VM_PROT_EXECUTE;

	if (maxprot & VM_PROT_READ)
	maxprot \|= VM_PROT_EXECUTE;
	#endif

	if (fitit)
	addr = pmap_addr_hint(object, addr, size);

	if (flags & MAP_STACK)
	rv = vm_map_stack (map, *addr, size, prot,
	maxprot, docow);
	else
	rv = vm_map_find(map, object, foff, addr, size, fitit,
	prot, maxprot, docow);

	if (rv != KERN_SUCCESS) {
	/*
	* Lose the object reference. Will destroy the
	* object if it's an unnamed anonymous mapping
	* or named anonymous without other references.
	*/
	vm_object_deallocate(object);
	} else if (flags & MAP_SHARED) {
	/*
	* Shared memory is also shared with children.
	*/
	rv = vm_map_inherit(map, addr, addr + size, VM_INHERIT_SHARE);
	if (rv != KERN_SUCCESS)
	(void) vm_map_remove(map, addr, addr + size);
	}
	switch (rv) {
	case KERN_SUCCESS:
	return (0);
	case KERN_INVALID_ADDRESS:
	case KERN_NO_SPACE:
	return (ENOMEM);
	case KERN_PROTECTION_FAILURE:
	return (EACCES);
	default:
	return (EINVAL);
	}
	}
	Index: head/sys/vm/vm_pageout.c
	===================================================================
	--- head/sys/vm/vm_pageout.c (revision 103766)
	+++ head/sys/vm/vm_pageout.c (revision 103767)
	@@ -1,1537 +1,1537 @@
	/*
	* Copyright (c) 1991 Regents of the University of California.
	* All rights reserved.
	* Copyright (c) 1994 John S. Dyson
	* All rights reserved.
	* Copyright (c) 1994 David Greenman
	* All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* The Mach Operating System project at Carnegie-Mellon University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91
	*
	*
	* Copyright (c) 1987, 1990 Carnegie-Mellon University.
	* All rights reserved.
	*
	* Authors: Avadis Tevanian, Jr., Michael Wayne Young
	*
	* Permission to use, copy, modify and distribute this software and
	* its documentation is hereby granted, provided that both the copyright
	* notice and this permission notice appear in all copies of the
	* software, derivative works or modified versions, and any portions
	* thereof, and that both notices appear in supporting documentation.
	*
	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
	* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	*
	* Carnegie Mellon requests users of this software to return to
	*
	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	* School of Computer Science
	* Carnegie Mellon University
	* Pittsburgh PA 15213-3890
	*
	* any improvements or extensions that they make and grant Carnegie the
	* rights to redistribute these changes.
	*
	* $FreeBSD$
	*/

	/*
	* The proverbial page-out daemon.
	*/

	#include "opt_vm.h"
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/kthread.h>
	#include <sys/ktr.h>
	#include <sys/resourcevar.h>
	#include <sys/signalvar.h>
	#include <sys/vnode.h>
	#include <sys/vmmeter.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_map.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_pager.h>
	#include <vm/swap_pager.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>

	#include <machine/mutex.h>

	/*
	* System initialization
	*/

	/* the kernel process "vm_pageout"*/
	static void vm_pageout(void);
	static int vm_pageout_clean(vm_page_t);
	static void vm_pageout_scan(int pass);
	static int vm_pageout_free_page_calc(vm_size_t count);
	struct proc *pageproc;

	static struct kproc_desc page_kp = {
	"pagedaemon",
	vm_pageout,
	&pageproc
	};
	SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)

	#if !defined(NO_SWAPPING)
	/* the kernel process "vm_daemon"*/
	static void vm_daemon(void);
	static struct proc *vmproc;

	static struct kproc_desc vm_kp = {
	"vmdaemon",
	vm_daemon,
	&vmproc
	};
	SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
	#endif


	int vm_pages_needed=0; /* Event on which pageout daemon sleeps */
	int vm_pageout_deficit=0; /* Estimated number of pages deficit */
	int vm_pageout_pages_needed=0; /* flag saying that the pageout daemon needs pages */

	#if !defined(NO_SWAPPING)
	static int vm_pageout_req_swapout; /* XXX */
	static int vm_daemon_needed;
	#endif
	extern int vm_swap_size;
	static int vm_max_launder = 32;
	static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
	static int vm_pageout_full_stats_interval = 0;
	static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
	static int defer_swap_pageouts=0;
	static int disable_swap_pageouts=0;

	#if defined(NO_SWAPPING)
	static int vm_swap_enabled=0;
	static int vm_swap_idle_enabled=0;
	#else
	static int vm_swap_enabled=1;
	static int vm_swap_idle_enabled=0;
	#endif

	SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");

	SYSCTL_INT(_vm, OID_AUTO, max_launder,
	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");

	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");

	SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");

	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");

	SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");

	#if defined(NO_SWAPPING)
	SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
	CTLFLAG_RD, &vm_swap_enabled, 0, "");
	SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
	#else
	SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
	SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
	#endif

	SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");

	SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");

	static int pageout_lock_miss;
	SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");

	#define VM_PAGEOUT_PAGE_COUNT 16
	int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;

	int vm_page_max_wired; /* XXX max # of wired pages system-wide */

	#if !defined(NO_SWAPPING)
	typedef void freeer_fcn_t(vm_map_t, vm_object_t, vm_pindex_t, int);
	static void vm_pageout_map_deactivate_pages(vm_map_t, vm_pindex_t);
	static freeer_fcn_t vm_pageout_object_deactivate_pages;
	static void vm_req_vmdaemon(void);
	#endif
	static void vm_pageout_page_stats(void);

	/*
	* vm_pageout_clean:
	*
	* Clean the page and remove it from the laundry.
	*
	* We set the busy bit to cause potential page faults on this page to
	* block. Note the careful timing, however, the busy bit isn't set till
	* late and we cannot do anything that will mess with the page.
	*/
	static int
	vm_pageout_clean(m)
	vm_page_t m;
	{
	vm_object_t object;
	vm_page_t mc[2*vm_pageout_page_count];
	int pageout_count;
	int ib, is, page_base;
	vm_pindex_t pindex = m->pindex;

	mtx_assert(&vm_page_queue_mtx, MA_OWNED);

	object = m->object;

	/*
	* It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
	* with the new swapper, but we could have serious problems paging
	* out other object types if there is insufficient memory.
	*
	* Unfortunately, checking free memory here is far too late, so the
	* check has been moved up a procedural level.
	*/

	/*
	* Don't mess with the page if it's busy, held, or special
	*/
	if ((m->hold_count != 0) \|\|
	((m->busy != 0) \|\| (m->flags & (PG_BUSY\|PG_UNMANAGED)))) {
	return 0;
	}

	mc[vm_pageout_page_count] = m;
	pageout_count = 1;
	page_base = vm_pageout_page_count;
	ib = 1;
	is = 1;

	/*
	* Scan object for clusterable pages.
	*
	* We can cluster ONLY if: ->> the page is NOT
	* clean, wired, busy, held, or mapped into a
	* buffer, and one of the following:
	* 1) The page is inactive, or a seldom used
	* active page.
	* -or-
	* 2) we force the issue.
	*
	* During heavy mmap/modification loads the pageout
	* daemon can really fragment the underlying file
	* due to flushing pages out of order and not trying
	* align the clusters (which leave sporatic out-of-order
	* holes). To solve this problem we do the reverse scan
	* first and attempt to align our cluster, then do a
	* forward scan if room remains.
	*/
	more:
	while (ib && pageout_count < vm_pageout_page_count) {
	vm_page_t p;

	if (ib > pindex) {
	ib = 0;
	break;
	}

	if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
	ib = 0;
	break;
	}
	if (((p->queue - p->pc) == PQ_CACHE) \|\|
	(p->flags & (PG_BUSY\|PG_UNMANAGED)) \|\| p->busy) {
	ib = 0;
	break;
	}
	vm_page_test_dirty(p);
	if ((p->dirty & p->valid) == 0 \|\|
	p->queue != PQ_INACTIVE \|\|
	p->wire_count != 0 \|\| /* may be held by buf cache */
	p->hold_count != 0) { /* may be undergoing I/O */
	ib = 0;
	break;
	}
	mc[--page_base] = p;
	++pageout_count;
	++ib;
	/*
	* alignment boundry, stop here and switch directions. Do
	* not clear ib.
	*/
	if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
	break;
	}

	while (pageout_count < vm_pageout_page_count &&
	pindex + is < object->size) {
	vm_page_t p;

	if ((p = vm_page_lookup(object, pindex + is)) == NULL)
	break;
	if (((p->queue - p->pc) == PQ_CACHE) \|\|
	(p->flags & (PG_BUSY\|PG_UNMANAGED)) \|\| p->busy) {
	break;
	}
	vm_page_test_dirty(p);
	if ((p->dirty & p->valid) == 0 \|\|
	p->queue != PQ_INACTIVE \|\|
	p->wire_count != 0 \|\| /* may be held by buf cache */
	p->hold_count != 0) { /* may be undergoing I/O */
	break;
	}
	mc[page_base + pageout_count] = p;
	++pageout_count;
	++is;
	}

	/*
	* If we exhausted our forward scan, continue with the reverse scan
	* when possible, even past a page boundry. This catches boundry
	* conditions.
	*/
	if (ib && pageout_count < vm_pageout_page_count)
	goto more;

	/*
	* we allow reads during pageouts...
	*/
	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
	}

	/*
	* vm_pageout_flush() - launder the given pages
	*
	* The given pages are laundered. Note that we setup for the start of
	* I/O ( i.e. busy the page ), mark it read-only, and bump the object
	* reference count all in here rather then in the parent. If we want
	* the parent to do more sophisticated things we may have to change
	* the ordering.
	*/
	int
	vm_pageout_flush(mc, count, flags)
	vm_page_t *mc;
	int count;
	int flags;
	{
	vm_object_t object;
	int pageout_status[count];
	int numpagedout = 0;
	int i;

	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
	/*
	* Initiate I/O. Bump the vm_page_t->busy counter and
	* mark the pages read-only.
	*
	* We do not have to fixup the clean/dirty bits here... we can
	* allow the pager to do it after the I/O completes.
	*
	* NOTE! mc[i]->dirty may be partial or fragmented due to an
	* edge case with file fragments.
	*/
	for (i = 0; i < count; i++) {
	KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially invalid page", mc[i], i, count));
	vm_page_io_start(mc[i]);
	vm_page_protect(mc[i], VM_PROT_READ);
	}
	object = mc[0]->object;
	vm_page_unlock_queues();
	vm_object_pip_add(object, count);

	vm_pager_put_pages(object, mc, count,
	(flags \| ((object == kernel_object) ? OBJPC_SYNC : 0)),
	pageout_status);

	vm_page_lock_queues();
	for (i = 0; i < count; i++) {
	vm_page_t mt = mc[i];

	switch (pageout_status[i]) {
	case VM_PAGER_OK:
	numpagedout++;
	break;
	case VM_PAGER_PEND:
	numpagedout++;
	break;
	case VM_PAGER_BAD:
	/*
	* Page outside of range of object. Right now we
	* essentially lose the changes by pretending it
	* worked.
	*/
	pmap_clear_modify(mt);
	vm_page_undirty(mt);
	break;
	case VM_PAGER_ERROR:
	case VM_PAGER_FAIL:
	/*
	* If page couldn't be paged out, then reactivate the
	* page so it doesn't clog the inactive list. (We
	* will try paging out it again later).
	*/
	vm_page_activate(mt);
	break;
	case VM_PAGER_AGAIN:
	break;
	}

	/*
	* If the operation is still going, leave the page busy to
	* block all other accesses. Also, leave the paging in
	* progress indicator set so that we don't attempt an object
	* collapse.
	*/
	if (pageout_status[i] != VM_PAGER_PEND) {
	vm_object_pip_wakeup(object);
	vm_page_io_finish(mt);
	if (!vm_page_count_severe() \|\| !vm_page_try_to_cache(mt))
	vm_page_protect(mt, VM_PROT_READ);
	}
	}
	return numpagedout;
	}

	#if !defined(NO_SWAPPING)
	/*
	* vm_pageout_object_deactivate_pages
	*
	* deactivate enough pages to satisfy the inactive target
	* requirements or if vm_page_proc_limit is set, then
	* deactivate all of the pages in the object and its
	* backing_objects.
	*
	* The object and map must be locked.
	*/
	static void
	vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only)
	vm_map_t map;
	vm_object_t object;
	vm_pindex_t desired;
	int map_remove_only;
	{
	vm_page_t p, next;
	int actcount, rcount, remove_mode;

	GIANT_REQUIRED;
	if (object->type == OBJT_DEVICE \|\| object->type == OBJT_PHYS)
	return;

	while (object) {
	if (pmap_resident_count(vm_map_pmap(map)) <= desired)
	return;
	if (object->paging_in_progress)
	return;

	remove_mode = map_remove_only;
	if (object->shadow_count > 1)
	remove_mode = 1;
	/*
	* scan the objects entire memory queue
	*/
	rcount = object->resident_page_count;
	p = TAILQ_FIRST(&object->memq);
	vm_page_lock_queues();
	while (p && (rcount-- > 0)) {
	if (pmap_resident_count(map->pmap) <= desired) {
	vm_page_unlock_queues();
	return;
	}
	next = TAILQ_NEXT(p, listq);
	cnt.v_pdpages++;
	if (p->wire_count != 0 \|\|
	p->hold_count != 0 \|\|
	p->busy != 0 \|\|
	(p->flags & (PG_BUSY\|PG_UNMANAGED)) \|\|
	!pmap_page_exists_quick(vm_map_pmap(map), p)) {
	p = next;
	continue;
	}
	actcount = pmap_ts_referenced(p);
	if (actcount) {
	vm_page_flag_set(p, PG_REFERENCED);
	} else if (p->flags & PG_REFERENCED) {
	actcount = 1;
	}
	if ((p->queue != PQ_ACTIVE) &&
	(p->flags & PG_REFERENCED)) {
	vm_page_activate(p);
	p->act_count += actcount;
	vm_page_flag_clear(p, PG_REFERENCED);
	} else if (p->queue == PQ_ACTIVE) {
	if ((p->flags & PG_REFERENCED) == 0) {
	p->act_count -= min(p->act_count, ACT_DECLINE);
	if (!remove_mode && (vm_pageout_algorithm \|\| (p->act_count == 0))) {
	vm_page_protect(p, VM_PROT_NONE);
	vm_page_deactivate(p);
	} else {
	vm_pageq_requeue(p);
	}
	} else {
	vm_page_activate(p);
	vm_page_flag_clear(p, PG_REFERENCED);
	if (p->act_count < (ACT_MAX - ACT_ADVANCE))
	p->act_count += ACT_ADVANCE;
	vm_pageq_requeue(p);
	}
	} else if (p->queue == PQ_INACTIVE) {
	vm_page_protect(p, VM_PROT_NONE);
	}
	p = next;
	}
	vm_page_unlock_queues();
	object = object->backing_object;
	}
	}

	/*
	* deactivate some number of pages in a map, try to do it fairly, but
	* that is really hard to do.
	*/
	static void
	vm_pageout_map_deactivate_pages(map, desired)
	vm_map_t map;
	vm_pindex_t desired;
	{
	vm_map_entry_t tmpe;
	vm_object_t obj, bigobj;
	int nothingwired;

	GIANT_REQUIRED;
	if (!vm_map_trylock(map))
	return;

	bigobj = NULL;
	nothingwired = TRUE;

	/*
	* first, search out the biggest object, and try to free pages from
	* that.
	*/
	tmpe = map->header.next;
	while (tmpe != &map->header) {
	if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	obj = tmpe->object.vm_object;
	if ((obj != NULL) && (obj->shadow_count <= 1) &&
	((bigobj == NULL) \|\|
	(bigobj->resident_page_count < obj->resident_page_count))) {
	bigobj = obj;
	}
	}
	if (tmpe->wired_count > 0)
	nothingwired = FALSE;
	tmpe = tmpe->next;
	}

	if (bigobj)
	vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);

	/*
	* Next, hunt around for other pages to deactivate. We actually
	* do this search sort of wrong -- .text first is not the best idea.
	*/
	tmpe = map->header.next;
	while (tmpe != &map->header) {
	if (pmap_resident_count(vm_map_pmap(map)) <= desired)
	break;
	if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
	obj = tmpe->object.vm_object;
	if (obj)
	vm_pageout_object_deactivate_pages(map, obj, desired, 0);
	}
	tmpe = tmpe->next;
	};

	/*
	* Remove all mappings if a process is swapped out, this will free page
	* table pages.
	*/
	if (desired == 0 && nothingwired)
	- pmap_remove(vm_map_pmap(map),
	- VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
	+ pmap_remove(vm_map_pmap(map), vm_map_min(map),
	+ vm_map_max(map));
	vm_map_unlock(map);
	return;
	}
	#endif /* !defined(NO_SWAPPING) */

	/*
	* Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore
	* to vnode deadlocks. We only do it for OBJT_DEFAULT and OBJT_SWAP objects
	* which we know can be trivially freed.
	*/
	void
	vm_pageout_page_free(vm_page_t m) {
	vm_object_t object = m->object;
	int type = object->type;

	GIANT_REQUIRED;
	if (type == OBJT_SWAP \|\| type == OBJT_DEFAULT)
	vm_object_reference(object);
	vm_page_busy(m);
	vm_page_protect(m, VM_PROT_NONE);
	vm_page_free(m);
	cnt.v_dfree++;
	if (type == OBJT_SWAP \|\| type == OBJT_DEFAULT)
	vm_object_deallocate(object);
	}

	/*
	* vm_pageout_scan does the dirty work for the pageout daemon.
	*/
	static void
	vm_pageout_scan(int pass)
	{
	vm_page_t m, next;
	struct vm_page marker;
	int save_page_shortage;
	int save_inactive_count;
	int page_shortage, maxscan, pcount;
	int addl_page_shortage, addl_page_shortage_init;
	struct proc p, bigproc;
	vm_offset_t size, bigsize;
	vm_object_t object;
	int actcount;
	int vnodes_skipped = 0;
	int maxlaunder;
	int s;
	struct thread *td;

	GIANT_REQUIRED;
	/*
	* Do whatever cleanup that the pmap code can.
	*/
	pmap_collect();
	uma_reclaim();

	addl_page_shortage_init = vm_pageout_deficit;
	vm_pageout_deficit = 0;

	/*
	* Calculate the number of pages we want to either free or move
	* to the cache.
	*/
	page_shortage = vm_paging_target() + addl_page_shortage_init;
	save_page_shortage = page_shortage;
	save_inactive_count = cnt.v_inactive_count;

	/*
	* Initialize our marker
	*/
	bzero(&marker, sizeof(marker));
	marker.flags = PG_BUSY \| PG_FICTITIOUS \| PG_MARKER;
	marker.queue = PQ_INACTIVE;
	marker.wire_count = 1;

	/*
	* Start scanning the inactive queue for pages we can move to the
	* cache or free. The scan will stop when the target is reached or
	* we have scanned the entire inactive queue. Note that m->act_count
	* is not used to form decisions for the inactive queue, only for the
	* active queue.
	*
	* maxlaunder limits the number of dirty pages we flush per scan.
	* For most systems a smaller value (16 or 32) is more robust under
	* extreme memory and disk pressure because any unnecessary writes
	* to disk can result in extreme performance degredation. However,
	* systems with excessive dirty pages (especially when MAP_NOSYNC is
	* used) will die horribly with limited laundering. If the pageout
	* daemon cannot clean enough pages in the first pass, we let it go
	* all out in succeeding passes.
	*/
	if ((maxlaunder = vm_max_launder) <= 1)
	maxlaunder = 1;
	if (pass)
	maxlaunder = 10000;
	rescan0:
	addl_page_shortage = addl_page_shortage_init;
	maxscan = cnt.v_inactive_count;

	for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
	m != NULL && maxscan-- > 0 && page_shortage > 0;
	m = next) {

	cnt.v_pdpages++;

	if (m->queue != PQ_INACTIVE) {
	goto rescan0;
	}

	next = TAILQ_NEXT(m, pageq);

	/*
	* skip marker pages
	*/
	if (m->flags & PG_MARKER)
	continue;

	/*
	* A held page may be undergoing I/O, so skip it.
	*/
	if (m->hold_count) {
	vm_pageq_requeue(m);
	addl_page_shortage++;
	continue;
	}
	/*
	* Don't mess with busy pages, keep in the front of the
	* queue, most likely are being paged out.
	*/
	if (m->busy \|\| (m->flags & PG_BUSY)) {
	addl_page_shortage++;
	continue;
	}

	/*
	* If the object is not being used, we ignore previous
	* references.
	*/
	if (m->object->ref_count == 0) {
	vm_page_flag_clear(m, PG_REFERENCED);
	pmap_clear_reference(m);

	/*
	* Otherwise, if the page has been referenced while in the
	* inactive queue, we bump the "activation count" upwards,
	* making it less likely that the page will be added back to
	* the inactive queue prematurely again. Here we check the
	* page tables (or emulated bits, if any), given the upper
	* level VM system not knowing anything about existing
	* references.
	*/
	} else if (((m->flags & PG_REFERENCED) == 0) &&
	(actcount = pmap_ts_referenced(m))) {
	vm_page_lock_queues();
	vm_page_activate(m);
	vm_page_unlock_queues();
	m->act_count += (actcount + ACT_ADVANCE);
	continue;
	}

	/*
	* If the upper level VM system knows about any page
	* references, we activate the page. We also set the
	* "activation count" higher than normal so that we will less
	* likely place pages back onto the inactive queue again.
	*/
	if ((m->flags & PG_REFERENCED) != 0) {
	vm_page_flag_clear(m, PG_REFERENCED);
	actcount = pmap_ts_referenced(m);
	vm_page_lock_queues();
	vm_page_activate(m);
	vm_page_unlock_queues();
	m->act_count += (actcount + ACT_ADVANCE + 1);
	continue;
	}

	/*
	* If the upper level VM system doesn't know anything about
	* the page being dirty, we have to check for it again. As
	* far as the VM code knows, any partially dirty pages are
	* fully dirty.
	*/
	if (m->dirty == 0) {
	vm_page_test_dirty(m);
	} else {
	vm_page_dirty(m);
	}

	/*
	* Invalid pages can be easily freed
	*/
	if (m->valid == 0) {
	vm_page_lock_queues();
	vm_pageout_page_free(m);
	vm_page_unlock_queues();
	--page_shortage;

	/*
	* Clean pages can be placed onto the cache queue. This
	* effectively frees them.
	*/
	} else if (m->dirty == 0) {
	vm_page_lock_queues();
	vm_page_cache(m);
	vm_page_unlock_queues();
	--page_shortage;
	} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
	/*
	* Dirty pages need to be paged out, but flushing
	* a page is extremely expensive verses freeing
	* a clean page. Rather then artificially limiting
	* the number of pages we can flush, we instead give
	* dirty pages extra priority on the inactive queue
	* by forcing them to be cycled through the queue
	* twice before being flushed, after which the
	* (now clean) page will cycle through once more
	* before being freed. This significantly extends
	* the thrash point for a heavily loaded machine.
	*/
	vm_page_flag_set(m, PG_WINATCFLS);
	vm_pageq_requeue(m);
	} else if (maxlaunder > 0) {
	/*
	* We always want to try to flush some dirty pages if
	* we encounter them, to keep the system stable.
	* Normally this number is small, but under extreme
	* pressure where there are insufficient clean pages
	* on the inactive queue, we may have to go all out.
	*/
	int swap_pageouts_ok;
	struct vnode *vp = NULL;
	struct mount *mp;

	object = m->object;

	if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
	swap_pageouts_ok = 1;
	} else {
	swap_pageouts_ok = !(defer_swap_pageouts \|\| disable_swap_pageouts);
	swap_pageouts_ok \|= (!disable_swap_pageouts && defer_swap_pageouts &&
	vm_page_count_min());

	}

	/*
	* We don't bother paging objects that are "dead".
	* Those objects are in a "rundown" state.
	*/
	if (!swap_pageouts_ok \|\| (object->flags & OBJ_DEAD)) {
	vm_pageq_requeue(m);
	continue;
	}

	/*
	* The object is already known NOT to be dead. It
	* is possible for the vget() to block the whole
	* pageout daemon, but the new low-memory handling
	* code should prevent it.
	*
	* The previous code skipped locked vnodes and, worse,
	* reordered pages in the queue. This results in
	* completely non-deterministic operation and, on a
	* busy system, can lead to extremely non-optimal
	* pageouts. For example, it can cause clean pages
	* to be freed and dirty pages to be moved to the end
	* of the queue. Since dirty pages are also moved to
	* the end of the queue once-cleaned, this gives
	* way too large a weighting to defering the freeing
	* of dirty pages.
	*
	* We can't wait forever for the vnode lock, we might
	* deadlock due to a vn_read() getting stuck in
	* vm_wait while holding this vnode. We skip the
	* vnode if we can't get it in a reasonable amount
	* of time.
	*/
	if (object->type == OBJT_VNODE) {
	vp = object->handle;

	mp = NULL;
	if (vp->v_type == VREG)
	vn_start_write(vp, &mp, V_NOWAIT);
	if (vget(vp, LK_EXCLUSIVE\|LK_NOOBJ\|LK_TIMELOCK, curthread)) {
	++pageout_lock_miss;
	vn_finished_write(mp);
	if (object->flags & OBJ_MIGHTBEDIRTY)
	vnodes_skipped++;
	continue;
	}

	/*
	* The page might have been moved to another
	* queue during potential blocking in vget()
	* above. The page might have been freed and
	* reused for another vnode. The object might
	* have been reused for another vnode.
	*/
	if (m->queue != PQ_INACTIVE \|\|
	m->object != object \|\|
	object->handle != vp) {
	if (object->flags & OBJ_MIGHTBEDIRTY)
	vnodes_skipped++;
	vput(vp);
	vn_finished_write(mp);
	continue;
	}

	/*
	* The page may have been busied during the
	* blocking in vput(); We don't move the
	* page back onto the end of the queue so that
	* statistics are more correct if we don't.
	*/
	if (m->busy \|\| (m->flags & PG_BUSY)) {
	vput(vp);
	vn_finished_write(mp);
	continue;
	}

	/*
	* If the page has become held it might
	* be undergoing I/O, so skip it
	*/
	if (m->hold_count) {
	vm_pageq_requeue(m);
	if (object->flags & OBJ_MIGHTBEDIRTY)
	vnodes_skipped++;
	vput(vp);
	vn_finished_write(mp);
	continue;
	}
	}

	/*
	* If a page is dirty, then it is either being washed
	* (but not yet cleaned) or it is still in the
	* laundry. If it is still in the laundry, then we
	* start the cleaning operation.
	*
	* This operation may cluster, invalidating the 'next'
	* pointer. To prevent an inordinate number of
	* restarts we use our marker to remember our place.
	*
	* decrement page_shortage on success to account for
	* the (future) cleaned page. Otherwise we could wind
	* up laundering or cleaning too many pages.
	*/
	vm_page_lock_queues();
	s = splvm();
	TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
	splx(s);
	if (vm_pageout_clean(m) != 0) {
	--page_shortage;
	--maxlaunder;
	}
	s = splvm();
	next = TAILQ_NEXT(&marker, pageq);
	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
	splx(s);
	vm_page_unlock_queues();
	if (vp) {
	vput(vp);
	vn_finished_write(mp);
	}
	}
	}

	/*
	* Compute the number of pages we want to try to move from the
	* active queue to the inactive queue.
	*/
	page_shortage = vm_paging_target() +
	cnt.v_inactive_target - cnt.v_inactive_count;
	page_shortage += addl_page_shortage;

	vm_page_lock_queues();
	/*
	* Scan the active queue for things we can deactivate. We nominally
	* track the per-page activity counter and use it to locate
	* deactivation candidates.
	*/
	pcount = cnt.v_active_count;
	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);

	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {

	/*
	* This is a consistency check, and should likely be a panic
	* or warning.
	*/
	if (m->queue != PQ_ACTIVE) {
	break;
	}

	next = TAILQ_NEXT(m, pageq);
	/*
	* Don't deactivate pages that are busy.
	*/
	if ((m->busy != 0) \|\|
	(m->flags & PG_BUSY) \|\|
	(m->hold_count != 0)) {
	vm_pageq_requeue(m);
	m = next;
	continue;
	}

	/*
	* The count for pagedaemon pages is done after checking the
	* page for eligibility...
	*/
	cnt.v_pdpages++;

	/*
	* Check to see "how much" the page has been used.
	*/
	actcount = 0;
	if (m->object->ref_count != 0) {
	if (m->flags & PG_REFERENCED) {
	actcount += 1;
	}
	actcount += pmap_ts_referenced(m);
	if (actcount) {
	m->act_count += ACT_ADVANCE + actcount;
	if (m->act_count > ACT_MAX)
	m->act_count = ACT_MAX;
	}
	}

	/*
	* Since we have "tested" this bit, we need to clear it now.
	*/
	vm_page_flag_clear(m, PG_REFERENCED);

	/*
	* Only if an object is currently being used, do we use the
	* page activation count stats.
	*/
	if (actcount && (m->object->ref_count != 0)) {
	vm_pageq_requeue(m);
	} else {
	m->act_count -= min(m->act_count, ACT_DECLINE);
	if (vm_pageout_algorithm \|\|
	m->object->ref_count == 0 \|\|
	m->act_count == 0) {
	page_shortage--;
	if (m->object->ref_count == 0) {
	vm_page_protect(m, VM_PROT_NONE);
	if (m->dirty == 0)
	vm_page_cache(m);
	else
	vm_page_deactivate(m);
	} else {
	vm_page_deactivate(m);
	}
	} else {
	vm_pageq_requeue(m);
	}
	}
	m = next;
	}
	s = splvm();

	/*
	* We try to maintain some really free pages, this allows interrupt
	* code to be guaranteed space. Since both cache and free queues
	* are considered basically 'free', moving pages from cache to free
	* does not effect other calculations.
	*/
	while (cnt.v_free_count < cnt.v_free_reserved) {
	static int cache_rover = 0;
	m = vm_pageq_find(PQ_CACHE, cache_rover, FALSE);
	if (!m)
	break;
	if ((m->flags & (PG_BUSY\|PG_UNMANAGED)) \|\|
	m->busy \|\|
	m->hold_count \|\|
	m->wire_count) {
	#ifdef INVARIANTS
	printf("Warning: busy page %p found in cache\n", m);
	#endif
	vm_page_deactivate(m);
	continue;
	}
	cache_rover = (cache_rover + PQ_PRIME2) & PQ_L2_MASK;
	vm_pageout_page_free(m);
	}
	splx(s);
	vm_page_unlock_queues();
	#if !defined(NO_SWAPPING)
	/*
	* Idle process swapout -- run once per second.
	*/
	if (vm_swap_idle_enabled) {
	static long lsec;
	if (time_second != lsec) {
	vm_pageout_req_swapout \|= VM_SWAP_IDLE;
	vm_req_vmdaemon();
	lsec = time_second;
	}
	}
	#endif

	/*
	* If we didn't get enough free pages, and we have skipped a vnode
	* in a writeable object, wakeup the sync daemon. And kick swapout
	* if we did not get enough free pages.
	*/
	if (vm_paging_target() > 0) {
	if (vnodes_skipped && vm_page_count_min())
	(void) speedup_syncer();
	#if !defined(NO_SWAPPING)
	if (vm_swap_enabled && vm_page_count_target()) {
	vm_req_vmdaemon();
	vm_pageout_req_swapout \|= VM_SWAP_NORMAL;
	}
	#endif
	}

	/*
	* If we are out of swap and were not able to reach our paging
	* target, kill the largest process.
	*
	* We keep the process bigproc locked once we find it to keep anyone
	* from messing with it; however, there is a possibility of
	* deadlock if process B is bigproc and one of it's child processes
	* attempts to propagate a signal to B while we are waiting for A's
	* lock while walking this list. To avoid this, we don't block on
	* the process lock but just skip a process if it is already locked.
	*/
	if ((vm_swap_size < 64 && vm_page_count_min()) \|\|
	(swap_pager_full && vm_paging_target() > 0)) {
	#if 0
	if ((vm_swap_size < 64 \|\| swap_pager_full) && vm_page_count_min()) {
	#endif
	bigproc = NULL;
	bigsize = 0;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	int breakout;
	/*
	* If this process is already locked, skip it.
	*/
	if (PROC_TRYLOCK(p) == 0)
	continue;
	/*
	* if this is a system process, skip it
	*/
	if ((p->p_flag & P_SYSTEM) \|\| (p->p_pid == 1) \|\|
	((p->p_pid < 48) && (vm_swap_size != 0))) {
	PROC_UNLOCK(p);
	continue;
	}
	/*
	* if the process is in a non-running type state,
	* don't touch it. Check all the threads individually.
	*/
	mtx_lock_spin(&sched_lock);
	breakout = 0;
	FOREACH_THREAD_IN_PROC(p, td) {
	if (!TD_ON_RUNQ(td) &&
	!TD_IS_RUNNING(td) &&
	!TD_IS_SLEEPING(td)) {
	breakout = 1;
	break;
	}
	}
	if (breakout) {
	mtx_unlock_spin(&sched_lock);
	PROC_UNLOCK(p);
	continue;
	}
	mtx_unlock_spin(&sched_lock);
	/*
	* get the process size
	*/
	size = vmspace_resident_count(p->p_vmspace) +
	vmspace_swap_count(p->p_vmspace);
	/*
	* if the this process is bigger than the biggest one
	* remember it.
	*/
	if (size > bigsize) {
	if (bigproc != NULL)
	PROC_UNLOCK(bigproc);
	bigproc = p;
	bigsize = size;
	} else
	PROC_UNLOCK(p);
	}
	sx_sunlock(&allproc_lock);
	if (bigproc != NULL) {
	struct ksegrp *kg;
	killproc(bigproc, "out of swap space");
	mtx_lock_spin(&sched_lock);
	FOREACH_KSEGRP_IN_PROC(bigproc, kg) {
	kg->kg_estcpu = 0;
	kg->kg_nice = PRIO_MIN; /* XXXKSE ??? */
	resetpriority(kg);
	}
	mtx_unlock_spin(&sched_lock);
	PROC_UNLOCK(bigproc);
	wakeup(&cnt.v_free_count);
	}
	}
	}

	/*
	* This routine tries to maintain the pseudo LRU active queue,
	* so that during long periods of time where there is no paging,
	* that some statistic accumulation still occurs. This code
	* helps the situation where paging just starts to occur.
	*/
	static void
	vm_pageout_page_stats()
	{
	vm_page_t m,next;
	int pcount,tpcount; /* Number of pages to check */
	static int fullintervalcount = 0;
	int page_shortage;
	int s0;

	page_shortage =
	(cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
	(cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);

	if (page_shortage <= 0)
	return;

	s0 = splvm();
	vm_page_lock_queues();
	pcount = cnt.v_active_count;
	fullintervalcount += vm_pageout_stats_interval;
	if (fullintervalcount < vm_pageout_full_stats_interval) {
	tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count;
	if (pcount > tpcount)
	pcount = tpcount;
	} else {
	fullintervalcount = 0;
	}

	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
	while ((m != NULL) && (pcount-- > 0)) {
	int actcount;

	if (m->queue != PQ_ACTIVE) {
	break;
	}

	next = TAILQ_NEXT(m, pageq);
	/*
	* Don't deactivate pages that are busy.
	*/
	if ((m->busy != 0) \|\|
	(m->flags & PG_BUSY) \|\|
	(m->hold_count != 0)) {
	vm_pageq_requeue(m);
	m = next;
	continue;
	}

	actcount = 0;
	if (m->flags & PG_REFERENCED) {
	vm_page_flag_clear(m, PG_REFERENCED);
	actcount += 1;
	}

	actcount += pmap_ts_referenced(m);
	if (actcount) {
	m->act_count += ACT_ADVANCE + actcount;
	if (m->act_count > ACT_MAX)
	m->act_count = ACT_MAX;
	vm_pageq_requeue(m);
	} else {
	if (m->act_count == 0) {
	/*
	* We turn off page access, so that we have
	* more accurate RSS stats. We don't do this
	* in the normal page deactivation when the
	* system is loaded VM wise, because the
	* cost of the large number of page protect
	* operations would be higher than the value
	* of doing the operation.
	*/
	vm_page_protect(m, VM_PROT_NONE);
	vm_page_deactivate(m);
	} else {
	m->act_count -= min(m->act_count, ACT_DECLINE);
	vm_pageq_requeue(m);
	}
	}

	m = next;
	}
	vm_page_unlock_queues();
	splx(s0);
	}

	static int
	vm_pageout_free_page_calc(count)
	vm_size_t count;
	{
	if (count < cnt.v_page_count)
	return 0;
	/*
	* free_reserved needs to include enough for the largest swap pager
	* structures plus enough for any pv_entry structs when paging.
	*/
	if (cnt.v_page_count > 1024)
	cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
	else
	cnt.v_free_min = 4;
	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
	cnt.v_interrupt_free_min;
	cnt.v_free_reserved = vm_pageout_page_count +
	cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE;
	cnt.v_free_severe = cnt.v_free_min / 2;
	cnt.v_free_min += cnt.v_free_reserved;
	cnt.v_free_severe += cnt.v_free_reserved;
	return 1;
	}

	/*
	* vm_pageout is the high level pageout daemon.
	*/
	static void
	vm_pageout()
	{
	int pass;

	mtx_lock(&Giant);

	/*
	* Initialize some paging parameters.
	*/
	cnt.v_interrupt_free_min = 2;
	if (cnt.v_page_count < 2000)
	vm_pageout_page_count = 8;

	vm_pageout_free_page_calc(cnt.v_page_count);
	/*
	* v_free_target and v_cache_min control pageout hysteresis. Note
	* that these are more a measure of the VM cache queue hysteresis
	* then the VM free queue. Specifically, v_free_target is the
	* high water mark (free+cache pages).
	*
	* v_free_reserved + v_cache_min (mostly means v_cache_min) is the
	* low water mark, while v_free_min is the stop. v_cache_min must
	* be big enough to handle memory needs while the pageout daemon
	* is signalled and run to free more pages.
	*/
	if (cnt.v_free_count > 6144)
	cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
	else
	cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;

	if (cnt.v_free_count > 2048) {
	cnt.v_cache_min = cnt.v_free_target;
	cnt.v_cache_max = 2 * cnt.v_cache_min;
	cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
	} else {
	cnt.v_cache_min = 0;
	cnt.v_cache_max = 0;
	cnt.v_inactive_target = cnt.v_free_count / 4;
	}
	if (cnt.v_inactive_target > cnt.v_free_count / 3)
	cnt.v_inactive_target = cnt.v_free_count / 3;

	/* XXX does not really belong here */
	if (vm_page_max_wired == 0)
	vm_page_max_wired = cnt.v_free_count / 3;

	if (vm_pageout_stats_max == 0)
	vm_pageout_stats_max = cnt.v_free_target;

	/*
	* Set interval in seconds for stats scan.
	*/
	if (vm_pageout_stats_interval == 0)
	vm_pageout_stats_interval = 5;
	if (vm_pageout_full_stats_interval == 0)
	vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;

	/*
	* Set maximum free per pass
	*/
	if (vm_pageout_stats_free_max == 0)
	vm_pageout_stats_free_max = 5;

	swap_pager_swap_init();
	pass = 0;
	/*
	* The pageout daemon is never done, so loop forever.
	*/
	while (TRUE) {
	int error;
	int s = splvm();

	/*
	* If we have enough free memory, wakeup waiters. Do
	* not clear vm_pages_needed until we reach our target,
	* otherwise we may be woken up over and over again and
	* waste a lot of cpu.
	*/
	if (vm_pages_needed && !vm_page_count_min()) {
	if (vm_paging_needed() <= 0)
	vm_pages_needed = 0;
	wakeup(&cnt.v_free_count);
	}
	if (vm_pages_needed) {
	/*
	* Still not done, take a second pass without waiting
	* (unlimited dirty cleaning), otherwise sleep a bit
	* and try again.
	*/
	++pass;
	if (pass > 1)
	tsleep(&vm_pages_needed, PVM,
	"psleep", hz/2);
	} else {
	/*
	* Good enough, sleep & handle stats. Prime the pass
	* for the next run.
	*/
	if (pass > 1)
	pass = 1;
	else
	pass = 0;
	error = tsleep(&vm_pages_needed, PVM,
	"psleep", vm_pageout_stats_interval * hz);
	if (error && !vm_pages_needed) {
	splx(s);
	pass = 0;
	vm_pageout_page_stats();
	continue;
	}
	}

	if (vm_pages_needed)
	cnt.v_pdwakeups++;
	splx(s);
	vm_pageout_scan(pass);
	vm_pageout_deficit = 0;
	}
	}

	void
	pagedaemon_wakeup()
	{
	if (!vm_pages_needed && curthread->td_proc != pageproc) {
	vm_pages_needed++;
	wakeup(&vm_pages_needed);
	}
	}

	#if !defined(NO_SWAPPING)
	static void
	vm_req_vmdaemon()
	{
	static int lastrun = 0;

	if ((ticks > (lastrun + hz)) \|\| (ticks < lastrun)) {
	wakeup(&vm_daemon_needed);
	lastrun = ticks;
	}
	}

	static void
	vm_daemon()
	{
	struct proc *p;
	int breakout;
	struct thread *td;

	mtx_lock(&Giant);
	while (TRUE) {
	tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
	if (vm_pageout_req_swapout) {
	swapout_procs(vm_pageout_req_swapout);
	vm_pageout_req_swapout = 0;
	}
	/*
	* scan the processes for exceeding their rlimits or if
	* process is swapped out -- deactivate pages
	*/
	sx_slock(&allproc_lock);
	LIST_FOREACH(p, &allproc, p_list) {
	vm_pindex_t limit, size;

	/*
	* if this is a system process or if we have already
	* looked at this process, skip it.
	*/
	if (p->p_flag & (P_SYSTEM \| P_WEXIT)) {
	continue;
	}
	/*
	* if the process is in a non-running type state,
	* don't touch it.
	*/
	mtx_lock_spin(&sched_lock);
	breakout = 0;
	FOREACH_THREAD_IN_PROC(p, td) {
	if (!TD_ON_RUNQ(td) &&
	!TD_IS_RUNNING(td) &&
	!TD_IS_SLEEPING(td)) {
	breakout = 1;
	break;
	}
	}
	if (breakout) {
	mtx_unlock_spin(&sched_lock);
	continue;
	}
	/*
	* get a limit
	*/
	limit = OFF_TO_IDX(
	qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
	p->p_rlimit[RLIMIT_RSS].rlim_max));

	/*
	* let processes that are swapped out really be
	* swapped out set the limit to nothing (will force a
	* swap-out.)
	*/
	if ((p->p_sflag & PS_INMEM) == 0)
	limit = 0; /* XXX */
	mtx_unlock_spin(&sched_lock);

	size = vmspace_resident_count(p->p_vmspace);
	if (limit >= 0 && size >= limit) {
	vm_pageout_map_deactivate_pages(
	&p->p_vmspace->vm_map, limit);
	}
	}
	sx_sunlock(&allproc_lock);
	}
	}
	#endif /* !defined(NO_SWAPPING) */
	Index: head/sys/vm/vm_unix.c
	===================================================================
	--- head/sys/vm/vm_unix.c (revision 103766)
	+++ head/sys/vm/vm_unix.c (revision 103767)
	@@ -1,153 +1,153 @@
	/*
	* Copyright (c) 1988 University of Utah.
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* the Systems Programming Group of the University of Utah Computer
	* Science Department.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: Utah $Hdr: vm_unix.c 1.1 89/11/07$
	*
	* @(#)vm_unix.c 8.1 (Berkeley) 6/11/93
	* $FreeBSD$
	*/

	/*
	* Traditional sbrk/grow interface to VM
	*/

	#include <sys/param.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>

	#ifndef _SYS_SYSPROTO_H_
	struct obreak_args {
	char *nsize;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	obreak(td, uap)
	struct thread *td;
	struct obreak_args *uap;
	{
	struct vmspace *vm = td->td_proc->p_vmspace;
	vm_offset_t new, old, base;
	int rv;
	int error = 0;

	new = round_page((vm_offset_t)uap->nsize);
	vm_map_lock(&vm->vm_map);

	base = round_page((vm_offset_t) vm->vm_daddr);
	old = base + ctob(vm->vm_dsize);
	if (new > base) {
	/*
	* Check the resource limit, but allow a process to reduce
	* its usage, even if it remains over the limit.
	*/
	if (new - base > td->td_proc->p_rlimit[RLIMIT_DATA].rlim_cur &&
	new > old) {
	error = ENOMEM;
	goto done;
	}
	- if (new > VM_MAXUSER_ADDRESS) {
	+ if (new > vm_map_max(&vm->vm_map)) {
	error = ENOMEM;
	goto done;
	}
	} else if (new < base) {
	/*
	* This is simply an invalid value. If someone wants to
	* do fancy address space manipulations, mmap and munmap
	* can do most of what the user would want.
	*/
	error = EINVAL;
	goto done;
	}
	if (new > old) {
	if (vm->vm_map.size + (new - old) >
	td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
	error = ENOMEM;
	goto done;
	}
	rv = vm_map_insert(&vm->vm_map, NULL, 0, old, new,
	VM_PROT_ALL, VM_PROT_ALL, 0);
	if (rv != KERN_SUCCESS) {
	error = ENOMEM;
	goto done;
	}
	vm->vm_dsize += btoc(new - old);
	} else if (new < old) {
	rv = vm_map_delete(&vm->vm_map, new, old);
	if (rv != KERN_SUCCESS) {
	error = ENOMEM;
	goto done;
	}
	vm->vm_dsize -= btoc(old - new);
	}
	done:
	vm_map_unlock(&vm->vm_map);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ovadvise_args {
	int anom;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	ovadvise(td, uap)
	struct thread *td;
	struct ovadvise_args *uap;
	{
	/* START_GIANT_OPTIONAL */
	/* END_GIANT_OPTIONAL */
	return (EINVAL);
	}

File Metadata

Mime Type: application/octet-stream
Expires: Sat, Mar 8, 4:09 PM (2 d)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 17034909
Default Alt Text: (421 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions