No OneTemporary
Actions

Size

100 KB

Referenced Files

None

Subscribers

None

View Options

	Index: stable/8/sys/amd64/include/xen
	===================================================================
	--- stable/8/sys/amd64/include/xen (revision 220261)
	+++ stable/8/sys/amd64/include/xen (revision 220262)

	Property changes on: stable/8/sys/amd64/include/xen
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /head/sys/amd64/include/xen:r219041-219042,219311-219312
	Index: stable/8/sys/cddl/contrib/opensolaris
	===================================================================
	--- stable/8/sys/cddl/contrib/opensolaris (revision 220261)
	+++ stable/8/sys/cddl/contrib/opensolaris (revision 220262)

	Property changes on: stable/8/sys/cddl/contrib/opensolaris
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /head/sys/cddl/contrib/opensolaris:r219041-219042,219311-219312
	Index: stable/8/sys/contrib/dev/acpica
	===================================================================
	--- stable/8/sys/contrib/dev/acpica (revision 220261)
	+++ stable/8/sys/contrib/dev/acpica (revision 220262)

	Property changes on: stable/8/sys/contrib/dev/acpica
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /head/sys/contrib/dev/acpica:r219041-219042,219311-219312
	Index: stable/8/sys/contrib/pf
	===================================================================
	--- stable/8/sys/contrib/pf (revision 220261)
	+++ stable/8/sys/contrib/pf (revision 220262)

	Property changes on: stable/8/sys/contrib/pf
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /head/sys/contrib/pf:r219041-219042,219311-219312
	Index: stable/8/sys/kern/kern_exec.c
	===================================================================
	--- stable/8/sys/kern/kern_exec.c (revision 220261)
	+++ stable/8/sys/kern/kern_exec.c (revision 220262)
	@@ -1,1422 +1,1428 @@
	/*-
	* Copyright (c) 1993, David Greenman
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_hwpmc_hooks.h"
	#include "opt_kdtrace.h"
	#include "opt_ktrace.h"
	#include "opt_vm.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/eventhandler.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	#include <sys/signalvar.h>
	#include <sys/kernel.h>
	#include <sys/mount.h>
	#include <sys/filedesc.h>
	#include <sys/fcntl.h>
	#include <sys/acct.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/imgact_elf.h>
	#include <sys/wait.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/pioctl.h>
	#include <sys/namei.h>
	#include <sys/resourcevar.h>
	#include <sys/sdt.h>
	#include <sys/sf_buf.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/shm.h>
	#include <sys/sysctl.h>
	#include <sys/vnode.h>
	#include <sys/stat.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_page.h>
	#include <vm/vm_map.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_pager.h>

	#ifdef HWPMC_HOOKS
	#include <sys/pmckern.h>
	#endif

	#include <machine/reg.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#ifdef KDTRACE_HOOKS
	#include <sys/dtrace_bsd.h>
	dtrace_execexit_func_t dtrace_fasttrap_exec;
	#endif

	SDT_PROVIDER_DECLARE(proc);
	SDT_PROBE_DEFINE(proc, kernel, , exec, exec);
	SDT_PROBE_ARGTYPE(proc, kernel, , exec, 0, "char *");
	SDT_PROBE_DEFINE(proc, kernel, , exec_failure, exec-failure);
	SDT_PROBE_ARGTYPE(proc, kernel, , exec_failure, 0, "int");
	SDT_PROBE_DEFINE(proc, kernel, , exec_success, exec-success);
	SDT_PROBE_ARGTYPE(proc, kernel, , exec_success, 0, "char *");

	MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");

	static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
	static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
	static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
	static int do_execve(struct thread td, struct image_args args,
	struct mac *mac_p);
	static void exec_free_args(struct image_args *);

	/* XXX This should be vm_size_t. */
	SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG\|CTLFLAG_RD,
	NULL, 0, sysctl_kern_ps_strings, "LU", "");

	/* XXX This should be vm_size_t. */
	SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG\|CTLFLAG_RD,
	NULL, 0, sysctl_kern_usrstack, "LU", "");

	SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT\|CTLFLAG_RD,
	NULL, 0, sysctl_kern_stackprot, "I", "");

	u_long ps_arg_cache_limit = PAGE_SIZE / 16;
	SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
	&ps_arg_cache_limit, 0, "");

	static int map_at_zero = 0;
	TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
	SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
	"Permit processes to map an object at virtual address 0.");

	static int
	sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
	{
	struct proc *p;
	int error;

	p = curproc;
	#ifdef SCTL_MASK32
	if (req->flags & SCTL_MASK32) {
	unsigned int val;
	val = (unsigned int)p->p_sysent->sv_psstrings;
	error = SYSCTL_OUT(req, &val, sizeof(val));
	} else
	#endif
	error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
	sizeof(p->p_sysent->sv_psstrings));
	return error;
	}

	static int
	sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
	{
	struct proc *p;
	int error;

	p = curproc;
	#ifdef SCTL_MASK32
	if (req->flags & SCTL_MASK32) {
	unsigned int val;
	val = (unsigned int)p->p_sysent->sv_usrstack;
	error = SYSCTL_OUT(req, &val, sizeof(val));
	} else
	#endif
	error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
	sizeof(p->p_sysent->sv_usrstack));
	return error;
	}

	static int
	sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
	{
	struct proc *p;

	p = curproc;
	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
	sizeof(p->p_sysent->sv_stackprot)));
	}

	/*
	* Each of the items is a pointer to a `const struct execsw', hence the
	* double pointer here.
	*/
	static const struct execsw **execsw;

	#ifndef _SYS_SYSPROTO_H_
	struct execve_args {
	char *fname;
	char **argv;
	char **envv;
	};
	#endif

	int
	execve(td, uap)
	struct thread *td;
	struct execve_args /* {
	char *fname;
	char **argv;
	char **envv;
	} / uap;
	{
	int error;
	struct image_args args;

	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
	uap->argv, uap->envv);
	if (error == 0)
	error = kern_execve(td, &args, NULL);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct fexecve_args {
	int fd;
	char **argv;
	char **envv;
	}
	#endif
	int
	fexecve(struct thread td, struct fexecve_args uap)
	{
	int error;
	struct image_args args;

	error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
	uap->argv, uap->envv);
	if (error == 0) {
	args.fd = uap->fd;
	error = kern_execve(td, &args, NULL);
	}
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct __mac_execve_args {
	char *fname;
	char **argv;
	char **envv;
	struct mac *mac_p;
	};
	#endif

	int
	__mac_execve(td, uap)
	struct thread *td;
	struct __mac_execve_args /* {
	char *fname;
	char **argv;
	char **envv;
	struct mac *mac_p;
	} / uap;
	{
	#ifdef MAC
	int error;
	struct image_args args;

	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
	uap->argv, uap->envv);
	if (error == 0)
	error = kern_execve(td, &args, uap->mac_p);
	return (error);
	#else
	return (ENOSYS);
	#endif
	}

	/*
	* XXX: kern_execve has the astonishing property of not always returning to
	* the caller. If sufficiently bad things happen during the call to
	* do_execve(), it can end up calling exit1(); as a result, callers must
	* avoid doing anything which they might need to undo (e.g., allocating
	* memory).
	*/
	int
	kern_execve(td, args, mac_p)
	struct thread *td;
	struct image_args *args;
	struct mac *mac_p;
	{
	struct proc *p = td->td_proc;
	int error;

	AUDIT_ARG_ARGV(args->begin_argv, args->argc,
	args->begin_envv - args->begin_argv);
	AUDIT_ARG_ENVV(args->begin_envv, args->envc,
	args->endp - args->begin_envv);
	if (p->p_flag & P_HADTHREADS) {
	PROC_LOCK(p);
	if (thread_single(SINGLE_BOUNDARY)) {
	PROC_UNLOCK(p);
	exec_free_args(args);
	return (ERESTART); /* Try again later. */
	}
	PROC_UNLOCK(p);
	}

	error = do_execve(td, args, mac_p);

	if (p->p_flag & P_HADTHREADS) {
	PROC_LOCK(p);
	/*
	* If success, we upgrade to SINGLE_EXIT state to
	* force other threads to suicide.
	*/
	if (error == 0)
	thread_single(SINGLE_EXIT);
	else
	thread_single_end();
	PROC_UNLOCK(p);
	}

	return (error);
	}

	/*
	* In-kernel implementation of execve(). All arguments are assumed to be
	* userspace pointers from the passed thread.
	*/
	static int
	do_execve(td, args, mac_p)
	struct thread *td;
	struct image_args *args;
	struct mac *mac_p;
	{
	struct proc *p = td->td_proc;
	struct nameidata nd;
	struct ucred newcred = NULL, oldcred;
	struct uidinfo *euip;
	register_t *stack_base;
	int error, i;
	struct image_params image_params, *imgp;
	struct vattr attr;
	int (img_first)(struct image_params );
	struct pargs oldargs = NULL, newargs = NULL;
	struct sigacts oldsigacts, newsigacts;
	#ifdef KTRACE
	struct vnode *tracevp = NULL;
	struct ucred *tracecred = NULL;
	#endif
	struct vnode textvp = NULL, binvp = NULL;
	int credential_changing;
	int vfslocked;
	int textset;
	#ifdef MAC
	struct label *interpvplabel = NULL;
	int will_transition;
	#endif
	#ifdef HWPMC_HOOKS
	struct pmckern_procexec pe;
	#endif
	static const char fexecv_proc_title[] = "(fexecv)";

	vfslocked = 0;
	imgp = &image_params;

	/*
	* Lock the process and set the P_INEXEC flag to indicate that
	* it should be left alone until we're done here. This is
	* necessary to avoid race conditions - e.g. in ptrace() -
	* that might allow a local user to illicitly obtain elevated
	* privileges.
	*/
	PROC_LOCK(p);
	KASSERT((p->p_flag & P_INEXEC) == 0,
	("%s(): process already has P_INEXEC flag", __func__));
	p->p_flag \|= P_INEXEC;
	PROC_UNLOCK(p);

	/*
	* Initialize part of the common data
	*/
	imgp->proc = p;
	imgp->execlabel = NULL;
	imgp->attr = &attr;
	imgp->entry_addr = 0;
	imgp->vmspace_destroyed = 0;
	imgp->interpreted = 0;
	imgp->opened = 0;
	imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX;
	imgp->auxargs = NULL;
	imgp->vp = NULL;
	imgp->object = NULL;
	imgp->firstpage = NULL;
	imgp->ps_strings = 0;
	imgp->auxarg_size = 0;
	imgp->args = args;
	imgp->execpath = imgp->freepath = NULL;
	imgp->execpathp = 0;

	#ifdef MAC
	error = mac_execve_enter(imgp, mac_p);
	if (error)
	goto exec_fail;
	#endif

	imgp->image_header = NULL;

	/*
	* Translate the file name. namei() returns a vnode pointer
	* in ni_vp amoung other things.
	*
	* XXXAUDIT: It would be desirable to also audit the name of the
	* interpreter if this is an interpreted binary.
	*/
	if (args->fname != NULL) {
	NDINIT(&nd, LOOKUP, ISOPEN \| LOCKLEAF \| FOLLOW \| SAVENAME
	\| MPSAFE \| AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
	}

	SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );

	interpret:
	if (args->fname != NULL) {
	error = namei(&nd);
	if (error)
	goto exec_fail;

	vfslocked = NDHASGIANT(&nd);
	binvp = nd.ni_vp;
	imgp->vp = binvp;
	} else {
	AUDIT_ARG_FD(args->fd);
	error = fgetvp(td, args->fd, &binvp);
	if (error)
	goto exec_fail;
	vfslocked = VFS_LOCK_GIANT(binvp->v_mount);
	vn_lock(binvp, LK_EXCLUSIVE \| LK_RETRY);
	AUDIT_ARG_VNODE1(binvp);
	imgp->vp = binvp;
	}

	/*
	* Check file permissions (also 'opens' file)
	*/
	error = exec_check_permissions(imgp);
	if (error)
	goto exec_fail_dealloc;

	imgp->object = imgp->vp->v_object;
	if (imgp->object != NULL)
	vm_object_reference(imgp->object);

	/*
	* Set VV_TEXT now so no one can write to the executable while we're
	* activating it.
	*
	* Remember if this was set before and unset it in case this is not
	* actually an executable image.
	*/
	textset = imgp->vp->v_vflag & VV_TEXT;
	imgp->vp->v_vflag \|= VV_TEXT;

	error = exec_map_first_page(imgp);
	if (error)
	goto exec_fail_dealloc;

	imgp->proc->p_osrel = 0;
	/*
	* If the current process has a special image activator it
	* wants to try first, call it. For example, emulating shell
	* scripts differently.
	*/
	error = -1;
	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
	error = img_first(imgp);

	/*
	* Loop through the list of image activators, calling each one.
	* An activator returns -1 if there is no match, 0 on success,
	* and an error otherwise.
	*/
	for (i = 0; error == -1 && execsw[i]; ++i) {
	if (execsw[i]->ex_imgact == NULL \|\|
	execsw[i]->ex_imgact == img_first) {
	continue;
	}
	error = (*execsw[i]->ex_imgact)(imgp);
	}

	if (error) {
	if (error == -1) {
	if (textset == 0)
	imgp->vp->v_vflag &= ~VV_TEXT;
	error = ENOEXEC;
	}
	goto exec_fail_dealloc;
	}

	/*
	* Special interpreter operation, cleanup and loop up to try to
	* activate the interpreter.
	*/
	if (imgp->interpreted) {
	exec_unmap_first_page(imgp);
	/*
	* VV_TEXT needs to be unset for scripts. There is a short
	* period before we determine that something is a script where
	* VV_TEXT will be set. The vnode lock is held over this
	* entire period so nothing should illegitimately be blocked.
	*/
	imgp->vp->v_vflag &= ~VV_TEXT;
	/* free name buffer and old vnode */
	if (args->fname != NULL)
	NDFREE(&nd, NDF_ONLY_PNBUF);
	#ifdef MAC
	mac_execve_interpreter_enter(binvp, &interpvplabel);
	#endif
	if (imgp->opened) {
	VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
	imgp->opened = 0;
	}
	vput(binvp);
	vm_object_deallocate(imgp->object);
	imgp->object = NULL;
	VFS_UNLOCK_GIANT(vfslocked);
	vfslocked = 0;
	/* set new name to that of the interpreter */
	NDINIT(&nd, LOOKUP, LOCKLEAF \| FOLLOW \| SAVENAME \| MPSAFE,
	UIO_SYSSPACE, imgp->interpreter_name, td);
	args->fname = imgp->interpreter_name;
	goto interpret;
	}

	/*
	* NB: We unlock the vnode here because it is believed that none
	* of the sv_copyout_strings/sv_fixup operations require the vnode.
	*/
	VOP_UNLOCK(imgp->vp, 0);

	/*
	* Do the best to calculate the full path to the image file.
	*/
	if (imgp->auxargs != NULL &&
	((args->fname != NULL && args->fname[0] == '/') \|\|
	vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
	imgp->execpath = args->fname;

	/*
	* Copy out strings (args and env) and initialize stack base
	*/
	if (p->p_sysent->sv_copyout_strings)
	stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
	else
	stack_base = exec_copyout_strings(imgp);

	/*
	* If custom stack fixup routine present for this process
	* let it do the stack setup.
	* Else stuff argument count as first item on stack
	*/
	if (p->p_sysent->sv_fixup != NULL)
	(*p->p_sysent->sv_fixup)(&stack_base, imgp);
	else
	suword(--stack_base, imgp->args->argc);

	/*
	* For security and other reasons, the file descriptor table cannot
	* be shared after an exec.
	*/
	fdunshare(p, td);

	/*
	* Malloc things before we need locks.
	*/
	newcred = crget();
	euip = uifind(attr.va_uid);
	i = imgp->args->begin_envv - imgp->args->begin_argv;
	/* Cache arguments if they fit inside our allowance */
	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
	newargs = pargs_alloc(i);
	bcopy(imgp->args->begin_argv, newargs->ar_args, i);
	}

	/* close files on exec */
	fdcloseexec(td);
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY);

	/* Get a reference to the vnode prior to locking the proc */
	VREF(binvp);

	/*
	* For security and other reasons, signal handlers cannot
	* be shared after an exec. The new process gets a copy of the old
	* handlers. In execsigs(), the new process will have its signals
	* reset.
	*/
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);
	if (sigacts_shared(p->p_sigacts)) {
	oldsigacts = p->p_sigacts;
	PROC_UNLOCK(p);
	newsigacts = sigacts_alloc();
	sigacts_copy(newsigacts, oldsigacts);
	PROC_LOCK(p);
	p->p_sigacts = newsigacts;
	} else
	oldsigacts = NULL;

	/* Stop profiling */
	stopprofclock(p);

	/* reset caught signals */
	execsigs(p);

	/* name this process - nameiexec(p, ndp) */
	bzero(p->p_comm, sizeof(p->p_comm));
	if (args->fname)
	bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
	min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
	else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
	bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));

	/*
	* mark as execed, wakeup the process that vforked (if any) and tell
	* it that it now has its own resources back
	*/
	p->p_flag \|= P_EXEC;
	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
	p->p_flag &= ~P_PPWAIT;
	cv_broadcast(&p->p_pwait);
	}

	/*
	* Implement image setuid/setgid.
	*
	* Don't honor setuid/setgid if the filesystem prohibits it or if
	* the process is being traced.
	*
	* XXXMAC: For the time being, use NOSUID to also prohibit
	* transitions on the file system.
	*/
	credential_changing = 0;
	credential_changing \|= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
	attr.va_uid;
	credential_changing \|= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
	attr.va_gid;
	#ifdef MAC
	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
	interpvplabel, imgp);
	credential_changing \|= will_transition;
	#endif

	if (credential_changing &&
	(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
	(p->p_flag & P_TRACED) == 0) {
	/*
	* Turn off syscall tracing for set-id programs, except for
	* root. Record any set-id flags first to make sure that
	* we do not regain any tracing during a possible block.
	*/
	setsugid(p);

	#ifdef KTRACE
	if (priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0))
	ktrprocexec(p, &tracecred, &tracevp);
	#endif
	/*
	* Close any file descriptors 0..2 that reference procfs,
	* then make sure file descriptors 0..2 are in use.
	*
	* setugidsafety() may call closef() and then pfind()
	* which may grab the process lock.
	* fdcheckstd() may call falloc() which may block to
	* allocate memory, so temporarily drop the process lock.
	*/
	PROC_UNLOCK(p);
	VOP_UNLOCK(imgp->vp, 0);
	setugidsafety(td);
	error = fdcheckstd(td);
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY);
	if (error != 0)
	goto done1;
	PROC_LOCK(p);
	/*
	* Set the new credentials.
	*/
	if (attr.va_mode & S_ISUID)
	change_euid(newcred, euip);
	if (attr.va_mode & S_ISGID)
	change_egid(newcred, attr.va_gid);
	#ifdef MAC
	if (will_transition) {
	mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
	interpvplabel, imgp);
	}
	#endif
	/*
	* Implement correct POSIX saved-id behavior.
	*
	* XXXMAC: Note that the current logic will save the
	* uid and gid if a MAC domain transition occurs, even
	* though maybe it shouldn't.
	*/
	change_svuid(newcred, newcred->cr_uid);
	change_svgid(newcred, newcred->cr_gid);
	p->p_ucred = newcred;
	newcred = NULL;
	} else {
	if (oldcred->cr_uid == oldcred->cr_ruid &&
	oldcred->cr_gid == oldcred->cr_rgid)
	p->p_flag &= ~P_SUGID;
	/*
	* Implement correct POSIX saved-id behavior.
	*
	* XXX: It's not clear that the existing behavior is
	* POSIX-compliant. A number of sources indicate that the
	* saved uid/gid should only be updated if the new ruid is
	* not equal to the old ruid, or the new euid is not equal
	* to the old euid and the new euid is not equal to the old
	* ruid. The FreeBSD code always updates the saved uid/gid.
	* Also, this code uses the new (replaced) euid and egid as
	* the source, which may or may not be the right ones to use.
	*/
	if (oldcred->cr_svuid != oldcred->cr_uid \|\|
	oldcred->cr_svgid != oldcred->cr_gid) {
	change_svuid(newcred, newcred->cr_uid);
	change_svgid(newcred, newcred->cr_gid);
	p->p_ucred = newcred;
	newcred = NULL;
	}
	}

	/*
	* Store the vp for use in procfs. This vnode was referenced prior
	* to locking the proc lock.
	*/
	textvp = p->p_textvp;
	p->p_textvp = binvp;

	#ifdef KDTRACE_HOOKS
	/*
	* Tell the DTrace fasttrap provider about the exec if it
	* has declared an interest.
	*/
	if (dtrace_fasttrap_exec)
	dtrace_fasttrap_exec(p);
	#endif

	/*
	* Notify others that we exec'd, and clear the P_INEXEC flag
	* as we're now a bona fide freshly-execed process.
	*/
	KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
	p->p_flag &= ~P_INEXEC;

	/*
	* If tracing the process, trap to debugger so breakpoints
	* can be set before the program executes.
	* Use tdsignal to deliver signal to current thread, use
	* psignal may cause the signal to be delivered to wrong thread
	* because that thread will exit, remember we are going to enter
	* single thread mode.
	*/
	if (p->p_flag & P_TRACED)
	tdsignal(p, td, SIGTRAP, NULL);

	/* clear "fork but no exec" flag, as we _are_ execing */
	p->p_acflag &= ~AFORK;

	/*
	* Free any previous argument cache and replace it with
	* the new argument cache, if any.
	*/
	oldargs = p->p_args;
	p->p_args = newargs;
	newargs = NULL;

	#ifdef HWPMC_HOOKS
	/*
	* Check if system-wide sampling is in effect or if the
	* current process is using PMCs. If so, do exec() time
	* processing. This processing needs to happen AFTER the
	* P_INEXEC flag is cleared.
	*
	* The proc lock needs to be released before taking the PMC
	* SX.
	*/
	if (PMC_SYSTEM_SAMPLING_ACTIVE() \|\| PMC_PROC_IS_USING_PMCS(p)) {
	PROC_UNLOCK(p);
	VOP_UNLOCK(imgp->vp, 0);
	pe.pm_credentialschanged = credential_changing;
	pe.pm_entryaddr = imgp->entry_addr;

	PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY);
	} else
	PROC_UNLOCK(p);
	#else /* !HWPMC_HOOKS */
	PROC_UNLOCK(p);
	#endif

	/* Set values passed into the program in registers. */
	if (p->p_sysent->sv_setregs)
	(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
	(u_long)(uintptr_t)stack_base, imgp->ps_strings);
	else
	exec_setregs(td, imgp->entry_addr,
	(u_long)(uintptr_t)stack_base, imgp->ps_strings);

	vfs_mark_atime(imgp->vp, td->td_ucred);

	SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0);

	done1:
	/*
	* Free any resources malloc'd earlier that we didn't use.
	*/
	uifree(euip);
	if (newcred == NULL)
	crfree(oldcred);
	else
	crfree(newcred);
	VOP_UNLOCK(imgp->vp, 0);

	/*
	* Handle deferred decrement of ref counts.
	*/
	if (textvp != NULL) {
	int tvfslocked;

	tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
	vrele(textvp);
	VFS_UNLOCK_GIANT(tvfslocked);
	}
	if (binvp && error != 0)
	vrele(binvp);
	#ifdef KTRACE
	if (tracevp != NULL) {
	int tvfslocked;

	tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
	vrele(tracevp);
	VFS_UNLOCK_GIANT(tvfslocked);
	}
	if (tracecred != NULL)
	crfree(tracecred);
	#endif
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY);
	pargs_drop(oldargs);
	pargs_drop(newargs);
	if (oldsigacts != NULL)
	sigacts_free(oldsigacts);

	exec_fail_dealloc:

	/*
	* free various allocated resources
	*/
	if (imgp->firstpage != NULL)
	exec_unmap_first_page(imgp);

	if (imgp->vp != NULL) {
	if (args->fname)
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (imgp->opened)
	VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
	vput(imgp->vp);
	}

	if (imgp->object != NULL)
	vm_object_deallocate(imgp->object);

	free(imgp->freepath, M_TEMP);

	if (error == 0) {
	PROC_LOCK(p);
	td->td_dbgflags \|= TDB_EXEC;
	PROC_UNLOCK(p);

	/*
	* Stop the process here if its stop event mask has
	* the S_EXEC bit set.
	*/
	STOPEVENT(p, S_EXEC, 0);
	goto done2;
	}

	exec_fail:
	/* we're done here, clear P_INEXEC */
	PROC_LOCK(p);
	p->p_flag &= ~P_INEXEC;
	PROC_UNLOCK(p);

	SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0);

	done2:
	#ifdef MAC
	mac_execve_exit(imgp);
	mac_execve_interpreter_exit(interpvplabel);
	#endif
	VFS_UNLOCK_GIANT(vfslocked);
	exec_free_args(args);

	if (error && imgp->vmspace_destroyed) {
	/* sorry, no more process anymore. exit gracefully */
	exit1(td, W_EXITCODE(0, SIGABRT));
	/* NOT REACHED */
	}
	+
	+#ifdef KTRACE
	+ if (error == 0)
	+ ktrprocctor(p);
	+#endif
	+
	return (error);
	}

	int
	exec_map_first_page(imgp)
	struct image_params *imgp;
	{
	int rv, i;
	int initial_pagein;
	vm_page_t ma[VM_INITIAL_PAGEIN];
	vm_object_t object;

	if (imgp->firstpage != NULL)
	exec_unmap_first_page(imgp);

	object = imgp->vp->v_object;
	if (object == NULL)
	return (EACCES);
	VM_OBJECT_LOCK(object);
	#if VM_NRESERVLEVEL > 0
	if ((object->flags & OBJ_COLORED) == 0) {
	object->flags \|= OBJ_COLORED;
	object->pg_color = 0;
	}
	#endif
	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	if (ma[0]->valid != VM_PAGE_BITS_ALL) {
	initial_pagein = VM_INITIAL_PAGEIN;
	if (initial_pagein > object->size)
	initial_pagein = object->size;
	for (i = 1; i < initial_pagein; i++) {
	if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
	if (ma[i]->valid)
	break;
	if ((ma[i]->oflags & VPO_BUSY) \|\| ma[i]->busy)
	break;
	vm_page_busy(ma[i]);
	} else {
	ma[i] = vm_page_alloc(object, i,
	VM_ALLOC_NORMAL \| VM_ALLOC_IFNOTCACHED);
	if (ma[i] == NULL)
	break;
	}
	}
	initial_pagein = i;
	rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
	ma[0] = vm_page_lookup(object, 0);
	if ((rv != VM_PAGER_OK) \|\| (ma[0] == NULL)) {
	if (ma[0]) {
	vm_page_lock_queues();
	vm_page_free(ma[0]);
	vm_page_unlock_queues();
	}
	VM_OBJECT_UNLOCK(object);
	return (EIO);
	}
	}
	vm_page_lock_queues();
	vm_page_hold(ma[0]);
	vm_page_unlock_queues();
	vm_page_wakeup(ma[0]);
	VM_OBJECT_UNLOCK(object);

	imgp->firstpage = sf_buf_alloc(ma[0], 0);
	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);

	return (0);
	}

	void
	exec_unmap_first_page(imgp)
	struct image_params *imgp;
	{
	vm_page_t m;

	if (imgp->firstpage != NULL) {
	m = sf_buf_page(imgp->firstpage);
	sf_buf_free(imgp->firstpage);
	imgp->firstpage = NULL;
	vm_page_lock_queues();
	vm_page_unhold(m);
	vm_page_unlock_queues();
	}
	}

	/*
	* Destroy old address space, and allocate a new stack
	* The new stack is only SGROWSIZ large because it is grown
	* automatically in trap.c.
	*/
	int
	exec_new_vmspace(imgp, sv)
	struct image_params *imgp;
	struct sysentvec *sv;
	{
	int error;
	struct proc *p = imgp->proc;
	struct vmspace *vmspace = p->p_vmspace;
	vm_offset_t sv_minuser, stack_addr;
	vm_map_t map;
	u_long ssiz;

	imgp->vmspace_destroyed = 1;
	imgp->sysent = sv;

	/* May be called with Giant held */
	EVENTHANDLER_INVOKE(process_exec, p, imgp);

	/*
	* Blow away entire process VM, if address space not shared,
	* otherwise, create a new VM space so that other threads are
	* not disrupted
	*/
	map = &vmspace->vm_map;
	if (map_at_zero)
	sv_minuser = sv->sv_minuser;
	else
	sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
	vm_map_max(map) == sv->sv_maxuser) {
	shmexit(vmspace);
	pmap_remove_pages(vmspace_pmap(vmspace));
	vm_map_remove(map, vm_map_min(map), vm_map_max(map));
	} else {
	error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
	if (error)
	return (error);
	vmspace = p->p_vmspace;
	map = &vmspace->vm_map;
	}

	/* Allocate a new stack */
	if (sv->sv_maxssiz != NULL)
	ssiz = *sv->sv_maxssiz;
	else
	ssiz = maxssiz;
	stack_addr = sv->sv_usrstack - ssiz;
	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
	sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
	if (error)
	return (error);

	#ifdef __ia64__
	/* Allocate a new register stack */
	stack_addr = IA64_BACKINGSTORE;
	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
	sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
	if (error)
	return (error);
	#endif

	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
	* VM_STACK case, but they are still used to monitor the size of the
	* process stack so we can check the stack rlimit.
	*/
	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;

	return (0);
	}

	/*
	* Copy out argument and environment strings from the old process address
	* space into the temporary string buffer.
	*/
	int
	exec_copyin_args(struct image_args args, char fname,
	enum uio_seg segflg, char argv, char envv)
	{
	char argp, envp;
	int error;
	size_t length;

	bzero(args, sizeof(*args));
	if (argv == NULL)
	return (EFAULT);
	/*
	* Allocate temporary demand zeroed space for argument and
	* environment strings:
	*
	* o ARG_MAX for argument and environment;
	* o MAXSHELLCMDLEN for the name of interpreters.
	*/
	args->buf = (char *) kmem_alloc_wait(exec_map,
	PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
	if (args->buf == NULL)
	return (ENOMEM);
	args->begin_argv = args->buf;
	args->endp = args->begin_argv;
	args->stringspace = ARG_MAX;
	/*
	* Copy the file name.
	*/
	if (fname != NULL) {
	args->fname = args->buf + ARG_MAX;
	error = (segflg == UIO_SYSSPACE) ?
	copystr(fname, args->fname, PATH_MAX, &length) :
	copyinstr(fname, args->fname, PATH_MAX, &length);
	if (error != 0)
	goto err_exit;
	} else
	args->fname = NULL;

	/*
	* extract arguments first
	*/
	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
	if (argp == (caddr_t) -1) {
	error = EFAULT;
	goto err_exit;
	}
	if ((error = copyinstr(argp, args->endp,
	args->stringspace, &length))) {
	if (error == ENAMETOOLONG)
	error = E2BIG;
	goto err_exit;
	}
	args->stringspace -= length;
	args->endp += length;
	args->argc++;
	}

	args->begin_envv = args->endp;

	/*
	* extract environment strings
	*/
	if (envv) {
	while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
	if (envp == (caddr_t)-1) {
	error = EFAULT;
	goto err_exit;
	}
	if ((error = copyinstr(envp, args->endp,
	args->stringspace, &length))) {
	if (error == ENAMETOOLONG)
	error = E2BIG;
	goto err_exit;
	}
	args->stringspace -= length;
	args->endp += length;
	args->envc++;
	}
	}

	return (0);

	err_exit:
	exec_free_args(args);
	return (error);
	}

	static void
	exec_free_args(struct image_args *args)
	{

	if (args->buf) {
	kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
	PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
	args->buf = NULL;
	}
	}

	/*
	* Copy strings out to the new process address space, constructing new arg
	* and env vector tables. Return a pointer to the base so that it can be used
	* as the initial stack pointer.
	*/
	register_t *
	exec_copyout_strings(imgp)
	struct image_params *imgp;
	{
	int argc, envc;
	char **vectp;
	char stringp, destp;
	register_t *stack_base;
	struct ps_strings *arginfo;
	struct proc *p;
	size_t execpath_len;
	int szsigcode;

	/*
	* Calculate string base and vector table pointers.
	* Also deal with signal trampoline code for this exec type.
	*/
	if (imgp->execpath != NULL && imgp->auxargs != NULL)
	execpath_len = strlen(imgp->execpath) + 1;
	else
	execpath_len = 0;
	p = imgp->proc;
	szsigcode = 0;
	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
	if (p->p_sysent->sv_szsigcode != NULL)
	szsigcode = *(p->p_sysent->sv_szsigcode);
	destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
	roundup(execpath_len, sizeof(char *)) -
	roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));

	/*
	* install sigcode
	*/
	if (szsigcode)
	copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
	szsigcode), szsigcode);

	/*
	* Copy the image path for the rtld.
	*/
	if (execpath_len != 0) {
	imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
	copyout(imgp->execpath, (void *)imgp->execpathp,
	execpath_len);
	}

	/*
	* If we have a valid auxargs ptr, prepare some room
	* on the stack.
	*/
	if (imgp->auxargs) {
	/*
	* 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
	* lower compatibility.
	*/
	imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
	(AT_COUNT * 2);
	/*
	* The '+ 2' is for the null pointers at the end of each of
	* the arg and env vector sets,and imgp->auxarg_size is room
	* for argument of Runtime loader.
	*/
	vectp = (char **)(destp - (imgp->args->argc +
	imgp->args->envc + 2 + imgp->auxarg_size + execpath_len) *
	sizeof(char *));
	} else {
	/*
	* The '+ 2' is for the null pointers at the end of each of
	* the arg and env vector sets
	*/
	vectp = (char *)(destp - (imgp->args->argc + imgp->args->envc + 2)
	sizeof(char *));
	}

	/*
	* vectp also becomes our initial stack base
	*/
	stack_base = (register_t *)vectp;

	stringp = imgp->args->begin_argv;
	argc = imgp->args->argc;
	envc = imgp->args->envc;

	/*
	* Copy out strings - arguments and environment.
	*/
	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);

	/*
	* Fill in "ps_strings" struct for ps, w, etc.
	*/
	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
	suword(&arginfo->ps_nargvstr, argc);

	/*
	* Fill in argument portion of vector table.
	*/
	for (; argc > 0; --argc) {
	suword(vectp++, (long)(intptr_t)destp);
	while (*stringp++ != 0)
	destp++;
	destp++;
	}

	/* a null vector table pointer separates the argp's from the envp's */
	suword(vectp++, 0);

	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
	suword(&arginfo->ps_nenvstr, envc);

	/*
	* Fill in environment portion of vector table.
	*/
	for (; envc > 0; --envc) {
	suword(vectp++, (long)(intptr_t)destp);
	while (*stringp++ != 0)
	destp++;
	destp++;
	}

	/* end of vector table is a null pointer */
	suword(vectp, 0);

	return (stack_base);
	}

	/*
	* Check permissions of file to execute.
	* Called with imgp->vp locked.
	* Return 0 for success or error code on failure.
	*/
	int
	exec_check_permissions(imgp)
	struct image_params *imgp;
	{
	struct vnode *vp = imgp->vp;
	struct vattr *attr = imgp->attr;
	struct thread *td;
	int error;

	td = curthread;

	/* Get file attributes */
	error = VOP_GETATTR(vp, attr, td->td_ucred);
	if (error)
	return (error);

	#ifdef MAC
	error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
	if (error)
	return (error);
	#endif

	/*
	* 1) Check if file execution is disabled for the filesystem that this
	* file resides on.
	* 2) Insure that at least one execute bit is on - otherwise root
	* will always succeed, and we don't want to happen unless the
	* file really is executable.
	* 3) Insure that the file is a regular file.
	*/
	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) \|\|
	((attr->va_mode & 0111) == 0) \|\|
	(attr->va_type != VREG))
	return (EACCES);

	/*
	* Zero length files can't be exec'd
	*/
	if (attr->va_size == 0)
	return (ENOEXEC);

	/*
	* Check for execute permission to file based on current credentials.
	*/
	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
	if (error)
	return (error);

	/*
	* Check number of open-for-writes on the file and deny execution
	* if there are any.
	*/
	if (vp->v_writecount)
	return (ETXTBSY);

	/*
	* Call filesystem specific open routine (which does nothing in the
	* general case).
	*/
	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
	if (error == 0)
	imgp->opened = 1;
	return (error);
	}

	/*
	* Exec handler registration
	*/
	int
	exec_register(execsw_arg)
	const struct execsw *execsw_arg;
	{
	const struct execsw es, xs, **newexecsw;
	int count = 2; /* New slot and trailing NULL */

	if (execsw)
	for (es = execsw; *es; es++)
	count++;
	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	if (newexecsw == NULL)
	return (ENOMEM);
	xs = newexecsw;
	if (execsw)
	for (es = execsw; *es; es++)
	xs++ = es;
	*xs++ = execsw_arg;
	*xs = NULL;
	if (execsw)
	free(execsw, M_TEMP);
	execsw = newexecsw;
	return (0);
	}

	int
	exec_unregister(execsw_arg)
	const struct execsw *execsw_arg;
	{
	const struct execsw es, xs, **newexecsw;
	int count = 1;

	if (execsw == NULL)
	panic("unregister with no handlers left?\n");

	for (es = execsw; *es; es++) {
	if (*es == execsw_arg)
	break;
	}
	if (*es == NULL)
	return (ENOENT);
	for (es = execsw; *es; es++)
	if (*es != execsw_arg)
	count++;
	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	if (newexecsw == NULL)
	return (ENOMEM);
	xs = newexecsw;
	for (es = execsw; *es; es++)
	if (*es != execsw_arg)
	xs++ = es;
	*xs = NULL;
	if (execsw)
	free(execsw, M_TEMP);
	execsw = newexecsw;
	return (0);
	}
	Index: stable/8/sys/kern/kern_fork.c
	===================================================================
	--- stable/8/sys/kern/kern_fork.c (revision 220261)
	+++ stable/8/sys/kern/kern_fork.c (revision 220262)
	@@ -1,928 +1,928 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_kdtrace.h"
	#include "opt_ktrace.h"
	#include "opt_kstack_pages.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/eventhandler.h>
	#include <sys/filedesc.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/sysctl.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/pioctl.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/syscall.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#include <sys/acct.h>
	#include <sys/ktr.h>
	#include <sys/ktrace.h>
	#include <sys/unistd.h>
	#include <sys/sdt.h>
	#include <sys/sx.h>
	#include <sys/signalvar.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>

	#ifdef KDTRACE_HOOKS
	#include <sys/dtrace_bsd.h>
	dtrace_fork_func_t dtrace_fasttrap_fork;
	#endif

	SDT_PROVIDER_DECLARE(proc);
	SDT_PROBE_DEFINE(proc, kernel, , create, create);
	SDT_PROBE_ARGTYPE(proc, kernel, , create, 0, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, , create, 1, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, , create, 2, "int");

	#ifndef _SYS_SYSPROTO_H_
	struct fork_args {
	int dummy;
	};
	#endif

	/* ARGSUSED */
	int
	fork(td, uap)
	struct thread *td;
	struct fork_args *uap;
	{
	int error;
	struct proc *p2;

	error = fork1(td, RFFDG \| RFPROC, 0, &p2);
	if (error == 0) {
	td->td_retval[0] = p2->p_pid;
	td->td_retval[1] = 0;
	}
	return (error);
	}

	/* ARGSUSED */
	int
	vfork(td, uap)
	struct thread *td;
	struct vfork_args *uap;
	{
	int error, flags;
	struct proc *p2;

	#ifdef XEN
	flags = RFFDG \| RFPROC; /* validate that this is still an issue */
	#else
	flags = RFFDG \| RFPROC \| RFPPWAIT \| RFMEM;
	#endif
	error = fork1(td, flags, 0, &p2);
	if (error == 0) {
	td->td_retval[0] = p2->p_pid;
	td->td_retval[1] = 0;
	}
	return (error);
	}

	int
	rfork(td, uap)
	struct thread *td;
	struct rfork_args *uap;
	{
	struct proc *p2;
	int error;

	/* Don't allow kernel-only flags. */
	if ((uap->flags & RFKERNELONLY) != 0)
	return (EINVAL);

	AUDIT_ARG_FFLAGS(uap->flags);
	error = fork1(td, uap->flags, 0, &p2);
	if (error == 0) {
	td->td_retval[0] = p2 ? p2->p_pid : 0;
	td->td_retval[1] = 0;
	}
	return (error);
	}

	int nprocs = 1; /* process 0 */
	int lastpid = 0;
	SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
	"Last used PID");

	/*
	* Random component to lastpid generation. We mix in a random factor to make
	* it a little harder to predict. We sanity check the modulus value to avoid
	* doing it in critical paths. Don't let it be too small or we pointlessly
	* waste randomness entropy, and don't let it be impossibly large. Using a
	* modulus that is too big causes a LOT more process table scans and slows
	* down fork processing as the pidchecked caching is defeated.
	*/
	static int randompid = 0;

	static int
	sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
	{
	int error, pid;

	error = sysctl_wire_old_buffer(req, sizeof(int));
	if (error != 0)
	return(error);
	sx_xlock(&allproc_lock);
	pid = randompid;
	error = sysctl_handle_int(oidp, &pid, 0, req);
	if (error == 0 && req->newptr != NULL) {
	if (pid < 0 \|\| pid > PID_MAX - 100) /* out of range */
	pid = PID_MAX - 100;
	else if (pid < 2) /* NOP */
	pid = 0;
	else if (pid < 100) /* Make it reasonable */
	pid = 100;
	randompid = pid;
	}
	sx_xunlock(&allproc_lock);
	return (error);
	}

	SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT\|CTLFLAG_RW,
	0, 0, sysctl_kern_randompid, "I", "Random PID modulus");

	int
	fork1(td, flags, pages, procp)
	struct thread *td;
	int flags;
	int pages;
	struct proc **procp;
	{
	struct proc p1, p2, *pptr;
	struct proc *newproc;
	int ok, p2_held, trypid;
	static int curfail, pidchecked = 0;
	static struct timeval lastfail;
	struct filedesc *fd;
	struct filedesc_to_leader *fdtol;
	struct thread *td2;
	struct sigacts *newsigacts;
	struct vmspace *vm2;
	vm_ooffset_t mem_charged;
	int error;

	/* Can't copy and clear. */
	if ((flags & (RFFDG\|RFCFDG)) == (RFFDG\|RFCFDG))
	return (EINVAL);

	p2_held = 0;
	p1 = td->td_proc;

	/*
	* Here we don't create a new process, but we divorce
	* certain parts of a process from itself.
	*/
	if ((flags & RFPROC) == 0) {
	if (((p1->p_flag & (P_HADTHREADS\|P_SYSTEM)) == P_HADTHREADS) &&
	(flags & (RFCFDG \| RFFDG))) {
	PROC_LOCK(p1);
	if (thread_single(SINGLE_BOUNDARY)) {
	PROC_UNLOCK(p1);
	return (ERESTART);
	}
	PROC_UNLOCK(p1);
	}

	error = vm_forkproc(td, NULL, NULL, NULL, flags);
	if (error)
	goto norfproc_fail;

	/*
	* Close all file descriptors.
	*/
	if (flags & RFCFDG) {
	struct filedesc *fdtmp;
	fdtmp = fdinit(td->td_proc->p_fd);
	fdfree(td);
	p1->p_fd = fdtmp;
	}

	/*
	* Unshare file descriptors (from parent).
	*/
	if (flags & RFFDG)
	fdunshare(p1, td);

	norfproc_fail:
	if (((p1->p_flag & (P_HADTHREADS\|P_SYSTEM)) == P_HADTHREADS) &&
	(flags & (RFCFDG \| RFFDG))) {
	PROC_LOCK(p1);
	thread_single_end();
	PROC_UNLOCK(p1);
	}
	*procp = NULL;
	return (error);
	}

	/*
	* XXX
	* We did have single-threading code here
	* however it proved un-needed and caused problems
	*/

	mem_charged = 0;
	vm2 = NULL;
	if (pages == 0)
	pages = KSTACK_PAGES;
	/* Allocate new proc. */
	newproc = uma_zalloc(proc_zone, M_WAITOK);
	td2 = FIRST_THREAD_IN_PROC(newproc);
	if (td2 == NULL) {
	td2 = thread_alloc(pages);
	if (td2 == NULL) {
	error = ENOMEM;
	goto fail1;
	}
	proc_linkup(newproc, td2);
	} else {
	if (td2->td_kstack == 0 \|\| td2->td_kstack_pages != pages) {
	if (td2->td_kstack != 0)
	vm_thread_dispose(td2);
	if (!thread_alloc_stack(td2, pages)) {
	error = ENOMEM;
	goto fail1;
	}
	}
	}

	if ((flags & RFMEM) == 0) {
	vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
	if (vm2 == NULL) {
	error = ENOMEM;
	goto fail1;
	}
	if (!swap_reserve(mem_charged)) {
	/*
	* The swap reservation failed. The accounting
	* from the entries of the copied vm2 will be
	* substracted in vmspace_free(), so force the
	* reservation there.
	*/
	swap_reserve_force(mem_charged);
	error = ENOMEM;
	goto fail1;
	}
	} else
	vm2 = NULL;
	#ifdef MAC
	mac_proc_init(newproc);
	#endif
	knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
	STAILQ_INIT(&newproc->p_ktr);

	/* We have to lock the process tree while we look for a pid. */
	sx_slock(&proctree_lock);

	/*
	* Although process entries are dynamically created, we still keep
	* a global limit on the maximum number we will create. Don't allow
	* a nonprivileged user to use the last ten processes; don't let root
	* exceed the limit. The variable nprocs is the current number of
	* processes, maxproc is the limit.
	*/
	sx_xlock(&allproc_lock);
	if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
	PRIV_MAXPROC, 0) != 0) \|\| nprocs >= maxproc) {
	error = EAGAIN;
	goto fail;
	}

	/*
	* Increment the count of procs running with this uid. Don't allow
	* a nonprivileged user to exceed their current limit.
	*
	* XXXRW: Can we avoid privilege here if it's not needed?
	*/
	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
	if (error == 0)
	ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
	else {
	PROC_LOCK(p1);
	ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
	lim_cur(p1, RLIMIT_NPROC));
	PROC_UNLOCK(p1);
	}
	if (!ok) {
	error = EAGAIN;
	goto fail;
	}

	/*
	* Increment the nprocs resource before blocking can occur. There
	* are hard-limits as to the number of processes that can run.
	*/
	nprocs++;

	/*
	* Find an unused process ID. We remember a range of unused IDs
	* ready to use (from lastpid+1 through pidchecked-1).
	*
	* If RFHIGHPID is set (used during system boot), do not allocate
	* low-numbered pids.
	*/
	trypid = lastpid + 1;
	if (flags & RFHIGHPID) {
	if (trypid < 10)
	trypid = 10;
	} else {
	if (randompid)
	trypid += arc4random() % randompid;
	}
	retry:
	/*
	* If the process ID prototype has wrapped around,
	* restart somewhat above 0, as the low-numbered procs
	* tend to include daemons that don't exit.
	*/
	if (trypid >= PID_MAX) {
	trypid = trypid % PID_MAX;
	if (trypid < 100)
	trypid += 100;
	pidchecked = 0;
	}
	if (trypid >= pidchecked) {
	int doingzomb = 0;

	pidchecked = PID_MAX;
	/*
	* Scan the active and zombie procs to check whether this pid
	* is in use. Remember the lowest pid that's greater
	* than trypid, so we can avoid checking for a while.
	*/
	p2 = LIST_FIRST(&allproc);
	again:
	for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
	while (p2->p_pid == trypid \|\|
	(p2->p_pgrp != NULL &&
	(p2->p_pgrp->pg_id == trypid \|\|
	(p2->p_session != NULL &&
	p2->p_session->s_sid == trypid)))) {
	trypid++;
	if (trypid >= pidchecked)
	goto retry;
	}
	if (p2->p_pid > trypid && pidchecked > p2->p_pid)
	pidchecked = p2->p_pid;
	if (p2->p_pgrp != NULL) {
	if (p2->p_pgrp->pg_id > trypid &&
	pidchecked > p2->p_pgrp->pg_id)
	pidchecked = p2->p_pgrp->pg_id;
	if (p2->p_session != NULL &&
	p2->p_session->s_sid > trypid &&
	pidchecked > p2->p_session->s_sid)
	pidchecked = p2->p_session->s_sid;
	}
	}
	if (!doingzomb) {
	doingzomb = 1;
	p2 = LIST_FIRST(&zombproc);
	goto again;
	}
	}
	sx_sunlock(&proctree_lock);

	/*
	* RFHIGHPID does not mess with the lastpid counter during boot.
	*/
	if (flags & RFHIGHPID)
	pidchecked = 0;
	else
	lastpid = trypid;

	p2 = newproc;
	p2->p_state = PRS_NEW; /* protect against others */
	p2->p_pid = trypid;
	AUDIT_ARG_PID(p2->p_pid);
	LIST_INSERT_HEAD(&allproc, p2, p_list);
	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);

	PROC_LOCK(p2);
	PROC_LOCK(p1);

	sx_xunlock(&allproc_lock);

	bcopy(&p1->p_startcopy, &p2->p_startcopy,
	__rangeof(struct proc, p_startcopy, p_endcopy));
	pargs_hold(p2->p_args);
	PROC_UNLOCK(p1);

	bzero(&p2->p_startzero,
	__rangeof(struct proc, p_startzero, p_endzero));

	p2->p_ucred = crhold(td->td_ucred);

	/* Tell the prison that we exist. */
	prison_proc_hold(p2->p_ucred->cr_prison);

	PROC_UNLOCK(p2);

	/*
	* Malloc things while we don't hold any locks.
	*/
	if (flags & RFSIGSHARE)
	newsigacts = NULL;
	else
	newsigacts = sigacts_alloc();

	/*
	* Copy filedesc.
	*/
	if (flags & RFCFDG) {
	fd = fdinit(p1->p_fd);
	fdtol = NULL;
	} else if (flags & RFFDG) {
	fd = fdcopy(p1->p_fd);
	fdtol = NULL;
	} else {
	fd = fdshare(p1->p_fd);
	if (p1->p_fdtol == NULL)
	p1->p_fdtol =
	filedesc_to_leader_alloc(NULL,
	NULL,
	p1->p_leader);
	if ((flags & RFTHREAD) != 0) {
	/*
	* Shared file descriptor table and
	* shared process leaders.
	*/
	fdtol = p1->p_fdtol;
	FILEDESC_XLOCK(p1->p_fd);
	fdtol->fdl_refcount++;
	FILEDESC_XUNLOCK(p1->p_fd);
	} else {
	/*
	* Shared file descriptor table, and
	* different process leaders
	*/
	fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
	p1->p_fd,
	p2);
	}
	}
	/*
	* Make a proc table entry for the new process.
	* Start by zeroing the section of proc that is zero-initialized,
	* then copy the section that is copied directly from the parent.
	*/

	PROC_LOCK(p2);
	PROC_LOCK(p1);

	bzero(&td2->td_startzero,
	__rangeof(struct thread, td_startzero, td_endzero));
	bzero(&td2->td_rux, sizeof(td2->td_rux));
	td2->td_map_def_user = NULL;
	td2->td_dbg_forked = 0;

	bcopy(&td->td_startcopy, &td2->td_startcopy,
	__rangeof(struct thread, td_startcopy, td_endcopy));

	bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
	td2->td_sigstk = td->td_sigstk;
	td2->td_sigmask = td->td_sigmask;
	td2->td_flags = TDF_INMEM;

	#ifdef VIMAGE
	td2->td_vnet = NULL;
	td2->td_vnet_lpush = NULL;
	#endif

	/*
	* Allow the scheduler to initialize the child.
	*/
	thread_lock(td);
	sched_fork(td, td2);
	thread_unlock(td);

	/*
	* Duplicate sub-structures as needed.
	* Increase reference counts on shared objects.
	*/
	p2->p_flag = P_INMEM;
	p2->p_swtick = ticks;
	if (p1->p_flag & P_PROFIL)
	startprofclock(p2);
	td2->td_ucred = crhold(p2->p_ucred);

	if (flags & RFSIGSHARE) {
	p2->p_sigacts = sigacts_hold(p1->p_sigacts);
	} else {
	sigacts_copy(newsigacts, p1->p_sigacts);
	p2->p_sigacts = newsigacts;
	}
	if (flags & RFLINUXTHPN)
	p2->p_sigparent = SIGUSR1;
	else
	p2->p_sigparent = SIGCHLD;

	p2->p_textvp = p1->p_textvp;
	p2->p_fd = fd;
	p2->p_fdtol = fdtol;

	/*
	* p_limit is copy-on-write. Bump its refcount.
	*/
	lim_fork(p1, p2);

	pstats_fork(p1->p_stats, p2->p_stats);

	PROC_UNLOCK(p1);
	PROC_UNLOCK(p2);

	/* Bump references to the text vnode (for procfs) */
	if (p2->p_textvp)
	vref(p2->p_textvp);

	/*
	* Set up linkage for kernel based threading.
	*/
	if ((flags & RFTHREAD) != 0) {
	mtx_lock(&ppeers_lock);
	p2->p_peers = p1->p_peers;
	p1->p_peers = p2;
	p2->p_leader = p1->p_leader;
	mtx_unlock(&ppeers_lock);
	PROC_LOCK(p1->p_leader);
	if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
	PROC_UNLOCK(p1->p_leader);
	/*
	* The task leader is exiting, so process p1 is
	* going to be killed shortly. Since p1 obviously
	* isn't dead yet, we know that the leader is either
	* sending SIGKILL's to all the processes in this
	* task or is sleeping waiting for all the peers to
	* exit. We let p1 complete the fork, but we need
	* to go ahead and kill the new process p2 since
	* the task leader may not get a chance to send
	* SIGKILL to it. We leave it on the list so that
	* the task leader will wait for this new process
	* to commit suicide.
	*/
	PROC_LOCK(p2);
	psignal(p2, SIGKILL);
	PROC_UNLOCK(p2);
	} else
	PROC_UNLOCK(p1->p_leader);
	} else {
	p2->p_peers = NULL;
	p2->p_leader = p2;
	}

	sx_xlock(&proctree_lock);
	PGRP_LOCK(p1->p_pgrp);
	PROC_LOCK(p2);
	PROC_LOCK(p1);

	/*
	* Preserve some more flags in subprocess. P_PROFIL has already
	* been preserved.
	*/
	p2->p_flag \|= p1->p_flag & P_SUGID;
	td2->td_pflags \|= td->td_pflags & TDP_ALTSTACK;
	SESS_LOCK(p1->p_session);
	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
	p2->p_flag \|= P_CONTROLT;
	SESS_UNLOCK(p1->p_session);
	if (flags & RFPPWAIT)
	p2->p_flag \|= P_PPWAIT;

	p2->p_pgrp = p1->p_pgrp;
	LIST_INSERT_AFTER(p1, p2, p_pglist);
	PGRP_UNLOCK(p1->p_pgrp);
	LIST_INIT(&p2->p_children);

	callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);

	-#ifdef KTRACE
	- ktrprocfork(p1, p2);
	-#endif
	-
	/*
	* If PF_FORK is set, the child process inherits the
	* procfs ioctl flags from its parent.
	*/
	if (p1->p_pfsflags & PF_FORK) {
	p2->p_stops = p1->p_stops;
	p2->p_pfsflags = p1->p_pfsflags;
	}

	/*
	* This begins the section where we must prevent the parent
	* from being swapped.
	*/
	_PHOLD(p1);
	PROC_UNLOCK(p1);

	/*
	* Attach the new process to its parent.
	*
	* If RFNOWAIT is set, the newly created process becomes a child
	* of init. This effectively disassociates the child from the
	* parent.
	*/
	if (flags & RFNOWAIT)
	pptr = initproc;
	else
	pptr = p1;
	p2->p_pptr = pptr;
	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
	sx_xunlock(&proctree_lock);

	/* Inform accounting that we have forked. */
	p2->p_acflag = AFORK;
	PROC_UNLOCK(p2);
	+
	+#ifdef KTRACE
	+ ktrprocfork(p1, p2);
	+#endif

	/*
	* Finish creating the child process. It will return via a different
	* execution path later. (ie: directly into user mode)
	*/
	vm_forkproc(td, p2, td2, vm2, flags);

	if (flags == (RFFDG \| RFPROC)) {
	PCPU_INC(cnt.v_forks);
	PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
	p2->p_vmspace->vm_ssize);
	} else if (flags == (RFFDG \| RFPROC \| RFPPWAIT \| RFMEM)) {
	PCPU_INC(cnt.v_vforks);
	PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
	p2->p_vmspace->vm_ssize);
	} else if (p1 == &proc0) {
	PCPU_INC(cnt.v_kthreads);
	PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
	p2->p_vmspace->vm_ssize);
	} else {
	PCPU_INC(cnt.v_rforks);
	PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
	p2->p_vmspace->vm_ssize);
	}

	/*
	* Both processes are set up, now check if any loadable modules want
	* to adjust anything.
	* What if they have an error? XXX
	*/
	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);

	/*
	* Set the child start time and mark the process as being complete.
	*/
	microuptime(&p2->p_stats->p_start);
	PROC_SLOCK(p2);
	p2->p_state = PRS_NORMAL;
	PROC_SUNLOCK(p2);
	#ifdef KDTRACE_HOOKS
	/*
	* Tell the DTrace fasttrap provider about the new process
	* if it has registered an interest. We have to do this only after
	* p_state is PRS_NORMAL since the fasttrap module will use pfind()
	* later on.
	*/
	if (dtrace_fasttrap_fork) {
	PROC_LOCK(p1);
	PROC_LOCK(p2);
	dtrace_fasttrap_fork(p1, p2);
	PROC_UNLOCK(p2);
	PROC_UNLOCK(p1);
	}
	#endif

	PROC_LOCK(p1);
	if ((p1->p_flag & (P_TRACED \| P_FOLLOWFORK)) == (P_TRACED \|
	P_FOLLOWFORK)) {
	/*
	* Arrange for debugger to receive the fork event.
	*
	* We can report PL_FLAG_FORKED regardless of
	* P_FOLLOWFORK settings, but it does not make a sense
	* for runaway child.
	*/
	td->td_dbgflags \|= TDB_FORK;
	td->td_dbg_forked = p2->p_pid;
	PROC_LOCK(p2);
	td2->td_dbgflags \|= TDB_STOPATFORK;
	_PHOLD(p2);
	p2_held = 1;
	PROC_UNLOCK(p2);
	}
	if ((flags & RFSTOPPED) == 0) {
	/*
	* If RFSTOPPED not requested, make child runnable and
	* add to run queue.
	*/
	thread_lock(td2);
	TD_SET_CAN_RUN(td2);
	sched_add(td2, SRQ_BORING);
	thread_unlock(td2);
	}

	/*
	* Now can be swapped.
	*/
	_PRELE(p1);
	PROC_UNLOCK(p1);

	/*
	* Tell any interested parties about the new process.
	*/
	knote_fork(&p1->p_klist, p2->p_pid);
	SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);

	/*
	* Wait until debugger is attached to child.
	*/
	PROC_LOCK(p2);
	while ((td2->td_dbgflags & TDB_STOPATFORK) != 0)
	cv_wait(&p2->p_dbgwait, &p2->p_mtx);
	if (p2_held)
	_PRELE(p2);

	/*
	* Preserve synchronization semantics of vfork. If waiting for
	* child to exec or exit, set P_PPWAIT on child, and sleep on our
	* proc (in case of exit).
	*/
	while (p2->p_flag & P_PPWAIT)
	cv_wait(&p2->p_pwait, &p2->p_mtx);
	PROC_UNLOCK(p2);

	/*
	* Return child proc pointer to parent.
	*/
	*procp = p2;
	return (0);
	fail:
	sx_sunlock(&proctree_lock);
	if (ppsratecheck(&lastfail, &curfail, 1))
	printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
	td->td_ucred->cr_ruid);
	sx_xunlock(&allproc_lock);
	#ifdef MAC
	mac_proc_destroy(newproc);
	#endif
	fail1:
	if (vm2 != NULL)
	vmspace_free(vm2);
	uma_zfree(proc_zone, newproc);
	pause("fork", hz / 2);
	return (error);
	}

	/*
	* Handle the return of a child process from fork1(). This function
	* is called from the MD fork_trampoline() entry point.
	*/
	void
	fork_exit(callout, arg, frame)
	void (callout)(void , struct trapframe *);
	void *arg;
	struct trapframe *frame;
	{
	struct proc *p;
	struct thread *td;
	struct thread *dtd;

	td = curthread;
	p = td->td_proc;
	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));

	CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
	td, td->td_sched, p->p_pid, td->td_name);

	sched_fork_exit(td);
	/*
	* Processes normally resume in mi_switch() after being
	* cpu_switch()'ed to, but when children start up they arrive here
	* instead, so we must do much the same things as mi_switch() would.
	*/
	if ((dtd = PCPU_GET(deadthread))) {
	PCPU_SET(deadthread, NULL);
	thread_stash(dtd);
	}
	thread_unlock(td);

	/*
	* cpu_set_fork_handler intercepts this function call to
	* have this call a non-return function to stay in kernel mode.
	* initproc has its own fork handler, but it does return.
	*/
	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
	callout(arg, frame);

	/*
	* Check if a kernel thread misbehaved and returned from its main
	* function.
	*/
	if (p->p_flag & P_KTHREAD) {
	printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
	td->td_name, p->p_pid);
	kproc_exit(0);
	}
	mtx_assert(&Giant, MA_NOTOWNED);

	EVENTHANDLER_INVOKE(schedtail, p);
	}

	/*
	* Simplified back end of syscall(), used when returning from fork()
	* directly into user mode. Giant is not held on entry, and must not
	* be held on return. This function is passed in to fork_exit() as the
	* first parameter and is called when returning to a new userland process.
	*/
	void
	fork_return(td, frame)
	struct thread *td;
	struct trapframe *frame;
	{
	struct proc p, dbg;

	if (td->td_dbgflags & TDB_STOPATFORK) {
	p = td->td_proc;
	sx_xlock(&proctree_lock);
	PROC_LOCK(p);
	if ((p->p_pptr->p_flag & (P_TRACED \| P_FOLLOWFORK)) ==
	(P_TRACED \| P_FOLLOWFORK)) {
	/*
	* If debugger still wants auto-attach for the
	* parent's children, do it now.
	*/
	dbg = p->p_pptr->p_pptr;
	p->p_flag \|= P_TRACED;
	p->p_oppid = p->p_pptr->p_pid;
	proc_reparent(p, dbg);
	sx_xunlock(&proctree_lock);
	ptracestop(td, SIGSTOP);
	} else {
	/*
	* ... otherwise clear the request.
	*/
	sx_xunlock(&proctree_lock);
	td->td_dbgflags &= ~TDB_STOPATFORK;
	cv_broadcast(&p->p_dbgwait);
	}
	PROC_UNLOCK(p);
	}

	userret(td, frame);

	#ifdef KTRACE
	if (KTRPOINT(td, KTR_SYSRET))
	ktrsysret(SYS_fork, 0, 0);
	#endif
	mtx_assert(&Giant, MA_NOTOWNED);
	}
	Index: stable/8/sys/kern/kern_ktrace.c
	===================================================================
	--- stable/8/sys/kern/kern_ktrace.c (revision 220261)
	+++ stable/8/sys/kern/kern_ktrace.c (revision 220262)
	@@ -1,1167 +1,1221 @@
	/*-
	* Copyright (c) 1989, 1993
	* The Regents of the University of California.
	* Copyright (c) 2005 Robert N. M. Watson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ktrace.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/fcntl.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/unistd.h>
	#include <sys/vnode.h>
	#include <sys/socket.h>
	#include <sys/stat.h>
	#include <sys/ktrace.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	+#include <sys/sysent.h>
	#include <sys/syslog.h>
	#include <sys/sysproto.h>

	#include <security/mac/mac_framework.h>

	/*
	* The ktrace facility allows the tracing of certain key events in user space
	* processes, such as system calls, signal delivery, context switches, and
	* user generated events using utrace(2). It works by streaming event
	* records and data to a vnode associated with the process using the
	* ktrace(2) system call. In general, records can be written directly from
	* the context that generates the event. One important exception to this is
	* during a context switch, where sleeping is not permitted. To handle this
	* case, trace events are generated using in-kernel ktr_request records, and
	* then delivered to disk at a convenient moment -- either immediately, the
	* next traceable event, at system call return, or at process exit.
	*
	* When dealing with multiple threads or processes writing to the same event
	* log, ordering guarantees are weak: specifically, if an event has multiple
	* records (i.e., system call enter and return), they may be interlaced with
	* records from another event. Process and thread ID information is provided
	* in the record, and user applications can de-interlace events if required.
	*/

	static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");

	#ifdef KTRACE

	#ifndef KTRACE_REQUEST_POOL
	#define KTRACE_REQUEST_POOL 100
	#endif

	struct ktr_request {
	struct ktr_header ktr_header;
	void *ktr_buffer;
	union {
	+ struct ktr_proc_ctor ktr_proc_ctor;
	struct ktr_syscall ktr_syscall;
	struct ktr_sysret ktr_sysret;
	struct ktr_genio ktr_genio;
	struct ktr_psig ktr_psig;
	struct ktr_csw ktr_csw;
	} ktr_data;
	STAILQ_ENTRY(ktr_request) ktr_list;
	};

	static int data_lengths[] = {
	0, /* none */
	offsetof(struct ktr_syscall, ktr_args), /* KTR_SYSCALL */
	sizeof(struct ktr_sysret), /* KTR_SYSRET */
	0, /* KTR_NAMEI */
	sizeof(struct ktr_genio), /* KTR_GENIO */
	sizeof(struct ktr_psig), /* KTR_PSIG */
	sizeof(struct ktr_csw), /* KTR_CSW */
	0, /* KTR_USER */
	0, /* KTR_STRUCT */
	0, /* KTR_SYSCTL */
	+ sizeof(struct ktr_proc_ctor), /* KTR_PROCCTOR */
	+ 0, /* KTR_PROCDTOR */
	};

	static STAILQ_HEAD(, ktr_request) ktr_free;

	static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");

	static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
	TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);

	static u_int ktr_geniosize = PAGE_SIZE;
	TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
	SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
	0, "Maximum size of genio event payload");

	static int print_message = 1;
	static struct mtx ktrace_mtx;
	static struct sx ktrace_sx;

	static void ktrace_init(void *dummy);
	static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
	-static u_int ktrace_resize_pool(u_int newsize);
	+static u_int ktrace_resize_pool(u_int oldsize, u_int newsize);
	+static struct ktr_request ktr_getrequest_entered(struct thread td, int type);
	static struct ktr_request *ktr_getrequest(int type);
	static void ktr_submitrequest(struct thread td, struct ktr_request req);
	static void ktr_freeproc(struct proc p, struct ucred *uc,
	struct vnode **vp);
	static void ktr_freerequest(struct ktr_request *req);
	static void ktr_freerequest_locked(struct ktr_request *req);
	static void ktr_writerequest(struct thread td, struct ktr_request req);
	static int ktrcanset(struct thread ,struct proc );
	static int ktrsetchildren(struct thread ,struct proc ,int,int,struct vnode *);
	static int ktrops(struct thread ,struct proc ,int,int,struct vnode *);
	+static void ktrprocctor_entered(struct thread , struct proc );

	/*
	* ktrace itself generates events, such as context switches, which we do not
	* wish to trace. Maintain a flag, TDP_INKTRACE, on each thread to determine
	* whether or not it is in a region where tracing of events should be
	* suppressed.
	*/
	static void
	ktrace_enter(struct thread *td)
	{

	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
	td->td_pflags \|= TDP_INKTRACE;
	}

	static void
	ktrace_exit(struct thread *td)
	{

	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
	td->td_pflags &= ~TDP_INKTRACE;
	}

	static void
	ktrace_assert(struct thread *td)
	{

	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
	}

	static void
	ktrace_init(void *dummy)
	{
	struct ktr_request *req;
	int i;

	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF \| MTX_QUIET);
	sx_init(&ktrace_sx, "ktrace_sx");
	STAILQ_INIT(&ktr_free);
	for (i = 0; i < ktr_requestpool; i++) {
	req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
	}
	}
	SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);

	static int
	sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
	{
	struct thread *td;
	u_int newsize, oldsize, wantsize;
	int error;

	/* Handle easy read-only case first to avoid warnings from GCC. */
	if (!req->newptr) {
	- mtx_lock(&ktrace_mtx);
	oldsize = ktr_requestpool;
	- mtx_unlock(&ktrace_mtx);
	return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
	}

	error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
	if (error)
	return (error);
	td = curthread;
	ktrace_enter(td);
	- mtx_lock(&ktrace_mtx);
	oldsize = ktr_requestpool;
	- newsize = ktrace_resize_pool(wantsize);
	- mtx_unlock(&ktrace_mtx);
	+ newsize = ktrace_resize_pool(oldsize, wantsize);
	ktrace_exit(td);
	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
	if (error)
	return (error);
	if (wantsize > oldsize && newsize < wantsize)
	return (ENOSPC);
	return (0);
	}
	SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT\|CTLFLAG_RW,
	&ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", "");

	static u_int
	-ktrace_resize_pool(u_int newsize)
	+ktrace_resize_pool(u_int oldsize, u_int newsize)
	{
	+ STAILQ_HEAD(, ktr_request) ktr_new;
	struct ktr_request *req;
	int bound;

	- mtx_assert(&ktrace_mtx, MA_OWNED);
	print_message = 1;
	- bound = newsize - ktr_requestpool;
	+ bound = newsize - oldsize;
	if (bound == 0)
	return (ktr_requestpool);
	- if (bound < 0)
	+ if (bound < 0) {
	+ mtx_lock(&ktrace_mtx);
	/* Shrink pool down to newsize if possible. */
	while (bound++ < 0) {
	req = STAILQ_FIRST(&ktr_free);
	if (req == NULL)
	- return (ktr_requestpool);
	+ break;
	STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
	ktr_requestpool--;
	- mtx_unlock(&ktrace_mtx);
	free(req, M_KTRACE);
	- mtx_lock(&ktrace_mtx);
	}
	- else
	+ } else {
	/* Grow pool up to newsize. */
	+ STAILQ_INIT(&ktr_new);
	while (bound-- > 0) {
	- mtx_unlock(&ktrace_mtx);
	req = malloc(sizeof(struct ktr_request), M_KTRACE,
	M_WAITOK);
	- mtx_lock(&ktrace_mtx);
	- STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
	- ktr_requestpool++;
	+ STAILQ_INSERT_HEAD(&ktr_new, req, ktr_list);
	}
	+ mtx_lock(&ktrace_mtx);
	+ STAILQ_CONCAT(&ktr_free, &ktr_new);
	+ ktr_requestpool += (newsize - oldsize);
	+ }
	+ mtx_unlock(&ktrace_mtx);
	return (ktr_requestpool);
	}

	/* ktr_getrequest() assumes that ktr_comm[] is the same size as td_name[]. */
	CTASSERT(sizeof(((struct ktr_header *)NULL)->ktr_comm) ==
	(sizeof((struct thread *)NULL)->td_name));

	static struct ktr_request *
	-ktr_getrequest(int type)
	+ktr_getrequest_entered(struct thread *td, int type)
	{
	struct ktr_request *req;
	- struct thread *td = curthread;
	struct proc *p = td->td_proc;
	int pm;

	- ktrace_enter(td); /* XXX: In caller instead? */
	mtx_lock(&ktrace_mtx);
	if (!KTRCHECK(td, type)) {
	mtx_unlock(&ktrace_mtx);
	- ktrace_exit(td);
	return (NULL);
	}
	req = STAILQ_FIRST(&ktr_free);
	if (req != NULL) {
	STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
	req->ktr_header.ktr_type = type;
	if (p->p_traceflag & KTRFAC_DROP) {
	req->ktr_header.ktr_type \|= KTR_DROP;
	p->p_traceflag &= ~KTRFAC_DROP;
	}
	mtx_unlock(&ktrace_mtx);
	microtime(&req->ktr_header.ktr_time);
	req->ktr_header.ktr_pid = p->p_pid;
	req->ktr_header.ktr_tid = td->td_tid;
	bcopy(td->td_name, req->ktr_header.ktr_comm,
	sizeof(req->ktr_header.ktr_comm));
	req->ktr_buffer = NULL;
	req->ktr_header.ktr_len = 0;
	} else {
	p->p_traceflag \|= KTRFAC_DROP;
	pm = print_message;
	print_message = 0;
	mtx_unlock(&ktrace_mtx);
	if (pm)
	printf("Out of ktrace request objects.\n");
	- ktrace_exit(td);
	}
	return (req);
	}

	+static struct ktr_request *
	+ktr_getrequest(int type)
	+{
	+ struct thread *td = curthread;
	+ struct ktr_request *req;
	+
	+ ktrace_enter(td);
	+ req = ktr_getrequest_entered(td, type);
	+ if (req == NULL)
	+ ktrace_exit(td);
	+
	+ return (req);
	+}
	+
	/*
	* Some trace generation environments don't permit direct access to VFS,
	* such as during a context switch where sleeping is not allowed. Under these
	* circumstances, queue a request to the thread to be written asynchronously
	* later.
	*/
	static void
	ktr_enqueuerequest(struct thread td, struct ktr_request req)
	{

	mtx_lock(&ktrace_mtx);
	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
	mtx_unlock(&ktrace_mtx);
	- ktrace_exit(td);
	}

	/*
	* Drain any pending ktrace records from the per-thread queue to disk. This
	* is used both internally before committing other records, and also on
	* system call return. We drain all the ones we can find at the time when
	* drain is requested, but don't keep draining after that as those events
	* may be approximately "after" the current event.
	*/
	static void
	ktr_drain(struct thread *td)
	{
	struct ktr_request *queued_req;
	STAILQ_HEAD(, ktr_request) local_queue;

	ktrace_assert(td);
	sx_assert(&ktrace_sx, SX_XLOCKED);

	STAILQ_INIT(&local_queue); /* XXXRW: needed? */

	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
	mtx_lock(&ktrace_mtx);
	STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
	mtx_unlock(&ktrace_mtx);

	while ((queued_req = STAILQ_FIRST(&local_queue))) {
	STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
	ktr_writerequest(td, queued_req);
	ktr_freerequest(queued_req);
	}
	}
	}

	/*
	* Submit a trace record for immediate commit to disk -- to be used only
	* where entering VFS is OK. First drain any pending records that may have
	* been cached in the thread.
	*/
	static void
	ktr_submitrequest(struct thread td, struct ktr_request req)
	{

	ktrace_assert(td);

	sx_xlock(&ktrace_sx);
	ktr_drain(td);
	ktr_writerequest(td, req);
	ktr_freerequest(req);
	sx_xunlock(&ktrace_sx);
	-
	ktrace_exit(td);
	}

	static void
	ktr_freerequest(struct ktr_request *req)
	{

	mtx_lock(&ktrace_mtx);
	ktr_freerequest_locked(req);
	mtx_unlock(&ktrace_mtx);
	}

	static void
	ktr_freerequest_locked(struct ktr_request *req)
	{

	mtx_assert(&ktrace_mtx, MA_OWNED);
	if (req->ktr_buffer != NULL)
	free(req->ktr_buffer, M_KTRACE);
	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
	}

	/*
	* Disable tracing for a process and release all associated resources.
	* The caller is responsible for releasing a reference on the returned
	* vnode and credentials.
	*/
	static void
	ktr_freeproc(struct proc p, struct ucred uc, struct vnode *vp)
	{
	struct ktr_request *req;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	mtx_assert(&ktrace_mtx, MA_OWNED);
	*uc = p->p_tracecred;
	p->p_tracecred = NULL;
	if (vp != NULL)
	*vp = p->p_tracevp;
	p->p_tracevp = NULL;
	p->p_traceflag = 0;
	while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
	STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
	ktr_freerequest_locked(req);
	}
	}

	void
	ktrsyscall(code, narg, args)
	int code, narg;
	register_t args[];
	{
	struct ktr_request *req;
	struct ktr_syscall *ktp;
	size_t buflen;
	char *buf = NULL;

	buflen = sizeof(register_t) * narg;
	if (buflen > 0) {
	buf = malloc(buflen, M_KTRACE, M_WAITOK);
	bcopy(args, buf, buflen);
	}
	req = ktr_getrequest(KTR_SYSCALL);
	if (req == NULL) {
	if (buf != NULL)
	free(buf, M_KTRACE);
	return;
	}
	ktp = &req->ktr_data.ktr_syscall;
	ktp->ktr_code = code;
	ktp->ktr_narg = narg;
	if (buflen > 0) {
	req->ktr_header.ktr_len = buflen;
	req->ktr_buffer = buf;
	}
	ktr_submitrequest(curthread, req);
	}

	void
	ktrsysret(code, error, retval)
	int code, error;
	register_t retval;
	{
	struct ktr_request *req;
	struct ktr_sysret *ktp;

	req = ktr_getrequest(KTR_SYSRET);
	if (req == NULL)
	return;
	ktp = &req->ktr_data.ktr_sysret;
	ktp->ktr_code = code;
	ktp->ktr_error = error;
	ktp->ktr_retval = retval; /* what about val2 ? */
	ktr_submitrequest(curthread, req);
	}

	/*
	* When a setuid process execs, disable tracing.
	*
	* XXX: We toss any pending asynchronous records.
	*/
	void
	ktrprocexec(struct proc p, struct ucred uc, struct vnode *vp)
	{

	PROC_LOCK_ASSERT(p, MA_OWNED);
	mtx_lock(&ktrace_mtx);
	ktr_freeproc(p, uc, vp);
	mtx_unlock(&ktrace_mtx);
	}

	/*
	* When a process exits, drain per-process asynchronous trace records
	* and disable tracing.
	*/
	void
	ktrprocexit(struct thread *td)
	{
	+ struct ktr_request *req;
	struct proc *p;
	struct ucred *cred;
	struct vnode *vp;
	int vfslocked;

	p = td->td_proc;
	if (p->p_traceflag == 0)
	return;

	ktrace_enter(td);
	+ req = ktr_getrequest_entered(td, KTR_PROCDTOR);
	+ if (req != NULL)
	+ ktr_enqueuerequest(td, req);
	sx_xlock(&ktrace_sx);
	ktr_drain(td);
	sx_xunlock(&ktrace_sx);
	PROC_LOCK(p);
	mtx_lock(&ktrace_mtx);
	ktr_freeproc(p, &cred, &vp);
	mtx_unlock(&ktrace_mtx);
	PROC_UNLOCK(p);
	if (vp != NULL) {
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	if (cred != NULL)
	crfree(cred);
	ktrace_exit(td);
	}

	+static void
	+ktrprocctor_entered(struct thread td, struct proc p)
	+{
	+ struct ktr_proc_ctor *ktp;
	+ struct ktr_request *req;
	+ struct thread *td2;
	+
	+ ktrace_assert(td);
	+ td2 = FIRST_THREAD_IN_PROC(p);
	+ req = ktr_getrequest_entered(td2, KTR_PROCCTOR);
	+ if (req == NULL)
	+ return;
	+ ktp = &req->ktr_data.ktr_proc_ctor;
	+ ktp->sv_flags = p->p_sysent->sv_flags;
	+ ktr_enqueuerequest(td2, req);
	+}
	+
	+void
	+ktrprocctor(struct proc *p)
	+{
	+ struct thread *td = curthread;
	+
	+ if ((p->p_traceflag & KTRFAC_MASK) == 0)
	+ return;
	+
	+ ktrace_enter(td);
	+ ktrprocctor_entered(td, p);
	+ ktrace_exit(td);
	+}
	+
	/*
	* When a process forks, enable tracing in the new process if needed.
	*/
	void
	ktrprocfork(struct proc p1, struct proc p2)
	{

	- PROC_LOCK_ASSERT(p1, MA_OWNED);
	- PROC_LOCK_ASSERT(p2, MA_OWNED);
	+ PROC_LOCK(p1);
	mtx_lock(&ktrace_mtx);
	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
	if (p1->p_traceflag & KTRFAC_INHERIT) {
	p2->p_traceflag = p1->p_traceflag;
	if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
	VREF(p2->p_tracevp);
	KASSERT(p1->p_tracecred != NULL,
	("ktrace vnode with no cred"));
	p2->p_tracecred = crhold(p1->p_tracecred);
	}
	}
	mtx_unlock(&ktrace_mtx);
	+ PROC_UNLOCK(p1);
	+
	+ ktrprocctor(p2);
	}

	/*
	* When a thread returns, drain any asynchronous records generated by the
	* system call.
	*/
	void
	ktruserret(struct thread *td)
	{

	ktrace_enter(td);
	sx_xlock(&ktrace_sx);
	ktr_drain(td);
	sx_xunlock(&ktrace_sx);
	ktrace_exit(td);
	}

	void
	ktrnamei(path)
	char *path;
	{
	struct ktr_request *req;
	int namelen;
	char *buf = NULL;

	namelen = strlen(path);
	if (namelen > 0) {
	buf = malloc(namelen, M_KTRACE, M_WAITOK);
	bcopy(path, buf, namelen);
	}
	req = ktr_getrequest(KTR_NAMEI);
	if (req == NULL) {
	if (buf != NULL)
	free(buf, M_KTRACE);
	return;
	}
	if (namelen > 0) {
	req->ktr_header.ktr_len = namelen;
	req->ktr_buffer = buf;
	}
	ktr_submitrequest(curthread, req);
	}

	void
	ktrsysctl(name, namelen)
	int *name;
	u_int namelen;
	{
	struct ktr_request *req;
	u_int mib[CTL_MAXNAME + 2];
	char *mibname;
	size_t mibnamelen;
	int error;

	/* Lookup name of mib. */
	KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
	mib[0] = 0;
	mib[1] = 1;
	bcopy(name, mib + 2, namelen * sizeof(*name));
	mibnamelen = 128;
	mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
	error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
	NULL, 0, &mibnamelen, 0);
	if (error) {
	free(mibname, M_KTRACE);
	return;
	}
	req = ktr_getrequest(KTR_SYSCTL);
	if (req == NULL) {
	free(mibname, M_KTRACE);
	return;
	}
	req->ktr_header.ktr_len = mibnamelen;
	req->ktr_buffer = mibname;
	ktr_submitrequest(curthread, req);
	}

	void
	ktrgenio(fd, rw, uio, error)
	int fd;
	enum uio_rw rw;
	struct uio *uio;
	int error;
	{
	struct ktr_request *req;
	struct ktr_genio *ktg;
	int datalen;
	char *buf;

	if (error) {
	free(uio, M_IOV);
	return;
	}
	uio->uio_offset = 0;
	uio->uio_rw = UIO_WRITE;
	datalen = imin(uio->uio_resid, ktr_geniosize);
	buf = malloc(datalen, M_KTRACE, M_WAITOK);
	error = uiomove(buf, datalen, uio);
	free(uio, M_IOV);
	if (error) {
	free(buf, M_KTRACE);
	return;
	}
	req = ktr_getrequest(KTR_GENIO);
	if (req == NULL) {
	free(buf, M_KTRACE);
	return;
	}
	ktg = &req->ktr_data.ktr_genio;
	ktg->ktr_fd = fd;
	ktg->ktr_rw = rw;
	req->ktr_header.ktr_len = datalen;
	req->ktr_buffer = buf;
	ktr_submitrequest(curthread, req);
	}

	void
	ktrpsig(sig, action, mask, code)
	int sig;
	sig_t action;
	sigset_t *mask;
	int code;
	{
	+ struct thread *td = curthread;
	struct ktr_request *req;
	struct ktr_psig *kp;

	req = ktr_getrequest(KTR_PSIG);
	if (req == NULL)
	return;
	kp = &req->ktr_data.ktr_psig;
	kp->signo = (char)sig;
	kp->action = action;
	kp->mask = *mask;
	kp->code = code;
	- ktr_enqueuerequest(curthread, req);
	+ ktr_enqueuerequest(td, req);
	+ ktrace_exit(td);
	}

	void
	ktrcsw(out, user)
	int out, user;
	{
	+ struct thread *td = curthread;
	struct ktr_request *req;
	struct ktr_csw *kc;

	req = ktr_getrequest(KTR_CSW);
	if (req == NULL)
	return;
	kc = &req->ktr_data.ktr_csw;
	kc->out = out;
	kc->user = user;
	- ktr_enqueuerequest(curthread, req);
	+ ktr_enqueuerequest(td, req);
	+ ktrace_exit(td);
	}

	void
	ktrstruct(name, namelen, data, datalen)
	const char *name;
	size_t namelen;
	void *data;
	size_t datalen;
	{
	struct ktr_request *req;
	char *buf = NULL;
	size_t buflen;

	if (!data)
	datalen = 0;
	buflen = namelen + 1 + datalen;
	buf = malloc(buflen, M_KTRACE, M_WAITOK);
	bcopy(name, buf, namelen);
	buf[namelen] = '\0';
	bcopy(data, buf + namelen + 1, datalen);
	if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
	free(buf, M_KTRACE);
	return;
	}
	req->ktr_buffer = buf;
	req->ktr_header.ktr_len = buflen;
	ktr_submitrequest(curthread, req);
	}
	#endif /* KTRACE */

	/* Interface and common routines */

	#ifndef _SYS_SYSPROTO_H_
	struct ktrace_args {
	char *fname;
	int ops;
	int facs;
	int pid;
	};
	#endif
	/* ARGSUSED */
	int
	ktrace(td, uap)
	struct thread *td;
	register struct ktrace_args *uap;
	{
	#ifdef KTRACE
	register struct vnode *vp = NULL;
	register struct proc *p;
	struct pgrp *pg;
	int facs = uap->facs & ~KTRFAC_ROOT;
	int ops = KTROP(uap->ops);
	int descend = uap->ops & KTRFLAG_DESCEND;
	int nfound, ret = 0;
	int flags, error = 0, vfslocked;
	struct nameidata nd;
	struct ucred *cred;

	/*
	* Need something to (un)trace.
	*/
	if (ops != KTROP_CLEARFILE && facs == 0)
	return (EINVAL);

	ktrace_enter(td);
	if (ops != KTROP_CLEAR) {
	/*
	* an operation which requires a file argument.
	*/
	NDINIT(&nd, LOOKUP, NOFOLLOW \| MPSAFE, UIO_USERSPACE,
	uap->fname, td);
	flags = FREAD \| FWRITE \| O_NOFOLLOW;
	error = vn_open(&nd, &flags, 0, NULL);
	if (error) {
	ktrace_exit(td);
	return (error);
	}
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;
	VOP_UNLOCK(vp, 0);
	if (vp->v_type != VREG) {
	(void) vn_close(vp, FREAD\|FWRITE, td->td_ucred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	ktrace_exit(td);
	return (EACCES);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	}
	/*
	* Clear all uses of the tracefile.
	*/
	if (ops == KTROP_CLEARFILE) {
	int vrele_count;

	vrele_count = 0;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	PROC_LOCK(p);
	if (p->p_tracevp == vp) {
	if (ktrcanset(td, p)) {
	mtx_lock(&ktrace_mtx);
	ktr_freeproc(p, &cred, NULL);
	mtx_unlock(&ktrace_mtx);
	vrele_count++;
	crfree(cred);
	} else
	error = EPERM;
	}
	PROC_UNLOCK(p);
	}
	sx_sunlock(&allproc_lock);
	if (vrele_count > 0) {
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	while (vrele_count-- > 0)
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	goto done;
	}
	/*
	* do it
	*/
	sx_slock(&proctree_lock);
	if (uap->pid < 0) {
	/*
	* by process group
	*/
	pg = pgfind(-uap->pid);
	if (pg == NULL) {
	sx_sunlock(&proctree_lock);
	error = ESRCH;
	goto done;
	}
	/*
	* ktrops() may call vrele(). Lock pg_members
	* by the proctree_lock rather than pg_mtx.
	*/
	PGRP_UNLOCK(pg);
	nfound = 0;
	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
	PROC_LOCK(p);
	if (p_cansee(td, p) != 0) {
	PROC_UNLOCK(p);
	continue;
	}
	PROC_UNLOCK(p);
	nfound++;
	if (descend)
	ret \|= ktrsetchildren(td, p, ops, facs, vp);
	else
	ret \|= ktrops(td, p, ops, facs, vp);
	}
	if (nfound == 0) {
	sx_sunlock(&proctree_lock);
	error = ESRCH;
	goto done;
	}
	} else {
	/*
	* by pid
	*/
	p = pfind(uap->pid);
	if (p == NULL) {
	sx_sunlock(&proctree_lock);
	error = ESRCH;
	goto done;
	}
	error = p_cansee(td, p);
	/*
	* The slock of the proctree lock will keep this process
	* from going away, so unlocking the proc here is ok.
	*/
	PROC_UNLOCK(p);
	if (error) {
	sx_sunlock(&proctree_lock);
	goto done;
	}
	if (descend)
	ret \|= ktrsetchildren(td, p, ops, facs, vp);
	else
	ret \|= ktrops(td, p, ops, facs, vp);
	}
	sx_sunlock(&proctree_lock);
	if (!ret)
	error = EPERM;
	done:
	if (vp != NULL) {
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	(void) vn_close(vp, FWRITE, td->td_ucred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	ktrace_exit(td);
	return (error);
	#else /* !KTRACE */
	return (ENOSYS);
	#endif /* KTRACE */
	}

	/* ARGSUSED */
	int
	utrace(td, uap)
	struct thread *td;
	register struct utrace_args *uap;
	{

	#ifdef KTRACE
	struct ktr_request *req;
	void *cp;
	int error;

	if (!KTRPOINT(td, KTR_USER))
	return (0);
	if (uap->len > KTR_USER_MAXLEN)
	return (EINVAL);
	cp = malloc(uap->len, M_KTRACE, M_WAITOK);
	error = copyin(uap->addr, cp, uap->len);
	if (error) {
	free(cp, M_KTRACE);
	return (error);
	}
	req = ktr_getrequest(KTR_USER);
	if (req == NULL) {
	free(cp, M_KTRACE);
	return (ENOMEM);
	}
	req->ktr_buffer = cp;
	req->ktr_header.ktr_len = uap->len;
	ktr_submitrequest(td, req);
	return (0);
	#else /* !KTRACE */
	return (ENOSYS);
	#endif /* KTRACE */
	}

	#ifdef KTRACE
	static int
	ktrops(td, p, ops, facs, vp)
	struct thread *td;
	struct proc *p;
	int ops, facs;
	struct vnode *vp;
	{
	struct vnode *tracevp = NULL;
	struct ucred *tracecred = NULL;

	PROC_LOCK(p);
	if (!ktrcanset(td, p)) {
	PROC_UNLOCK(p);
	return (0);
	}
	mtx_lock(&ktrace_mtx);
	if (ops == KTROP_SET) {
	if (p->p_tracevp != vp) {
	/*
	* if trace file already in use, relinquish below
	*/
	tracevp = p->p_tracevp;
	VREF(vp);
	p->p_tracevp = vp;
	}
	if (p->p_tracecred != td->td_ucred) {
	tracecred = p->p_tracecred;
	p->p_tracecred = crhold(td->td_ucred);
	}
	p->p_traceflag \|= facs;
	if (priv_check(td, PRIV_KTRACE) == 0)
	p->p_traceflag \|= KTRFAC_ROOT;
	} else {
	/* KTROP_CLEAR */
	if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
	/* no more tracing */
	ktr_freeproc(p, &tracecred, &tracevp);
	}
	mtx_unlock(&ktrace_mtx);
	+ if ((p->p_traceflag & KTRFAC_MASK) != 0)
	+ ktrprocctor_entered(td, p);
	PROC_UNLOCK(p);
	if (tracevp != NULL) {
	int vfslocked;

	vfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
	vrele(tracevp);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	if (tracecred != NULL)
	crfree(tracecred);

	return (1);
	}

	static int
	ktrsetchildren(td, top, ops, facs, vp)
	struct thread *td;
	struct proc *top;
	int ops, facs;
	struct vnode *vp;
	{
	register struct proc *p;
	register int ret = 0;

	p = top;
	sx_assert(&proctree_lock, SX_LOCKED);
	for (;;) {
	ret \|= ktrops(td, p, ops, facs, vp);
	/*
	* If this process has children, descend to them next,
	* otherwise do any siblings, and if done with this level,
	* follow back up the tree (but not past top).
	*/
	if (!LIST_EMPTY(&p->p_children))
	p = LIST_FIRST(&p->p_children);
	else for (;;) {
	if (p == top)
	return (ret);
	if (LIST_NEXT(p, p_sibling)) {
	p = LIST_NEXT(p, p_sibling);
	break;
	}
	p = p->p_pptr;
	}
	}
	/NOTREACHED/
	}

	static void
	ktr_writerequest(struct thread td, struct ktr_request req)
	{
	struct ktr_header *kth;
	struct vnode *vp;
	struct proc *p;
	struct ucred *cred;
	struct uio auio;
	struct iovec aiov[3];
	struct mount *mp;
	int datalen, buflen, vrele_count;
	int error, vfslocked;

	/*
	* We hold the vnode and credential for use in I/O in case ktrace is
	* disabled on the process as we write out the request.
	*
	* XXXRW: This is not ideal: we could end up performing a write after
	* the vnode has been closed.
	*/
	mtx_lock(&ktrace_mtx);
	vp = td->td_proc->p_tracevp;
	cred = td->td_proc->p_tracecred;

	/*
	* If vp is NULL, the vp has been cleared out from under this
	* request, so just drop it. Make sure the credential and vnode are
	* in sync: we should have both or neither.
	*/
	if (vp == NULL) {
	KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
	mtx_unlock(&ktrace_mtx);
	return;
	}
	VREF(vp);
	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
	crhold(cred);
	mtx_unlock(&ktrace_mtx);

	kth = &req->ktr_header;
	KASSERT(((u_short)kth->ktr_type & ~KTR_DROP) <
	sizeof(data_lengths) / sizeof(data_lengths[0]),
	("data_lengths array overflow"));
	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
	buflen = kth->ktr_len;
	auio.uio_iov = &aiov[0];
	auio.uio_offset = 0;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_rw = UIO_WRITE;
	aiov[0].iov_base = (caddr_t)kth;
	aiov[0].iov_len = sizeof(struct ktr_header);
	auio.uio_resid = sizeof(struct ktr_header);
	auio.uio_iovcnt = 1;
	auio.uio_td = td;
	if (datalen != 0) {
	aiov[1].iov_base = (caddr_t)&req->ktr_data;
	aiov[1].iov_len = datalen;
	auio.uio_resid += datalen;
	auio.uio_iovcnt++;
	kth->ktr_len += datalen;
	}
	if (buflen != 0) {
	KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
	aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
	aiov[auio.uio_iovcnt].iov_len = buflen;
	auio.uio_resid += buflen;
	auio.uio_iovcnt++;
	}

	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_start_write(vp, &mp, V_WAIT);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	#ifdef MAC
	error = mac_vnode_check_write(cred, NOCRED, vp);
	if (error == 0)
	#endif
	error = VOP_WRITE(vp, &auio, IO_UNIT \| IO_APPEND, cred);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	crfree(cred);
	if (!error) {
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return;
	}
	VFS_UNLOCK_GIANT(vfslocked);

	/*
	* If error encountered, give up tracing on this vnode. We defer
	* all the vrele()'s on the vnode until after we are finished walking
	* the various lists to avoid needlessly holding locks.
	* NB: at this point we still hold the vnode reference that must
	* not go away as we need the valid vnode to compare with. Thus let
	* vrele_count start at 1 and the reference will be freed
	* by the loop at the end after our last use of vp.
	*/
	log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
	error);
	vrele_count = 1;
	/*
	* First, clear this vnode from being used by any processes in the
	* system.
	* XXX - If one process gets an EPERM writing to the vnode, should
	* we really do this? Other processes might have suitable
	* credentials for the operation.
	*/
	cred = NULL;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	PROC_LOCK(p);
	if (p->p_tracevp == vp) {
	mtx_lock(&ktrace_mtx);
	ktr_freeproc(p, &cred, NULL);
	mtx_unlock(&ktrace_mtx);
	vrele_count++;
	}
	PROC_UNLOCK(p);
	if (cred != NULL) {
	crfree(cred);
	cred = NULL;
	}
	}
	sx_sunlock(&allproc_lock);

	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	while (vrele_count-- > 0)
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	}

	/*
	* Return true if caller has permission to set the ktracing state
	* of target. Essentially, the target can't possess any
	* more permissions than the caller. KTRFAC_ROOT signifies that
	* root previously set the tracing status on the target process, and
	* so, only root may further change it.
	*/
	static int
	ktrcanset(td, targetp)
	struct thread *td;
	struct proc *targetp;
	{

	PROC_LOCK_ASSERT(targetp, MA_OWNED);
	if (targetp->p_traceflag & KTRFAC_ROOT &&
	priv_check(td, PRIV_KTRACE))
	return (0);

	if (p_candebug(td, targetp) != 0)
	return (0);

	return (1);
	}

	#endif /* KTRACE */
	Index: stable/8/sys/sys/ktrace.h
	===================================================================
	--- stable/8/sys/sys/ktrace.h (revision 220261)
	+++ stable/8/sys/sys/ktrace.h (revision 220262)
	@@ -1,222 +1,239 @@
	/*-
	* Copyright (c) 1988, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ktrace.h 8.1 (Berkeley) 6/2/93
	* $FreeBSD$
	*/

	#ifndef _SYS_KTRACE_H_
	#define _SYS_KTRACE_H_

	/*
	* operations to ktrace system call (KTROP(op))
	*/
	#define KTROP_SET 0 /* set trace points */
	#define KTROP_CLEAR 1 /* clear trace points */
	#define KTROP_CLEARFILE 2 /* stop all tracing to file */
	#define KTROP(o) ((o)&3) /* macro to extract operation */
	/*
	* flags (ORed in with operation)
	*/
	#define KTRFLAG_DESCEND 4 /* perform op on all children too */

	/*
	* ktrace record header
	*/
	struct ktr_header {
	int ktr_len; /* length of buf */
	short ktr_type; /* trace record type */
	pid_t ktr_pid; /* process id */
	char ktr_comm[MAXCOMLEN+1]; /* command name */
	struct timeval ktr_time; /* timestamp */
	intptr_t ktr_tid; /* was ktr_buffer */
	};

	/*
	* Test for kernel trace point (MP SAFE).
	*
	* KTRCHECK() just checks that the type is enabled and is only for
	* internal use in the ktrace subsystem. KTRPOINT() checks against
	* ktrace recursion as well as checking that the type is enabled and
	* is the public interface.
	*/
	#define KTRCHECK(td, type) ((td)->td_proc->p_traceflag & (1 << type))
	#define KTRPOINT(td, type) \
	(KTRCHECK((td), (type)) && !((td)->td_pflags & TDP_INKTRACE))
	#define KTRCHECKDRAIN(td) (!(STAILQ_EMPTY(&(td)->td_proc->p_ktr)))
	#define KTRUSERRET(td) do { \
	if (KTRCHECKDRAIN(td)) \
	ktruserret(td); \
	} while (0)
	#define KTRPROCEXIT(td) do { \
	if (KTRCHECKDRAIN(td)) \
	ktrprocexit(td); \
	} while (0)

	/*
	* ktrace record types
	*/

	/*
	* KTR_SYSCALL - system call record
	*/
	#define KTR_SYSCALL 1
	struct ktr_syscall {
	short ktr_code; /* syscall number */
	short ktr_narg; /* number of arguments */
	/*
	* followed by ktr_narg register_t
	*/
	register_t ktr_args[1];
	};

	/*
	* KTR_SYSRET - return from system call record
	*/
	#define KTR_SYSRET 2
	struct ktr_sysret {
	short ktr_code;
	short ktr_eosys;
	int ktr_error;
	register_t ktr_retval;
	};

	/*
	* KTR_NAMEI - namei record
	*/
	#define KTR_NAMEI 3
	/* record contains pathname */

	/*
	* KTR_GENIO - trace generic process i/o
	*/
	#define KTR_GENIO 4
	struct ktr_genio {
	int ktr_fd;
	enum uio_rw ktr_rw;
	/*
	* followed by data successfully read/written
	*/
	};

	/*
	* KTR_PSIG - trace processed signal
	*/
	#define KTR_PSIG 5
	struct ktr_psig {
	int signo;
	sig_t action;
	int code;
	sigset_t mask;
	};

	/*
	* KTR_CSW - trace context switches
	*/
	#define KTR_CSW 6
	struct ktr_csw {
	int out; /* 1 if switch out, 0 if switch in */
	int user; /* 1 if usermode (ivcsw), 0 if kernel (vcsw) */
	};

	/*
	* KTR_USER - data coming from userland
	*/
	#define KTR_USER_MAXLEN 2048 /* maximum length of passed data */
	#define KTR_USER 7

	/*
	* KTR_STRUCT - misc. structs
	*/
	#define KTR_STRUCT 8
	struct sockaddr;
	struct stat;
	+struct sysentvec;

	/*
	* KTR_SYSCTL - name of a sysctl MIB
	*/
	#define KTR_SYSCTL 9
	/* record contains null-terminated MIB name */

	/*
	+ * KTR_PROCCTOR - trace process creation (multiple ABI support)
	+ */
	+#define KTR_PROCCTOR 10
	+struct ktr_proc_ctor {
	+ u_int sv_flags; /* struct sysentvec sv_flags copy */
	+};
	+
	+/*
	+ * KTR_PROCDTOR - trace process destruction (multiple ABI support)
	+ */
	+#define KTR_PROCDTOR 11
	+
	+/*
	* KTR_DROP - If this bit is set in ktr_type, then at least one event
	* between the previous record and this record was dropped.
	*/
	#define KTR_DROP 0x8000

	/*
	* kernel trace points (in p_traceflag)
	*/
	#define KTRFAC_MASK 0x00ffffff
	#define KTRFAC_SYSCALL (1<<KTR_SYSCALL)
	#define KTRFAC_SYSRET (1<<KTR_SYSRET)
	#define KTRFAC_NAMEI (1<<KTR_NAMEI)
	#define KTRFAC_GENIO (1<<KTR_GENIO)
	#define KTRFAC_PSIG (1<<KTR_PSIG)
	#define KTRFAC_CSW (1<<KTR_CSW)
	#define KTRFAC_USER (1<<KTR_USER)
	#define KTRFAC_STRUCT (1<<KTR_STRUCT)
	#define KTRFAC_SYSCTL (1<<KTR_SYSCTL)
	+#define KTRFAC_PROCCTOR (1<<KTR_PROCCTOR)
	+#define KTRFAC_PROCDTOR (1<<KTR_PROCDTOR)

	/*
	* trace flags (also in p_traceflags)
	*/
	#define KTRFAC_ROOT 0x80000000 /* root set this trace */
	#define KTRFAC_INHERIT 0x40000000 /* pass trace flags to children */
	#define KTRFAC_DROP 0x20000000 /* last event was dropped */

	#ifdef _KERNEL
	void ktrnamei(char *);
	void ktrcsw(int, int);
	void ktrpsig(int, sig_t, sigset_t *, int);
	void ktrgenio(int, enum uio_rw, struct uio *, int);
	void ktrsyscall(int, int narg, register_t args[]);
	void ktrsysctl(int *name, u_int namelen);
	void ktrsysret(int, int, register_t);
	+void ktrprocctor(struct proc *);
	void ktrprocexec(struct proc , struct ucred , struct vnode *);
	void ktrprocexit(struct thread *);
	void ktrprocfork(struct proc , struct proc );
	void ktruserret(struct thread *);
	void ktrstruct(const char , size_t, void , size_t);
	#define ktrsockaddr(s) \
	ktrstruct("sockaddr", 8, (s), ((struct sockaddr *)(s))->sa_len)
	#define ktrstat(s) \
	ktrstruct("stat", 4, (s), sizeof(struct stat))

	#else

	#include <sys/cdefs.h>

	__BEGIN_DECLS
	int ktrace(const char *, int, int, pid_t);
	int utrace(const void *, size_t);
	__END_DECLS

	#endif

	#endif
	Index: stable/8/sys
	===================================================================
	--- stable/8/sys (revision 220261)
	+++ stable/8/sys (revision 220262)

	Property changes on: stable/8/sys
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /head/sys:r219041-219042,219311-219312

File Metadata

Mime Type: text/x-c
Expires: Tue, Oct 14, 10:33 PM (2 d)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 23695468
Default Alt Text: (100 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions