Page MenuHomeFreeBSD

No OneTemporary

Index: stable/8/sys/amd64/include/xen
===================================================================
--- stable/8/sys/amd64/include/xen (revision 220261)
+++ stable/8/sys/amd64/include/xen (revision 220262)
Property changes on: stable/8/sys/amd64/include/xen
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/amd64/include/xen:r219041-219042,219311-219312
Index: stable/8/sys/cddl/contrib/opensolaris
===================================================================
--- stable/8/sys/cddl/contrib/opensolaris (revision 220261)
+++ stable/8/sys/cddl/contrib/opensolaris (revision 220262)
Property changes on: stable/8/sys/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/cddl/contrib/opensolaris:r219041-219042,219311-219312
Index: stable/8/sys/contrib/dev/acpica
===================================================================
--- stable/8/sys/contrib/dev/acpica (revision 220261)
+++ stable/8/sys/contrib/dev/acpica (revision 220262)
Property changes on: stable/8/sys/contrib/dev/acpica
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/contrib/dev/acpica:r219041-219042,219311-219312
Index: stable/8/sys/contrib/pf
===================================================================
--- stable/8/sys/contrib/pf (revision 220261)
+++ stable/8/sys/contrib/pf (revision 220262)
Property changes on: stable/8/sys/contrib/pf
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/contrib/pf:r219041-219042,219311-219312
Index: stable/8/sys/kern/kern_exec.c
===================================================================
--- stable/8/sys/kern/kern_exec.c (revision 220261)
+++ stable/8/sys/kern/kern_exec.c (revision 220262)
@@ -1,1422 +1,1428 @@
/*-
* Copyright (c) 1993, David Greenman
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_hwpmc_hooks.h"
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_vm.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/signalvar.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/filedesc.h>
#include <sys/fcntl.h>
#include <sys/acct.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/imgact_elf.h>
#include <sys/wait.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/namei.h>
#include <sys/resourcevar.h>
#include <sys/sdt.h>
#include <sys/sf_buf.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/shm.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
#include <machine/reg.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
dtrace_execexit_func_t dtrace_fasttrap_exec;
#endif
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE(proc, kernel, , exec, exec);
SDT_PROBE_ARGTYPE(proc, kernel, , exec, 0, "char *");
SDT_PROBE_DEFINE(proc, kernel, , exec_failure, exec-failure);
SDT_PROBE_ARGTYPE(proc, kernel, , exec_failure, 0, "int");
SDT_PROBE_DEFINE(proc, kernel, , exec_success, exec-success);
SDT_PROBE_ARGTYPE(proc, kernel, , exec_success, 0, "char *");
MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
static int do_execve(struct thread *td, struct image_args *args,
struct mac *mac_p);
static void exec_free_args(struct image_args *);
/* XXX This should be vm_size_t. */
SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
NULL, 0, sysctl_kern_ps_strings, "LU", "");
/* XXX This should be vm_size_t. */
SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
NULL, 0, sysctl_kern_usrstack, "LU", "");
SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
NULL, 0, sysctl_kern_stackprot, "I", "");
u_long ps_arg_cache_limit = PAGE_SIZE / 16;
SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
&ps_arg_cache_limit, 0, "");
static int map_at_zero = 0;
TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
"Permit processes to map an object at virtual address 0.");
static int
sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
{
struct proc *p;
int error;
p = curproc;
#ifdef SCTL_MASK32
if (req->flags & SCTL_MASK32) {
unsigned int val;
val = (unsigned int)p->p_sysent->sv_psstrings;
error = SYSCTL_OUT(req, &val, sizeof(val));
} else
#endif
error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
sizeof(p->p_sysent->sv_psstrings));
return error;
}
static int
sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
{
struct proc *p;
int error;
p = curproc;
#ifdef SCTL_MASK32
if (req->flags & SCTL_MASK32) {
unsigned int val;
val = (unsigned int)p->p_sysent->sv_usrstack;
error = SYSCTL_OUT(req, &val, sizeof(val));
} else
#endif
error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
sizeof(p->p_sysent->sv_usrstack));
return error;
}
static int
sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
{
struct proc *p;
p = curproc;
return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
sizeof(p->p_sysent->sv_stackprot)));
}
/*
* Each of the items is a pointer to a `const struct execsw', hence the
* double pointer here.
*/
static const struct execsw **execsw;
#ifndef _SYS_SYSPROTO_H_
struct execve_args {
char *fname;
char **argv;
char **envv;
};
#endif
int
execve(td, uap)
struct thread *td;
struct execve_args /* {
char *fname;
char **argv;
char **envv;
} */ *uap;
{
int error;
struct image_args args;
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
if (error == 0)
error = kern_execve(td, &args, NULL);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct fexecve_args {
int fd;
char **argv;
char **envv;
}
#endif
int
fexecve(struct thread *td, struct fexecve_args *uap)
{
int error;
struct image_args args;
error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
uap->argv, uap->envv);
if (error == 0) {
args.fd = uap->fd;
error = kern_execve(td, &args, NULL);
}
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct __mac_execve_args {
char *fname;
char **argv;
char **envv;
struct mac *mac_p;
};
#endif
int
__mac_execve(td, uap)
struct thread *td;
struct __mac_execve_args /* {
char *fname;
char **argv;
char **envv;
struct mac *mac_p;
} */ *uap;
{
#ifdef MAC
int error;
struct image_args args;
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
if (error == 0)
error = kern_execve(td, &args, uap->mac_p);
return (error);
#else
return (ENOSYS);
#endif
}
/*
* XXX: kern_execve has the astonishing property of not always returning to
* the caller. If sufficiently bad things happen during the call to
* do_execve(), it can end up calling exit1(); as a result, callers must
* avoid doing anything which they might need to undo (e.g., allocating
* memory).
*/
int
kern_execve(td, args, mac_p)
struct thread *td;
struct image_args *args;
struct mac *mac_p;
{
struct proc *p = td->td_proc;
int error;
AUDIT_ARG_ARGV(args->begin_argv, args->argc,
args->begin_envv - args->begin_argv);
AUDIT_ARG_ENVV(args->begin_envv, args->envc,
args->endp - args->begin_envv);
if (p->p_flag & P_HADTHREADS) {
PROC_LOCK(p);
if (thread_single(SINGLE_BOUNDARY)) {
PROC_UNLOCK(p);
exec_free_args(args);
return (ERESTART); /* Try again later. */
}
PROC_UNLOCK(p);
}
error = do_execve(td, args, mac_p);
if (p->p_flag & P_HADTHREADS) {
PROC_LOCK(p);
/*
* If success, we upgrade to SINGLE_EXIT state to
* force other threads to suicide.
*/
if (error == 0)
thread_single(SINGLE_EXIT);
else
thread_single_end();
PROC_UNLOCK(p);
}
return (error);
}
/*
* In-kernel implementation of execve(). All arguments are assumed to be
* userspace pointers from the passed thread.
*/
static int
do_execve(td, args, mac_p)
struct thread *td;
struct image_args *args;
struct mac *mac_p;
{
struct proc *p = td->td_proc;
struct nameidata nd;
struct ucred *newcred = NULL, *oldcred;
struct uidinfo *euip;
register_t *stack_base;
int error, i;
struct image_params image_params, *imgp;
struct vattr attr;
int (*img_first)(struct image_params *);
struct pargs *oldargs = NULL, *newargs = NULL;
struct sigacts *oldsigacts, *newsigacts;
#ifdef KTRACE
struct vnode *tracevp = NULL;
struct ucred *tracecred = NULL;
#endif
struct vnode *textvp = NULL, *binvp = NULL;
int credential_changing;
int vfslocked;
int textset;
#ifdef MAC
struct label *interpvplabel = NULL;
int will_transition;
#endif
#ifdef HWPMC_HOOKS
struct pmckern_procexec pe;
#endif
static const char fexecv_proc_title[] = "(fexecv)";
vfslocked = 0;
imgp = &image_params;
/*
* Lock the process and set the P_INEXEC flag to indicate that
* it should be left alone until we're done here. This is
* necessary to avoid race conditions - e.g. in ptrace() -
* that might allow a local user to illicitly obtain elevated
* privileges.
*/
PROC_LOCK(p);
KASSERT((p->p_flag & P_INEXEC) == 0,
("%s(): process already has P_INEXEC flag", __func__));
p->p_flag |= P_INEXEC;
PROC_UNLOCK(p);
/*
* Initialize part of the common data
*/
imgp->proc = p;
imgp->execlabel = NULL;
imgp->attr = &attr;
imgp->entry_addr = 0;
imgp->vmspace_destroyed = 0;
imgp->interpreted = 0;
imgp->opened = 0;
imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX;
imgp->auxargs = NULL;
imgp->vp = NULL;
imgp->object = NULL;
imgp->firstpage = NULL;
imgp->ps_strings = 0;
imgp->auxarg_size = 0;
imgp->args = args;
imgp->execpath = imgp->freepath = NULL;
imgp->execpathp = 0;
#ifdef MAC
error = mac_execve_enter(imgp, mac_p);
if (error)
goto exec_fail;
#endif
imgp->image_header = NULL;
/*
* Translate the file name. namei() returns a vnode pointer
* in ni_vp amoung other things.
*
* XXXAUDIT: It would be desirable to also audit the name of the
* interpreter if this is an interpreted binary.
*/
if (args->fname != NULL) {
NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
| MPSAFE | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
}
SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );
interpret:
if (args->fname != NULL) {
error = namei(&nd);
if (error)
goto exec_fail;
vfslocked = NDHASGIANT(&nd);
binvp = nd.ni_vp;
imgp->vp = binvp;
} else {
AUDIT_ARG_FD(args->fd);
error = fgetvp(td, args->fd, &binvp);
if (error)
goto exec_fail;
vfslocked = VFS_LOCK_GIANT(binvp->v_mount);
vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY);
AUDIT_ARG_VNODE1(binvp);
imgp->vp = binvp;
}
/*
* Check file permissions (also 'opens' file)
*/
error = exec_check_permissions(imgp);
if (error)
goto exec_fail_dealloc;
imgp->object = imgp->vp->v_object;
if (imgp->object != NULL)
vm_object_reference(imgp->object);
/*
* Set VV_TEXT now so no one can write to the executable while we're
* activating it.
*
* Remember if this was set before and unset it in case this is not
* actually an executable image.
*/
textset = imgp->vp->v_vflag & VV_TEXT;
imgp->vp->v_vflag |= VV_TEXT;
error = exec_map_first_page(imgp);
if (error)
goto exec_fail_dealloc;
imgp->proc->p_osrel = 0;
/*
* If the current process has a special image activator it
* wants to try first, call it. For example, emulating shell
* scripts differently.
*/
error = -1;
if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
error = img_first(imgp);
/*
* Loop through the list of image activators, calling each one.
* An activator returns -1 if there is no match, 0 on success,
* and an error otherwise.
*/
for (i = 0; error == -1 && execsw[i]; ++i) {
if (execsw[i]->ex_imgact == NULL ||
execsw[i]->ex_imgact == img_first) {
continue;
}
error = (*execsw[i]->ex_imgact)(imgp);
}
if (error) {
if (error == -1) {
if (textset == 0)
imgp->vp->v_vflag &= ~VV_TEXT;
error = ENOEXEC;
}
goto exec_fail_dealloc;
}
/*
* Special interpreter operation, cleanup and loop up to try to
* activate the interpreter.
*/
if (imgp->interpreted) {
exec_unmap_first_page(imgp);
/*
* VV_TEXT needs to be unset for scripts. There is a short
* period before we determine that something is a script where
* VV_TEXT will be set. The vnode lock is held over this
* entire period so nothing should illegitimately be blocked.
*/
imgp->vp->v_vflag &= ~VV_TEXT;
/* free name buffer and old vnode */
if (args->fname != NULL)
NDFREE(&nd, NDF_ONLY_PNBUF);
#ifdef MAC
mac_execve_interpreter_enter(binvp, &interpvplabel);
#endif
if (imgp->opened) {
VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
imgp->opened = 0;
}
vput(binvp);
vm_object_deallocate(imgp->object);
imgp->object = NULL;
VFS_UNLOCK_GIANT(vfslocked);
vfslocked = 0;
/* set new name to that of the interpreter */
NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
UIO_SYSSPACE, imgp->interpreter_name, td);
args->fname = imgp->interpreter_name;
goto interpret;
}
/*
* NB: We unlock the vnode here because it is believed that none
* of the sv_copyout_strings/sv_fixup operations require the vnode.
*/
VOP_UNLOCK(imgp->vp, 0);
/*
* Do the best to calculate the full path to the image file.
*/
if (imgp->auxargs != NULL &&
((args->fname != NULL && args->fname[0] == '/') ||
vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
imgp->execpath = args->fname;
/*
* Copy out strings (args and env) and initialize stack base
*/
if (p->p_sysent->sv_copyout_strings)
stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
else
stack_base = exec_copyout_strings(imgp);
/*
* If custom stack fixup routine present for this process
* let it do the stack setup.
* Else stuff argument count as first item on stack
*/
if (p->p_sysent->sv_fixup != NULL)
(*p->p_sysent->sv_fixup)(&stack_base, imgp);
else
suword(--stack_base, imgp->args->argc);
/*
* For security and other reasons, the file descriptor table cannot
* be shared after an exec.
*/
fdunshare(p, td);
/*
* Malloc things before we need locks.
*/
newcred = crget();
euip = uifind(attr.va_uid);
i = imgp->args->begin_envv - imgp->args->begin_argv;
/* Cache arguments if they fit inside our allowance */
if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
newargs = pargs_alloc(i);
bcopy(imgp->args->begin_argv, newargs->ar_args, i);
}
/* close files on exec */
fdcloseexec(td);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
/* Get a reference to the vnode prior to locking the proc */
VREF(binvp);
/*
* For security and other reasons, signal handlers cannot
* be shared after an exec. The new process gets a copy of the old
* handlers. In execsigs(), the new process will have its signals
* reset.
*/
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
if (sigacts_shared(p->p_sigacts)) {
oldsigacts = p->p_sigacts;
PROC_UNLOCK(p);
newsigacts = sigacts_alloc();
sigacts_copy(newsigacts, oldsigacts);
PROC_LOCK(p);
p->p_sigacts = newsigacts;
} else
oldsigacts = NULL;
/* Stop profiling */
stopprofclock(p);
/* reset caught signals */
execsigs(p);
/* name this process - nameiexec(p, ndp) */
bzero(p->p_comm, sizeof(p->p_comm));
if (args->fname)
bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
/*
* mark as execed, wakeup the process that vforked (if any) and tell
* it that it now has its own resources back
*/
p->p_flag |= P_EXEC;
if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
p->p_flag &= ~P_PPWAIT;
cv_broadcast(&p->p_pwait);
}
/*
* Implement image setuid/setgid.
*
* Don't honor setuid/setgid if the filesystem prohibits it or if
* the process is being traced.
*
* XXXMAC: For the time being, use NOSUID to also prohibit
* transitions on the file system.
*/
credential_changing = 0;
credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
attr.va_uid;
credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
attr.va_gid;
#ifdef MAC
will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
interpvplabel, imgp);
credential_changing |= will_transition;
#endif
if (credential_changing &&
(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
(p->p_flag & P_TRACED) == 0) {
/*
* Turn off syscall tracing for set-id programs, except for
* root. Record any set-id flags first to make sure that
* we do not regain any tracing during a possible block.
*/
setsugid(p);
#ifdef KTRACE
if (priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0))
ktrprocexec(p, &tracecred, &tracevp);
#endif
/*
* Close any file descriptors 0..2 that reference procfs,
* then make sure file descriptors 0..2 are in use.
*
* setugidsafety() may call closef() and then pfind()
* which may grab the process lock.
* fdcheckstd() may call falloc() which may block to
* allocate memory, so temporarily drop the process lock.
*/
PROC_UNLOCK(p);
VOP_UNLOCK(imgp->vp, 0);
setugidsafety(td);
error = fdcheckstd(td);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
if (error != 0)
goto done1;
PROC_LOCK(p);
/*
* Set the new credentials.
*/
if (attr.va_mode & S_ISUID)
change_euid(newcred, euip);
if (attr.va_mode & S_ISGID)
change_egid(newcred, attr.va_gid);
#ifdef MAC
if (will_transition) {
mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
interpvplabel, imgp);
}
#endif
/*
* Implement correct POSIX saved-id behavior.
*
* XXXMAC: Note that the current logic will save the
* uid and gid if a MAC domain transition occurs, even
* though maybe it shouldn't.
*/
change_svuid(newcred, newcred->cr_uid);
change_svgid(newcred, newcred->cr_gid);
p->p_ucred = newcred;
newcred = NULL;
} else {
if (oldcred->cr_uid == oldcred->cr_ruid &&
oldcred->cr_gid == oldcred->cr_rgid)
p->p_flag &= ~P_SUGID;
/*
* Implement correct POSIX saved-id behavior.
*
* XXX: It's not clear that the existing behavior is
* POSIX-compliant. A number of sources indicate that the
* saved uid/gid should only be updated if the new ruid is
* not equal to the old ruid, or the new euid is not equal
* to the old euid and the new euid is not equal to the old
* ruid. The FreeBSD code always updates the saved uid/gid.
* Also, this code uses the new (replaced) euid and egid as
* the source, which may or may not be the right ones to use.
*/
if (oldcred->cr_svuid != oldcred->cr_uid ||
oldcred->cr_svgid != oldcred->cr_gid) {
change_svuid(newcred, newcred->cr_uid);
change_svgid(newcred, newcred->cr_gid);
p->p_ucred = newcred;
newcred = NULL;
}
}
/*
* Store the vp for use in procfs. This vnode was referenced prior
* to locking the proc lock.
*/
textvp = p->p_textvp;
p->p_textvp = binvp;
#ifdef KDTRACE_HOOKS
/*
* Tell the DTrace fasttrap provider about the exec if it
* has declared an interest.
*/
if (dtrace_fasttrap_exec)
dtrace_fasttrap_exec(p);
#endif
/*
* Notify others that we exec'd, and clear the P_INEXEC flag
* as we're now a bona fide freshly-execed process.
*/
KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
p->p_flag &= ~P_INEXEC;
/*
* If tracing the process, trap to debugger so breakpoints
* can be set before the program executes.
* Use tdsignal to deliver signal to current thread, use
* psignal may cause the signal to be delivered to wrong thread
* because that thread will exit, remember we are going to enter
* single thread mode.
*/
if (p->p_flag & P_TRACED)
tdsignal(p, td, SIGTRAP, NULL);
/* clear "fork but no exec" flag, as we _are_ execing */
p->p_acflag &= ~AFORK;
/*
* Free any previous argument cache and replace it with
* the new argument cache, if any.
*/
oldargs = p->p_args;
p->p_args = newargs;
newargs = NULL;
#ifdef HWPMC_HOOKS
/*
* Check if system-wide sampling is in effect or if the
* current process is using PMCs. If so, do exec() time
* processing. This processing needs to happen AFTER the
* P_INEXEC flag is cleared.
*
* The proc lock needs to be released before taking the PMC
* SX.
*/
if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
PROC_UNLOCK(p);
VOP_UNLOCK(imgp->vp, 0);
pe.pm_credentialschanged = credential_changing;
pe.pm_entryaddr = imgp->entry_addr;
PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
} else
PROC_UNLOCK(p);
#else /* !HWPMC_HOOKS */
PROC_UNLOCK(p);
#endif
/* Set values passed into the program in registers. */
if (p->p_sysent->sv_setregs)
(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
(u_long)(uintptr_t)stack_base, imgp->ps_strings);
else
exec_setregs(td, imgp->entry_addr,
(u_long)(uintptr_t)stack_base, imgp->ps_strings);
vfs_mark_atime(imgp->vp, td->td_ucred);
SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0);
done1:
/*
* Free any resources malloc'd earlier that we didn't use.
*/
uifree(euip);
if (newcred == NULL)
crfree(oldcred);
else
crfree(newcred);
VOP_UNLOCK(imgp->vp, 0);
/*
* Handle deferred decrement of ref counts.
*/
if (textvp != NULL) {
int tvfslocked;
tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
vrele(textvp);
VFS_UNLOCK_GIANT(tvfslocked);
}
if (binvp && error != 0)
vrele(binvp);
#ifdef KTRACE
if (tracevp != NULL) {
int tvfslocked;
tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
vrele(tracevp);
VFS_UNLOCK_GIANT(tvfslocked);
}
if (tracecred != NULL)
crfree(tracecred);
#endif
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
pargs_drop(oldargs);
pargs_drop(newargs);
if (oldsigacts != NULL)
sigacts_free(oldsigacts);
exec_fail_dealloc:
/*
* free various allocated resources
*/
if (imgp->firstpage != NULL)
exec_unmap_first_page(imgp);
if (imgp->vp != NULL) {
if (args->fname)
NDFREE(&nd, NDF_ONLY_PNBUF);
if (imgp->opened)
VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
vput(imgp->vp);
}
if (imgp->object != NULL)
vm_object_deallocate(imgp->object);
free(imgp->freepath, M_TEMP);
if (error == 0) {
PROC_LOCK(p);
td->td_dbgflags |= TDB_EXEC;
PROC_UNLOCK(p);
/*
* Stop the process here if its stop event mask has
* the S_EXEC bit set.
*/
STOPEVENT(p, S_EXEC, 0);
goto done2;
}
exec_fail:
/* we're done here, clear P_INEXEC */
PROC_LOCK(p);
p->p_flag &= ~P_INEXEC;
PROC_UNLOCK(p);
SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0);
done2:
#ifdef MAC
mac_execve_exit(imgp);
mac_execve_interpreter_exit(interpvplabel);
#endif
VFS_UNLOCK_GIANT(vfslocked);
exec_free_args(args);
if (error && imgp->vmspace_destroyed) {
/* sorry, no more process anymore. exit gracefully */
exit1(td, W_EXITCODE(0, SIGABRT));
/* NOT REACHED */
}
+
+#ifdef KTRACE
+ if (error == 0)
+ ktrprocctor(p);
+#endif
+
return (error);
}
int
exec_map_first_page(imgp)
struct image_params *imgp;
{
int rv, i;
int initial_pagein;
vm_page_t ma[VM_INITIAL_PAGEIN];
vm_object_t object;
if (imgp->firstpage != NULL)
exec_unmap_first_page(imgp);
object = imgp->vp->v_object;
if (object == NULL)
return (EACCES);
VM_OBJECT_LOCK(object);
#if VM_NRESERVLEVEL > 0
if ((object->flags & OBJ_COLORED) == 0) {
object->flags |= OBJ_COLORED;
object->pg_color = 0;
}
#endif
ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
if (ma[0]->valid != VM_PAGE_BITS_ALL) {
initial_pagein = VM_INITIAL_PAGEIN;
if (initial_pagein > object->size)
initial_pagein = object->size;
for (i = 1; i < initial_pagein; i++) {
if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
if (ma[i]->valid)
break;
if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
break;
vm_page_busy(ma[i]);
} else {
ma[i] = vm_page_alloc(object, i,
VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
if (ma[i] == NULL)
break;
}
}
initial_pagein = i;
rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
ma[0] = vm_page_lookup(object, 0);
if ((rv != VM_PAGER_OK) || (ma[0] == NULL)) {
if (ma[0]) {
vm_page_lock_queues();
vm_page_free(ma[0]);
vm_page_unlock_queues();
}
VM_OBJECT_UNLOCK(object);
return (EIO);
}
}
vm_page_lock_queues();
vm_page_hold(ma[0]);
vm_page_unlock_queues();
vm_page_wakeup(ma[0]);
VM_OBJECT_UNLOCK(object);
imgp->firstpage = sf_buf_alloc(ma[0], 0);
imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
return (0);
}
void
exec_unmap_first_page(imgp)
struct image_params *imgp;
{
vm_page_t m;
if (imgp->firstpage != NULL) {
m = sf_buf_page(imgp->firstpage);
sf_buf_free(imgp->firstpage);
imgp->firstpage = NULL;
vm_page_lock_queues();
vm_page_unhold(m);
vm_page_unlock_queues();
}
}
/*
* Destroy old address space, and allocate a new stack
* The new stack is only SGROWSIZ large because it is grown
* automatically in trap.c.
*/
int
exec_new_vmspace(imgp, sv)
struct image_params *imgp;
struct sysentvec *sv;
{
int error;
struct proc *p = imgp->proc;
struct vmspace *vmspace = p->p_vmspace;
vm_offset_t sv_minuser, stack_addr;
vm_map_t map;
u_long ssiz;
imgp->vmspace_destroyed = 1;
imgp->sysent = sv;
/* May be called with Giant held */
EVENTHANDLER_INVOKE(process_exec, p, imgp);
/*
* Blow away entire process VM, if address space not shared,
* otherwise, create a new VM space so that other threads are
* not disrupted
*/
map = &vmspace->vm_map;
if (map_at_zero)
sv_minuser = sv->sv_minuser;
else
sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
vm_map_max(map) == sv->sv_maxuser) {
shmexit(vmspace);
pmap_remove_pages(vmspace_pmap(vmspace));
vm_map_remove(map, vm_map_min(map), vm_map_max(map));
} else {
error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
if (error)
return (error);
vmspace = p->p_vmspace;
map = &vmspace->vm_map;
}
/* Allocate a new stack */
if (sv->sv_maxssiz != NULL)
ssiz = *sv->sv_maxssiz;
else
ssiz = maxssiz;
stack_addr = sv->sv_usrstack - ssiz;
error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
if (error)
return (error);
#ifdef __ia64__
/* Allocate a new register stack */
stack_addr = IA64_BACKINGSTORE;
error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
if (error)
return (error);
#endif
/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
* VM_STACK case, but they are still used to monitor the size of the
* process stack so we can check the stack rlimit.
*/
vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
return (0);
}
/*
* Copy out argument and environment strings from the old process address
* space into the temporary string buffer.
*/
int
exec_copyin_args(struct image_args *args, char *fname,
enum uio_seg segflg, char **argv, char **envv)
{
char *argp, *envp;
int error;
size_t length;
bzero(args, sizeof(*args));
if (argv == NULL)
return (EFAULT);
/*
* Allocate temporary demand zeroed space for argument and
* environment strings:
*
* o ARG_MAX for argument and environment;
* o MAXSHELLCMDLEN for the name of interpreters.
*/
args->buf = (char *) kmem_alloc_wait(exec_map,
PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
if (args->buf == NULL)
return (ENOMEM);
args->begin_argv = args->buf;
args->endp = args->begin_argv;
args->stringspace = ARG_MAX;
/*
* Copy the file name.
*/
if (fname != NULL) {
args->fname = args->buf + ARG_MAX;
error = (segflg == UIO_SYSSPACE) ?
copystr(fname, args->fname, PATH_MAX, &length) :
copyinstr(fname, args->fname, PATH_MAX, &length);
if (error != 0)
goto err_exit;
} else
args->fname = NULL;
/*
* extract arguments first
*/
while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
if (argp == (caddr_t) -1) {
error = EFAULT;
goto err_exit;
}
if ((error = copyinstr(argp, args->endp,
args->stringspace, &length))) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto err_exit;
}
args->stringspace -= length;
args->endp += length;
args->argc++;
}
args->begin_envv = args->endp;
/*
* extract environment strings
*/
if (envv) {
while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
if (envp == (caddr_t)-1) {
error = EFAULT;
goto err_exit;
}
if ((error = copyinstr(envp, args->endp,
args->stringspace, &length))) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto err_exit;
}
args->stringspace -= length;
args->endp += length;
args->envc++;
}
}
return (0);
err_exit:
exec_free_args(args);
return (error);
}
static void
exec_free_args(struct image_args *args)
{
if (args->buf) {
kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
args->buf = NULL;
}
}
/*
* Copy strings out to the new process address space, constructing new arg
* and env vector tables. Return a pointer to the base so that it can be used
* as the initial stack pointer.
*/
register_t *
exec_copyout_strings(imgp)
struct image_params *imgp;
{
int argc, envc;
char **vectp;
char *stringp, *destp;
register_t *stack_base;
struct ps_strings *arginfo;
struct proc *p;
size_t execpath_len;
int szsigcode;
/*
* Calculate string base and vector table pointers.
* Also deal with signal trampoline code for this exec type.
*/
if (imgp->execpath != NULL && imgp->auxargs != NULL)
execpath_len = strlen(imgp->execpath) + 1;
else
execpath_len = 0;
p = imgp->proc;
szsigcode = 0;
arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
if (p->p_sysent->sv_szsigcode != NULL)
szsigcode = *(p->p_sysent->sv_szsigcode);
destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
roundup(execpath_len, sizeof(char *)) -
roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
/*
* install sigcode
*/
if (szsigcode)
copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
szsigcode), szsigcode);
/*
* Copy the image path for the rtld.
*/
if (execpath_len != 0) {
imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
copyout(imgp->execpath, (void *)imgp->execpathp,
execpath_len);
}
/*
* If we have a valid auxargs ptr, prepare some room
* on the stack.
*/
if (imgp->auxargs) {
/*
* 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
* lower compatibility.
*/
imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
(AT_COUNT * 2);
/*
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets,and imgp->auxarg_size is room
* for argument of Runtime loader.
*/
vectp = (char **)(destp - (imgp->args->argc +
imgp->args->envc + 2 + imgp->auxarg_size + execpath_len) *
sizeof(char *));
} else {
/*
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets
*/
vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
sizeof(char *));
}
/*
* vectp also becomes our initial stack base
*/
stack_base = (register_t *)vectp;
stringp = imgp->args->begin_argv;
argc = imgp->args->argc;
envc = imgp->args->envc;
/*
* Copy out strings - arguments and environment.
*/
copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
/*
* Fill in "ps_strings" struct for ps, w, etc.
*/
suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
suword(&arginfo->ps_nargvstr, argc);
/*
* Fill in argument portion of vector table.
*/
for (; argc > 0; --argc) {
suword(vectp++, (long)(intptr_t)destp);
while (*stringp++ != 0)
destp++;
destp++;
}
/* a null vector table pointer separates the argp's from the envp's */
suword(vectp++, 0);
suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
suword(&arginfo->ps_nenvstr, envc);
/*
* Fill in environment portion of vector table.
*/
for (; envc > 0; --envc) {
suword(vectp++, (long)(intptr_t)destp);
while (*stringp++ != 0)
destp++;
destp++;
}
/* end of vector table is a null pointer */
suword(vectp, 0);
return (stack_base);
}
/*
* Check permissions of file to execute.
* Called with imgp->vp locked.
* Return 0 for success or error code on failure.
*/
int
exec_check_permissions(imgp)
struct image_params *imgp;
{
struct vnode *vp = imgp->vp;
struct vattr *attr = imgp->attr;
struct thread *td;
int error;
td = curthread;
/* Get file attributes */
error = VOP_GETATTR(vp, attr, td->td_ucred);
if (error)
return (error);
#ifdef MAC
error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
if (error)
return (error);
#endif
/*
* 1) Check if file execution is disabled for the filesystem that this
* file resides on.
* 2) Insure that at least one execute bit is on - otherwise root
* will always succeed, and we don't want to happen unless the
* file really is executable.
* 3) Insure that the file is a regular file.
*/
if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
((attr->va_mode & 0111) == 0) ||
(attr->va_type != VREG))
return (EACCES);
/*
* Zero length files can't be exec'd
*/
if (attr->va_size == 0)
return (ENOEXEC);
/*
* Check for execute permission to file based on current credentials.
*/
error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
if (error)
return (error);
/*
* Check number of open-for-writes on the file and deny execution
* if there are any.
*/
if (vp->v_writecount)
return (ETXTBSY);
/*
* Call filesystem specific open routine (which does nothing in the
* general case).
*/
error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
if (error == 0)
imgp->opened = 1;
return (error);
}
/*
* Exec handler registration
*/
int
exec_register(execsw_arg)
const struct execsw *execsw_arg;
{
const struct execsw **es, **xs, **newexecsw;
int count = 2; /* New slot and trailing NULL */
if (execsw)
for (es = execsw; *es; es++)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
if (newexecsw == NULL)
return (ENOMEM);
xs = newexecsw;
if (execsw)
for (es = execsw; *es; es++)
*xs++ = *es;
*xs++ = execsw_arg;
*xs = NULL;
if (execsw)
free(execsw, M_TEMP);
execsw = newexecsw;
return (0);
}
int
exec_unregister(execsw_arg)
const struct execsw *execsw_arg;
{
const struct execsw **es, **xs, **newexecsw;
int count = 1;
if (execsw == NULL)
panic("unregister with no handlers left?\n");
for (es = execsw; *es; es++) {
if (*es == execsw_arg)
break;
}
if (*es == NULL)
return (ENOENT);
for (es = execsw; *es; es++)
if (*es != execsw_arg)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
if (newexecsw == NULL)
return (ENOMEM);
xs = newexecsw;
for (es = execsw; *es; es++)
if (*es != execsw_arg)
*xs++ = *es;
*xs = NULL;
if (execsw)
free(execsw, M_TEMP);
execsw = newexecsw;
return (0);
}
Index: stable/8/sys/kern/kern_fork.c
===================================================================
--- stable/8/sys/kern/kern_fork.c (revision 220261)
+++ stable/8/sys/kern/kern_fork.c (revision 220262)
@@ -1,928 +1,928 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_kstack_pages.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/eventhandler.h>
#include <sys/filedesc.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/sysctl.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/syscall.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/acct.h>
#include <sys/ktr.h>
#include <sys/ktrace.h>
#include <sys/unistd.h>
#include <sys/sdt.h>
#include <sys/sx.h>
#include <sys/signalvar.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
dtrace_fork_func_t dtrace_fasttrap_fork;
#endif
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE(proc, kernel, , create, create);
SDT_PROBE_ARGTYPE(proc, kernel, , create, 0, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, , create, 1, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, , create, 2, "int");
#ifndef _SYS_SYSPROTO_H_
struct fork_args {
int dummy;
};
#endif
/* ARGSUSED */
int
fork(td, uap)
struct thread *td;
struct fork_args *uap;
{
int error;
struct proc *p2;
error = fork1(td, RFFDG | RFPROC, 0, &p2);
if (error == 0) {
td->td_retval[0] = p2->p_pid;
td->td_retval[1] = 0;
}
return (error);
}
/* ARGSUSED */
int
vfork(td, uap)
struct thread *td;
struct vfork_args *uap;
{
int error, flags;
struct proc *p2;
#ifdef XEN
flags = RFFDG | RFPROC; /* validate that this is still an issue */
#else
flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
#endif
error = fork1(td, flags, 0, &p2);
if (error == 0) {
td->td_retval[0] = p2->p_pid;
td->td_retval[1] = 0;
}
return (error);
}
int
rfork(td, uap)
struct thread *td;
struct rfork_args *uap;
{
struct proc *p2;
int error;
/* Don't allow kernel-only flags. */
if ((uap->flags & RFKERNELONLY) != 0)
return (EINVAL);
AUDIT_ARG_FFLAGS(uap->flags);
error = fork1(td, uap->flags, 0, &p2);
if (error == 0) {
td->td_retval[0] = p2 ? p2->p_pid : 0;
td->td_retval[1] = 0;
}
return (error);
}
int nprocs = 1; /* process 0 */
int lastpid = 0;
SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
"Last used PID");
/*
* Random component to lastpid generation. We mix in a random factor to make
* it a little harder to predict. We sanity check the modulus value to avoid
* doing it in critical paths. Don't let it be too small or we pointlessly
* waste randomness entropy, and don't let it be impossibly large. Using a
* modulus that is too big causes a LOT more process table scans and slows
* down fork processing as the pidchecked caching is defeated.
*/
static int randompid = 0;
static int
sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
{
int error, pid;
error = sysctl_wire_old_buffer(req, sizeof(int));
if (error != 0)
return(error);
sx_xlock(&allproc_lock);
pid = randompid;
error = sysctl_handle_int(oidp, &pid, 0, req);
if (error == 0 && req->newptr != NULL) {
if (pid < 0 || pid > PID_MAX - 100) /* out of range */
pid = PID_MAX - 100;
else if (pid < 2) /* NOP */
pid = 0;
else if (pid < 100) /* Make it reasonable */
pid = 100;
randompid = pid;
}
sx_xunlock(&allproc_lock);
return (error);
}
SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
int
fork1(td, flags, pages, procp)
struct thread *td;
int flags;
int pages;
struct proc **procp;
{
struct proc *p1, *p2, *pptr;
struct proc *newproc;
int ok, p2_held, trypid;
static int curfail, pidchecked = 0;
static struct timeval lastfail;
struct filedesc *fd;
struct filedesc_to_leader *fdtol;
struct thread *td2;
struct sigacts *newsigacts;
struct vmspace *vm2;
vm_ooffset_t mem_charged;
int error;
/* Can't copy and clear. */
if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
return (EINVAL);
p2_held = 0;
p1 = td->td_proc;
/*
* Here we don't create a new process, but we divorce
* certain parts of a process from itself.
*/
if ((flags & RFPROC) == 0) {
if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
(flags & (RFCFDG | RFFDG))) {
PROC_LOCK(p1);
if (thread_single(SINGLE_BOUNDARY)) {
PROC_UNLOCK(p1);
return (ERESTART);
}
PROC_UNLOCK(p1);
}
error = vm_forkproc(td, NULL, NULL, NULL, flags);
if (error)
goto norfproc_fail;
/*
* Close all file descriptors.
*/
if (flags & RFCFDG) {
struct filedesc *fdtmp;
fdtmp = fdinit(td->td_proc->p_fd);
fdfree(td);
p1->p_fd = fdtmp;
}
/*
* Unshare file descriptors (from parent).
*/
if (flags & RFFDG)
fdunshare(p1, td);
norfproc_fail:
if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
(flags & (RFCFDG | RFFDG))) {
PROC_LOCK(p1);
thread_single_end();
PROC_UNLOCK(p1);
}
*procp = NULL;
return (error);
}
/*
* XXX
* We did have single-threading code here
* however it proved un-needed and caused problems
*/
mem_charged = 0;
vm2 = NULL;
if (pages == 0)
pages = KSTACK_PAGES;
/* Allocate new proc. */
newproc = uma_zalloc(proc_zone, M_WAITOK);
td2 = FIRST_THREAD_IN_PROC(newproc);
if (td2 == NULL) {
td2 = thread_alloc(pages);
if (td2 == NULL) {
error = ENOMEM;
goto fail1;
}
proc_linkup(newproc, td2);
} else {
if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) {
if (td2->td_kstack != 0)
vm_thread_dispose(td2);
if (!thread_alloc_stack(td2, pages)) {
error = ENOMEM;
goto fail1;
}
}
}
if ((flags & RFMEM) == 0) {
vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
if (vm2 == NULL) {
error = ENOMEM;
goto fail1;
}
if (!swap_reserve(mem_charged)) {
/*
* The swap reservation failed. The accounting
* from the entries of the copied vm2 will be
* substracted in vmspace_free(), so force the
* reservation there.
*/
swap_reserve_force(mem_charged);
error = ENOMEM;
goto fail1;
}
} else
vm2 = NULL;
#ifdef MAC
mac_proc_init(newproc);
#endif
knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
STAILQ_INIT(&newproc->p_ktr);
/* We have to lock the process tree while we look for a pid. */
sx_slock(&proctree_lock);
/*
* Although process entries are dynamically created, we still keep
* a global limit on the maximum number we will create. Don't allow
* a nonprivileged user to use the last ten processes; don't let root
* exceed the limit. The variable nprocs is the current number of
* processes, maxproc is the limit.
*/
sx_xlock(&allproc_lock);
if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
error = EAGAIN;
goto fail;
}
/*
* Increment the count of procs running with this uid. Don't allow
* a nonprivileged user to exceed their current limit.
*
* XXXRW: Can we avoid privilege here if it's not needed?
*/
error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
if (error == 0)
ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
else {
PROC_LOCK(p1);
ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
lim_cur(p1, RLIMIT_NPROC));
PROC_UNLOCK(p1);
}
if (!ok) {
error = EAGAIN;
goto fail;
}
/*
* Increment the nprocs resource before blocking can occur. There
* are hard-limits as to the number of processes that can run.
*/
nprocs++;
/*
* Find an unused process ID. We remember a range of unused IDs
* ready to use (from lastpid+1 through pidchecked-1).
*
* If RFHIGHPID is set (used during system boot), do not allocate
* low-numbered pids.
*/
trypid = lastpid + 1;
if (flags & RFHIGHPID) {
if (trypid < 10)
trypid = 10;
} else {
if (randompid)
trypid += arc4random() % randompid;
}
retry:
/*
* If the process ID prototype has wrapped around,
* restart somewhat above 0, as the low-numbered procs
* tend to include daemons that don't exit.
*/
if (trypid >= PID_MAX) {
trypid = trypid % PID_MAX;
if (trypid < 100)
trypid += 100;
pidchecked = 0;
}
if (trypid >= pidchecked) {
int doingzomb = 0;
pidchecked = PID_MAX;
/*
* Scan the active and zombie procs to check whether this pid
* is in use. Remember the lowest pid that's greater
* than trypid, so we can avoid checking for a while.
*/
p2 = LIST_FIRST(&allproc);
again:
for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
while (p2->p_pid == trypid ||
(p2->p_pgrp != NULL &&
(p2->p_pgrp->pg_id == trypid ||
(p2->p_session != NULL &&
p2->p_session->s_sid == trypid)))) {
trypid++;
if (trypid >= pidchecked)
goto retry;
}
if (p2->p_pid > trypid && pidchecked > p2->p_pid)
pidchecked = p2->p_pid;
if (p2->p_pgrp != NULL) {
if (p2->p_pgrp->pg_id > trypid &&
pidchecked > p2->p_pgrp->pg_id)
pidchecked = p2->p_pgrp->pg_id;
if (p2->p_session != NULL &&
p2->p_session->s_sid > trypid &&
pidchecked > p2->p_session->s_sid)
pidchecked = p2->p_session->s_sid;
}
}
if (!doingzomb) {
doingzomb = 1;
p2 = LIST_FIRST(&zombproc);
goto again;
}
}
sx_sunlock(&proctree_lock);
/*
* RFHIGHPID does not mess with the lastpid counter during boot.
*/
if (flags & RFHIGHPID)
pidchecked = 0;
else
lastpid = trypid;
p2 = newproc;
p2->p_state = PRS_NEW; /* protect against others */
p2->p_pid = trypid;
AUDIT_ARG_PID(p2->p_pid);
LIST_INSERT_HEAD(&allproc, p2, p_list);
LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
PROC_LOCK(p2);
PROC_LOCK(p1);
sx_xunlock(&allproc_lock);
bcopy(&p1->p_startcopy, &p2->p_startcopy,
__rangeof(struct proc, p_startcopy, p_endcopy));
pargs_hold(p2->p_args);
PROC_UNLOCK(p1);
bzero(&p2->p_startzero,
__rangeof(struct proc, p_startzero, p_endzero));
p2->p_ucred = crhold(td->td_ucred);
/* Tell the prison that we exist. */
prison_proc_hold(p2->p_ucred->cr_prison);
PROC_UNLOCK(p2);
/*
* Malloc things while we don't hold any locks.
*/
if (flags & RFSIGSHARE)
newsigacts = NULL;
else
newsigacts = sigacts_alloc();
/*
* Copy filedesc.
*/
if (flags & RFCFDG) {
fd = fdinit(p1->p_fd);
fdtol = NULL;
} else if (flags & RFFDG) {
fd = fdcopy(p1->p_fd);
fdtol = NULL;
} else {
fd = fdshare(p1->p_fd);
if (p1->p_fdtol == NULL)
p1->p_fdtol =
filedesc_to_leader_alloc(NULL,
NULL,
p1->p_leader);
if ((flags & RFTHREAD) != 0) {
/*
* Shared file descriptor table and
* shared process leaders.
*/
fdtol = p1->p_fdtol;
FILEDESC_XLOCK(p1->p_fd);
fdtol->fdl_refcount++;
FILEDESC_XUNLOCK(p1->p_fd);
} else {
/*
* Shared file descriptor table, and
* different process leaders
*/
fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
p1->p_fd,
p2);
}
}
/*
* Make a proc table entry for the new process.
* Start by zeroing the section of proc that is zero-initialized,
* then copy the section that is copied directly from the parent.
*/
PROC_LOCK(p2);
PROC_LOCK(p1);
bzero(&td2->td_startzero,
__rangeof(struct thread, td_startzero, td_endzero));
bzero(&td2->td_rux, sizeof(td2->td_rux));
td2->td_map_def_user = NULL;
td2->td_dbg_forked = 0;
bcopy(&td->td_startcopy, &td2->td_startcopy,
__rangeof(struct thread, td_startcopy, td_endcopy));
bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
td2->td_sigstk = td->td_sigstk;
td2->td_sigmask = td->td_sigmask;
td2->td_flags = TDF_INMEM;
#ifdef VIMAGE
td2->td_vnet = NULL;
td2->td_vnet_lpush = NULL;
#endif
/*
* Allow the scheduler to initialize the child.
*/
thread_lock(td);
sched_fork(td, td2);
thread_unlock(td);
/*
* Duplicate sub-structures as needed.
* Increase reference counts on shared objects.
*/
p2->p_flag = P_INMEM;
p2->p_swtick = ticks;
if (p1->p_flag & P_PROFIL)
startprofclock(p2);
td2->td_ucred = crhold(p2->p_ucred);
if (flags & RFSIGSHARE) {
p2->p_sigacts = sigacts_hold(p1->p_sigacts);
} else {
sigacts_copy(newsigacts, p1->p_sigacts);
p2->p_sigacts = newsigacts;
}
if (flags & RFLINUXTHPN)
p2->p_sigparent = SIGUSR1;
else
p2->p_sigparent = SIGCHLD;
p2->p_textvp = p1->p_textvp;
p2->p_fd = fd;
p2->p_fdtol = fdtol;
/*
* p_limit is copy-on-write. Bump its refcount.
*/
lim_fork(p1, p2);
pstats_fork(p1->p_stats, p2->p_stats);
PROC_UNLOCK(p1);
PROC_UNLOCK(p2);
/* Bump references to the text vnode (for procfs) */
if (p2->p_textvp)
vref(p2->p_textvp);
/*
* Set up linkage for kernel based threading.
*/
if ((flags & RFTHREAD) != 0) {
mtx_lock(&ppeers_lock);
p2->p_peers = p1->p_peers;
p1->p_peers = p2;
p2->p_leader = p1->p_leader;
mtx_unlock(&ppeers_lock);
PROC_LOCK(p1->p_leader);
if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
PROC_UNLOCK(p1->p_leader);
/*
* The task leader is exiting, so process p1 is
* going to be killed shortly. Since p1 obviously
* isn't dead yet, we know that the leader is either
* sending SIGKILL's to all the processes in this
* task or is sleeping waiting for all the peers to
* exit. We let p1 complete the fork, but we need
* to go ahead and kill the new process p2 since
* the task leader may not get a chance to send
* SIGKILL to it. We leave it on the list so that
* the task leader will wait for this new process
* to commit suicide.
*/
PROC_LOCK(p2);
psignal(p2, SIGKILL);
PROC_UNLOCK(p2);
} else
PROC_UNLOCK(p1->p_leader);
} else {
p2->p_peers = NULL;
p2->p_leader = p2;
}
sx_xlock(&proctree_lock);
PGRP_LOCK(p1->p_pgrp);
PROC_LOCK(p2);
PROC_LOCK(p1);
/*
* Preserve some more flags in subprocess. P_PROFIL has already
* been preserved.
*/
p2->p_flag |= p1->p_flag & P_SUGID;
td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
SESS_LOCK(p1->p_session);
if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
p2->p_flag |= P_CONTROLT;
SESS_UNLOCK(p1->p_session);
if (flags & RFPPWAIT)
p2->p_flag |= P_PPWAIT;
p2->p_pgrp = p1->p_pgrp;
LIST_INSERT_AFTER(p1, p2, p_pglist);
PGRP_UNLOCK(p1->p_pgrp);
LIST_INIT(&p2->p_children);
callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);
-#ifdef KTRACE
- ktrprocfork(p1, p2);
-#endif
-
/*
* If PF_FORK is set, the child process inherits the
* procfs ioctl flags from its parent.
*/
if (p1->p_pfsflags & PF_FORK) {
p2->p_stops = p1->p_stops;
p2->p_pfsflags = p1->p_pfsflags;
}
/*
* This begins the section where we must prevent the parent
* from being swapped.
*/
_PHOLD(p1);
PROC_UNLOCK(p1);
/*
* Attach the new process to its parent.
*
* If RFNOWAIT is set, the newly created process becomes a child
* of init. This effectively disassociates the child from the
* parent.
*/
if (flags & RFNOWAIT)
pptr = initproc;
else
pptr = p1;
p2->p_pptr = pptr;
LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
sx_xunlock(&proctree_lock);
/* Inform accounting that we have forked. */
p2->p_acflag = AFORK;
PROC_UNLOCK(p2);
+
+#ifdef KTRACE
+ ktrprocfork(p1, p2);
+#endif
/*
* Finish creating the child process. It will return via a different
* execution path later. (ie: directly into user mode)
*/
vm_forkproc(td, p2, td2, vm2, flags);
if (flags == (RFFDG | RFPROC)) {
PCPU_INC(cnt.v_forks);
PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
PCPU_INC(cnt.v_vforks);
PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else if (p1 == &proc0) {
PCPU_INC(cnt.v_kthreads);
PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else {
PCPU_INC(cnt.v_rforks);
PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
}
/*
* Both processes are set up, now check if any loadable modules want
* to adjust anything.
* What if they have an error? XXX
*/
EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
/*
* Set the child start time and mark the process as being complete.
*/
microuptime(&p2->p_stats->p_start);
PROC_SLOCK(p2);
p2->p_state = PRS_NORMAL;
PROC_SUNLOCK(p2);
#ifdef KDTRACE_HOOKS
/*
* Tell the DTrace fasttrap provider about the new process
* if it has registered an interest. We have to do this only after
* p_state is PRS_NORMAL since the fasttrap module will use pfind()
* later on.
*/
if (dtrace_fasttrap_fork) {
PROC_LOCK(p1);
PROC_LOCK(p2);
dtrace_fasttrap_fork(p1, p2);
PROC_UNLOCK(p2);
PROC_UNLOCK(p1);
}
#endif
PROC_LOCK(p1);
if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED |
P_FOLLOWFORK)) {
/*
* Arrange for debugger to receive the fork event.
*
* We can report PL_FLAG_FORKED regardless of
* P_FOLLOWFORK settings, but it does not make a sense
* for runaway child.
*/
td->td_dbgflags |= TDB_FORK;
td->td_dbg_forked = p2->p_pid;
PROC_LOCK(p2);
td2->td_dbgflags |= TDB_STOPATFORK;
_PHOLD(p2);
p2_held = 1;
PROC_UNLOCK(p2);
}
if ((flags & RFSTOPPED) == 0) {
/*
* If RFSTOPPED not requested, make child runnable and
* add to run queue.
*/
thread_lock(td2);
TD_SET_CAN_RUN(td2);
sched_add(td2, SRQ_BORING);
thread_unlock(td2);
}
/*
* Now can be swapped.
*/
_PRELE(p1);
PROC_UNLOCK(p1);
/*
* Tell any interested parties about the new process.
*/
knote_fork(&p1->p_klist, p2->p_pid);
SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);
/*
* Wait until debugger is attached to child.
*/
PROC_LOCK(p2);
while ((td2->td_dbgflags & TDB_STOPATFORK) != 0)
cv_wait(&p2->p_dbgwait, &p2->p_mtx);
if (p2_held)
_PRELE(p2);
/*
* Preserve synchronization semantics of vfork. If waiting for
* child to exec or exit, set P_PPWAIT on child, and sleep on our
* proc (in case of exit).
*/
while (p2->p_flag & P_PPWAIT)
cv_wait(&p2->p_pwait, &p2->p_mtx);
PROC_UNLOCK(p2);
/*
* Return child proc pointer to parent.
*/
*procp = p2;
return (0);
fail:
sx_sunlock(&proctree_lock);
if (ppsratecheck(&lastfail, &curfail, 1))
printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
td->td_ucred->cr_ruid);
sx_xunlock(&allproc_lock);
#ifdef MAC
mac_proc_destroy(newproc);
#endif
fail1:
if (vm2 != NULL)
vmspace_free(vm2);
uma_zfree(proc_zone, newproc);
pause("fork", hz / 2);
return (error);
}
/*
* Handle the return of a child process from fork1(). This function
* is called from the MD fork_trampoline() entry point.
*/
void
fork_exit(callout, arg, frame)
void (*callout)(void *, struct trapframe *);
void *arg;
struct trapframe *frame;
{
struct proc *p;
struct thread *td;
struct thread *dtd;
td = curthread;
p = td->td_proc;
KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
td, td->td_sched, p->p_pid, td->td_name);
sched_fork_exit(td);
/*
* Processes normally resume in mi_switch() after being
* cpu_switch()'ed to, but when children start up they arrive here
* instead, so we must do much the same things as mi_switch() would.
*/
if ((dtd = PCPU_GET(deadthread))) {
PCPU_SET(deadthread, NULL);
thread_stash(dtd);
}
thread_unlock(td);
/*
* cpu_set_fork_handler intercepts this function call to
* have this call a non-return function to stay in kernel mode.
* initproc has its own fork handler, but it does return.
*/
KASSERT(callout != NULL, ("NULL callout in fork_exit"));
callout(arg, frame);
/*
* Check if a kernel thread misbehaved and returned from its main
* function.
*/
if (p->p_flag & P_KTHREAD) {
printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
td->td_name, p->p_pid);
kproc_exit(0);
}
mtx_assert(&Giant, MA_NOTOWNED);
EVENTHANDLER_INVOKE(schedtail, p);
}
/*
* Simplified back end of syscall(), used when returning from fork()
* directly into user mode. Giant is not held on entry, and must not
* be held on return. This function is passed in to fork_exit() as the
* first parameter and is called when returning to a new userland process.
*/
void
fork_return(td, frame)
struct thread *td;
struct trapframe *frame;
{
struct proc *p, *dbg;
if (td->td_dbgflags & TDB_STOPATFORK) {
p = td->td_proc;
sx_xlock(&proctree_lock);
PROC_LOCK(p);
if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) ==
(P_TRACED | P_FOLLOWFORK)) {
/*
* If debugger still wants auto-attach for the
* parent's children, do it now.
*/
dbg = p->p_pptr->p_pptr;
p->p_flag |= P_TRACED;
p->p_oppid = p->p_pptr->p_pid;
proc_reparent(p, dbg);
sx_xunlock(&proctree_lock);
ptracestop(td, SIGSTOP);
} else {
/*
* ... otherwise clear the request.
*/
sx_xunlock(&proctree_lock);
td->td_dbgflags &= ~TDB_STOPATFORK;
cv_broadcast(&p->p_dbgwait);
}
PROC_UNLOCK(p);
}
userret(td, frame);
#ifdef KTRACE
if (KTRPOINT(td, KTR_SYSRET))
ktrsysret(SYS_fork, 0, 0);
#endif
mtx_assert(&Giant, MA_NOTOWNED);
}
Index: stable/8/sys/kern/kern_ktrace.c
===================================================================
--- stable/8/sys/kern/kern_ktrace.c (revision 220261)
+++ stable/8/sys/kern/kern_ktrace.c (revision 220262)
@@ -1,1167 +1,1221 @@
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California.
* Copyright (c) 2005 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/ktrace.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
+#include <sys/sysent.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <security/mac/mac_framework.h>
/*
* The ktrace facility allows the tracing of certain key events in user space
* processes, such as system calls, signal delivery, context switches, and
* user generated events using utrace(2). It works by streaming event
* records and data to a vnode associated with the process using the
* ktrace(2) system call. In general, records can be written directly from
* the context that generates the event. One important exception to this is
* during a context switch, where sleeping is not permitted. To handle this
* case, trace events are generated using in-kernel ktr_request records, and
* then delivered to disk at a convenient moment -- either immediately, the
* next traceable event, at system call return, or at process exit.
*
* When dealing with multiple threads or processes writing to the same event
* log, ordering guarantees are weak: specifically, if an event has multiple
* records (i.e., system call enter and return), they may be interlaced with
* records from another event. Process and thread ID information is provided
* in the record, and user applications can de-interlace events if required.
*/
static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
#ifdef KTRACE
#ifndef KTRACE_REQUEST_POOL
#define KTRACE_REQUEST_POOL 100
#endif
struct ktr_request {
struct ktr_header ktr_header;
void *ktr_buffer;
union {
+ struct ktr_proc_ctor ktr_proc_ctor;
struct ktr_syscall ktr_syscall;
struct ktr_sysret ktr_sysret;
struct ktr_genio ktr_genio;
struct ktr_psig ktr_psig;
struct ktr_csw ktr_csw;
} ktr_data;
STAILQ_ENTRY(ktr_request) ktr_list;
};
static int data_lengths[] = {
0, /* none */
offsetof(struct ktr_syscall, ktr_args), /* KTR_SYSCALL */
sizeof(struct ktr_sysret), /* KTR_SYSRET */
0, /* KTR_NAMEI */
sizeof(struct ktr_genio), /* KTR_GENIO */
sizeof(struct ktr_psig), /* KTR_PSIG */
sizeof(struct ktr_csw), /* KTR_CSW */
0, /* KTR_USER */
0, /* KTR_STRUCT */
0, /* KTR_SYSCTL */
+ sizeof(struct ktr_proc_ctor), /* KTR_PROCCTOR */
+ 0, /* KTR_PROCDTOR */
};
static STAILQ_HEAD(, ktr_request) ktr_free;
static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
static u_int ktr_geniosize = PAGE_SIZE;
TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
0, "Maximum size of genio event payload");
static int print_message = 1;
static struct mtx ktrace_mtx;
static struct sx ktrace_sx;
static void ktrace_init(void *dummy);
static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
-static u_int ktrace_resize_pool(u_int newsize);
+static u_int ktrace_resize_pool(u_int oldsize, u_int newsize);
+static struct ktr_request *ktr_getrequest_entered(struct thread *td, int type);
static struct ktr_request *ktr_getrequest(int type);
static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
static void ktr_freeproc(struct proc *p, struct ucred **uc,
struct vnode **vp);
static void ktr_freerequest(struct ktr_request *req);
static void ktr_freerequest_locked(struct ktr_request *req);
static void ktr_writerequest(struct thread *td, struct ktr_request *req);
static int ktrcanset(struct thread *,struct proc *);
static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
+static void ktrprocctor_entered(struct thread *, struct proc *);
/*
* ktrace itself generates events, such as context switches, which we do not
* wish to trace. Maintain a flag, TDP_INKTRACE, on each thread to determine
* whether or not it is in a region where tracing of events should be
* suppressed.
*/
static void
ktrace_enter(struct thread *td)
{
KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
td->td_pflags |= TDP_INKTRACE;
}
static void
ktrace_exit(struct thread *td)
{
KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
td->td_pflags &= ~TDP_INKTRACE;
}
static void
ktrace_assert(struct thread *td)
{
KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
}
static void
ktrace_init(void *dummy)
{
struct ktr_request *req;
int i;
mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
sx_init(&ktrace_sx, "ktrace_sx");
STAILQ_INIT(&ktr_free);
for (i = 0; i < ktr_requestpool; i++) {
req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
}
}
SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
static int
sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
{
struct thread *td;
u_int newsize, oldsize, wantsize;
int error;
/* Handle easy read-only case first to avoid warnings from GCC. */
if (!req->newptr) {
- mtx_lock(&ktrace_mtx);
oldsize = ktr_requestpool;
- mtx_unlock(&ktrace_mtx);
return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
}
error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
if (error)
return (error);
td = curthread;
ktrace_enter(td);
- mtx_lock(&ktrace_mtx);
oldsize = ktr_requestpool;
- newsize = ktrace_resize_pool(wantsize);
- mtx_unlock(&ktrace_mtx);
+ newsize = ktrace_resize_pool(oldsize, wantsize);
ktrace_exit(td);
error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
if (error)
return (error);
if (wantsize > oldsize && newsize < wantsize)
return (ENOSPC);
return (0);
}
SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW,
&ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", "");
static u_int
-ktrace_resize_pool(u_int newsize)
+ktrace_resize_pool(u_int oldsize, u_int newsize)
{
+ STAILQ_HEAD(, ktr_request) ktr_new;
struct ktr_request *req;
int bound;
- mtx_assert(&ktrace_mtx, MA_OWNED);
print_message = 1;
- bound = newsize - ktr_requestpool;
+ bound = newsize - oldsize;
if (bound == 0)
return (ktr_requestpool);
- if (bound < 0)
+ if (bound < 0) {
+ mtx_lock(&ktrace_mtx);
/* Shrink pool down to newsize if possible. */
while (bound++ < 0) {
req = STAILQ_FIRST(&ktr_free);
if (req == NULL)
- return (ktr_requestpool);
+ break;
STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
ktr_requestpool--;
- mtx_unlock(&ktrace_mtx);
free(req, M_KTRACE);
- mtx_lock(&ktrace_mtx);
}
- else
+ } else {
/* Grow pool up to newsize. */
+ STAILQ_INIT(&ktr_new);
while (bound-- > 0) {
- mtx_unlock(&ktrace_mtx);
req = malloc(sizeof(struct ktr_request), M_KTRACE,
M_WAITOK);
- mtx_lock(&ktrace_mtx);
- STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
- ktr_requestpool++;
+ STAILQ_INSERT_HEAD(&ktr_new, req, ktr_list);
}
+ mtx_lock(&ktrace_mtx);
+ STAILQ_CONCAT(&ktr_free, &ktr_new);
+ ktr_requestpool += (newsize - oldsize);
+ }
+ mtx_unlock(&ktrace_mtx);
return (ktr_requestpool);
}
/* ktr_getrequest() assumes that ktr_comm[] is the same size as td_name[]. */
CTASSERT(sizeof(((struct ktr_header *)NULL)->ktr_comm) ==
(sizeof((struct thread *)NULL)->td_name));
static struct ktr_request *
-ktr_getrequest(int type)
+ktr_getrequest_entered(struct thread *td, int type)
{
struct ktr_request *req;
- struct thread *td = curthread;
struct proc *p = td->td_proc;
int pm;
- ktrace_enter(td); /* XXX: In caller instead? */
mtx_lock(&ktrace_mtx);
if (!KTRCHECK(td, type)) {
mtx_unlock(&ktrace_mtx);
- ktrace_exit(td);
return (NULL);
}
req = STAILQ_FIRST(&ktr_free);
if (req != NULL) {
STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
req->ktr_header.ktr_type = type;
if (p->p_traceflag & KTRFAC_DROP) {
req->ktr_header.ktr_type |= KTR_DROP;
p->p_traceflag &= ~KTRFAC_DROP;
}
mtx_unlock(&ktrace_mtx);
microtime(&req->ktr_header.ktr_time);
req->ktr_header.ktr_pid = p->p_pid;
req->ktr_header.ktr_tid = td->td_tid;
bcopy(td->td_name, req->ktr_header.ktr_comm,
sizeof(req->ktr_header.ktr_comm));
req->ktr_buffer = NULL;
req->ktr_header.ktr_len = 0;
} else {
p->p_traceflag |= KTRFAC_DROP;
pm = print_message;
print_message = 0;
mtx_unlock(&ktrace_mtx);
if (pm)
printf("Out of ktrace request objects.\n");
- ktrace_exit(td);
}
return (req);
}
+static struct ktr_request *
+ktr_getrequest(int type)
+{
+ struct thread *td = curthread;
+ struct ktr_request *req;
+
+ ktrace_enter(td);
+ req = ktr_getrequest_entered(td, type);
+ if (req == NULL)
+ ktrace_exit(td);
+
+ return (req);
+}
+
/*
* Some trace generation environments don't permit direct access to VFS,
* such as during a context switch where sleeping is not allowed. Under these
* circumstances, queue a request to the thread to be written asynchronously
* later.
*/
static void
ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
{
mtx_lock(&ktrace_mtx);
STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
mtx_unlock(&ktrace_mtx);
- ktrace_exit(td);
}
/*
* Drain any pending ktrace records from the per-thread queue to disk. This
* is used both internally before committing other records, and also on
* system call return. We drain all the ones we can find at the time when
* drain is requested, but don't keep draining after that as those events
* may be approximately "after" the current event.
*/
static void
ktr_drain(struct thread *td)
{
struct ktr_request *queued_req;
STAILQ_HEAD(, ktr_request) local_queue;
ktrace_assert(td);
sx_assert(&ktrace_sx, SX_XLOCKED);
STAILQ_INIT(&local_queue); /* XXXRW: needed? */
if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
mtx_lock(&ktrace_mtx);
STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
mtx_unlock(&ktrace_mtx);
while ((queued_req = STAILQ_FIRST(&local_queue))) {
STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
ktr_writerequest(td, queued_req);
ktr_freerequest(queued_req);
}
}
}
/*
* Submit a trace record for immediate commit to disk -- to be used only
* where entering VFS is OK. First drain any pending records that may have
* been cached in the thread.
*/
static void
ktr_submitrequest(struct thread *td, struct ktr_request *req)
{
ktrace_assert(td);
sx_xlock(&ktrace_sx);
ktr_drain(td);
ktr_writerequest(td, req);
ktr_freerequest(req);
sx_xunlock(&ktrace_sx);
-
ktrace_exit(td);
}
static void
ktr_freerequest(struct ktr_request *req)
{
mtx_lock(&ktrace_mtx);
ktr_freerequest_locked(req);
mtx_unlock(&ktrace_mtx);
}
static void
ktr_freerequest_locked(struct ktr_request *req)
{
mtx_assert(&ktrace_mtx, MA_OWNED);
if (req->ktr_buffer != NULL)
free(req->ktr_buffer, M_KTRACE);
STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
}
/*
* Disable tracing for a process and release all associated resources.
* The caller is responsible for releasing a reference on the returned
* vnode and credentials.
*/
static void
ktr_freeproc(struct proc *p, struct ucred **uc, struct vnode **vp)
{
struct ktr_request *req;
PROC_LOCK_ASSERT(p, MA_OWNED);
mtx_assert(&ktrace_mtx, MA_OWNED);
*uc = p->p_tracecred;
p->p_tracecred = NULL;
if (vp != NULL)
*vp = p->p_tracevp;
p->p_tracevp = NULL;
p->p_traceflag = 0;
while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
ktr_freerequest_locked(req);
}
}
void
ktrsyscall(code, narg, args)
int code, narg;
register_t args[];
{
struct ktr_request *req;
struct ktr_syscall *ktp;
size_t buflen;
char *buf = NULL;
buflen = sizeof(register_t) * narg;
if (buflen > 0) {
buf = malloc(buflen, M_KTRACE, M_WAITOK);
bcopy(args, buf, buflen);
}
req = ktr_getrequest(KTR_SYSCALL);
if (req == NULL) {
if (buf != NULL)
free(buf, M_KTRACE);
return;
}
ktp = &req->ktr_data.ktr_syscall;
ktp->ktr_code = code;
ktp->ktr_narg = narg;
if (buflen > 0) {
req->ktr_header.ktr_len = buflen;
req->ktr_buffer = buf;
}
ktr_submitrequest(curthread, req);
}
void
ktrsysret(code, error, retval)
int code, error;
register_t retval;
{
struct ktr_request *req;
struct ktr_sysret *ktp;
req = ktr_getrequest(KTR_SYSRET);
if (req == NULL)
return;
ktp = &req->ktr_data.ktr_sysret;
ktp->ktr_code = code;
ktp->ktr_error = error;
ktp->ktr_retval = retval; /* what about val2 ? */
ktr_submitrequest(curthread, req);
}
/*
* When a setuid process execs, disable tracing.
*
* XXX: We toss any pending asynchronous records.
*/
void
ktrprocexec(struct proc *p, struct ucred **uc, struct vnode **vp)
{
PROC_LOCK_ASSERT(p, MA_OWNED);
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, uc, vp);
mtx_unlock(&ktrace_mtx);
}
/*
* When a process exits, drain per-process asynchronous trace records
* and disable tracing.
*/
void
ktrprocexit(struct thread *td)
{
+ struct ktr_request *req;
struct proc *p;
struct ucred *cred;
struct vnode *vp;
int vfslocked;
p = td->td_proc;
if (p->p_traceflag == 0)
return;
ktrace_enter(td);
+ req = ktr_getrequest_entered(td, KTR_PROCDTOR);
+ if (req != NULL)
+ ktr_enqueuerequest(td, req);
sx_xlock(&ktrace_sx);
ktr_drain(td);
sx_xunlock(&ktrace_sx);
PROC_LOCK(p);
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, &cred, &vp);
mtx_unlock(&ktrace_mtx);
PROC_UNLOCK(p);
if (vp != NULL) {
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
if (cred != NULL)
crfree(cred);
ktrace_exit(td);
}
+static void
+ktrprocctor_entered(struct thread *td, struct proc *p)
+{
+ struct ktr_proc_ctor *ktp;
+ struct ktr_request *req;
+ struct thread *td2;
+
+ ktrace_assert(td);
+ td2 = FIRST_THREAD_IN_PROC(p);
+ req = ktr_getrequest_entered(td2, KTR_PROCCTOR);
+ if (req == NULL)
+ return;
+ ktp = &req->ktr_data.ktr_proc_ctor;
+ ktp->sv_flags = p->p_sysent->sv_flags;
+ ktr_enqueuerequest(td2, req);
+}
+
+void
+ktrprocctor(struct proc *p)
+{
+ struct thread *td = curthread;
+
+ if ((p->p_traceflag & KTRFAC_MASK) == 0)
+ return;
+
+ ktrace_enter(td);
+ ktrprocctor_entered(td, p);
+ ktrace_exit(td);
+}
+
/*
* When a process forks, enable tracing in the new process if needed.
*/
void
ktrprocfork(struct proc *p1, struct proc *p2)
{
- PROC_LOCK_ASSERT(p1, MA_OWNED);
- PROC_LOCK_ASSERT(p2, MA_OWNED);
+ PROC_LOCK(p1);
mtx_lock(&ktrace_mtx);
KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
if (p1->p_traceflag & KTRFAC_INHERIT) {
p2->p_traceflag = p1->p_traceflag;
if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
VREF(p2->p_tracevp);
KASSERT(p1->p_tracecred != NULL,
("ktrace vnode with no cred"));
p2->p_tracecred = crhold(p1->p_tracecred);
}
}
mtx_unlock(&ktrace_mtx);
+ PROC_UNLOCK(p1);
+
+ ktrprocctor(p2);
}
/*
* When a thread returns, drain any asynchronous records generated by the
* system call.
*/
void
ktruserret(struct thread *td)
{
ktrace_enter(td);
sx_xlock(&ktrace_sx);
ktr_drain(td);
sx_xunlock(&ktrace_sx);
ktrace_exit(td);
}
void
ktrnamei(path)
char *path;
{
struct ktr_request *req;
int namelen;
char *buf = NULL;
namelen = strlen(path);
if (namelen > 0) {
buf = malloc(namelen, M_KTRACE, M_WAITOK);
bcopy(path, buf, namelen);
}
req = ktr_getrequest(KTR_NAMEI);
if (req == NULL) {
if (buf != NULL)
free(buf, M_KTRACE);
return;
}
if (namelen > 0) {
req->ktr_header.ktr_len = namelen;
req->ktr_buffer = buf;
}
ktr_submitrequest(curthread, req);
}
void
ktrsysctl(name, namelen)
int *name;
u_int namelen;
{
struct ktr_request *req;
u_int mib[CTL_MAXNAME + 2];
char *mibname;
size_t mibnamelen;
int error;
/* Lookup name of mib. */
KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
mib[0] = 0;
mib[1] = 1;
bcopy(name, mib + 2, namelen * sizeof(*name));
mibnamelen = 128;
mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
NULL, 0, &mibnamelen, 0);
if (error) {
free(mibname, M_KTRACE);
return;
}
req = ktr_getrequest(KTR_SYSCTL);
if (req == NULL) {
free(mibname, M_KTRACE);
return;
}
req->ktr_header.ktr_len = mibnamelen;
req->ktr_buffer = mibname;
ktr_submitrequest(curthread, req);
}
void
ktrgenio(fd, rw, uio, error)
int fd;
enum uio_rw rw;
struct uio *uio;
int error;
{
struct ktr_request *req;
struct ktr_genio *ktg;
int datalen;
char *buf;
if (error) {
free(uio, M_IOV);
return;
}
uio->uio_offset = 0;
uio->uio_rw = UIO_WRITE;
datalen = imin(uio->uio_resid, ktr_geniosize);
buf = malloc(datalen, M_KTRACE, M_WAITOK);
error = uiomove(buf, datalen, uio);
free(uio, M_IOV);
if (error) {
free(buf, M_KTRACE);
return;
}
req = ktr_getrequest(KTR_GENIO);
if (req == NULL) {
free(buf, M_KTRACE);
return;
}
ktg = &req->ktr_data.ktr_genio;
ktg->ktr_fd = fd;
ktg->ktr_rw = rw;
req->ktr_header.ktr_len = datalen;
req->ktr_buffer = buf;
ktr_submitrequest(curthread, req);
}
void
ktrpsig(sig, action, mask, code)
int sig;
sig_t action;
sigset_t *mask;
int code;
{
+ struct thread *td = curthread;
struct ktr_request *req;
struct ktr_psig *kp;
req = ktr_getrequest(KTR_PSIG);
if (req == NULL)
return;
kp = &req->ktr_data.ktr_psig;
kp->signo = (char)sig;
kp->action = action;
kp->mask = *mask;
kp->code = code;
- ktr_enqueuerequest(curthread, req);
+ ktr_enqueuerequest(td, req);
+ ktrace_exit(td);
}
void
ktrcsw(out, user)
int out, user;
{
+ struct thread *td = curthread;
struct ktr_request *req;
struct ktr_csw *kc;
req = ktr_getrequest(KTR_CSW);
if (req == NULL)
return;
kc = &req->ktr_data.ktr_csw;
kc->out = out;
kc->user = user;
- ktr_enqueuerequest(curthread, req);
+ ktr_enqueuerequest(td, req);
+ ktrace_exit(td);
}
void
ktrstruct(name, namelen, data, datalen)
const char *name;
size_t namelen;
void *data;
size_t datalen;
{
struct ktr_request *req;
char *buf = NULL;
size_t buflen;
if (!data)
datalen = 0;
buflen = namelen + 1 + datalen;
buf = malloc(buflen, M_KTRACE, M_WAITOK);
bcopy(name, buf, namelen);
buf[namelen] = '\0';
bcopy(data, buf + namelen + 1, datalen);
if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
free(buf, M_KTRACE);
return;
}
req->ktr_buffer = buf;
req->ktr_header.ktr_len = buflen;
ktr_submitrequest(curthread, req);
}
#endif /* KTRACE */
/* Interface and common routines */
#ifndef _SYS_SYSPROTO_H_
struct ktrace_args {
char *fname;
int ops;
int facs;
int pid;
};
#endif
/* ARGSUSED */
int
ktrace(td, uap)
struct thread *td;
register struct ktrace_args *uap;
{
#ifdef KTRACE
register struct vnode *vp = NULL;
register struct proc *p;
struct pgrp *pg;
int facs = uap->facs & ~KTRFAC_ROOT;
int ops = KTROP(uap->ops);
int descend = uap->ops & KTRFLAG_DESCEND;
int nfound, ret = 0;
int flags, error = 0, vfslocked;
struct nameidata nd;
struct ucred *cred;
/*
* Need something to (un)trace.
*/
if (ops != KTROP_CLEARFILE && facs == 0)
return (EINVAL);
ktrace_enter(td);
if (ops != KTROP_CLEAR) {
/*
* an operation which requires a file argument.
*/
NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE,
uap->fname, td);
flags = FREAD | FWRITE | O_NOFOLLOW;
error = vn_open(&nd, &flags, 0, NULL);
if (error) {
ktrace_exit(td);
return (error);
}
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
VOP_UNLOCK(vp, 0);
if (vp->v_type != VREG) {
(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
ktrace_exit(td);
return (EACCES);
}
VFS_UNLOCK_GIANT(vfslocked);
}
/*
* Clear all uses of the tracefile.
*/
if (ops == KTROP_CLEARFILE) {
int vrele_count;
vrele_count = 0;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_tracevp == vp) {
if (ktrcanset(td, p)) {
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, &cred, NULL);
mtx_unlock(&ktrace_mtx);
vrele_count++;
crfree(cred);
} else
error = EPERM;
}
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
if (vrele_count > 0) {
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
while (vrele_count-- > 0)
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
goto done;
}
/*
* do it
*/
sx_slock(&proctree_lock);
if (uap->pid < 0) {
/*
* by process group
*/
pg = pgfind(-uap->pid);
if (pg == NULL) {
sx_sunlock(&proctree_lock);
error = ESRCH;
goto done;
}
/*
* ktrops() may call vrele(). Lock pg_members
* by the proctree_lock rather than pg_mtx.
*/
PGRP_UNLOCK(pg);
nfound = 0;
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (p_cansee(td, p) != 0) {
PROC_UNLOCK(p);
continue;
}
PROC_UNLOCK(p);
nfound++;
if (descend)
ret |= ktrsetchildren(td, p, ops, facs, vp);
else
ret |= ktrops(td, p, ops, facs, vp);
}
if (nfound == 0) {
sx_sunlock(&proctree_lock);
error = ESRCH;
goto done;
}
} else {
/*
* by pid
*/
p = pfind(uap->pid);
if (p == NULL) {
sx_sunlock(&proctree_lock);
error = ESRCH;
goto done;
}
error = p_cansee(td, p);
/*
* The slock of the proctree lock will keep this process
* from going away, so unlocking the proc here is ok.
*/
PROC_UNLOCK(p);
if (error) {
sx_sunlock(&proctree_lock);
goto done;
}
if (descend)
ret |= ktrsetchildren(td, p, ops, facs, vp);
else
ret |= ktrops(td, p, ops, facs, vp);
}
sx_sunlock(&proctree_lock);
if (!ret)
error = EPERM;
done:
if (vp != NULL) {
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
(void) vn_close(vp, FWRITE, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
}
ktrace_exit(td);
return (error);
#else /* !KTRACE */
return (ENOSYS);
#endif /* KTRACE */
}
/* ARGSUSED */
int
utrace(td, uap)
struct thread *td;
register struct utrace_args *uap;
{
#ifdef KTRACE
struct ktr_request *req;
void *cp;
int error;
if (!KTRPOINT(td, KTR_USER))
return (0);
if (uap->len > KTR_USER_MAXLEN)
return (EINVAL);
cp = malloc(uap->len, M_KTRACE, M_WAITOK);
error = copyin(uap->addr, cp, uap->len);
if (error) {
free(cp, M_KTRACE);
return (error);
}
req = ktr_getrequest(KTR_USER);
if (req == NULL) {
free(cp, M_KTRACE);
return (ENOMEM);
}
req->ktr_buffer = cp;
req->ktr_header.ktr_len = uap->len;
ktr_submitrequest(td, req);
return (0);
#else /* !KTRACE */
return (ENOSYS);
#endif /* KTRACE */
}
#ifdef KTRACE
static int
ktrops(td, p, ops, facs, vp)
struct thread *td;
struct proc *p;
int ops, facs;
struct vnode *vp;
{
struct vnode *tracevp = NULL;
struct ucred *tracecred = NULL;
PROC_LOCK(p);
if (!ktrcanset(td, p)) {
PROC_UNLOCK(p);
return (0);
}
mtx_lock(&ktrace_mtx);
if (ops == KTROP_SET) {
if (p->p_tracevp != vp) {
/*
* if trace file already in use, relinquish below
*/
tracevp = p->p_tracevp;
VREF(vp);
p->p_tracevp = vp;
}
if (p->p_tracecred != td->td_ucred) {
tracecred = p->p_tracecred;
p->p_tracecred = crhold(td->td_ucred);
}
p->p_traceflag |= facs;
if (priv_check(td, PRIV_KTRACE) == 0)
p->p_traceflag |= KTRFAC_ROOT;
} else {
/* KTROP_CLEAR */
if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
/* no more tracing */
ktr_freeproc(p, &tracecred, &tracevp);
}
mtx_unlock(&ktrace_mtx);
+ if ((p->p_traceflag & KTRFAC_MASK) != 0)
+ ktrprocctor_entered(td, p);
PROC_UNLOCK(p);
if (tracevp != NULL) {
int vfslocked;
vfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
vrele(tracevp);
VFS_UNLOCK_GIANT(vfslocked);
}
if (tracecred != NULL)
crfree(tracecred);
return (1);
}
static int
ktrsetchildren(td, top, ops, facs, vp)
struct thread *td;
struct proc *top;
int ops, facs;
struct vnode *vp;
{
register struct proc *p;
register int ret = 0;
p = top;
sx_assert(&proctree_lock, SX_LOCKED);
for (;;) {
ret |= ktrops(td, p, ops, facs, vp);
/*
* If this process has children, descend to them next,
* otherwise do any siblings, and if done with this level,
* follow back up the tree (but not past top).
*/
if (!LIST_EMPTY(&p->p_children))
p = LIST_FIRST(&p->p_children);
else for (;;) {
if (p == top)
return (ret);
if (LIST_NEXT(p, p_sibling)) {
p = LIST_NEXT(p, p_sibling);
break;
}
p = p->p_pptr;
}
}
/*NOTREACHED*/
}
static void
ktr_writerequest(struct thread *td, struct ktr_request *req)
{
struct ktr_header *kth;
struct vnode *vp;
struct proc *p;
struct ucred *cred;
struct uio auio;
struct iovec aiov[3];
struct mount *mp;
int datalen, buflen, vrele_count;
int error, vfslocked;
/*
* We hold the vnode and credential for use in I/O in case ktrace is
* disabled on the process as we write out the request.
*
* XXXRW: This is not ideal: we could end up performing a write after
* the vnode has been closed.
*/
mtx_lock(&ktrace_mtx);
vp = td->td_proc->p_tracevp;
cred = td->td_proc->p_tracecred;
/*
* If vp is NULL, the vp has been cleared out from under this
* request, so just drop it. Make sure the credential and vnode are
* in sync: we should have both or neither.
*/
if (vp == NULL) {
KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
mtx_unlock(&ktrace_mtx);
return;
}
VREF(vp);
KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
crhold(cred);
mtx_unlock(&ktrace_mtx);
kth = &req->ktr_header;
KASSERT(((u_short)kth->ktr_type & ~KTR_DROP) <
sizeof(data_lengths) / sizeof(data_lengths[0]),
("data_lengths array overflow"));
datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
buflen = kth->ktr_len;
auio.uio_iov = &aiov[0];
auio.uio_offset = 0;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_rw = UIO_WRITE;
aiov[0].iov_base = (caddr_t)kth;
aiov[0].iov_len = sizeof(struct ktr_header);
auio.uio_resid = sizeof(struct ktr_header);
auio.uio_iovcnt = 1;
auio.uio_td = td;
if (datalen != 0) {
aiov[1].iov_base = (caddr_t)&req->ktr_data;
aiov[1].iov_len = datalen;
auio.uio_resid += datalen;
auio.uio_iovcnt++;
kth->ktr_len += datalen;
}
if (buflen != 0) {
KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
aiov[auio.uio_iovcnt].iov_len = buflen;
auio.uio_resid += buflen;
auio.uio_iovcnt++;
}
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef MAC
error = mac_vnode_check_write(cred, NOCRED, vp);
if (error == 0)
#endif
error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
crfree(cred);
if (!error) {
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
return;
}
VFS_UNLOCK_GIANT(vfslocked);
/*
* If error encountered, give up tracing on this vnode. We defer
* all the vrele()'s on the vnode until after we are finished walking
* the various lists to avoid needlessly holding locks.
* NB: at this point we still hold the vnode reference that must
* not go away as we need the valid vnode to compare with. Thus let
* vrele_count start at 1 and the reference will be freed
* by the loop at the end after our last use of vp.
*/
log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
error);
vrele_count = 1;
/*
* First, clear this vnode from being used by any processes in the
* system.
* XXX - If one process gets an EPERM writing to the vnode, should
* we really do this? Other processes might have suitable
* credentials for the operation.
*/
cred = NULL;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_tracevp == vp) {
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, &cred, NULL);
mtx_unlock(&ktrace_mtx);
vrele_count++;
}
PROC_UNLOCK(p);
if (cred != NULL) {
crfree(cred);
cred = NULL;
}
}
sx_sunlock(&allproc_lock);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
while (vrele_count-- > 0)
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
/*
* Return true if caller has permission to set the ktracing state
* of target. Essentially, the target can't possess any
* more permissions than the caller. KTRFAC_ROOT signifies that
* root previously set the tracing status on the target process, and
* so, only root may further change it.
*/
static int
ktrcanset(td, targetp)
struct thread *td;
struct proc *targetp;
{
PROC_LOCK_ASSERT(targetp, MA_OWNED);
if (targetp->p_traceflag & KTRFAC_ROOT &&
priv_check(td, PRIV_KTRACE))
return (0);
if (p_candebug(td, targetp) != 0)
return (0);
return (1);
}
#endif /* KTRACE */
Index: stable/8/sys/sys/ktrace.h
===================================================================
--- stable/8/sys/sys/ktrace.h (revision 220261)
+++ stable/8/sys/sys/ktrace.h (revision 220262)
@@ -1,222 +1,239 @@
/*-
* Copyright (c) 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ktrace.h 8.1 (Berkeley) 6/2/93
* $FreeBSD$
*/
#ifndef _SYS_KTRACE_H_
#define _SYS_KTRACE_H_
/*
* operations to ktrace system call (KTROP(op))
*/
#define KTROP_SET 0 /* set trace points */
#define KTROP_CLEAR 1 /* clear trace points */
#define KTROP_CLEARFILE 2 /* stop all tracing to file */
#define KTROP(o) ((o)&3) /* macro to extract operation */
/*
* flags (ORed in with operation)
*/
#define KTRFLAG_DESCEND 4 /* perform op on all children too */
/*
* ktrace record header
*/
struct ktr_header {
int ktr_len; /* length of buf */
short ktr_type; /* trace record type */
pid_t ktr_pid; /* process id */
char ktr_comm[MAXCOMLEN+1]; /* command name */
struct timeval ktr_time; /* timestamp */
intptr_t ktr_tid; /* was ktr_buffer */
};
/*
* Test for kernel trace point (MP SAFE).
*
* KTRCHECK() just checks that the type is enabled and is only for
* internal use in the ktrace subsystem. KTRPOINT() checks against
* ktrace recursion as well as checking that the type is enabled and
* is the public interface.
*/
#define KTRCHECK(td, type) ((td)->td_proc->p_traceflag & (1 << type))
#define KTRPOINT(td, type) \
(KTRCHECK((td), (type)) && !((td)->td_pflags & TDP_INKTRACE))
#define KTRCHECKDRAIN(td) (!(STAILQ_EMPTY(&(td)->td_proc->p_ktr)))
#define KTRUSERRET(td) do { \
if (KTRCHECKDRAIN(td)) \
ktruserret(td); \
} while (0)
#define KTRPROCEXIT(td) do { \
if (KTRCHECKDRAIN(td)) \
ktrprocexit(td); \
} while (0)
/*
* ktrace record types
*/
/*
* KTR_SYSCALL - system call record
*/
#define KTR_SYSCALL 1
struct ktr_syscall {
short ktr_code; /* syscall number */
short ktr_narg; /* number of arguments */
/*
* followed by ktr_narg register_t
*/
register_t ktr_args[1];
};
/*
* KTR_SYSRET - return from system call record
*/
#define KTR_SYSRET 2
struct ktr_sysret {
short ktr_code;
short ktr_eosys;
int ktr_error;
register_t ktr_retval;
};
/*
* KTR_NAMEI - namei record
*/
#define KTR_NAMEI 3
/* record contains pathname */
/*
* KTR_GENIO - trace generic process i/o
*/
#define KTR_GENIO 4
struct ktr_genio {
int ktr_fd;
enum uio_rw ktr_rw;
/*
* followed by data successfully read/written
*/
};
/*
* KTR_PSIG - trace processed signal
*/
#define KTR_PSIG 5
struct ktr_psig {
int signo;
sig_t action;
int code;
sigset_t mask;
};
/*
* KTR_CSW - trace context switches
*/
#define KTR_CSW 6
struct ktr_csw {
int out; /* 1 if switch out, 0 if switch in */
int user; /* 1 if usermode (ivcsw), 0 if kernel (vcsw) */
};
/*
* KTR_USER - data coming from userland
*/
#define KTR_USER_MAXLEN 2048 /* maximum length of passed data */
#define KTR_USER 7
/*
* KTR_STRUCT - misc. structs
*/
#define KTR_STRUCT 8
struct sockaddr;
struct stat;
+struct sysentvec;
/*
* KTR_SYSCTL - name of a sysctl MIB
*/
#define KTR_SYSCTL 9
/* record contains null-terminated MIB name */
/*
+ * KTR_PROCCTOR - trace process creation (multiple ABI support)
+ */
+#define KTR_PROCCTOR 10
+struct ktr_proc_ctor {
+ u_int sv_flags; /* struct sysentvec sv_flags copy */
+};
+
+/*
+ * KTR_PROCDTOR - trace process destruction (multiple ABI support)
+ */
+#define KTR_PROCDTOR 11
+
+/*
* KTR_DROP - If this bit is set in ktr_type, then at least one event
* between the previous record and this record was dropped.
*/
#define KTR_DROP 0x8000
/*
* kernel trace points (in p_traceflag)
*/
#define KTRFAC_MASK 0x00ffffff
#define KTRFAC_SYSCALL (1<<KTR_SYSCALL)
#define KTRFAC_SYSRET (1<<KTR_SYSRET)
#define KTRFAC_NAMEI (1<<KTR_NAMEI)
#define KTRFAC_GENIO (1<<KTR_GENIO)
#define KTRFAC_PSIG (1<<KTR_PSIG)
#define KTRFAC_CSW (1<<KTR_CSW)
#define KTRFAC_USER (1<<KTR_USER)
#define KTRFAC_STRUCT (1<<KTR_STRUCT)
#define KTRFAC_SYSCTL (1<<KTR_SYSCTL)
+#define KTRFAC_PROCCTOR (1<<KTR_PROCCTOR)
+#define KTRFAC_PROCDTOR (1<<KTR_PROCDTOR)
/*
* trace flags (also in p_traceflags)
*/
#define KTRFAC_ROOT 0x80000000 /* root set this trace */
#define KTRFAC_INHERIT 0x40000000 /* pass trace flags to children */
#define KTRFAC_DROP 0x20000000 /* last event was dropped */
#ifdef _KERNEL
void ktrnamei(char *);
void ktrcsw(int, int);
void ktrpsig(int, sig_t, sigset_t *, int);
void ktrgenio(int, enum uio_rw, struct uio *, int);
void ktrsyscall(int, int narg, register_t args[]);
void ktrsysctl(int *name, u_int namelen);
void ktrsysret(int, int, register_t);
+void ktrprocctor(struct proc *);
void ktrprocexec(struct proc *, struct ucred **, struct vnode **);
void ktrprocexit(struct thread *);
void ktrprocfork(struct proc *, struct proc *);
void ktruserret(struct thread *);
void ktrstruct(const char *, size_t, void *, size_t);
#define ktrsockaddr(s) \
ktrstruct("sockaddr", 8, (s), ((struct sockaddr *)(s))->sa_len)
#define ktrstat(s) \
ktrstruct("stat", 4, (s), sizeof(struct stat))
#else
#include <sys/cdefs.h>
__BEGIN_DECLS
int ktrace(const char *, int, int, pid_t);
int utrace(const void *, size_t);
__END_DECLS
#endif
#endif
Index: stable/8/sys
===================================================================
--- stable/8/sys (revision 220261)
+++ stable/8/sys (revision 220262)
Property changes on: stable/8/sys
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys:r219041-219042,219311-219312

File Metadata

Mime Type
text/x-c
Expires
Tue, Oct 14, 10:33 PM (2 d)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
23695468
Default Alt Text
(100 KB)

Event Timeline