Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F132010964
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
100 KB
Referenced Files
None
Subscribers
None
View Options
Index: stable/8/sys/amd64/include/xen
===================================================================
--- stable/8/sys/amd64/include/xen (revision 220261)
+++ stable/8/sys/amd64/include/xen (revision 220262)
Property changes on: stable/8/sys/amd64/include/xen
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/amd64/include/xen:r219041-219042,219311-219312
Index: stable/8/sys/cddl/contrib/opensolaris
===================================================================
--- stable/8/sys/cddl/contrib/opensolaris (revision 220261)
+++ stable/8/sys/cddl/contrib/opensolaris (revision 220262)
Property changes on: stable/8/sys/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/cddl/contrib/opensolaris:r219041-219042,219311-219312
Index: stable/8/sys/contrib/dev/acpica
===================================================================
--- stable/8/sys/contrib/dev/acpica (revision 220261)
+++ stable/8/sys/contrib/dev/acpica (revision 220262)
Property changes on: stable/8/sys/contrib/dev/acpica
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/contrib/dev/acpica:r219041-219042,219311-219312
Index: stable/8/sys/contrib/pf
===================================================================
--- stable/8/sys/contrib/pf (revision 220261)
+++ stable/8/sys/contrib/pf (revision 220262)
Property changes on: stable/8/sys/contrib/pf
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/contrib/pf:r219041-219042,219311-219312
Index: stable/8/sys/kern/kern_exec.c
===================================================================
--- stable/8/sys/kern/kern_exec.c (revision 220261)
+++ stable/8/sys/kern/kern_exec.c (revision 220262)
@@ -1,1422 +1,1428 @@
/*-
* Copyright (c) 1993, David Greenman
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_hwpmc_hooks.h"
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_vm.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
#include <sys/signalvar.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/filedesc.h>
#include <sys/fcntl.h>
#include <sys/acct.h>
#include <sys/exec.h>
#include <sys/imgact.h>
#include <sys/imgact_elf.h>
#include <sys/wait.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/namei.h>
#include <sys/resourcevar.h>
#include <sys/sdt.h>
#include <sys/sf_buf.h>
#include <sys/syscallsubr.h>
#include <sys/sysent.h>
#include <sys/shm.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <vm/pmap.h>
#include <vm/vm_page.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
#include <machine/reg.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
dtrace_execexit_func_t dtrace_fasttrap_exec;
#endif
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE(proc, kernel, , exec, exec);
SDT_PROBE_ARGTYPE(proc, kernel, , exec, 0, "char *");
SDT_PROBE_DEFINE(proc, kernel, , exec_failure, exec-failure);
SDT_PROBE_ARGTYPE(proc, kernel, , exec_failure, 0, "int");
SDT_PROBE_DEFINE(proc, kernel, , exec_success, exec-success);
SDT_PROBE_ARGTYPE(proc, kernel, , exec_success, 0, "char *");
MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
static int do_execve(struct thread *td, struct image_args *args,
struct mac *mac_p);
static void exec_free_args(struct image_args *);
/* XXX This should be vm_size_t. */
SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
NULL, 0, sysctl_kern_ps_strings, "LU", "");
/* XXX This should be vm_size_t. */
SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
NULL, 0, sysctl_kern_usrstack, "LU", "");
SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
NULL, 0, sysctl_kern_stackprot, "I", "");
u_long ps_arg_cache_limit = PAGE_SIZE / 16;
SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
&ps_arg_cache_limit, 0, "");
static int map_at_zero = 0;
TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
"Permit processes to map an object at virtual address 0.");
static int
sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
{
struct proc *p;
int error;
p = curproc;
#ifdef SCTL_MASK32
if (req->flags & SCTL_MASK32) {
unsigned int val;
val = (unsigned int)p->p_sysent->sv_psstrings;
error = SYSCTL_OUT(req, &val, sizeof(val));
} else
#endif
error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
sizeof(p->p_sysent->sv_psstrings));
return error;
}
static int
sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
{
struct proc *p;
int error;
p = curproc;
#ifdef SCTL_MASK32
if (req->flags & SCTL_MASK32) {
unsigned int val;
val = (unsigned int)p->p_sysent->sv_usrstack;
error = SYSCTL_OUT(req, &val, sizeof(val));
} else
#endif
error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
sizeof(p->p_sysent->sv_usrstack));
return error;
}
static int
sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
{
struct proc *p;
p = curproc;
return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
sizeof(p->p_sysent->sv_stackprot)));
}
/*
* Each of the items is a pointer to a `const struct execsw', hence the
* double pointer here.
*/
static const struct execsw **execsw;
#ifndef _SYS_SYSPROTO_H_
struct execve_args {
char *fname;
char **argv;
char **envv;
};
#endif
int
execve(td, uap)
struct thread *td;
struct execve_args /* {
char *fname;
char **argv;
char **envv;
} */ *uap;
{
int error;
struct image_args args;
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
if (error == 0)
error = kern_execve(td, &args, NULL);
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct fexecve_args {
int fd;
char **argv;
char **envv;
}
#endif
int
fexecve(struct thread *td, struct fexecve_args *uap)
{
int error;
struct image_args args;
error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
uap->argv, uap->envv);
if (error == 0) {
args.fd = uap->fd;
error = kern_execve(td, &args, NULL);
}
return (error);
}
#ifndef _SYS_SYSPROTO_H_
struct __mac_execve_args {
char *fname;
char **argv;
char **envv;
struct mac *mac_p;
};
#endif
int
__mac_execve(td, uap)
struct thread *td;
struct __mac_execve_args /* {
char *fname;
char **argv;
char **envv;
struct mac *mac_p;
} */ *uap;
{
#ifdef MAC
int error;
struct image_args args;
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
if (error == 0)
error = kern_execve(td, &args, uap->mac_p);
return (error);
#else
return (ENOSYS);
#endif
}
/*
* XXX: kern_execve has the astonishing property of not always returning to
* the caller. If sufficiently bad things happen during the call to
* do_execve(), it can end up calling exit1(); as a result, callers must
* avoid doing anything which they might need to undo (e.g., allocating
* memory).
*/
int
kern_execve(td, args, mac_p)
struct thread *td;
struct image_args *args;
struct mac *mac_p;
{
struct proc *p = td->td_proc;
int error;
AUDIT_ARG_ARGV(args->begin_argv, args->argc,
args->begin_envv - args->begin_argv);
AUDIT_ARG_ENVV(args->begin_envv, args->envc,
args->endp - args->begin_envv);
if (p->p_flag & P_HADTHREADS) {
PROC_LOCK(p);
if (thread_single(SINGLE_BOUNDARY)) {
PROC_UNLOCK(p);
exec_free_args(args);
return (ERESTART); /* Try again later. */
}
PROC_UNLOCK(p);
}
error = do_execve(td, args, mac_p);
if (p->p_flag & P_HADTHREADS) {
PROC_LOCK(p);
/*
* If success, we upgrade to SINGLE_EXIT state to
* force other threads to suicide.
*/
if (error == 0)
thread_single(SINGLE_EXIT);
else
thread_single_end();
PROC_UNLOCK(p);
}
return (error);
}
/*
* In-kernel implementation of execve(). All arguments are assumed to be
* userspace pointers from the passed thread.
*/
static int
do_execve(td, args, mac_p)
struct thread *td;
struct image_args *args;
struct mac *mac_p;
{
struct proc *p = td->td_proc;
struct nameidata nd;
struct ucred *newcred = NULL, *oldcred;
struct uidinfo *euip;
register_t *stack_base;
int error, i;
struct image_params image_params, *imgp;
struct vattr attr;
int (*img_first)(struct image_params *);
struct pargs *oldargs = NULL, *newargs = NULL;
struct sigacts *oldsigacts, *newsigacts;
#ifdef KTRACE
struct vnode *tracevp = NULL;
struct ucred *tracecred = NULL;
#endif
struct vnode *textvp = NULL, *binvp = NULL;
int credential_changing;
int vfslocked;
int textset;
#ifdef MAC
struct label *interpvplabel = NULL;
int will_transition;
#endif
#ifdef HWPMC_HOOKS
struct pmckern_procexec pe;
#endif
static const char fexecv_proc_title[] = "(fexecv)";
vfslocked = 0;
imgp = &image_params;
/*
* Lock the process and set the P_INEXEC flag to indicate that
* it should be left alone until we're done here. This is
* necessary to avoid race conditions - e.g. in ptrace() -
* that might allow a local user to illicitly obtain elevated
* privileges.
*/
PROC_LOCK(p);
KASSERT((p->p_flag & P_INEXEC) == 0,
("%s(): process already has P_INEXEC flag", __func__));
p->p_flag |= P_INEXEC;
PROC_UNLOCK(p);
/*
* Initialize part of the common data
*/
imgp->proc = p;
imgp->execlabel = NULL;
imgp->attr = &attr;
imgp->entry_addr = 0;
imgp->vmspace_destroyed = 0;
imgp->interpreted = 0;
imgp->opened = 0;
imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX;
imgp->auxargs = NULL;
imgp->vp = NULL;
imgp->object = NULL;
imgp->firstpage = NULL;
imgp->ps_strings = 0;
imgp->auxarg_size = 0;
imgp->args = args;
imgp->execpath = imgp->freepath = NULL;
imgp->execpathp = 0;
#ifdef MAC
error = mac_execve_enter(imgp, mac_p);
if (error)
goto exec_fail;
#endif
imgp->image_header = NULL;
/*
* Translate the file name. namei() returns a vnode pointer
* in ni_vp amoung other things.
*
* XXXAUDIT: It would be desirable to also audit the name of the
* interpreter if this is an interpreted binary.
*/
if (args->fname != NULL) {
NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
| MPSAFE | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
}
SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );
interpret:
if (args->fname != NULL) {
error = namei(&nd);
if (error)
goto exec_fail;
vfslocked = NDHASGIANT(&nd);
binvp = nd.ni_vp;
imgp->vp = binvp;
} else {
AUDIT_ARG_FD(args->fd);
error = fgetvp(td, args->fd, &binvp);
if (error)
goto exec_fail;
vfslocked = VFS_LOCK_GIANT(binvp->v_mount);
vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY);
AUDIT_ARG_VNODE1(binvp);
imgp->vp = binvp;
}
/*
* Check file permissions (also 'opens' file)
*/
error = exec_check_permissions(imgp);
if (error)
goto exec_fail_dealloc;
imgp->object = imgp->vp->v_object;
if (imgp->object != NULL)
vm_object_reference(imgp->object);
/*
* Set VV_TEXT now so no one can write to the executable while we're
* activating it.
*
* Remember if this was set before and unset it in case this is not
* actually an executable image.
*/
textset = imgp->vp->v_vflag & VV_TEXT;
imgp->vp->v_vflag |= VV_TEXT;
error = exec_map_first_page(imgp);
if (error)
goto exec_fail_dealloc;
imgp->proc->p_osrel = 0;
/*
* If the current process has a special image activator it
* wants to try first, call it. For example, emulating shell
* scripts differently.
*/
error = -1;
if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
error = img_first(imgp);
/*
* Loop through the list of image activators, calling each one.
* An activator returns -1 if there is no match, 0 on success,
* and an error otherwise.
*/
for (i = 0; error == -1 && execsw[i]; ++i) {
if (execsw[i]->ex_imgact == NULL ||
execsw[i]->ex_imgact == img_first) {
continue;
}
error = (*execsw[i]->ex_imgact)(imgp);
}
if (error) {
if (error == -1) {
if (textset == 0)
imgp->vp->v_vflag &= ~VV_TEXT;
error = ENOEXEC;
}
goto exec_fail_dealloc;
}
/*
* Special interpreter operation, cleanup and loop up to try to
* activate the interpreter.
*/
if (imgp->interpreted) {
exec_unmap_first_page(imgp);
/*
* VV_TEXT needs to be unset for scripts. There is a short
* period before we determine that something is a script where
* VV_TEXT will be set. The vnode lock is held over this
* entire period so nothing should illegitimately be blocked.
*/
imgp->vp->v_vflag &= ~VV_TEXT;
/* free name buffer and old vnode */
if (args->fname != NULL)
NDFREE(&nd, NDF_ONLY_PNBUF);
#ifdef MAC
mac_execve_interpreter_enter(binvp, &interpvplabel);
#endif
if (imgp->opened) {
VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
imgp->opened = 0;
}
vput(binvp);
vm_object_deallocate(imgp->object);
imgp->object = NULL;
VFS_UNLOCK_GIANT(vfslocked);
vfslocked = 0;
/* set new name to that of the interpreter */
NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
UIO_SYSSPACE, imgp->interpreter_name, td);
args->fname = imgp->interpreter_name;
goto interpret;
}
/*
* NB: We unlock the vnode here because it is believed that none
* of the sv_copyout_strings/sv_fixup operations require the vnode.
*/
VOP_UNLOCK(imgp->vp, 0);
/*
* Do the best to calculate the full path to the image file.
*/
if (imgp->auxargs != NULL &&
((args->fname != NULL && args->fname[0] == '/') ||
vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
imgp->execpath = args->fname;
/*
* Copy out strings (args and env) and initialize stack base
*/
if (p->p_sysent->sv_copyout_strings)
stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
else
stack_base = exec_copyout_strings(imgp);
/*
* If custom stack fixup routine present for this process
* let it do the stack setup.
* Else stuff argument count as first item on stack
*/
if (p->p_sysent->sv_fixup != NULL)
(*p->p_sysent->sv_fixup)(&stack_base, imgp);
else
suword(--stack_base, imgp->args->argc);
/*
* For security and other reasons, the file descriptor table cannot
* be shared after an exec.
*/
fdunshare(p, td);
/*
* Malloc things before we need locks.
*/
newcred = crget();
euip = uifind(attr.va_uid);
i = imgp->args->begin_envv - imgp->args->begin_argv;
/* Cache arguments if they fit inside our allowance */
if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
newargs = pargs_alloc(i);
bcopy(imgp->args->begin_argv, newargs->ar_args, i);
}
/* close files on exec */
fdcloseexec(td);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
/* Get a reference to the vnode prior to locking the proc */
VREF(binvp);
/*
* For security and other reasons, signal handlers cannot
* be shared after an exec. The new process gets a copy of the old
* handlers. In execsigs(), the new process will have its signals
* reset.
*/
PROC_LOCK(p);
oldcred = crcopysafe(p, newcred);
if (sigacts_shared(p->p_sigacts)) {
oldsigacts = p->p_sigacts;
PROC_UNLOCK(p);
newsigacts = sigacts_alloc();
sigacts_copy(newsigacts, oldsigacts);
PROC_LOCK(p);
p->p_sigacts = newsigacts;
} else
oldsigacts = NULL;
/* Stop profiling */
stopprofclock(p);
/* reset caught signals */
execsigs(p);
/* name this process - nameiexec(p, ndp) */
bzero(p->p_comm, sizeof(p->p_comm));
if (args->fname)
bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
/*
* mark as execed, wakeup the process that vforked (if any) and tell
* it that it now has its own resources back
*/
p->p_flag |= P_EXEC;
if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
p->p_flag &= ~P_PPWAIT;
cv_broadcast(&p->p_pwait);
}
/*
* Implement image setuid/setgid.
*
* Don't honor setuid/setgid if the filesystem prohibits it or if
* the process is being traced.
*
* XXXMAC: For the time being, use NOSUID to also prohibit
* transitions on the file system.
*/
credential_changing = 0;
credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
attr.va_uid;
credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
attr.va_gid;
#ifdef MAC
will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
interpvplabel, imgp);
credential_changing |= will_transition;
#endif
if (credential_changing &&
(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
(p->p_flag & P_TRACED) == 0) {
/*
* Turn off syscall tracing for set-id programs, except for
* root. Record any set-id flags first to make sure that
* we do not regain any tracing during a possible block.
*/
setsugid(p);
#ifdef KTRACE
if (priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0))
ktrprocexec(p, &tracecred, &tracevp);
#endif
/*
* Close any file descriptors 0..2 that reference procfs,
* then make sure file descriptors 0..2 are in use.
*
* setugidsafety() may call closef() and then pfind()
* which may grab the process lock.
* fdcheckstd() may call falloc() which may block to
* allocate memory, so temporarily drop the process lock.
*/
PROC_UNLOCK(p);
VOP_UNLOCK(imgp->vp, 0);
setugidsafety(td);
error = fdcheckstd(td);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
if (error != 0)
goto done1;
PROC_LOCK(p);
/*
* Set the new credentials.
*/
if (attr.va_mode & S_ISUID)
change_euid(newcred, euip);
if (attr.va_mode & S_ISGID)
change_egid(newcred, attr.va_gid);
#ifdef MAC
if (will_transition) {
mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
interpvplabel, imgp);
}
#endif
/*
* Implement correct POSIX saved-id behavior.
*
* XXXMAC: Note that the current logic will save the
* uid and gid if a MAC domain transition occurs, even
* though maybe it shouldn't.
*/
change_svuid(newcred, newcred->cr_uid);
change_svgid(newcred, newcred->cr_gid);
p->p_ucred = newcred;
newcred = NULL;
} else {
if (oldcred->cr_uid == oldcred->cr_ruid &&
oldcred->cr_gid == oldcred->cr_rgid)
p->p_flag &= ~P_SUGID;
/*
* Implement correct POSIX saved-id behavior.
*
* XXX: It's not clear that the existing behavior is
* POSIX-compliant. A number of sources indicate that the
* saved uid/gid should only be updated if the new ruid is
* not equal to the old ruid, or the new euid is not equal
* to the old euid and the new euid is not equal to the old
* ruid. The FreeBSD code always updates the saved uid/gid.
* Also, this code uses the new (replaced) euid and egid as
* the source, which may or may not be the right ones to use.
*/
if (oldcred->cr_svuid != oldcred->cr_uid ||
oldcred->cr_svgid != oldcred->cr_gid) {
change_svuid(newcred, newcred->cr_uid);
change_svgid(newcred, newcred->cr_gid);
p->p_ucred = newcred;
newcred = NULL;
}
}
/*
* Store the vp for use in procfs. This vnode was referenced prior
* to locking the proc lock.
*/
textvp = p->p_textvp;
p->p_textvp = binvp;
#ifdef KDTRACE_HOOKS
/*
* Tell the DTrace fasttrap provider about the exec if it
* has declared an interest.
*/
if (dtrace_fasttrap_exec)
dtrace_fasttrap_exec(p);
#endif
/*
* Notify others that we exec'd, and clear the P_INEXEC flag
* as we're now a bona fide freshly-execed process.
*/
KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
p->p_flag &= ~P_INEXEC;
/*
* If tracing the process, trap to debugger so breakpoints
* can be set before the program executes.
* Use tdsignal to deliver signal to current thread, use
* psignal may cause the signal to be delivered to wrong thread
* because that thread will exit, remember we are going to enter
* single thread mode.
*/
if (p->p_flag & P_TRACED)
tdsignal(p, td, SIGTRAP, NULL);
/* clear "fork but no exec" flag, as we _are_ execing */
p->p_acflag &= ~AFORK;
/*
* Free any previous argument cache and replace it with
* the new argument cache, if any.
*/
oldargs = p->p_args;
p->p_args = newargs;
newargs = NULL;
#ifdef HWPMC_HOOKS
/*
* Check if system-wide sampling is in effect or if the
* current process is using PMCs. If so, do exec() time
* processing. This processing needs to happen AFTER the
* P_INEXEC flag is cleared.
*
* The proc lock needs to be released before taking the PMC
* SX.
*/
if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
PROC_UNLOCK(p);
VOP_UNLOCK(imgp->vp, 0);
pe.pm_credentialschanged = credential_changing;
pe.pm_entryaddr = imgp->entry_addr;
PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
} else
PROC_UNLOCK(p);
#else /* !HWPMC_HOOKS */
PROC_UNLOCK(p);
#endif
/* Set values passed into the program in registers. */
if (p->p_sysent->sv_setregs)
(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
(u_long)(uintptr_t)stack_base, imgp->ps_strings);
else
exec_setregs(td, imgp->entry_addr,
(u_long)(uintptr_t)stack_base, imgp->ps_strings);
vfs_mark_atime(imgp->vp, td->td_ucred);
SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0);
done1:
/*
* Free any resources malloc'd earlier that we didn't use.
*/
uifree(euip);
if (newcred == NULL)
crfree(oldcred);
else
crfree(newcred);
VOP_UNLOCK(imgp->vp, 0);
/*
* Handle deferred decrement of ref counts.
*/
if (textvp != NULL) {
int tvfslocked;
tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
vrele(textvp);
VFS_UNLOCK_GIANT(tvfslocked);
}
if (binvp && error != 0)
vrele(binvp);
#ifdef KTRACE
if (tracevp != NULL) {
int tvfslocked;
tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
vrele(tracevp);
VFS_UNLOCK_GIANT(tvfslocked);
}
if (tracecred != NULL)
crfree(tracecred);
#endif
vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
pargs_drop(oldargs);
pargs_drop(newargs);
if (oldsigacts != NULL)
sigacts_free(oldsigacts);
exec_fail_dealloc:
/*
* free various allocated resources
*/
if (imgp->firstpage != NULL)
exec_unmap_first_page(imgp);
if (imgp->vp != NULL) {
if (args->fname)
NDFREE(&nd, NDF_ONLY_PNBUF);
if (imgp->opened)
VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
vput(imgp->vp);
}
if (imgp->object != NULL)
vm_object_deallocate(imgp->object);
free(imgp->freepath, M_TEMP);
if (error == 0) {
PROC_LOCK(p);
td->td_dbgflags |= TDB_EXEC;
PROC_UNLOCK(p);
/*
* Stop the process here if its stop event mask has
* the S_EXEC bit set.
*/
STOPEVENT(p, S_EXEC, 0);
goto done2;
}
exec_fail:
/* we're done here, clear P_INEXEC */
PROC_LOCK(p);
p->p_flag &= ~P_INEXEC;
PROC_UNLOCK(p);
SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0);
done2:
#ifdef MAC
mac_execve_exit(imgp);
mac_execve_interpreter_exit(interpvplabel);
#endif
VFS_UNLOCK_GIANT(vfslocked);
exec_free_args(args);
if (error && imgp->vmspace_destroyed) {
/* sorry, no more process anymore. exit gracefully */
exit1(td, W_EXITCODE(0, SIGABRT));
/* NOT REACHED */
}
+
+#ifdef KTRACE
+ if (error == 0)
+ ktrprocctor(p);
+#endif
+
return (error);
}
int
exec_map_first_page(imgp)
struct image_params *imgp;
{
int rv, i;
int initial_pagein;
vm_page_t ma[VM_INITIAL_PAGEIN];
vm_object_t object;
if (imgp->firstpage != NULL)
exec_unmap_first_page(imgp);
object = imgp->vp->v_object;
if (object == NULL)
return (EACCES);
VM_OBJECT_LOCK(object);
#if VM_NRESERVLEVEL > 0
if ((object->flags & OBJ_COLORED) == 0) {
object->flags |= OBJ_COLORED;
object->pg_color = 0;
}
#endif
ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
if (ma[0]->valid != VM_PAGE_BITS_ALL) {
initial_pagein = VM_INITIAL_PAGEIN;
if (initial_pagein > object->size)
initial_pagein = object->size;
for (i = 1; i < initial_pagein; i++) {
if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
if (ma[i]->valid)
break;
if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
break;
vm_page_busy(ma[i]);
} else {
ma[i] = vm_page_alloc(object, i,
VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
if (ma[i] == NULL)
break;
}
}
initial_pagein = i;
rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
ma[0] = vm_page_lookup(object, 0);
if ((rv != VM_PAGER_OK) || (ma[0] == NULL)) {
if (ma[0]) {
vm_page_lock_queues();
vm_page_free(ma[0]);
vm_page_unlock_queues();
}
VM_OBJECT_UNLOCK(object);
return (EIO);
}
}
vm_page_lock_queues();
vm_page_hold(ma[0]);
vm_page_unlock_queues();
vm_page_wakeup(ma[0]);
VM_OBJECT_UNLOCK(object);
imgp->firstpage = sf_buf_alloc(ma[0], 0);
imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
return (0);
}
void
exec_unmap_first_page(imgp)
struct image_params *imgp;
{
vm_page_t m;
if (imgp->firstpage != NULL) {
m = sf_buf_page(imgp->firstpage);
sf_buf_free(imgp->firstpage);
imgp->firstpage = NULL;
vm_page_lock_queues();
vm_page_unhold(m);
vm_page_unlock_queues();
}
}
/*
* Destroy old address space, and allocate a new stack
* The new stack is only SGROWSIZ large because it is grown
* automatically in trap.c.
*/
int
exec_new_vmspace(imgp, sv)
struct image_params *imgp;
struct sysentvec *sv;
{
int error;
struct proc *p = imgp->proc;
struct vmspace *vmspace = p->p_vmspace;
vm_offset_t sv_minuser, stack_addr;
vm_map_t map;
u_long ssiz;
imgp->vmspace_destroyed = 1;
imgp->sysent = sv;
/* May be called with Giant held */
EVENTHANDLER_INVOKE(process_exec, p, imgp);
/*
* Blow away entire process VM, if address space not shared,
* otherwise, create a new VM space so that other threads are
* not disrupted
*/
map = &vmspace->vm_map;
if (map_at_zero)
sv_minuser = sv->sv_minuser;
else
sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
vm_map_max(map) == sv->sv_maxuser) {
shmexit(vmspace);
pmap_remove_pages(vmspace_pmap(vmspace));
vm_map_remove(map, vm_map_min(map), vm_map_max(map));
} else {
error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
if (error)
return (error);
vmspace = p->p_vmspace;
map = &vmspace->vm_map;
}
/* Allocate a new stack */
if (sv->sv_maxssiz != NULL)
ssiz = *sv->sv_maxssiz;
else
ssiz = maxssiz;
stack_addr = sv->sv_usrstack - ssiz;
error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
if (error)
return (error);
#ifdef __ia64__
/* Allocate a new register stack */
stack_addr = IA64_BACKINGSTORE;
error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
if (error)
return (error);
#endif
/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
* VM_STACK case, but they are still used to monitor the size of the
* process stack so we can check the stack rlimit.
*/
vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
return (0);
}
/*
* Copy out argument and environment strings from the old process address
* space into the temporary string buffer.
*/
int
exec_copyin_args(struct image_args *args, char *fname,
enum uio_seg segflg, char **argv, char **envv)
{
char *argp, *envp;
int error;
size_t length;
bzero(args, sizeof(*args));
if (argv == NULL)
return (EFAULT);
/*
* Allocate temporary demand zeroed space for argument and
* environment strings:
*
* o ARG_MAX for argument and environment;
* o MAXSHELLCMDLEN for the name of interpreters.
*/
args->buf = (char *) kmem_alloc_wait(exec_map,
PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
if (args->buf == NULL)
return (ENOMEM);
args->begin_argv = args->buf;
args->endp = args->begin_argv;
args->stringspace = ARG_MAX;
/*
* Copy the file name.
*/
if (fname != NULL) {
args->fname = args->buf + ARG_MAX;
error = (segflg == UIO_SYSSPACE) ?
copystr(fname, args->fname, PATH_MAX, &length) :
copyinstr(fname, args->fname, PATH_MAX, &length);
if (error != 0)
goto err_exit;
} else
args->fname = NULL;
/*
* extract arguments first
*/
while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
if (argp == (caddr_t) -1) {
error = EFAULT;
goto err_exit;
}
if ((error = copyinstr(argp, args->endp,
args->stringspace, &length))) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto err_exit;
}
args->stringspace -= length;
args->endp += length;
args->argc++;
}
args->begin_envv = args->endp;
/*
* extract environment strings
*/
if (envv) {
while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
if (envp == (caddr_t)-1) {
error = EFAULT;
goto err_exit;
}
if ((error = copyinstr(envp, args->endp,
args->stringspace, &length))) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto err_exit;
}
args->stringspace -= length;
args->endp += length;
args->envc++;
}
}
return (0);
err_exit:
exec_free_args(args);
return (error);
}
static void
exec_free_args(struct image_args *args)
{
if (args->buf) {
kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
args->buf = NULL;
}
}
/*
* Copy strings out to the new process address space, constructing new arg
* and env vector tables. Return a pointer to the base so that it can be used
* as the initial stack pointer.
*/
register_t *
exec_copyout_strings(imgp)
struct image_params *imgp;
{
int argc, envc;
char **vectp;
char *stringp, *destp;
register_t *stack_base;
struct ps_strings *arginfo;
struct proc *p;
size_t execpath_len;
int szsigcode;
/*
* Calculate string base and vector table pointers.
* Also deal with signal trampoline code for this exec type.
*/
if (imgp->execpath != NULL && imgp->auxargs != NULL)
execpath_len = strlen(imgp->execpath) + 1;
else
execpath_len = 0;
p = imgp->proc;
szsigcode = 0;
arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
if (p->p_sysent->sv_szsigcode != NULL)
szsigcode = *(p->p_sysent->sv_szsigcode);
destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
roundup(execpath_len, sizeof(char *)) -
roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
/*
* install sigcode
*/
if (szsigcode)
copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
szsigcode), szsigcode);
/*
* Copy the image path for the rtld.
*/
if (execpath_len != 0) {
imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
copyout(imgp->execpath, (void *)imgp->execpathp,
execpath_len);
}
/*
* If we have a valid auxargs ptr, prepare some room
* on the stack.
*/
if (imgp->auxargs) {
/*
* 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
* lower compatibility.
*/
imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
(AT_COUNT * 2);
/*
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets,and imgp->auxarg_size is room
* for argument of Runtime loader.
*/
vectp = (char **)(destp - (imgp->args->argc +
imgp->args->envc + 2 + imgp->auxarg_size + execpath_len) *
sizeof(char *));
} else {
/*
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets
*/
vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
sizeof(char *));
}
/*
* vectp also becomes our initial stack base
*/
stack_base = (register_t *)vectp;
stringp = imgp->args->begin_argv;
argc = imgp->args->argc;
envc = imgp->args->envc;
/*
* Copy out strings - arguments and environment.
*/
copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
/*
* Fill in "ps_strings" struct for ps, w, etc.
*/
suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
suword(&arginfo->ps_nargvstr, argc);
/*
* Fill in argument portion of vector table.
*/
for (; argc > 0; --argc) {
suword(vectp++, (long)(intptr_t)destp);
while (*stringp++ != 0)
destp++;
destp++;
}
/* a null vector table pointer separates the argp's from the envp's */
suword(vectp++, 0);
suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
suword(&arginfo->ps_nenvstr, envc);
/*
* Fill in environment portion of vector table.
*/
for (; envc > 0; --envc) {
suword(vectp++, (long)(intptr_t)destp);
while (*stringp++ != 0)
destp++;
destp++;
}
/* end of vector table is a null pointer */
suword(vectp, 0);
return (stack_base);
}
/*
* Check permissions of file to execute.
* Called with imgp->vp locked.
* Return 0 for success or error code on failure.
*/
int
exec_check_permissions(imgp)
struct image_params *imgp;
{
struct vnode *vp = imgp->vp;
struct vattr *attr = imgp->attr;
struct thread *td;
int error;
td = curthread;
/* Get file attributes */
error = VOP_GETATTR(vp, attr, td->td_ucred);
if (error)
return (error);
#ifdef MAC
error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
if (error)
return (error);
#endif
/*
* 1) Check if file execution is disabled for the filesystem that this
* file resides on.
* 2) Insure that at least one execute bit is on - otherwise root
* will always succeed, and we don't want to happen unless the
* file really is executable.
* 3) Insure that the file is a regular file.
*/
if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
((attr->va_mode & 0111) == 0) ||
(attr->va_type != VREG))
return (EACCES);
/*
* Zero length files can't be exec'd
*/
if (attr->va_size == 0)
return (ENOEXEC);
/*
* Check for execute permission to file based on current credentials.
*/
error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
if (error)
return (error);
/*
* Check number of open-for-writes on the file and deny execution
* if there are any.
*/
if (vp->v_writecount)
return (ETXTBSY);
/*
* Call filesystem specific open routine (which does nothing in the
* general case).
*/
error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
if (error == 0)
imgp->opened = 1;
return (error);
}
/*
* Exec handler registration
*/
int
exec_register(execsw_arg)
const struct execsw *execsw_arg;
{
const struct execsw **es, **xs, **newexecsw;
int count = 2; /* New slot and trailing NULL */
if (execsw)
for (es = execsw; *es; es++)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
if (newexecsw == NULL)
return (ENOMEM);
xs = newexecsw;
if (execsw)
for (es = execsw; *es; es++)
*xs++ = *es;
*xs++ = execsw_arg;
*xs = NULL;
if (execsw)
free(execsw, M_TEMP);
execsw = newexecsw;
return (0);
}
int
exec_unregister(execsw_arg)
const struct execsw *execsw_arg;
{
const struct execsw **es, **xs, **newexecsw;
int count = 1;
if (execsw == NULL)
panic("unregister with no handlers left?\n");
for (es = execsw; *es; es++) {
if (*es == execsw_arg)
break;
}
if (*es == NULL)
return (ENOENT);
for (es = execsw; *es; es++)
if (*es != execsw_arg)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
if (newexecsw == NULL)
return (ENOMEM);
xs = newexecsw;
for (es = execsw; *es; es++)
if (*es != execsw_arg)
*xs++ = *es;
*xs = NULL;
if (execsw)
free(execsw, M_TEMP);
execsw = newexecsw;
return (0);
}
Index: stable/8/sys/kern/kern_fork.c
===================================================================
--- stable/8/sys/kern/kern_fork.c (revision 220261)
+++ stable/8/sys/kern/kern_fork.c (revision 220262)
@@ -1,928 +1,928 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
#include "opt_kstack_pages.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
#include <sys/eventhandler.h>
#include <sys/filedesc.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/sysctl.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/syscall.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/acct.h>
#include <sys/ktr.h>
#include <sys/ktrace.h>
#include <sys/unistd.h>
#include <sys/sdt.h>
#include <sys/sx.h>
#include <sys/signalvar.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
dtrace_fork_func_t dtrace_fasttrap_fork;
#endif
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE(proc, kernel, , create, create);
SDT_PROBE_ARGTYPE(proc, kernel, , create, 0, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, , create, 1, "struct proc *");
SDT_PROBE_ARGTYPE(proc, kernel, , create, 2, "int");
#ifndef _SYS_SYSPROTO_H_
struct fork_args {
int dummy;
};
#endif
/* ARGSUSED */
int
fork(td, uap)
struct thread *td;
struct fork_args *uap;
{
int error;
struct proc *p2;
error = fork1(td, RFFDG | RFPROC, 0, &p2);
if (error == 0) {
td->td_retval[0] = p2->p_pid;
td->td_retval[1] = 0;
}
return (error);
}
/* ARGSUSED */
int
vfork(td, uap)
struct thread *td;
struct vfork_args *uap;
{
int error, flags;
struct proc *p2;
#ifdef XEN
flags = RFFDG | RFPROC; /* validate that this is still an issue */
#else
flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
#endif
error = fork1(td, flags, 0, &p2);
if (error == 0) {
td->td_retval[0] = p2->p_pid;
td->td_retval[1] = 0;
}
return (error);
}
int
rfork(td, uap)
struct thread *td;
struct rfork_args *uap;
{
struct proc *p2;
int error;
/* Don't allow kernel-only flags. */
if ((uap->flags & RFKERNELONLY) != 0)
return (EINVAL);
AUDIT_ARG_FFLAGS(uap->flags);
error = fork1(td, uap->flags, 0, &p2);
if (error == 0) {
td->td_retval[0] = p2 ? p2->p_pid : 0;
td->td_retval[1] = 0;
}
return (error);
}
int nprocs = 1; /* process 0 */
int lastpid = 0;
SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
"Last used PID");
/*
* Random component to lastpid generation. We mix in a random factor to make
* it a little harder to predict. We sanity check the modulus value to avoid
* doing it in critical paths. Don't let it be too small or we pointlessly
* waste randomness entropy, and don't let it be impossibly large. Using a
* modulus that is too big causes a LOT more process table scans and slows
* down fork processing as the pidchecked caching is defeated.
*/
static int randompid = 0;
static int
sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
{
int error, pid;
error = sysctl_wire_old_buffer(req, sizeof(int));
if (error != 0)
return(error);
sx_xlock(&allproc_lock);
pid = randompid;
error = sysctl_handle_int(oidp, &pid, 0, req);
if (error == 0 && req->newptr != NULL) {
if (pid < 0 || pid > PID_MAX - 100) /* out of range */
pid = PID_MAX - 100;
else if (pid < 2) /* NOP */
pid = 0;
else if (pid < 100) /* Make it reasonable */
pid = 100;
randompid = pid;
}
sx_xunlock(&allproc_lock);
return (error);
}
SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
int
fork1(td, flags, pages, procp)
struct thread *td;
int flags;
int pages;
struct proc **procp;
{
struct proc *p1, *p2, *pptr;
struct proc *newproc;
int ok, p2_held, trypid;
static int curfail, pidchecked = 0;
static struct timeval lastfail;
struct filedesc *fd;
struct filedesc_to_leader *fdtol;
struct thread *td2;
struct sigacts *newsigacts;
struct vmspace *vm2;
vm_ooffset_t mem_charged;
int error;
/* Can't copy and clear. */
if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
return (EINVAL);
p2_held = 0;
p1 = td->td_proc;
/*
* Here we don't create a new process, but we divorce
* certain parts of a process from itself.
*/
if ((flags & RFPROC) == 0) {
if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
(flags & (RFCFDG | RFFDG))) {
PROC_LOCK(p1);
if (thread_single(SINGLE_BOUNDARY)) {
PROC_UNLOCK(p1);
return (ERESTART);
}
PROC_UNLOCK(p1);
}
error = vm_forkproc(td, NULL, NULL, NULL, flags);
if (error)
goto norfproc_fail;
/*
* Close all file descriptors.
*/
if (flags & RFCFDG) {
struct filedesc *fdtmp;
fdtmp = fdinit(td->td_proc->p_fd);
fdfree(td);
p1->p_fd = fdtmp;
}
/*
* Unshare file descriptors (from parent).
*/
if (flags & RFFDG)
fdunshare(p1, td);
norfproc_fail:
if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
(flags & (RFCFDG | RFFDG))) {
PROC_LOCK(p1);
thread_single_end();
PROC_UNLOCK(p1);
}
*procp = NULL;
return (error);
}
/*
* XXX
* We did have single-threading code here
* however it proved un-needed and caused problems
*/
mem_charged = 0;
vm2 = NULL;
if (pages == 0)
pages = KSTACK_PAGES;
/* Allocate new proc. */
newproc = uma_zalloc(proc_zone, M_WAITOK);
td2 = FIRST_THREAD_IN_PROC(newproc);
if (td2 == NULL) {
td2 = thread_alloc(pages);
if (td2 == NULL) {
error = ENOMEM;
goto fail1;
}
proc_linkup(newproc, td2);
} else {
if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) {
if (td2->td_kstack != 0)
vm_thread_dispose(td2);
if (!thread_alloc_stack(td2, pages)) {
error = ENOMEM;
goto fail1;
}
}
}
if ((flags & RFMEM) == 0) {
vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
if (vm2 == NULL) {
error = ENOMEM;
goto fail1;
}
if (!swap_reserve(mem_charged)) {
/*
* The swap reservation failed. The accounting
* from the entries of the copied vm2 will be
* substracted in vmspace_free(), so force the
* reservation there.
*/
swap_reserve_force(mem_charged);
error = ENOMEM;
goto fail1;
}
} else
vm2 = NULL;
#ifdef MAC
mac_proc_init(newproc);
#endif
knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
STAILQ_INIT(&newproc->p_ktr);
/* We have to lock the process tree while we look for a pid. */
sx_slock(&proctree_lock);
/*
* Although process entries are dynamically created, we still keep
* a global limit on the maximum number we will create. Don't allow
* a nonprivileged user to use the last ten processes; don't let root
* exceed the limit. The variable nprocs is the current number of
* processes, maxproc is the limit.
*/
sx_xlock(&allproc_lock);
if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
error = EAGAIN;
goto fail;
}
/*
* Increment the count of procs running with this uid. Don't allow
* a nonprivileged user to exceed their current limit.
*
* XXXRW: Can we avoid privilege here if it's not needed?
*/
error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
if (error == 0)
ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
else {
PROC_LOCK(p1);
ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
lim_cur(p1, RLIMIT_NPROC));
PROC_UNLOCK(p1);
}
if (!ok) {
error = EAGAIN;
goto fail;
}
/*
* Increment the nprocs resource before blocking can occur. There
* are hard-limits as to the number of processes that can run.
*/
nprocs++;
/*
* Find an unused process ID. We remember a range of unused IDs
* ready to use (from lastpid+1 through pidchecked-1).
*
* If RFHIGHPID is set (used during system boot), do not allocate
* low-numbered pids.
*/
trypid = lastpid + 1;
if (flags & RFHIGHPID) {
if (trypid < 10)
trypid = 10;
} else {
if (randompid)
trypid += arc4random() % randompid;
}
retry:
/*
* If the process ID prototype has wrapped around,
* restart somewhat above 0, as the low-numbered procs
* tend to include daemons that don't exit.
*/
if (trypid >= PID_MAX) {
trypid = trypid % PID_MAX;
if (trypid < 100)
trypid += 100;
pidchecked = 0;
}
if (trypid >= pidchecked) {
int doingzomb = 0;
pidchecked = PID_MAX;
/*
* Scan the active and zombie procs to check whether this pid
* is in use. Remember the lowest pid that's greater
* than trypid, so we can avoid checking for a while.
*/
p2 = LIST_FIRST(&allproc);
again:
for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
while (p2->p_pid == trypid ||
(p2->p_pgrp != NULL &&
(p2->p_pgrp->pg_id == trypid ||
(p2->p_session != NULL &&
p2->p_session->s_sid == trypid)))) {
trypid++;
if (trypid >= pidchecked)
goto retry;
}
if (p2->p_pid > trypid && pidchecked > p2->p_pid)
pidchecked = p2->p_pid;
if (p2->p_pgrp != NULL) {
if (p2->p_pgrp->pg_id > trypid &&
pidchecked > p2->p_pgrp->pg_id)
pidchecked = p2->p_pgrp->pg_id;
if (p2->p_session != NULL &&
p2->p_session->s_sid > trypid &&
pidchecked > p2->p_session->s_sid)
pidchecked = p2->p_session->s_sid;
}
}
if (!doingzomb) {
doingzomb = 1;
p2 = LIST_FIRST(&zombproc);
goto again;
}
}
sx_sunlock(&proctree_lock);
/*
* RFHIGHPID does not mess with the lastpid counter during boot.
*/
if (flags & RFHIGHPID)
pidchecked = 0;
else
lastpid = trypid;
p2 = newproc;
p2->p_state = PRS_NEW; /* protect against others */
p2->p_pid = trypid;
AUDIT_ARG_PID(p2->p_pid);
LIST_INSERT_HEAD(&allproc, p2, p_list);
LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
PROC_LOCK(p2);
PROC_LOCK(p1);
sx_xunlock(&allproc_lock);
bcopy(&p1->p_startcopy, &p2->p_startcopy,
__rangeof(struct proc, p_startcopy, p_endcopy));
pargs_hold(p2->p_args);
PROC_UNLOCK(p1);
bzero(&p2->p_startzero,
__rangeof(struct proc, p_startzero, p_endzero));
p2->p_ucred = crhold(td->td_ucred);
/* Tell the prison that we exist. */
prison_proc_hold(p2->p_ucred->cr_prison);
PROC_UNLOCK(p2);
/*
* Malloc things while we don't hold any locks.
*/
if (flags & RFSIGSHARE)
newsigacts = NULL;
else
newsigacts = sigacts_alloc();
/*
* Copy filedesc.
*/
if (flags & RFCFDG) {
fd = fdinit(p1->p_fd);
fdtol = NULL;
} else if (flags & RFFDG) {
fd = fdcopy(p1->p_fd);
fdtol = NULL;
} else {
fd = fdshare(p1->p_fd);
if (p1->p_fdtol == NULL)
p1->p_fdtol =
filedesc_to_leader_alloc(NULL,
NULL,
p1->p_leader);
if ((flags & RFTHREAD) != 0) {
/*
* Shared file descriptor table and
* shared process leaders.
*/
fdtol = p1->p_fdtol;
FILEDESC_XLOCK(p1->p_fd);
fdtol->fdl_refcount++;
FILEDESC_XUNLOCK(p1->p_fd);
} else {
/*
* Shared file descriptor table, and
* different process leaders
*/
fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
p1->p_fd,
p2);
}
}
/*
* Make a proc table entry for the new process.
* Start by zeroing the section of proc that is zero-initialized,
* then copy the section that is copied directly from the parent.
*/
PROC_LOCK(p2);
PROC_LOCK(p1);
bzero(&td2->td_startzero,
__rangeof(struct thread, td_startzero, td_endzero));
bzero(&td2->td_rux, sizeof(td2->td_rux));
td2->td_map_def_user = NULL;
td2->td_dbg_forked = 0;
bcopy(&td->td_startcopy, &td2->td_startcopy,
__rangeof(struct thread, td_startcopy, td_endcopy));
bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
td2->td_sigstk = td->td_sigstk;
td2->td_sigmask = td->td_sigmask;
td2->td_flags = TDF_INMEM;
#ifdef VIMAGE
td2->td_vnet = NULL;
td2->td_vnet_lpush = NULL;
#endif
/*
* Allow the scheduler to initialize the child.
*/
thread_lock(td);
sched_fork(td, td2);
thread_unlock(td);
/*
* Duplicate sub-structures as needed.
* Increase reference counts on shared objects.
*/
p2->p_flag = P_INMEM;
p2->p_swtick = ticks;
if (p1->p_flag & P_PROFIL)
startprofclock(p2);
td2->td_ucred = crhold(p2->p_ucred);
if (flags & RFSIGSHARE) {
p2->p_sigacts = sigacts_hold(p1->p_sigacts);
} else {
sigacts_copy(newsigacts, p1->p_sigacts);
p2->p_sigacts = newsigacts;
}
if (flags & RFLINUXTHPN)
p2->p_sigparent = SIGUSR1;
else
p2->p_sigparent = SIGCHLD;
p2->p_textvp = p1->p_textvp;
p2->p_fd = fd;
p2->p_fdtol = fdtol;
/*
* p_limit is copy-on-write. Bump its refcount.
*/
lim_fork(p1, p2);
pstats_fork(p1->p_stats, p2->p_stats);
PROC_UNLOCK(p1);
PROC_UNLOCK(p2);
/* Bump references to the text vnode (for procfs) */
if (p2->p_textvp)
vref(p2->p_textvp);
/*
* Set up linkage for kernel based threading.
*/
if ((flags & RFTHREAD) != 0) {
mtx_lock(&ppeers_lock);
p2->p_peers = p1->p_peers;
p1->p_peers = p2;
p2->p_leader = p1->p_leader;
mtx_unlock(&ppeers_lock);
PROC_LOCK(p1->p_leader);
if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
PROC_UNLOCK(p1->p_leader);
/*
* The task leader is exiting, so process p1 is
* going to be killed shortly. Since p1 obviously
* isn't dead yet, we know that the leader is either
* sending SIGKILL's to all the processes in this
* task or is sleeping waiting for all the peers to
* exit. We let p1 complete the fork, but we need
* to go ahead and kill the new process p2 since
* the task leader may not get a chance to send
* SIGKILL to it. We leave it on the list so that
* the task leader will wait for this new process
* to commit suicide.
*/
PROC_LOCK(p2);
psignal(p2, SIGKILL);
PROC_UNLOCK(p2);
} else
PROC_UNLOCK(p1->p_leader);
} else {
p2->p_peers = NULL;
p2->p_leader = p2;
}
sx_xlock(&proctree_lock);
PGRP_LOCK(p1->p_pgrp);
PROC_LOCK(p2);
PROC_LOCK(p1);
/*
* Preserve some more flags in subprocess. P_PROFIL has already
* been preserved.
*/
p2->p_flag |= p1->p_flag & P_SUGID;
td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
SESS_LOCK(p1->p_session);
if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
p2->p_flag |= P_CONTROLT;
SESS_UNLOCK(p1->p_session);
if (flags & RFPPWAIT)
p2->p_flag |= P_PPWAIT;
p2->p_pgrp = p1->p_pgrp;
LIST_INSERT_AFTER(p1, p2, p_pglist);
PGRP_UNLOCK(p1->p_pgrp);
LIST_INIT(&p2->p_children);
callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);
-#ifdef KTRACE
- ktrprocfork(p1, p2);
-#endif
-
/*
* If PF_FORK is set, the child process inherits the
* procfs ioctl flags from its parent.
*/
if (p1->p_pfsflags & PF_FORK) {
p2->p_stops = p1->p_stops;
p2->p_pfsflags = p1->p_pfsflags;
}
/*
* This begins the section where we must prevent the parent
* from being swapped.
*/
_PHOLD(p1);
PROC_UNLOCK(p1);
/*
* Attach the new process to its parent.
*
* If RFNOWAIT is set, the newly created process becomes a child
* of init. This effectively disassociates the child from the
* parent.
*/
if (flags & RFNOWAIT)
pptr = initproc;
else
pptr = p1;
p2->p_pptr = pptr;
LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
sx_xunlock(&proctree_lock);
/* Inform accounting that we have forked. */
p2->p_acflag = AFORK;
PROC_UNLOCK(p2);
+
+#ifdef KTRACE
+ ktrprocfork(p1, p2);
+#endif
/*
* Finish creating the child process. It will return via a different
* execution path later. (ie: directly into user mode)
*/
vm_forkproc(td, p2, td2, vm2, flags);
if (flags == (RFFDG | RFPROC)) {
PCPU_INC(cnt.v_forks);
PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
PCPU_INC(cnt.v_vforks);
PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else if (p1 == &proc0) {
PCPU_INC(cnt.v_kthreads);
PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
} else {
PCPU_INC(cnt.v_rforks);
PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
p2->p_vmspace->vm_ssize);
}
/*
* Both processes are set up, now check if any loadable modules want
* to adjust anything.
* What if they have an error? XXX
*/
EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
/*
* Set the child start time and mark the process as being complete.
*/
microuptime(&p2->p_stats->p_start);
PROC_SLOCK(p2);
p2->p_state = PRS_NORMAL;
PROC_SUNLOCK(p2);
#ifdef KDTRACE_HOOKS
/*
* Tell the DTrace fasttrap provider about the new process
* if it has registered an interest. We have to do this only after
* p_state is PRS_NORMAL since the fasttrap module will use pfind()
* later on.
*/
if (dtrace_fasttrap_fork) {
PROC_LOCK(p1);
PROC_LOCK(p2);
dtrace_fasttrap_fork(p1, p2);
PROC_UNLOCK(p2);
PROC_UNLOCK(p1);
}
#endif
PROC_LOCK(p1);
if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED |
P_FOLLOWFORK)) {
/*
* Arrange for debugger to receive the fork event.
*
* We can report PL_FLAG_FORKED regardless of
* P_FOLLOWFORK settings, but it does not make a sense
* for runaway child.
*/
td->td_dbgflags |= TDB_FORK;
td->td_dbg_forked = p2->p_pid;
PROC_LOCK(p2);
td2->td_dbgflags |= TDB_STOPATFORK;
_PHOLD(p2);
p2_held = 1;
PROC_UNLOCK(p2);
}
if ((flags & RFSTOPPED) == 0) {
/*
* If RFSTOPPED not requested, make child runnable and
* add to run queue.
*/
thread_lock(td2);
TD_SET_CAN_RUN(td2);
sched_add(td2, SRQ_BORING);
thread_unlock(td2);
}
/*
* Now can be swapped.
*/
_PRELE(p1);
PROC_UNLOCK(p1);
/*
* Tell any interested parties about the new process.
*/
knote_fork(&p1->p_klist, p2->p_pid);
SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);
/*
* Wait until debugger is attached to child.
*/
PROC_LOCK(p2);
while ((td2->td_dbgflags & TDB_STOPATFORK) != 0)
cv_wait(&p2->p_dbgwait, &p2->p_mtx);
if (p2_held)
_PRELE(p2);
/*
* Preserve synchronization semantics of vfork. If waiting for
* child to exec or exit, set P_PPWAIT on child, and sleep on our
* proc (in case of exit).
*/
while (p2->p_flag & P_PPWAIT)
cv_wait(&p2->p_pwait, &p2->p_mtx);
PROC_UNLOCK(p2);
/*
* Return child proc pointer to parent.
*/
*procp = p2;
return (0);
fail:
sx_sunlock(&proctree_lock);
if (ppsratecheck(&lastfail, &curfail, 1))
printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
td->td_ucred->cr_ruid);
sx_xunlock(&allproc_lock);
#ifdef MAC
mac_proc_destroy(newproc);
#endif
fail1:
if (vm2 != NULL)
vmspace_free(vm2);
uma_zfree(proc_zone, newproc);
pause("fork", hz / 2);
return (error);
}
/*
* Handle the return of a child process from fork1(). This function
* is called from the MD fork_trampoline() entry point.
*/
void
fork_exit(callout, arg, frame)
void (*callout)(void *, struct trapframe *);
void *arg;
struct trapframe *frame;
{
struct proc *p;
struct thread *td;
struct thread *dtd;
td = curthread;
p = td->td_proc;
KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
td, td->td_sched, p->p_pid, td->td_name);
sched_fork_exit(td);
/*
* Processes normally resume in mi_switch() after being
* cpu_switch()'ed to, but when children start up they arrive here
* instead, so we must do much the same things as mi_switch() would.
*/
if ((dtd = PCPU_GET(deadthread))) {
PCPU_SET(deadthread, NULL);
thread_stash(dtd);
}
thread_unlock(td);
/*
* cpu_set_fork_handler intercepts this function call to
* have this call a non-return function to stay in kernel mode.
* initproc has its own fork handler, but it does return.
*/
KASSERT(callout != NULL, ("NULL callout in fork_exit"));
callout(arg, frame);
/*
* Check if a kernel thread misbehaved and returned from its main
* function.
*/
if (p->p_flag & P_KTHREAD) {
printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
td->td_name, p->p_pid);
kproc_exit(0);
}
mtx_assert(&Giant, MA_NOTOWNED);
EVENTHANDLER_INVOKE(schedtail, p);
}
/*
* Simplified back end of syscall(), used when returning from fork()
* directly into user mode. Giant is not held on entry, and must not
* be held on return. This function is passed in to fork_exit() as the
* first parameter and is called when returning to a new userland process.
*/
void
fork_return(td, frame)
struct thread *td;
struct trapframe *frame;
{
struct proc *p, *dbg;
if (td->td_dbgflags & TDB_STOPATFORK) {
p = td->td_proc;
sx_xlock(&proctree_lock);
PROC_LOCK(p);
if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) ==
(P_TRACED | P_FOLLOWFORK)) {
/*
* If debugger still wants auto-attach for the
* parent's children, do it now.
*/
dbg = p->p_pptr->p_pptr;
p->p_flag |= P_TRACED;
p->p_oppid = p->p_pptr->p_pid;
proc_reparent(p, dbg);
sx_xunlock(&proctree_lock);
ptracestop(td, SIGSTOP);
} else {
/*
* ... otherwise clear the request.
*/
sx_xunlock(&proctree_lock);
td->td_dbgflags &= ~TDB_STOPATFORK;
cv_broadcast(&p->p_dbgwait);
}
PROC_UNLOCK(p);
}
userret(td, frame);
#ifdef KTRACE
if (KTRPOINT(td, KTR_SYSRET))
ktrsysret(SYS_fork, 0, 0);
#endif
mtx_assert(&Giant, MA_NOTOWNED);
}
Index: stable/8/sys/kern/kern_ktrace.c
===================================================================
--- stable/8/sys/kern/kern_ktrace.c (revision 220261)
+++ stable/8/sys/kern/kern_ktrace.c (revision 220262)
@@ -1,1167 +1,1221 @@
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California.
* Copyright (c) 2005 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/ktrace.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
+#include <sys/sysent.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <security/mac/mac_framework.h>
/*
* The ktrace facility allows the tracing of certain key events in user space
* processes, such as system calls, signal delivery, context switches, and
* user generated events using utrace(2). It works by streaming event
* records and data to a vnode associated with the process using the
* ktrace(2) system call. In general, records can be written directly from
* the context that generates the event. One important exception to this is
* during a context switch, where sleeping is not permitted. To handle this
* case, trace events are generated using in-kernel ktr_request records, and
* then delivered to disk at a convenient moment -- either immediately, the
* next traceable event, at system call return, or at process exit.
*
* When dealing with multiple threads or processes writing to the same event
* log, ordering guarantees are weak: specifically, if an event has multiple
* records (i.e., system call enter and return), they may be interlaced with
* records from another event. Process and thread ID information is provided
* in the record, and user applications can de-interlace events if required.
*/
static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
#ifdef KTRACE
#ifndef KTRACE_REQUEST_POOL
#define KTRACE_REQUEST_POOL 100
#endif
struct ktr_request {
struct ktr_header ktr_header;
void *ktr_buffer;
union {
+ struct ktr_proc_ctor ktr_proc_ctor;
struct ktr_syscall ktr_syscall;
struct ktr_sysret ktr_sysret;
struct ktr_genio ktr_genio;
struct ktr_psig ktr_psig;
struct ktr_csw ktr_csw;
} ktr_data;
STAILQ_ENTRY(ktr_request) ktr_list;
};
static int data_lengths[] = {
0, /* none */
offsetof(struct ktr_syscall, ktr_args), /* KTR_SYSCALL */
sizeof(struct ktr_sysret), /* KTR_SYSRET */
0, /* KTR_NAMEI */
sizeof(struct ktr_genio), /* KTR_GENIO */
sizeof(struct ktr_psig), /* KTR_PSIG */
sizeof(struct ktr_csw), /* KTR_CSW */
0, /* KTR_USER */
0, /* KTR_STRUCT */
0, /* KTR_SYSCTL */
+ sizeof(struct ktr_proc_ctor), /* KTR_PROCCTOR */
+ 0, /* KTR_PROCDTOR */
};
static STAILQ_HEAD(, ktr_request) ktr_free;
static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
static u_int ktr_geniosize = PAGE_SIZE;
TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
0, "Maximum size of genio event payload");
static int print_message = 1;
static struct mtx ktrace_mtx;
static struct sx ktrace_sx;
static void ktrace_init(void *dummy);
static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
-static u_int ktrace_resize_pool(u_int newsize);
+static u_int ktrace_resize_pool(u_int oldsize, u_int newsize);
+static struct ktr_request *ktr_getrequest_entered(struct thread *td, int type);
static struct ktr_request *ktr_getrequest(int type);
static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
static void ktr_freeproc(struct proc *p, struct ucred **uc,
struct vnode **vp);
static void ktr_freerequest(struct ktr_request *req);
static void ktr_freerequest_locked(struct ktr_request *req);
static void ktr_writerequest(struct thread *td, struct ktr_request *req);
static int ktrcanset(struct thread *,struct proc *);
static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
+static void ktrprocctor_entered(struct thread *, struct proc *);
/*
* ktrace itself generates events, such as context switches, which we do not
* wish to trace. Maintain a flag, TDP_INKTRACE, on each thread to determine
* whether or not it is in a region where tracing of events should be
* suppressed.
*/
static void
ktrace_enter(struct thread *td)
{
KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
td->td_pflags |= TDP_INKTRACE;
}
static void
ktrace_exit(struct thread *td)
{
KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
td->td_pflags &= ~TDP_INKTRACE;
}
static void
ktrace_assert(struct thread *td)
{
KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
}
static void
ktrace_init(void *dummy)
{
struct ktr_request *req;
int i;
mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
sx_init(&ktrace_sx, "ktrace_sx");
STAILQ_INIT(&ktr_free);
for (i = 0; i < ktr_requestpool; i++) {
req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
}
}
SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
static int
sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
{
struct thread *td;
u_int newsize, oldsize, wantsize;
int error;
/* Handle easy read-only case first to avoid warnings from GCC. */
if (!req->newptr) {
- mtx_lock(&ktrace_mtx);
oldsize = ktr_requestpool;
- mtx_unlock(&ktrace_mtx);
return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
}
error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
if (error)
return (error);
td = curthread;
ktrace_enter(td);
- mtx_lock(&ktrace_mtx);
oldsize = ktr_requestpool;
- newsize = ktrace_resize_pool(wantsize);
- mtx_unlock(&ktrace_mtx);
+ newsize = ktrace_resize_pool(oldsize, wantsize);
ktrace_exit(td);
error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
if (error)
return (error);
if (wantsize > oldsize && newsize < wantsize)
return (ENOSPC);
return (0);
}
SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW,
&ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", "");
static u_int
-ktrace_resize_pool(u_int newsize)
+ktrace_resize_pool(u_int oldsize, u_int newsize)
{
+ STAILQ_HEAD(, ktr_request) ktr_new;
struct ktr_request *req;
int bound;
- mtx_assert(&ktrace_mtx, MA_OWNED);
print_message = 1;
- bound = newsize - ktr_requestpool;
+ bound = newsize - oldsize;
if (bound == 0)
return (ktr_requestpool);
- if (bound < 0)
+ if (bound < 0) {
+ mtx_lock(&ktrace_mtx);
/* Shrink pool down to newsize if possible. */
while (bound++ < 0) {
req = STAILQ_FIRST(&ktr_free);
if (req == NULL)
- return (ktr_requestpool);
+ break;
STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
ktr_requestpool--;
- mtx_unlock(&ktrace_mtx);
free(req, M_KTRACE);
- mtx_lock(&ktrace_mtx);
}
- else
+ } else {
/* Grow pool up to newsize. */
+ STAILQ_INIT(&ktr_new);
while (bound-- > 0) {
- mtx_unlock(&ktrace_mtx);
req = malloc(sizeof(struct ktr_request), M_KTRACE,
M_WAITOK);
- mtx_lock(&ktrace_mtx);
- STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
- ktr_requestpool++;
+ STAILQ_INSERT_HEAD(&ktr_new, req, ktr_list);
}
+ mtx_lock(&ktrace_mtx);
+ STAILQ_CONCAT(&ktr_free, &ktr_new);
+ ktr_requestpool += (newsize - oldsize);
+ }
+ mtx_unlock(&ktrace_mtx);
return (ktr_requestpool);
}
/* ktr_getrequest() assumes that ktr_comm[] is the same size as td_name[]. */
CTASSERT(sizeof(((struct ktr_header *)NULL)->ktr_comm) ==
(sizeof((struct thread *)NULL)->td_name));
static struct ktr_request *
-ktr_getrequest(int type)
+ktr_getrequest_entered(struct thread *td, int type)
{
struct ktr_request *req;
- struct thread *td = curthread;
struct proc *p = td->td_proc;
int pm;
- ktrace_enter(td); /* XXX: In caller instead? */
mtx_lock(&ktrace_mtx);
if (!KTRCHECK(td, type)) {
mtx_unlock(&ktrace_mtx);
- ktrace_exit(td);
return (NULL);
}
req = STAILQ_FIRST(&ktr_free);
if (req != NULL) {
STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
req->ktr_header.ktr_type = type;
if (p->p_traceflag & KTRFAC_DROP) {
req->ktr_header.ktr_type |= KTR_DROP;
p->p_traceflag &= ~KTRFAC_DROP;
}
mtx_unlock(&ktrace_mtx);
microtime(&req->ktr_header.ktr_time);
req->ktr_header.ktr_pid = p->p_pid;
req->ktr_header.ktr_tid = td->td_tid;
bcopy(td->td_name, req->ktr_header.ktr_comm,
sizeof(req->ktr_header.ktr_comm));
req->ktr_buffer = NULL;
req->ktr_header.ktr_len = 0;
} else {
p->p_traceflag |= KTRFAC_DROP;
pm = print_message;
print_message = 0;
mtx_unlock(&ktrace_mtx);
if (pm)
printf("Out of ktrace request objects.\n");
- ktrace_exit(td);
}
return (req);
}
+static struct ktr_request *
+ktr_getrequest(int type)
+{
+ struct thread *td = curthread;
+ struct ktr_request *req;
+
+ ktrace_enter(td);
+ req = ktr_getrequest_entered(td, type);
+ if (req == NULL)
+ ktrace_exit(td);
+
+ return (req);
+}
+
/*
* Some trace generation environments don't permit direct access to VFS,
* such as during a context switch where sleeping is not allowed. Under these
* circumstances, queue a request to the thread to be written asynchronously
* later.
*/
static void
ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
{
mtx_lock(&ktrace_mtx);
STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
mtx_unlock(&ktrace_mtx);
- ktrace_exit(td);
}
/*
* Drain any pending ktrace records from the per-thread queue to disk. This
* is used both internally before committing other records, and also on
* system call return. We drain all the ones we can find at the time when
* drain is requested, but don't keep draining after that as those events
* may be approximately "after" the current event.
*/
static void
ktr_drain(struct thread *td)
{
struct ktr_request *queued_req;
STAILQ_HEAD(, ktr_request) local_queue;
ktrace_assert(td);
sx_assert(&ktrace_sx, SX_XLOCKED);
STAILQ_INIT(&local_queue); /* XXXRW: needed? */
if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
mtx_lock(&ktrace_mtx);
STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
mtx_unlock(&ktrace_mtx);
while ((queued_req = STAILQ_FIRST(&local_queue))) {
STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
ktr_writerequest(td, queued_req);
ktr_freerequest(queued_req);
}
}
}
/*
* Submit a trace record for immediate commit to disk -- to be used only
* where entering VFS is OK. First drain any pending records that may have
* been cached in the thread.
*/
static void
ktr_submitrequest(struct thread *td, struct ktr_request *req)
{
ktrace_assert(td);
sx_xlock(&ktrace_sx);
ktr_drain(td);
ktr_writerequest(td, req);
ktr_freerequest(req);
sx_xunlock(&ktrace_sx);
-
ktrace_exit(td);
}
static void
ktr_freerequest(struct ktr_request *req)
{
mtx_lock(&ktrace_mtx);
ktr_freerequest_locked(req);
mtx_unlock(&ktrace_mtx);
}
static void
ktr_freerequest_locked(struct ktr_request *req)
{
mtx_assert(&ktrace_mtx, MA_OWNED);
if (req->ktr_buffer != NULL)
free(req->ktr_buffer, M_KTRACE);
STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
}
/*
* Disable tracing for a process and release all associated resources.
* The caller is responsible for releasing a reference on the returned
* vnode and credentials.
*/
static void
ktr_freeproc(struct proc *p, struct ucred **uc, struct vnode **vp)
{
struct ktr_request *req;
PROC_LOCK_ASSERT(p, MA_OWNED);
mtx_assert(&ktrace_mtx, MA_OWNED);
*uc = p->p_tracecred;
p->p_tracecred = NULL;
if (vp != NULL)
*vp = p->p_tracevp;
p->p_tracevp = NULL;
p->p_traceflag = 0;
while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
ktr_freerequest_locked(req);
}
}
void
ktrsyscall(code, narg, args)
int code, narg;
register_t args[];
{
struct ktr_request *req;
struct ktr_syscall *ktp;
size_t buflen;
char *buf = NULL;
buflen = sizeof(register_t) * narg;
if (buflen > 0) {
buf = malloc(buflen, M_KTRACE, M_WAITOK);
bcopy(args, buf, buflen);
}
req = ktr_getrequest(KTR_SYSCALL);
if (req == NULL) {
if (buf != NULL)
free(buf, M_KTRACE);
return;
}
ktp = &req->ktr_data.ktr_syscall;
ktp->ktr_code = code;
ktp->ktr_narg = narg;
if (buflen > 0) {
req->ktr_header.ktr_len = buflen;
req->ktr_buffer = buf;
}
ktr_submitrequest(curthread, req);
}
void
ktrsysret(code, error, retval)
int code, error;
register_t retval;
{
struct ktr_request *req;
struct ktr_sysret *ktp;
req = ktr_getrequest(KTR_SYSRET);
if (req == NULL)
return;
ktp = &req->ktr_data.ktr_sysret;
ktp->ktr_code = code;
ktp->ktr_error = error;
ktp->ktr_retval = retval; /* what about val2 ? */
ktr_submitrequest(curthread, req);
}
/*
* When a setuid process execs, disable tracing.
*
* XXX: We toss any pending asynchronous records.
*/
void
ktrprocexec(struct proc *p, struct ucred **uc, struct vnode **vp)
{
PROC_LOCK_ASSERT(p, MA_OWNED);
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, uc, vp);
mtx_unlock(&ktrace_mtx);
}
/*
* When a process exits, drain per-process asynchronous trace records
* and disable tracing.
*/
void
ktrprocexit(struct thread *td)
{
+ struct ktr_request *req;
struct proc *p;
struct ucred *cred;
struct vnode *vp;
int vfslocked;
p = td->td_proc;
if (p->p_traceflag == 0)
return;
ktrace_enter(td);
+ req = ktr_getrequest_entered(td, KTR_PROCDTOR);
+ if (req != NULL)
+ ktr_enqueuerequest(td, req);
sx_xlock(&ktrace_sx);
ktr_drain(td);
sx_xunlock(&ktrace_sx);
PROC_LOCK(p);
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, &cred, &vp);
mtx_unlock(&ktrace_mtx);
PROC_UNLOCK(p);
if (vp != NULL) {
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
if (cred != NULL)
crfree(cred);
ktrace_exit(td);
}
+static void
+ktrprocctor_entered(struct thread *td, struct proc *p)
+{
+ struct ktr_proc_ctor *ktp;
+ struct ktr_request *req;
+ struct thread *td2;
+
+ ktrace_assert(td);
+ td2 = FIRST_THREAD_IN_PROC(p);
+ req = ktr_getrequest_entered(td2, KTR_PROCCTOR);
+ if (req == NULL)
+ return;
+ ktp = &req->ktr_data.ktr_proc_ctor;
+ ktp->sv_flags = p->p_sysent->sv_flags;
+ ktr_enqueuerequest(td2, req);
+}
+
+void
+ktrprocctor(struct proc *p)
+{
+ struct thread *td = curthread;
+
+ if ((p->p_traceflag & KTRFAC_MASK) == 0)
+ return;
+
+ ktrace_enter(td);
+ ktrprocctor_entered(td, p);
+ ktrace_exit(td);
+}
+
/*
* When a process forks, enable tracing in the new process if needed.
*/
void
ktrprocfork(struct proc *p1, struct proc *p2)
{
- PROC_LOCK_ASSERT(p1, MA_OWNED);
- PROC_LOCK_ASSERT(p2, MA_OWNED);
+ PROC_LOCK(p1);
mtx_lock(&ktrace_mtx);
KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
if (p1->p_traceflag & KTRFAC_INHERIT) {
p2->p_traceflag = p1->p_traceflag;
if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
VREF(p2->p_tracevp);
KASSERT(p1->p_tracecred != NULL,
("ktrace vnode with no cred"));
p2->p_tracecred = crhold(p1->p_tracecred);
}
}
mtx_unlock(&ktrace_mtx);
+ PROC_UNLOCK(p1);
+
+ ktrprocctor(p2);
}
/*
* When a thread returns, drain any asynchronous records generated by the
* system call.
*/
void
ktruserret(struct thread *td)
{
ktrace_enter(td);
sx_xlock(&ktrace_sx);
ktr_drain(td);
sx_xunlock(&ktrace_sx);
ktrace_exit(td);
}
void
ktrnamei(path)
char *path;
{
struct ktr_request *req;
int namelen;
char *buf = NULL;
namelen = strlen(path);
if (namelen > 0) {
buf = malloc(namelen, M_KTRACE, M_WAITOK);
bcopy(path, buf, namelen);
}
req = ktr_getrequest(KTR_NAMEI);
if (req == NULL) {
if (buf != NULL)
free(buf, M_KTRACE);
return;
}
if (namelen > 0) {
req->ktr_header.ktr_len = namelen;
req->ktr_buffer = buf;
}
ktr_submitrequest(curthread, req);
}
void
ktrsysctl(name, namelen)
int *name;
u_int namelen;
{
struct ktr_request *req;
u_int mib[CTL_MAXNAME + 2];
char *mibname;
size_t mibnamelen;
int error;
/* Lookup name of mib. */
KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
mib[0] = 0;
mib[1] = 1;
bcopy(name, mib + 2, namelen * sizeof(*name));
mibnamelen = 128;
mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
NULL, 0, &mibnamelen, 0);
if (error) {
free(mibname, M_KTRACE);
return;
}
req = ktr_getrequest(KTR_SYSCTL);
if (req == NULL) {
free(mibname, M_KTRACE);
return;
}
req->ktr_header.ktr_len = mibnamelen;
req->ktr_buffer = mibname;
ktr_submitrequest(curthread, req);
}
void
ktrgenio(fd, rw, uio, error)
int fd;
enum uio_rw rw;
struct uio *uio;
int error;
{
struct ktr_request *req;
struct ktr_genio *ktg;
int datalen;
char *buf;
if (error) {
free(uio, M_IOV);
return;
}
uio->uio_offset = 0;
uio->uio_rw = UIO_WRITE;
datalen = imin(uio->uio_resid, ktr_geniosize);
buf = malloc(datalen, M_KTRACE, M_WAITOK);
error = uiomove(buf, datalen, uio);
free(uio, M_IOV);
if (error) {
free(buf, M_KTRACE);
return;
}
req = ktr_getrequest(KTR_GENIO);
if (req == NULL) {
free(buf, M_KTRACE);
return;
}
ktg = &req->ktr_data.ktr_genio;
ktg->ktr_fd = fd;
ktg->ktr_rw = rw;
req->ktr_header.ktr_len = datalen;
req->ktr_buffer = buf;
ktr_submitrequest(curthread, req);
}
void
ktrpsig(sig, action, mask, code)
int sig;
sig_t action;
sigset_t *mask;
int code;
{
+ struct thread *td = curthread;
struct ktr_request *req;
struct ktr_psig *kp;
req = ktr_getrequest(KTR_PSIG);
if (req == NULL)
return;
kp = &req->ktr_data.ktr_psig;
kp->signo = (char)sig;
kp->action = action;
kp->mask = *mask;
kp->code = code;
- ktr_enqueuerequest(curthread, req);
+ ktr_enqueuerequest(td, req);
+ ktrace_exit(td);
}
void
ktrcsw(out, user)
int out, user;
{
+ struct thread *td = curthread;
struct ktr_request *req;
struct ktr_csw *kc;
req = ktr_getrequest(KTR_CSW);
if (req == NULL)
return;
kc = &req->ktr_data.ktr_csw;
kc->out = out;
kc->user = user;
- ktr_enqueuerequest(curthread, req);
+ ktr_enqueuerequest(td, req);
+ ktrace_exit(td);
}
void
ktrstruct(name, namelen, data, datalen)
const char *name;
size_t namelen;
void *data;
size_t datalen;
{
struct ktr_request *req;
char *buf = NULL;
size_t buflen;
if (!data)
datalen = 0;
buflen = namelen + 1 + datalen;
buf = malloc(buflen, M_KTRACE, M_WAITOK);
bcopy(name, buf, namelen);
buf[namelen] = '\0';
bcopy(data, buf + namelen + 1, datalen);
if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
free(buf, M_KTRACE);
return;
}
req->ktr_buffer = buf;
req->ktr_header.ktr_len = buflen;
ktr_submitrequest(curthread, req);
}
#endif /* KTRACE */
/* Interface and common routines */
#ifndef _SYS_SYSPROTO_H_
struct ktrace_args {
char *fname;
int ops;
int facs;
int pid;
};
#endif
/* ARGSUSED */
int
ktrace(td, uap)
struct thread *td;
register struct ktrace_args *uap;
{
#ifdef KTRACE
register struct vnode *vp = NULL;
register struct proc *p;
struct pgrp *pg;
int facs = uap->facs & ~KTRFAC_ROOT;
int ops = KTROP(uap->ops);
int descend = uap->ops & KTRFLAG_DESCEND;
int nfound, ret = 0;
int flags, error = 0, vfslocked;
struct nameidata nd;
struct ucred *cred;
/*
* Need something to (un)trace.
*/
if (ops != KTROP_CLEARFILE && facs == 0)
return (EINVAL);
ktrace_enter(td);
if (ops != KTROP_CLEAR) {
/*
* an operation which requires a file argument.
*/
NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE,
uap->fname, td);
flags = FREAD | FWRITE | O_NOFOLLOW;
error = vn_open(&nd, &flags, 0, NULL);
if (error) {
ktrace_exit(td);
return (error);
}
vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
VOP_UNLOCK(vp, 0);
if (vp->v_type != VREG) {
(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
ktrace_exit(td);
return (EACCES);
}
VFS_UNLOCK_GIANT(vfslocked);
}
/*
* Clear all uses of the tracefile.
*/
if (ops == KTROP_CLEARFILE) {
int vrele_count;
vrele_count = 0;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_tracevp == vp) {
if (ktrcanset(td, p)) {
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, &cred, NULL);
mtx_unlock(&ktrace_mtx);
vrele_count++;
crfree(cred);
} else
error = EPERM;
}
PROC_UNLOCK(p);
}
sx_sunlock(&allproc_lock);
if (vrele_count > 0) {
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
while (vrele_count-- > 0)
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
goto done;
}
/*
* do it
*/
sx_slock(&proctree_lock);
if (uap->pid < 0) {
/*
* by process group
*/
pg = pgfind(-uap->pid);
if (pg == NULL) {
sx_sunlock(&proctree_lock);
error = ESRCH;
goto done;
}
/*
* ktrops() may call vrele(). Lock pg_members
* by the proctree_lock rather than pg_mtx.
*/
PGRP_UNLOCK(pg);
nfound = 0;
LIST_FOREACH(p, &pg->pg_members, p_pglist) {
PROC_LOCK(p);
if (p_cansee(td, p) != 0) {
PROC_UNLOCK(p);
continue;
}
PROC_UNLOCK(p);
nfound++;
if (descend)
ret |= ktrsetchildren(td, p, ops, facs, vp);
else
ret |= ktrops(td, p, ops, facs, vp);
}
if (nfound == 0) {
sx_sunlock(&proctree_lock);
error = ESRCH;
goto done;
}
} else {
/*
* by pid
*/
p = pfind(uap->pid);
if (p == NULL) {
sx_sunlock(&proctree_lock);
error = ESRCH;
goto done;
}
error = p_cansee(td, p);
/*
* The slock of the proctree lock will keep this process
* from going away, so unlocking the proc here is ok.
*/
PROC_UNLOCK(p);
if (error) {
sx_sunlock(&proctree_lock);
goto done;
}
if (descend)
ret |= ktrsetchildren(td, p, ops, facs, vp);
else
ret |= ktrops(td, p, ops, facs, vp);
}
sx_sunlock(&proctree_lock);
if (!ret)
error = EPERM;
done:
if (vp != NULL) {
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
(void) vn_close(vp, FWRITE, td->td_ucred, td);
VFS_UNLOCK_GIANT(vfslocked);
}
ktrace_exit(td);
return (error);
#else /* !KTRACE */
return (ENOSYS);
#endif /* KTRACE */
}
/* ARGSUSED */
int
utrace(td, uap)
struct thread *td;
register struct utrace_args *uap;
{
#ifdef KTRACE
struct ktr_request *req;
void *cp;
int error;
if (!KTRPOINT(td, KTR_USER))
return (0);
if (uap->len > KTR_USER_MAXLEN)
return (EINVAL);
cp = malloc(uap->len, M_KTRACE, M_WAITOK);
error = copyin(uap->addr, cp, uap->len);
if (error) {
free(cp, M_KTRACE);
return (error);
}
req = ktr_getrequest(KTR_USER);
if (req == NULL) {
free(cp, M_KTRACE);
return (ENOMEM);
}
req->ktr_buffer = cp;
req->ktr_header.ktr_len = uap->len;
ktr_submitrequest(td, req);
return (0);
#else /* !KTRACE */
return (ENOSYS);
#endif /* KTRACE */
}
#ifdef KTRACE
static int
ktrops(td, p, ops, facs, vp)
struct thread *td;
struct proc *p;
int ops, facs;
struct vnode *vp;
{
struct vnode *tracevp = NULL;
struct ucred *tracecred = NULL;
PROC_LOCK(p);
if (!ktrcanset(td, p)) {
PROC_UNLOCK(p);
return (0);
}
mtx_lock(&ktrace_mtx);
if (ops == KTROP_SET) {
if (p->p_tracevp != vp) {
/*
* if trace file already in use, relinquish below
*/
tracevp = p->p_tracevp;
VREF(vp);
p->p_tracevp = vp;
}
if (p->p_tracecred != td->td_ucred) {
tracecred = p->p_tracecred;
p->p_tracecred = crhold(td->td_ucred);
}
p->p_traceflag |= facs;
if (priv_check(td, PRIV_KTRACE) == 0)
p->p_traceflag |= KTRFAC_ROOT;
} else {
/* KTROP_CLEAR */
if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
/* no more tracing */
ktr_freeproc(p, &tracecred, &tracevp);
}
mtx_unlock(&ktrace_mtx);
+ if ((p->p_traceflag & KTRFAC_MASK) != 0)
+ ktrprocctor_entered(td, p);
PROC_UNLOCK(p);
if (tracevp != NULL) {
int vfslocked;
vfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
vrele(tracevp);
VFS_UNLOCK_GIANT(vfslocked);
}
if (tracecred != NULL)
crfree(tracecred);
return (1);
}
static int
ktrsetchildren(td, top, ops, facs, vp)
struct thread *td;
struct proc *top;
int ops, facs;
struct vnode *vp;
{
register struct proc *p;
register int ret = 0;
p = top;
sx_assert(&proctree_lock, SX_LOCKED);
for (;;) {
ret |= ktrops(td, p, ops, facs, vp);
/*
* If this process has children, descend to them next,
* otherwise do any siblings, and if done with this level,
* follow back up the tree (but not past top).
*/
if (!LIST_EMPTY(&p->p_children))
p = LIST_FIRST(&p->p_children);
else for (;;) {
if (p == top)
return (ret);
if (LIST_NEXT(p, p_sibling)) {
p = LIST_NEXT(p, p_sibling);
break;
}
p = p->p_pptr;
}
}
/*NOTREACHED*/
}
static void
ktr_writerequest(struct thread *td, struct ktr_request *req)
{
struct ktr_header *kth;
struct vnode *vp;
struct proc *p;
struct ucred *cred;
struct uio auio;
struct iovec aiov[3];
struct mount *mp;
int datalen, buflen, vrele_count;
int error, vfslocked;
/*
* We hold the vnode and credential for use in I/O in case ktrace is
* disabled on the process as we write out the request.
*
* XXXRW: This is not ideal: we could end up performing a write after
* the vnode has been closed.
*/
mtx_lock(&ktrace_mtx);
vp = td->td_proc->p_tracevp;
cred = td->td_proc->p_tracecred;
/*
* If vp is NULL, the vp has been cleared out from under this
* request, so just drop it. Make sure the credential and vnode are
* in sync: we should have both or neither.
*/
if (vp == NULL) {
KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
mtx_unlock(&ktrace_mtx);
return;
}
VREF(vp);
KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
crhold(cred);
mtx_unlock(&ktrace_mtx);
kth = &req->ktr_header;
KASSERT(((u_short)kth->ktr_type & ~KTR_DROP) <
sizeof(data_lengths) / sizeof(data_lengths[0]),
("data_lengths array overflow"));
datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
buflen = kth->ktr_len;
auio.uio_iov = &aiov[0];
auio.uio_offset = 0;
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_rw = UIO_WRITE;
aiov[0].iov_base = (caddr_t)kth;
aiov[0].iov_len = sizeof(struct ktr_header);
auio.uio_resid = sizeof(struct ktr_header);
auio.uio_iovcnt = 1;
auio.uio_td = td;
if (datalen != 0) {
aiov[1].iov_base = (caddr_t)&req->ktr_data;
aiov[1].iov_len = datalen;
auio.uio_resid += datalen;
auio.uio_iovcnt++;
kth->ktr_len += datalen;
}
if (buflen != 0) {
KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
aiov[auio.uio_iovcnt].iov_len = buflen;
auio.uio_resid += buflen;
auio.uio_iovcnt++;
}
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_start_write(vp, &mp, V_WAIT);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef MAC
error = mac_vnode_check_write(cred, NOCRED, vp);
if (error == 0)
#endif
error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
crfree(cred);
if (!error) {
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
return;
}
VFS_UNLOCK_GIANT(vfslocked);
/*
* If error encountered, give up tracing on this vnode. We defer
* all the vrele()'s on the vnode until after we are finished walking
* the various lists to avoid needlessly holding locks.
* NB: at this point we still hold the vnode reference that must
* not go away as we need the valid vnode to compare with. Thus let
* vrele_count start at 1 and the reference will be freed
* by the loop at the end after our last use of vp.
*/
log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
error);
vrele_count = 1;
/*
* First, clear this vnode from being used by any processes in the
* system.
* XXX - If one process gets an EPERM writing to the vnode, should
* we really do this? Other processes might have suitable
* credentials for the operation.
*/
cred = NULL;
sx_slock(&allproc_lock);
FOREACH_PROC_IN_SYSTEM(p) {
PROC_LOCK(p);
if (p->p_tracevp == vp) {
mtx_lock(&ktrace_mtx);
ktr_freeproc(p, &cred, NULL);
mtx_unlock(&ktrace_mtx);
vrele_count++;
}
PROC_UNLOCK(p);
if (cred != NULL) {
crfree(cred);
cred = NULL;
}
}
sx_sunlock(&allproc_lock);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
while (vrele_count-- > 0)
vrele(vp);
VFS_UNLOCK_GIANT(vfslocked);
}
/*
* Return true if caller has permission to set the ktracing state
* of target. Essentially, the target can't possess any
* more permissions than the caller. KTRFAC_ROOT signifies that
* root previously set the tracing status on the target process, and
* so, only root may further change it.
*/
static int
ktrcanset(td, targetp)
struct thread *td;
struct proc *targetp;
{
PROC_LOCK_ASSERT(targetp, MA_OWNED);
if (targetp->p_traceflag & KTRFAC_ROOT &&
priv_check(td, PRIV_KTRACE))
return (0);
if (p_candebug(td, targetp) != 0)
return (0);
return (1);
}
#endif /* KTRACE */
Index: stable/8/sys/sys/ktrace.h
===================================================================
--- stable/8/sys/sys/ktrace.h (revision 220261)
+++ stable/8/sys/sys/ktrace.h (revision 220262)
@@ -1,222 +1,239 @@
/*-
* Copyright (c) 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ktrace.h 8.1 (Berkeley) 6/2/93
* $FreeBSD$
*/
#ifndef _SYS_KTRACE_H_
#define _SYS_KTRACE_H_
/*
* operations to ktrace system call (KTROP(op))
*/
#define KTROP_SET 0 /* set trace points */
#define KTROP_CLEAR 1 /* clear trace points */
#define KTROP_CLEARFILE 2 /* stop all tracing to file */
#define KTROP(o) ((o)&3) /* macro to extract operation */
/*
* flags (ORed in with operation)
*/
#define KTRFLAG_DESCEND 4 /* perform op on all children too */
/*
* ktrace record header
*/
struct ktr_header {
int ktr_len; /* length of buf */
short ktr_type; /* trace record type */
pid_t ktr_pid; /* process id */
char ktr_comm[MAXCOMLEN+1]; /* command name */
struct timeval ktr_time; /* timestamp */
intptr_t ktr_tid; /* was ktr_buffer */
};
/*
* Test for kernel trace point (MP SAFE).
*
* KTRCHECK() just checks that the type is enabled and is only for
* internal use in the ktrace subsystem. KTRPOINT() checks against
* ktrace recursion as well as checking that the type is enabled and
* is the public interface.
*/
#define KTRCHECK(td, type) ((td)->td_proc->p_traceflag & (1 << type))
#define KTRPOINT(td, type) \
(KTRCHECK((td), (type)) && !((td)->td_pflags & TDP_INKTRACE))
#define KTRCHECKDRAIN(td) (!(STAILQ_EMPTY(&(td)->td_proc->p_ktr)))
#define KTRUSERRET(td) do { \
if (KTRCHECKDRAIN(td)) \
ktruserret(td); \
} while (0)
#define KTRPROCEXIT(td) do { \
if (KTRCHECKDRAIN(td)) \
ktrprocexit(td); \
} while (0)
/*
* ktrace record types
*/
/*
* KTR_SYSCALL - system call record
*/
#define KTR_SYSCALL 1
struct ktr_syscall {
short ktr_code; /* syscall number */
short ktr_narg; /* number of arguments */
/*
* followed by ktr_narg register_t
*/
register_t ktr_args[1];
};
/*
* KTR_SYSRET - return from system call record
*/
#define KTR_SYSRET 2
struct ktr_sysret {
short ktr_code;
short ktr_eosys;
int ktr_error;
register_t ktr_retval;
};
/*
* KTR_NAMEI - namei record
*/
#define KTR_NAMEI 3
/* record contains pathname */
/*
* KTR_GENIO - trace generic process i/o
*/
#define KTR_GENIO 4
struct ktr_genio {
int ktr_fd;
enum uio_rw ktr_rw;
/*
* followed by data successfully read/written
*/
};
/*
* KTR_PSIG - trace processed signal
*/
#define KTR_PSIG 5
struct ktr_psig {
int signo;
sig_t action;
int code;
sigset_t mask;
};
/*
* KTR_CSW - trace context switches
*/
#define KTR_CSW 6
struct ktr_csw {
int out; /* 1 if switch out, 0 if switch in */
int user; /* 1 if usermode (ivcsw), 0 if kernel (vcsw) */
};
/*
* KTR_USER - data coming from userland
*/
#define KTR_USER_MAXLEN 2048 /* maximum length of passed data */
#define KTR_USER 7
/*
* KTR_STRUCT - misc. structs
*/
#define KTR_STRUCT 8
struct sockaddr;
struct stat;
+struct sysentvec;
/*
* KTR_SYSCTL - name of a sysctl MIB
*/
#define KTR_SYSCTL 9
/* record contains null-terminated MIB name */
/*
+ * KTR_PROCCTOR - trace process creation (multiple ABI support)
+ */
+#define KTR_PROCCTOR 10
+struct ktr_proc_ctor {
+ u_int sv_flags; /* struct sysentvec sv_flags copy */
+};
+
+/*
+ * KTR_PROCDTOR - trace process destruction (multiple ABI support)
+ */
+#define KTR_PROCDTOR 11
+
+/*
* KTR_DROP - If this bit is set in ktr_type, then at least one event
* between the previous record and this record was dropped.
*/
#define KTR_DROP 0x8000
/*
* kernel trace points (in p_traceflag)
*/
#define KTRFAC_MASK 0x00ffffff
#define KTRFAC_SYSCALL (1<<KTR_SYSCALL)
#define KTRFAC_SYSRET (1<<KTR_SYSRET)
#define KTRFAC_NAMEI (1<<KTR_NAMEI)
#define KTRFAC_GENIO (1<<KTR_GENIO)
#define KTRFAC_PSIG (1<<KTR_PSIG)
#define KTRFAC_CSW (1<<KTR_CSW)
#define KTRFAC_USER (1<<KTR_USER)
#define KTRFAC_STRUCT (1<<KTR_STRUCT)
#define KTRFAC_SYSCTL (1<<KTR_SYSCTL)
+#define KTRFAC_PROCCTOR (1<<KTR_PROCCTOR)
+#define KTRFAC_PROCDTOR (1<<KTR_PROCDTOR)
/*
* trace flags (also in p_traceflags)
*/
#define KTRFAC_ROOT 0x80000000 /* root set this trace */
#define KTRFAC_INHERIT 0x40000000 /* pass trace flags to children */
#define KTRFAC_DROP 0x20000000 /* last event was dropped */
#ifdef _KERNEL
void ktrnamei(char *);
void ktrcsw(int, int);
void ktrpsig(int, sig_t, sigset_t *, int);
void ktrgenio(int, enum uio_rw, struct uio *, int);
void ktrsyscall(int, int narg, register_t args[]);
void ktrsysctl(int *name, u_int namelen);
void ktrsysret(int, int, register_t);
+void ktrprocctor(struct proc *);
void ktrprocexec(struct proc *, struct ucred **, struct vnode **);
void ktrprocexit(struct thread *);
void ktrprocfork(struct proc *, struct proc *);
void ktruserret(struct thread *);
void ktrstruct(const char *, size_t, void *, size_t);
#define ktrsockaddr(s) \
ktrstruct("sockaddr", 8, (s), ((struct sockaddr *)(s))->sa_len)
#define ktrstat(s) \
ktrstruct("stat", 4, (s), sizeof(struct stat))
#else
#include <sys/cdefs.h>
__BEGIN_DECLS
int ktrace(const char *, int, int, pid_t);
int utrace(const void *, size_t);
__END_DECLS
#endif
#endif
Index: stable/8/sys
===================================================================
--- stable/8/sys (revision 220261)
+++ stable/8/sys (revision 220262)
Property changes on: stable/8/sys
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys:r219041-219042,219311-219312
File Metadata
Details
Attached
Mime Type
text/x-c
Expires
Tue, Oct 14, 10:33 PM (2 d)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
23695468
Default Alt Text
(100 KB)
Attached To
Mode
rS FreeBSD src repository - subversion
Attached
Detach File
Event Timeline
Log In to Comment