diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index a943ec339e75..cf067527237e 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1,2102 +1,2103 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exec, "char *"); SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", "Location of process' ps_strings structure"); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", "Top of process stack"); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_stackprot, "I", "Stack memory permissions"); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, "Process' command line characters cache limit"); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); static int core_dump_can_intr = 1; SYSCTL_INT(_kern, OID_AUTO, core_dump_can_intr, CTLFLAG_RWTUN, &core_dump_can_intr, 0, "Core dumping interruptible with SIGKILL"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; vm_offset_t ps_strings; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)PROC_PS_STRINGS(p); return (SYSCTL_OUT(req, &val, sizeof(val))); } #endif ps_strings = PROC_PS_STRINGS(p); return (SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings))); } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; vm_offset_t val; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val32; val32 = round_page((unsigned int)p->p_vmspace->vm_stacktop); return (SYSCTL_OUT(req, &val32, sizeof(val32))); } #endif val = round_page(p->p_vmspace->vm_stacktop); return (SYSCTL_OUT(req, &val, sizeof(val))); } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL, oldvmspace); post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; }; #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL, oldvmspace); } post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p, oldvmspace); post_execve(td, error, oldvmspace); AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == EJUSTRETURN) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } exec_cleanup(td, oldvmspace); } /* * kern_execve() has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace) { TSEXEC(td->td_proc->p_pid, args->begin_argv); AUDIT_ARG_ARGV(args->begin_argv, args->argc, exec_args_get_begin_envv(args) - args->begin_argv); AUDIT_ARG_ENVV(exec_args_get_begin_envv(args), args->envc, args->endp - exec_args_get_begin_envv(args)); #ifdef KTRACE if (KTRPOINT(td, KTR_ARGS)) { ktrdata(KTR_ARGS, args->begin_argv, exec_args_get_begin_envv(args) - args->begin_argv); } if (KTRPOINT(td, KTR_ENVS)) { ktrdata(KTR_ENVS, exec_args_get_begin_envv(args), args->endp - exec_args_get_begin_envv(args)); } #endif /* Must have at least one argument. */ if (args->argc == 0) { exec_free_args(args); return (EINVAL); } return (do_execve(td, args, mac_p, oldvmspace)); } static void execve_nosetid(struct image_params *imgp) { imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p, struct vmspace *oldvmspace) { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; uintptr_t stack_base; struct image_params image_params, *imgp; struct vattr attr; struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct ktr_io_params *kiop; #endif struct vnode *oldtextvp, *newtextvp; struct vnode *oldtextdvp, *newtextdvp; char *oldbinname, *newbinname; bool credential_changing; #ifdef MAC struct label *interpvplabel = NULL; bool will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif int error, i, orig_osrel; uint32_t orig_fctl0; Elf_Brandinfo *orig_brandinfo; size_t freepath_size; static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; oldtextvp = oldtextdvp = NULL; newtextvp = newtextdvp = NULL; newbinname = oldbinname = NULL; #ifdef KTRACE kiop = NULL; #endif /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; orig_osrel = p->p_osrel; orig_fctl0 = p->p_fctl0; orig_brandinfo = p->p_elf_brandinfo; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE if (CAP_TRACING(td)) ktrcapfail(CAPFAIL_NAMEI, args->fname); /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif /* * Translate the file name. namei() returns a vnode * pointer in ni_vp among other things. */ NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW | AUDITVNODE1 | WANTPARENT, UIO_SYSSPACE, args->fname); error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; newtextdvp = nd.ni_dvp; nd.ni_dvp = NULL; newbinname = malloc(nd.ni_cnd.cn_namelen + 1, M_PARGS, M_WAITOK); memcpy(newbinname, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen); newbinname[nd.ni_cnd.cn_namelen] = '\0'; imgp->vp = newtextvp; /* * Do the best to calculate the full path to the image file. */ if (args->fname[0] == '/') { imgp->execpath = args->fname; } else { VOP_UNLOCK(imgp->vp); freepath_size = MAXPATHLEN; if (vn_fullpath_hardlink(newtextvp, newtextdvp, newbinname, nd.ni_cnd.cn_namelen, &imgp->execpath, &imgp->freepath, &freepath_size) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } } else if (imgp->interpreter_vp) { /* * An image activator has already provided an open vnode */ newtextvp = imgp->interpreter_vp; imgp->interpreter_vp = NULL; if (vn_fullpath(newtextvp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(newtextvp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * If the descriptors was not opened with O_PATH, then * we require that it was opened with O_EXEC or * O_RDONLY. In either case, exec_check_permissions() * below checks _current_ file access mode regardless * of the permissions additionally checked at the * open(2). */ error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp); if (error != 0) goto exec_fail; if (vn_fullpath(newtextvp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(newtextvp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions. Also 'opens' file and sets its vnode to * text mode. */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; imgp->proc->p_fctl0 = 0; imgp->proc->p_elf_brandinfo = NULL; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = false; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp) != 0; credential_changing |= will_transition; #endif /* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */ if (credential_changing) imgp->proc->p_pdeathsig = 0; if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ error = -1; for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL) continue; error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) error = ENOEXEC; goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * The text reference needs to be removed for scripts. * There is a short period before we determine that * something is a script where text reference is active. * The vnode lock is held over this entire period * so nothing should illegitimately be blocked. */ MPASS(imgp->textset); VOP_UNSET_TEXT_CHECKED(newtextvp); imgp->textset = false; /* free name buffer and old vnode */ #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = false; } vput(newtextvp); imgp->vp = newtextvp = NULL; if (args->fname != NULL) { if (newtextdvp != NULL) { vrele(newtextdvp); newtextdvp = NULL; } NDFREE_PNBUF(&nd); free(newbinname, M_PARGS); newbinname = NULL; } vm_object_deallocate(imgp->object); imgp->object = NULL; execve_nosetid(imgp); imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ if (imgp->interpreter_vp) { args->fname = NULL; } else { args->fname = imgp->interpreter_name; } goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * Copy out strings (args and env) and initialize stack base. */ error = (*p->p_sysent->sv_copyout_strings)(imgp, &stack_base); if (error != 0) { vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * Stack setup. */ error = (*p->p_sysent->sv_fixup)(&stack_base, imgp); if (error != 0) { vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* * For security and other reasons, the file descriptor table cannot be * shared after an exec. */ fdunshare(td); pdunshare(td); /* close files on exec */ fdcloseexec(td); /* * Malloc things before we need locks. */ i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; + td->td_pflags2 &= ~TDP2_UEXTERR; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0) p->p_flag2 &= ~P2_STKGAP_DISABLE; p->p_flag2 &= ~(P2_MEMBAR_PRIVE | P2_MEMBAR_PRIVE_SYNCORE | P2_MEMBAR_GLOBE); if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } if ((imgp->sysent->sv_setid_allowed != NULL && !(*imgp->sysent->sv_setid_allowed)(td, imgp)) || (p->p_flag2 & P2_NO_NEW_PRIVS) != 0) execve_nosetid(imgp); /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE kiop = ktrprocexec(p); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in kern.proc.pathname. This vnode was * referenced by namei() or by fexecve variant of fname handling. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; oldtextdvp = p->p_textdvp; p->p_textdvp = newtextdvp; newtextdvp = NULL; oldbinname = p->p_binname; p->p_binname = newbinname; newbinname = NULL; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; PROC_UNLOCK(p); #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { VOP_UNLOCK(imgp->vp); pe.pm_credentialschanged = credential_changing; pe.pm_baseaddr = imgp->reloc_base; pe.pm_dynaddr = imgp->et_dyn_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } #endif /* Set values passed into the program in registers. */ (*p->p_sysent->sv_setregs)(td, imgp, stack_base); VOP_MMAPPED(imgp->vp); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (error != 0) { p->p_osrel = orig_osrel; p->p_fctl0 = orig_fctl0; p->p_elf_brandinfo = orig_brandinfo; } if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (imgp->textset) VOP_UNSET_TEXT_CHECKED(imgp->vp); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp); if (args->fname != NULL) NDFREE_PNBUF(&nd); if (newtextdvp != NULL) vrele(newtextdvp); free(newbinname, M_PARGS); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { if (p->p_ptevents & PTRACE_EXEC) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); } } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); if (oldtextdvp != NULL) vrele(oldtextdvp); free(oldbinname, M_PARGS); #ifdef KTRACE ktr_io_params_free(kiop); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exec_cleanup(td, oldvmspace); exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif /* * We don't want cpu_set_syscall_retval() to overwrite any of * the register values put in place by exec_setregs(). * Implementations of cpu_set_syscall_retval() will leave * registers unmodified when returning EJUSTRETURN. */ return (error == 0 ? EJUSTRETURN : error); } void exec_cleanup(struct thread *td, struct vmspace *oldvmspace) { if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(td->td_proc->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } int exec_map_first_page(struct image_params *imgp) { vm_object_t object; vm_page_t m; int error; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); #if VM_NRESERVLEVEL > 0 if ((object->flags & OBJ_COLORED) == 0) { VM_OBJECT_WLOCK(object); vm_object_color(object, 0); VM_OBJECT_WUNLOCK(object); } #endif error = vm_page_grab_valid_unlocked(&m, object, 0, VM_ALLOC_COUNT(VM_INITIAL_PAGEIN) | VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); if (error != VM_PAGER_OK) return (EIO); imgp->firstpage = sf_buf_alloc(m, 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(struct image_params *imgp) { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_unwire(m, PQ_ACTIVE); } } void exec_onexec_old(struct thread *td) { sigfastblock_clear(td); umtx_exec(td->td_proc); } /* * This is an optimization which removes the unmanaged shared page * mapping. In combination with pmap_remove_pages(), which cleans all * managed mappings in the process' vmspace pmap, no work will be left * for pmap_remove(min, max). */ void exec_free_abi_mappings(struct proc *p) { struct vmspace *vmspace; vmspace = p->p_vmspace; if (refcount_load(&vmspace->vm_refcnt) != 1) return; if (!PROC_HAS_SHP(p)) return; pmap_remove(vmspace_pmap(vmspace), vmspace->vm_shp_base, vmspace->vm_shp_base + p->p_sysent->sv_shared_page_len); } /* * Run down the current address space and install a new one. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; struct thread *td = curthread; vm_offset_t sv_minuser; vm_map_t map; imgp->vmspace_destroyed = true; imgp->sysent = sv; if (p->p_sysent->sv_onexec_old != NULL) p->p_sysent->sv_onexec_old(td); itimers_exec(p); EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (refcount_load(&vmspace->vm_refcnt) == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser && cpu_exec_vmspace_reuse(p, map)) { exec_free_abi_mappings(p); shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* * An exec terminates mlockall(MCL_FUTURE). * ASLR and W^X states must be re-evaluated. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR | MAP_ASLR_IGNSTART | MAP_ASLR_STACK | MAP_WXORX); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } map->flags |= imgp->map_flags; return (sv->sv_onexec != NULL ? sv->sv_onexec(p, imgp) : 0); } /* * Compute the stack size limit and map the main process stack. * Map the shared page. */ int exec_map_stack(struct image_params *imgp) { struct rlimit rlim_stack; struct sysentvec *sv; struct proc *p; vm_map_t map; struct vmspace *vmspace; vm_offset_t stack_addr, stack_top; vm_offset_t sharedpage_addr; u_long ssiz; int error, find_space, stack_off; vm_prot_t stack_prot; vm_object_t obj; p = imgp->proc; sv = p->p_sysent; if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } vmspace = p->p_vmspace; map = &vmspace->vm_map; stack_prot = sv->sv_shared_page_obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot; if ((map->flags & MAP_ASLR_STACK) != 0) { stack_addr = round_page((vm_offset_t)p->p_vmspace->vm_daddr + lim_max(curthread, RLIMIT_DATA)); find_space = VMFS_ANY_SPACE; } else { stack_addr = sv->sv_usrstack - ssiz; find_space = VMFS_NO_SPACE; } error = vm_map_find(map, NULL, 0, &stack_addr, (vm_size_t)ssiz, sv->sv_usrstack, find_space, stack_prot, VM_PROT_ALL, MAP_STACK_AREA); if (error != KERN_SUCCESS) { uprintf("exec_new_vmspace: mapping stack size %#jx prot %#x " "failed, mach error %d errno %d\n", (uintmax_t)ssiz, stack_prot, error, vm_mmap_to_errno(error)); return (vm_mmap_to_errno(error)); } stack_top = stack_addr + ssiz; if ((map->flags & MAP_ASLR_STACK) != 0) { /* Randomize within the first page of the stack. */ arc4rand(&stack_off, sizeof(stack_off), 0); stack_top -= rounddown2(stack_off & PAGE_MASK, sizeof(void *)); } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj == NULL) { sharedpage_addr = 0; goto out; } /* * If randomization is disabled then the shared page will * be mapped at address specified in sysentvec. * Otherwise any address above .data section can be selected. * Same logic is used for stack address randomization. * If the address randomization is applied map a guard page * at the top of UVA. */ vm_object_reference(obj); if ((imgp->imgp_flags & IMGP_ASLR_SHARED_PAGE) != 0) { sharedpage_addr = round_page((vm_offset_t)p->p_vmspace->vm_daddr + lim_max(curthread, RLIMIT_DATA)); error = vm_map_fixed(map, NULL, 0, sv->sv_maxuser - PAGE_SIZE, PAGE_SIZE, VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD); if (error != KERN_SUCCESS) { /* * This is not fatal, so let's just print a warning * and continue. */ uprintf("%s: Mapping guard page at the top of UVA failed" " mach error %d errno %d", __func__, error, vm_mmap_to_errno(error)); } error = vm_map_find(map, obj, 0, &sharedpage_addr, sv->sv_shared_page_len, sv->sv_maxuser, VMFS_ANY_SPACE, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); } else { sharedpage_addr = sv->sv_shared_page_base; vm_map_fixed(map, obj, 0, sharedpage_addr, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); } if (error != KERN_SUCCESS) { uprintf("%s: mapping shared page at addr: %p" "failed, mach error %d errno %d\n", __func__, (void *)sharedpage_addr, error, vm_mmap_to_errno(error)); vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } out: /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_maxsaddr = (char *)stack_addr; vmspace->vm_stacktop = stack_top; vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_shp_base = sharedpage_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, const char *fname, char **argv, char **envv) { u_long arg, env; int error; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ error = exec_args_add_fname(args, fname, UIO_USERSPACE); if (error != 0) goto err_exit; /* * extract arguments first */ for (;;) { error = fueword(argv++, &arg); if (error == -1) { error = EFAULT; goto err_exit; } if (arg == 0) break; error = exec_args_add_arg(args, (char *)(uintptr_t)arg, UIO_USERSPACE); if (error != 0) goto err_exit; } /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &env); if (error == -1) { error = EFAULT; goto err_exit; } if (env == 0) break; error = exec_args_add_env(args, (char *)(uintptr_t)env, UIO_USERSPACE); if (error != 0) goto err_exit; } } return (0); err_exit: exec_free_args(args); return (error); } struct exec_args_kva { vm_offset_t addr; u_int gen; SLIST_ENTRY(exec_args_kva) next; }; DPCPU_DEFINE_STATIC(struct exec_args_kva *, exec_args_kva); static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist; static struct mtx exec_args_kva_mtx; static u_int exec_args_gen; static void exec_prealloc_args_kva(void *arg __unused) { struct exec_args_kva *argkva; u_int i; SLIST_INIT(&exec_args_kva_freelist); mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF); for (i = 0; i < exec_map_entries; i++) { argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK); argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size); argkva->gen = exec_args_gen; SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); } } SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL); static vm_offset_t exec_alloc_args_kva(void **cookie) { struct exec_args_kva *argkva; argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_PTR(exec_args_kva)); if (argkva == NULL) { mtx_lock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL) (void)mtx_sleep(&exec_args_kva_freelist, &exec_args_kva_mtx, 0, "execkva", 0); SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next); mtx_unlock(&exec_args_kva_mtx); } kasan_mark((void *)argkva->addr, exec_map_entry_size, exec_map_entry_size, 0); *(struct exec_args_kva **)cookie = argkva; return (argkva->addr); } static void exec_release_args_kva(struct exec_args_kva *argkva, u_int gen) { vm_offset_t base; base = argkva->addr; kasan_mark((void *)argkva->addr, 0, exec_map_entry_size, KASAN_EXEC_ARGS_FREED); if (argkva->gen != gen) { (void)vm_map_madvise(exec_map, base, base + exec_map_entry_size, MADV_FREE); argkva->gen = gen; } if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva), (uintptr_t)NULL, (uintptr_t)argkva)) { mtx_lock(&exec_args_kva_mtx); SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); wakeup_one(&exec_args_kva_freelist); mtx_unlock(&exec_args_kva_mtx); } } static void exec_free_args_kva(void *cookie) { exec_release_args_kva(cookie, exec_args_gen); } static void exec_args_kva_lowmem(void *arg __unused, int flags __unused) { SLIST_HEAD(, exec_args_kva) head; struct exec_args_kva *argkva; u_int gen; int i; gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1; /* * Force an madvise of each KVA range. Any currently allocated ranges * will have MADV_FREE applied once they are freed. */ SLIST_INIT(&head); mtx_lock(&exec_args_kva_mtx); SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva); mtx_unlock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&head)) != NULL) { SLIST_REMOVE_HEAD(&head, next); exec_release_args_kva(argkva, gen); } CPU_FOREACH(i) { argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva)); if (argkva != NULL) exec_release_args_kva(argkva, gen); } } EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL, EVENTHANDLER_PRI_ANY); /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)exec_alloc_args_kva(&args->bufkva); return (0); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { exec_free_args_kva(args->bufkva); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } } /* * A set to functions to fill struct image args. * * NOTE: exec_args_add_fname() must be called (possibly with a NULL * fname) before the other functions. All exec_args_add_arg() calls must * be made before any exec_args_add_env() calls. exec_args_adjust_args() * may be called any time after exec_args_add_fname(). * * exec_args_add_fname() - install path to be executed * exec_args_add_arg() - append an argument string * exec_args_add_env() - append an env string * exec_args_adjust_args() - adjust location of the argument list to * allow new arguments to be prepended */ int exec_args_add_fname(struct image_args *args, const char *fname, enum uio_seg segflg) { int error; size_t length; KASSERT(args->fname == NULL, ("fname already appended")); KASSERT(args->endp == NULL, ("already appending to args")); if (fname != NULL) { args->fname = args->buf; error = segflg == UIO_SYSSPACE ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) return (error == ENAMETOOLONG ? E2BIG : error); } else length = 0; /* Set up for _arg_*()/_env_*() */ args->endp = args->buf + length; /* begin_argv must be set and kept updated */ args->begin_argv = args->endp; KASSERT(exec_map_entry_size - length >= ARG_MAX, ("too little space remaining for arguments %zu < %zu", exec_map_entry_size - length, (size_t)ARG_MAX)); args->stringspace = ARG_MAX; return (0); } static int exec_args_add_str(struct image_args *args, const char *str, enum uio_seg segflg, int *countp) { int error; size_t length; KASSERT(args->endp != NULL, ("endp not initialized")); KASSERT(args->begin_argv != NULL, ("begin_argp not initialized")); error = (segflg == UIO_SYSSPACE) ? copystr(str, args->endp, args->stringspace, &length) : copyinstr(str, args->endp, args->stringspace, &length); if (error != 0) return (error == ENAMETOOLONG ? E2BIG : error); args->stringspace -= length; args->endp += length; (*countp)++; return (0); } int exec_args_add_arg(struct image_args *args, const char *argp, enum uio_seg segflg) { KASSERT(args->envc == 0, ("appending args after env")); return (exec_args_add_str(args, argp, segflg, &args->argc)); } int exec_args_add_env(struct image_args *args, const char *envp, enum uio_seg segflg) { if (args->envc == 0) args->begin_envv = args->endp; return (exec_args_add_str(args, envp, segflg, &args->envc)); } int exec_args_adjust_args(struct image_args *args, size_t consume, ssize_t extend) { ssize_t offset; KASSERT(args->endp != NULL, ("endp not initialized")); KASSERT(args->begin_argv != NULL, ("begin_argp not initialized")); offset = extend - consume; if (args->stringspace < offset) return (E2BIG); memmove(args->begin_argv + extend, args->begin_argv + consume, args->endp - args->begin_argv + consume); if (args->envc > 0) args->begin_envv += offset; args->endp += offset; args->stringspace -= offset; return (0); } char * exec_args_get_begin_envv(struct image_args *args) { KASSERT(args->endp != NULL, ("endp not initialized")); if (args->envc > 0) return (args->begin_envv); return (args->endp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ int exec_copyout_strings(struct image_params *imgp, uintptr_t *stack_base) { int argc, envc; char **vectp; char *stringp; uintptr_t destp, ustringp; struct ps_strings *arginfo; struct proc *p; struct sysentvec *sysent; size_t execpath_len; int error, szsigcode; char canary[sizeof(long) * 8]; p = imgp->proc; sysent = p->p_sysent; destp = PROC_PS_STRINGS(p); arginfo = imgp->ps_strings = (void *)destp; /* * Install sigcode. */ if (sysent->sv_shared_page_base == 0 && sysent->sv_szsigcode != NULL) { szsigcode = *(sysent->sv_szsigcode); destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); error = copyout(sysent->sv_sigcode, (void *)destp, szsigcode); if (error != 0) return (error); } /* * Copy the image path for the rtld. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) { execpath_len = strlen(imgp->execpath) + 1; destp -= execpath_len; destp = rounddown2(destp, sizeof(void *)); imgp->execpathp = (void *)destp; error = copyout(imgp->execpath, imgp->execpathp, execpath_len); if (error != 0) return (error); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = (void *)destp; error = copyout(canary, imgp->canary, sizeof(canary)); if (error != 0) return (error); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ imgp->pagesizeslen = sizeof(pagesizes[0]) * MAXPAGESIZES; destp -= imgp->pagesizeslen; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = (void *)destp; error = copyout(pagesizes, imgp->pagesizes, imgp->pagesizeslen); if (error != 0) return (error); /* * Allocate room for the argument and environment strings. */ destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); ustringp = destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has up to AT_COUNT entries. */ destp -= AT_COUNT * sizeof(Elf_Auxinfo); destp = rounddown2(destp, sizeof(void *)); } vectp = (char **)destp; /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* * vectp also becomes our initial stack base */ *stack_base = (uintptr_t)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ error = copyout(stringp, (void *)ustringp, ARG_MAX - imgp->args->stringspace); if (error != 0) return (error); /* * Fill in "ps_strings" struct for ps, w, etc. */ imgp->argv = vectp; if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nargvstr, argc) != 0) return (EFAULT); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* a null vector table pointer separates the argp's from the envp's */ if (suword(vectp++, 0) != 0) return (EFAULT); imgp->envv = vectp; if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 || suword32(&arginfo->ps_nenvstr, envc) != 0) return (EFAULT); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { if (suword(vectp++, ustringp) != 0) return (EFAULT); while (*stringp++ != 0) ustringp++; ustringp++; } /* end of vector table is a null pointer */ if (suword(vectp, 0) != 0) return (EFAULT); if (imgp->auxargs) { vectp++; error = imgp->sysent->sv_copyout_auxargs(imgp, (uintptr_t)vectp); if (error != 0) return (error); } return (0); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(struct image_params *imgp) { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. * * Add a text reference now so no one can write to the * executable while we're activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ error = VOP_SET_TEXT(vp); if (error != 0) return (error); imgp->textset = true; /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = true; return (error); } /* * Exec handler registration */ int exec_register(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; u_int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } /* * Write out a core segment to the compression stream. */ static int compress_chunk(struct coredump_params *cp, char *base, char *buf, size_t len) { size_t chunk_len; int error; error = 0; while (len > 0) { chunk_len = MIN(len, CORE_BUF_SIZE); /* * We can get EFAULT error here. * In that case zero out the current chunk of the segment. */ error = copyin(base, buf, chunk_len); if (error != 0) bzero(buf, chunk_len); error = compressor_write(cp->comp, buf, chunk_len); if (error != 0) break; base += chunk_len; len -= chunk_len; } return (error); } int core_write(struct coredump_params *cp, const void *base, size_t len, off_t offset, enum uio_seg seg, size_t *resid) { return (vn_rdwr_inchunks(UIO_WRITE, cp->vp, __DECONST(void *, base), len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED, cp->active_cred, cp->file_cred, resid, cp->td)); } int core_output(char *base, size_t len, off_t offset, struct coredump_params *cp, void *tmpbuf) { vm_map_t map; struct mount *mp; size_t resid, runlen; int error; bool success; KASSERT((uintptr_t)base % PAGE_SIZE == 0, ("%s: user address %p is not page-aligned", __func__, base)); if (cp->comp != NULL) return (compress_chunk(cp, base, tmpbuf, len)); error = 0; map = &cp->td->td_proc->p_vmspace->vm_map; for (; len > 0; base += runlen, offset += runlen, len -= runlen) { /* * Attempt to page in all virtual pages in the range. If a * virtual page is not backed by the pager, it is represented as * a hole in the file. This can occur with zero-filled * anonymous memory or truncated files, for example. */ for (runlen = 0; runlen < len; runlen += PAGE_SIZE) { if (core_dump_can_intr && curproc_sigkilled()) return (EINTR); error = vm_fault(map, (uintptr_t)base + runlen, VM_PROT_READ, VM_FAULT_NOFILL, NULL); if (runlen == 0) success = error == KERN_SUCCESS; else if ((error == KERN_SUCCESS) != success) break; } if (success) { error = core_write(cp, base, runlen, offset, UIO_USERSPACE, &resid); if (error != 0) { if (error != EFAULT) break; /* * EFAULT may be returned if the user mapping * could not be accessed, e.g., because a mapped * file has been truncated. Skip the page if no * progress was made, to protect against a * hypothetical scenario where vm_fault() was * successful but core_write() returns EFAULT * anyway. */ runlen -= resid; if (runlen == 0) { success = false; runlen = PAGE_SIZE; } } } if (!success) { error = vn_start_write(cp->vp, &mp, V_WAIT); if (error != 0) break; vn_lock(cp->vp, LK_EXCLUSIVE | LK_RETRY); error = vn_truncate_locked(cp->vp, offset + runlen, false, cp->td->td_ucred); VOP_UNLOCK(cp->vp); vn_finished_write(mp); if (error != 0) break; } } return (error); } /* * Drain into a core file. */ int sbuf_drain_core_output(void *arg, const char *data, int len) { struct coredump_params *cp; struct proc *p; int error, locked; cp = arg; p = cp->td->td_proc; /* * Some kern_proc out routines that print to this sbuf may * call us with the process lock held. Draining with the * non-sleepable lock held is unsafe. The lock is needed for * those routines when dumping a live process. In our case we * can safely release the lock before draining and acquire * again after. */ locked = PROC_LOCKED(p); if (locked) PROC_UNLOCK(p); if (cp->comp != NULL) error = compressor_write(cp->comp, __DECONST(char *, data), len); else error = core_write(cp, __DECONST(void *, data), len, cp->offset, UIO_SYSSPACE, NULL); if (locked) PROC_LOCK(p); if (error != 0) return (-error); cp->offset += len; return (len); } diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 494f06cc0621..2ab9b363f8b5 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -1,1235 +1,1236 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ktrace.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_fork_func_t dtrace_fasttrap_fork; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int"); #ifndef _SYS_SYSPROTO_H_ struct fork_args { int dummy; }; #endif /* ARGSUSED */ int sys_fork(struct thread *td, struct fork_args *uap) { struct fork_req fr; int error, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } /* ARGUSED */ int sys_pdfork(struct thread *td, struct pdfork_args *uap) { struct fork_req fr; int error, fd, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFPROCDESC; fr.fr_pidp = &pid; fr.fr_pd_fd = &fd; fr.fr_pd_flags = uap->flags; AUDIT_ARG_FFLAGS(uap->flags); /* * It is necessary to return fd by reference because 0 is a valid file * descriptor number, and the child needs to be able to distinguish * itself from the parent using the return value. */ error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; error = copyout(&fd, uap->fdp, sizeof(fd)); } return (error); } /* ARGSUSED */ int sys_vfork(struct thread *td, struct vfork_args *uap) { struct fork_req fr; int error, pid; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } int sys_rfork(struct thread *td, struct rfork_args *uap) { struct fork_req fr; int error, pid; /* Don't allow kernel-only flags. */ if ((uap->flags & RFKERNELONLY) != 0) return (EINVAL); /* RFSPAWN must not appear with others */ if ((uap->flags & RFSPAWN) != 0 && uap->flags != RFSPAWN) return (EINVAL); AUDIT_ARG_FFLAGS(uap->flags); bzero(&fr, sizeof(fr)); if ((uap->flags & RFSPAWN) != 0) { fr.fr_flags = RFFDG | RFPROC | RFPPWAIT | RFMEM; fr.fr_flags2 = FR2_DROPSIG_CAUGHT; } else { fr.fr_flags = uap->flags; } fr.fr_pidp = &pid; error = fork1(td, &fr); if (error == 0) { td->td_retval[0] = pid; td->td_retval[1] = 0; } return (error); } int __exclusive_cache_line nprocs = 1; /* process 0 */ int lastpid = 0; SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, "Last used PID"); /* * Random component to lastpid generation. We mix in a random factor to make * it a little harder to predict. We sanity check the modulus value to avoid * doing it in critical paths. Don't let it be too small or we pointlessly * waste randomness entropy, and don't let it be impossibly large. Using a * modulus that is too big causes a LOT more process table scans and slows * down fork processing as the pidchecked caching is defeated. */ static int randompid = 0; static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS) { int error, pid; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error != 0) return(error); sx_xlock(&allproc_lock); pid = randompid; error = sysctl_handle_int(oidp, &pid, 0, req); if (error == 0 && req->newptr != NULL) { if (pid == 0) randompid = 0; else if (pid == 1) /* generate a random PID modulus between 100 and 1123 */ randompid = 100 + arc4random() % 1024; else if (pid < 0 || pid > pid_max - 100) /* out of range */ randompid = pid_max - 100; else if (pid < 100) /* Make it reasonable */ randompid = 100; else randompid = pid; } sx_xunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_randompid, "I", "Random PID modulus. Special values: 0: disable, 1: choose random value"); extern bitstr_t proc_id_pidmap; extern bitstr_t proc_id_grpidmap; extern bitstr_t proc_id_sessidmap; extern bitstr_t proc_id_reapmap; /* * Find an unused process ID * * If RFHIGHPID is set (used during system boot), do not allocate * low-numbered pids. */ static int fork_findpid(int flags) { pid_t result; int trypid, random; /* * Avoid calling arc4random with procid_lock held. */ random = 0; if (__predict_false(randompid)) random = arc4random() % randompid; mtx_lock(&procid_lock); trypid = lastpid + 1; if (flags & RFHIGHPID) { if (trypid < 10) trypid = 10; } else { trypid += random; } retry: if (trypid >= pid_max) trypid = 2; bit_ffc_at(&proc_id_pidmap, trypid, pid_max, &result); if (result == -1) { KASSERT(trypid != 2, ("unexpectedly ran out of IDs")); trypid = 2; goto retry; } if (bit_test(&proc_id_grpidmap, result) || bit_test(&proc_id_sessidmap, result) || bit_test(&proc_id_reapmap, result)) { trypid = result + 1; goto retry; } /* * RFHIGHPID does not mess with the lastpid counter during boot. */ if ((flags & RFHIGHPID) == 0) lastpid = result; bit_set(&proc_id_pidmap, result); mtx_unlock(&procid_lock); return (result); } static int fork_norfproc(struct thread *td, int flags) { struct proc *p1; int error; KASSERT((flags & RFPROC) == 0, ("fork_norfproc called with RFPROC set")); p1 = td->td_proc; /* * Quiesce other threads if necessary. If RFMEM is not specified we * must ensure that other threads do not concurrently create a second * process sharing the vmspace, see vmspace_unshare(). */ if ((p1->p_flag & (P_HADTHREADS | P_SYSTEM)) == P_HADTHREADS && ((flags & (RFCFDG | RFFDG)) != 0 || (flags & RFMEM) == 0)) { PROC_LOCK(p1); if (thread_single(p1, SINGLE_BOUNDARY)) { PROC_UNLOCK(p1); return (ERESTART); } PROC_UNLOCK(p1); } error = vm_forkproc(td, NULL, NULL, NULL, flags); if (error != 0) goto fail; /* * Close all file descriptors. */ if ((flags & RFCFDG) != 0) { struct filedesc *fdtmp; struct pwddesc *pdtmp; pdtmp = pdinit(td->td_proc->p_pd, false); fdtmp = fdinit(); pdescfree(td); fdescfree(td); p1->p_fd = fdtmp; p1->p_pd = pdtmp; } /* * Unshare file descriptors (from parent). */ if ((flags & RFFDG) != 0) { fdunshare(td); pdunshare(td); } fail: if ((p1->p_flag & (P_HADTHREADS | P_SYSTEM)) == P_HADTHREADS && ((flags & (RFCFDG | RFFDG)) != 0 || (flags & RFMEM) == 0)) { PROC_LOCK(p1); thread_single_end(p1, SINGLE_BOUNDARY); PROC_UNLOCK(p1); } return (error); } static void do_fork(struct thread *td, struct fork_req *fr, struct proc *p2, struct thread *td2, struct vmspace *vm2, struct file *fp_procdesc) { struct proc *p1, *pptr; struct filedesc *fd; struct filedesc_to_leader *fdtol; struct pwddesc *pd; struct sigacts *newsigacts; p1 = td->td_proc; PROC_LOCK(p1); bcopy(&p1->p_startcopy, &p2->p_startcopy, __rangeof(struct proc, p_startcopy, p_endcopy)); pargs_hold(p2->p_args); PROC_UNLOCK(p1); bzero(&p2->p_startzero, __rangeof(struct proc, p_startzero, p_endzero)); /* Tell the prison that we exist. */ prison_proc_hold(p2->p_ucred->cr_prison); p2->p_state = PRS_NEW; /* protect against others */ p2->p_pid = fork_findpid(fr->fr_flags); AUDIT_ARG_PID(p2->p_pid); TSFORK(p2->p_pid, p1->p_pid); sx_xlock(&allproc_lock); LIST_INSERT_HEAD(&allproc, p2, p_list); allproc_gen++; prison_proc_link(p2->p_ucred->cr_prison, p2); sx_xunlock(&allproc_lock); sx_xlock(PIDHASHLOCK(p2->p_pid)); LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); sx_xunlock(PIDHASHLOCK(p2->p_pid)); tidhash_add(td2); /* * Malloc things while we don't hold any locks. */ if (fr->fr_flags & RFSIGSHARE) newsigacts = NULL; else newsigacts = sigacts_alloc(); /* * Copy filedesc. */ if (fr->fr_flags & RFCFDG) { pd = pdinit(p1->p_pd, false); fd = fdinit(); fdtol = NULL; } else if (fr->fr_flags & RFFDG) { if (fr->fr_flags2 & FR2_SHARE_PATHS) pd = pdshare(p1->p_pd); else pd = pdcopy(p1->p_pd); fd = fdcopy(p1->p_fd); fdtol = NULL; } else { if (fr->fr_flags2 & FR2_SHARE_PATHS) pd = pdcopy(p1->p_pd); else pd = pdshare(p1->p_pd); fd = fdshare(p1->p_fd); if (p1->p_fdtol == NULL) p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL, p1->p_leader); if ((fr->fr_flags & RFTHREAD) != 0) { /* * Shared file descriptor table, and shared * process leaders. */ fdtol = filedesc_to_leader_share(p1->p_fdtol, p1->p_fd); } else { /* * Shared file descriptor table, and different * process leaders. */ fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p1->p_fd, p2); } } /* * Make a proc table entry for the new process. * Start by zeroing the section of proc that is zero-initialized, * then copy the section that is copied directly from the parent. */ PROC_LOCK(p2); PROC_LOCK(p1); bzero(&td2->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &td2->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name)); td2->td_sigstk = td->td_sigstk; td2->td_flags = TDF_INMEM; td2->td_lend_user_pri = PRI_MAX; #ifdef VIMAGE td2->td_vnet = NULL; td2->td_vnet_lpush = NULL; #endif /* * Allow the scheduler to initialize the child. */ thread_lock(td); sched_fork(td, td2); /* * Request AST to check for TDP_RFPPWAIT. Do it here * to avoid calling thread_lock() again. */ if ((fr->fr_flags & RFPPWAIT) != 0) ast_sched_locked(td, TDA_VFORK); thread_unlock(td); /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. */ p2->p_flag = P_INMEM; p2->p_flag2 = p1->p_flag2 & (P2_ASLR_DISABLE | P2_ASLR_ENABLE | P2_ASLR_IGNSTART | P2_NOTRACE | P2_NOTRACE_EXEC | P2_PROTMAX_ENABLE | P2_PROTMAX_DISABLE | P2_TRAPCAP | P2_STKGAP_DISABLE | P2_STKGAP_DISABLE_EXEC | P2_NO_NEW_PRIVS | P2_WXORX_DISABLE | P2_WXORX_ENABLE_EXEC | P2_LOGSIGEXIT_CTL | P2_LOGSIGEXIT_ENABLE); p2->p_swtick = ticks; if (p1->p_flag & P_PROFIL) startprofclock(p2); if (fr->fr_flags & RFSIGSHARE) { p2->p_sigacts = sigacts_hold(p1->p_sigacts); } else { sigacts_copy(newsigacts, p1->p_sigacts); p2->p_sigacts = newsigacts; if ((fr->fr_flags2 & (FR2_DROPSIG_CAUGHT | FR2_KPROC)) != 0) { mtx_lock(&p2->p_sigacts->ps_mtx); if ((fr->fr_flags2 & FR2_DROPSIG_CAUGHT) != 0) sig_drop_caught(p2); if ((fr->fr_flags2 & FR2_KPROC) != 0) p2->p_sigacts->ps_flag |= PS_NOCLDWAIT; mtx_unlock(&p2->p_sigacts->ps_mtx); } } if (fr->fr_flags & RFTSIGZMB) p2->p_sigparent = RFTSIGNUM(fr->fr_flags); else if (fr->fr_flags & RFLINUXTHPN) p2->p_sigparent = SIGUSR1; else p2->p_sigparent = SIGCHLD; if ((fr->fr_flags2 & FR2_KPROC) != 0) { p2->p_flag |= P_SYSTEM | P_KPROC; td2->td_pflags |= TDP_KTHREAD; } p2->p_textvp = p1->p_textvp; p2->p_textdvp = p1->p_textdvp; p2->p_fd = fd; p2->p_fdtol = fdtol; p2->p_pd = pd; if (p1->p_flag2 & P2_INHERIT_PROTECTED) { p2->p_flag |= P_PROTECTED; p2->p_flag2 |= P2_INHERIT_PROTECTED; } /* * p_limit is copy-on-write. Bump its refcount. */ lim_fork(p1, p2); thread_cow_get_proc(td2, p2); pstats_fork(p1->p_stats, p2->p_stats); PROC_UNLOCK(p1); PROC_UNLOCK(p2); /* * Bump references to the text vnode and directory, and copy * the hardlink name. */ if (p2->p_textvp != NULL) vrefact(p2->p_textvp); if (p2->p_textdvp != NULL) vrefact(p2->p_textdvp); p2->p_binname = p1->p_binname == NULL ? NULL : strdup(p1->p_binname, M_PARGS); /* * Set up linkage for kernel based threading. */ if ((fr->fr_flags & RFTHREAD) != 0) { mtx_lock(&ppeers_lock); p2->p_peers = p1->p_peers; p1->p_peers = p2; p2->p_leader = p1->p_leader; mtx_unlock(&ppeers_lock); PROC_LOCK(p1->p_leader); if ((p1->p_leader->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(p1->p_leader); /* * The task leader is exiting, so process p1 is * going to be killed shortly. Since p1 obviously * isn't dead yet, we know that the leader is either * sending SIGKILL's to all the processes in this * task or is sleeping waiting for all the peers to * exit. We let p1 complete the fork, but we need * to go ahead and kill the new process p2 since * the task leader may not get a chance to send * SIGKILL to it. We leave it on the list so that * the task leader will wait for this new process * to commit suicide. */ PROC_LOCK(p2); kern_psignal(p2, SIGKILL); PROC_UNLOCK(p2); } else PROC_UNLOCK(p1->p_leader); } else { p2->p_peers = NULL; p2->p_leader = p2; } sx_xlock(&proctree_lock); PGRP_LOCK(p1->p_pgrp); PROC_LOCK(p2); PROC_LOCK(p1); /* * Preserve some more flags in subprocess. P_PROFIL has already * been preserved. */ p2->p_flag |= p1->p_flag & P_SUGID; td2->td_pflags |= td->td_pflags & (TDP_ALTSTACK | TDP_SIGFASTBLOCK); + td2->td_pflags2 |= td->td_pflags2 & TDP2_UEXTERR; SESS_LOCK(p1->p_session); if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; SESS_UNLOCK(p1->p_session); if (fr->fr_flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; p2->p_pgrp = p1->p_pgrp; LIST_INSERT_AFTER(p1, p2, p_pglist); PGRP_UNLOCK(p1->p_pgrp); LIST_INIT(&p2->p_children); LIST_INIT(&p2->p_orphans); callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0); /* * This begins the section where we must prevent the parent * from being swapped. */ _PHOLD(p1); PROC_UNLOCK(p1); /* * Attach the new process to its parent. * * If RFNOWAIT is set, the newly created process becomes a child * of init. This effectively disassociates the child from the * parent. */ if ((fr->fr_flags & RFNOWAIT) != 0) { pptr = p1->p_reaper; p2->p_reaper = pptr; } else { p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ? p1 : p1->p_reaper; pptr = p1; } p2->p_pptr = pptr; p2->p_oppid = pptr->p_pid; LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_reaplist); LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling); if (p2->p_reaper == p1 && p1 != initproc) { p2->p_reapsubtree = p2->p_pid; proc_id_set_cond(PROC_ID_REAP, p2->p_pid); } sx_xunlock(&proctree_lock); /* Inform accounting that we have forked. */ p2->p_acflag = AFORK; PROC_UNLOCK(p2); #ifdef KTRACE ktrprocfork(p1, p2); #endif /* * Finish creating the child process. It will return via a different * execution path later. (ie: directly into user mode) */ vm_forkproc(td, p2, td2, vm2, fr->fr_flags); if (fr->fr_flags == (RFFDG | RFPROC)) { VM_CNT_INC(v_forks); VM_CNT_ADD(v_forkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (fr->fr_flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) { VM_CNT_INC(v_vforks); VM_CNT_ADD(v_vforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else if (p1 == &proc0) { VM_CNT_INC(v_kthreads); VM_CNT_ADD(v_kthreadpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } else { VM_CNT_INC(v_rforks); VM_CNT_ADD(v_rforkpages, p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize); } /* * Associate the process descriptor with the process before anything * can happen that might cause that process to need the descriptor. * However, don't do this until after fork(2) can no longer fail. */ if (fr->fr_flags & RFPROCDESC) procdesc_new(p2, fr->fr_pd_flags); /* * Both processes are set up, now check if any loadable modules want * to adjust anything. */ EVENTHANDLER_DIRECT_INVOKE(process_fork, p1, p2, fr->fr_flags); /* * Set the child start time and mark the process as being complete. */ PROC_LOCK(p2); PROC_LOCK(p1); microuptime(&p2->p_stats->p_start); PROC_SLOCK(p2); p2->p_state = PRS_NORMAL; PROC_SUNLOCK(p2); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the new process so that any * tracepoints inherited from the parent can be removed. We have to do * this only after p_state is PRS_NORMAL since the fasttrap module will * use pfind() later on. */ if ((fr->fr_flags & RFMEM) == 0 && dtrace_fasttrap_fork) dtrace_fasttrap_fork(p1, p2); #endif if (fr->fr_flags & RFPPWAIT) { td->td_pflags |= TDP_RFPPWAIT; td->td_rfppwait_p = p2; td->td_dbgflags |= TDB_VFORK; } PROC_UNLOCK(p2); /* * Tell any interested parties about the new process. */ knote_fork(p1->p_klist, p2->p_pid); /* * Now can be swapped. */ _PRELE(p1); PROC_UNLOCK(p1); SDT_PROBE3(proc, , , create, p2, p1, fr->fr_flags); if (fr->fr_flags & RFPROCDESC) { procdesc_finit(p2->p_procdesc, fp_procdesc); fdrop(fp_procdesc, td); } /* * Speculative check for PTRACE_FORK. PTRACE_FORK is not * synced with forks in progress so it is OK if we miss it * if being set atm. */ if ((p1->p_ptevents & PTRACE_FORK) != 0) { sx_xlock(&proctree_lock); PROC_LOCK(p2); /* * p1->p_ptevents & p1->p_pptr are protected by both * process and proctree locks for modifications, * so owning proctree_lock allows the race-free read. */ if ((p1->p_ptevents & PTRACE_FORK) != 0) { /* * Arrange for debugger to receive the fork event. * * We can report PL_FLAG_FORKED regardless of * P_FOLLOWFORK settings, but it does not make a sense * for runaway child. */ td->td_dbgflags |= TDB_FORK; td->td_dbg_forked = p2->p_pid; td2->td_dbgflags |= TDB_STOPATFORK; proc_set_traced(p2, true); CTR2(KTR_PTRACE, "do_fork: attaching to new child pid %d: oppid %d", p2->p_pid, p2->p_oppid); proc_reparent(p2, p1->p_pptr, false); } PROC_UNLOCK(p2); sx_xunlock(&proctree_lock); } racct_proc_fork_done(p2); if ((fr->fr_flags & RFSTOPPED) == 0) { if (fr->fr_pidp != NULL) *fr->fr_pidp = p2->p_pid; /* * If RFSTOPPED not requested, make child runnable and * add to run queue. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); } else { *fr->fr_procp = p2; } } static void ast_vfork(struct thread *td, int tda __unused) { struct proc *p, *p2; MPASS(td->td_pflags & TDP_RFPPWAIT); p = td->td_proc; /* * Preserve synchronization semantics of vfork. If * waiting for child to exec or exit, fork set * P_PPWAIT on child, and there we sleep on our proc * (in case of exit). * * Do it after the ptracestop() above is finished, to * not block our debugger until child execs or exits * to finish vfork wait. */ td->td_pflags &= ~TDP_RFPPWAIT; p2 = td->td_rfppwait_p; again: PROC_LOCK(p2); while (p2->p_flag & P_PPWAIT) { PROC_LOCK(p); if (thread_suspend_check_needed()) { PROC_UNLOCK(p2); thread_suspend_check(0); PROC_UNLOCK(p); goto again; } else { PROC_UNLOCK(p); } cv_timedwait(&p2->p_pwait, &p2->p_mtx, hz); } PROC_UNLOCK(p2); if (td->td_dbgflags & TDB_VFORK) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_VFORK) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~TDB_VFORK; PROC_UNLOCK(p); } } int fork1(struct thread *td, struct fork_req *fr) { struct proc *p1, *newproc; struct thread *td2; struct vmspace *vm2; struct ucred *cred; struct file *fp_procdesc; struct pgrp *pg; vm_ooffset_t mem_charged; int error, nprocs_new; static int curfail; static struct timeval lastfail; int flags, pages; bool killsx_locked, singlethreaded; flags = fr->fr_flags; pages = fr->fr_pages; if ((flags & RFSTOPPED) != 0) MPASS(fr->fr_procp != NULL && fr->fr_pidp == NULL); else MPASS(fr->fr_procp == NULL); /* Check for the undefined or unimplemented flags. */ if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0) return (EINVAL); /* Signal value requires RFTSIGZMB. */ if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0) return (EINVAL); /* Can't copy and clear. */ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) return (EINVAL); /* Check the validity of the signal number. */ if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG) return (EINVAL); if ((flags & RFPROCDESC) != 0) { /* Can't not create a process yet get a process descriptor. */ if ((flags & RFPROC) == 0) return (EINVAL); /* Must provide a place to put a procdesc if creating one. */ if (fr->fr_pd_fd == NULL) return (EINVAL); /* Check if we are using supported flags. */ if ((fr->fr_pd_flags & ~PD_ALLOWED_AT_FORK) != 0) return (EINVAL); } p1 = td->td_proc; /* * Here we don't create a new process, but we divorce * certain parts of a process from itself. */ if ((flags & RFPROC) == 0) { if (fr->fr_procp != NULL) *fr->fr_procp = NULL; else if (fr->fr_pidp != NULL) *fr->fr_pidp = 0; return (fork_norfproc(td, flags)); } fp_procdesc = NULL; newproc = NULL; vm2 = NULL; killsx_locked = false; singlethreaded = false; /* * Increment the nprocs resource before allocations occur. * Although process entries are dynamically created, we still * keep a global limit on the maximum number we will * create. There are hard-limits as to the number of processes * that can run, established by the KVA and memory usage for * the process data. * * Don't allow a nonprivileged user to use the last ten * processes; don't let root exceed the limit. */ nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1; if (nprocs_new >= maxproc - 10) { if (priv_check_cred(td->td_ucred, PRIV_MAXPROC) != 0 || nprocs_new >= maxproc) { error = EAGAIN; sx_xlock(&allproc_lock); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("maxproc limit exceeded by uid %u " "(pid %d); see tuning(7) and " "login.conf(5)\n", td->td_ucred->cr_ruid, p1->p_pid); } sx_xunlock(&allproc_lock); goto fail2; } } /* * If we are possibly multi-threaded, and there is a process * sending a signal to our group right now, ensure that our * other threads cannot be chosen for the signal queueing. * Otherwise, this might delay signal action, and make the new * child escape the signaling. */ pg = p1->p_pgrp; if (p1->p_numthreads > 1) { if (sx_try_slock(&pg->pg_killsx) != 0) { killsx_locked = true; } else { PROC_LOCK(p1); if (thread_single(p1, SINGLE_BOUNDARY)) { PROC_UNLOCK(p1); error = ERESTART; goto fail2; } PROC_UNLOCK(p1); singlethreaded = true; } } /* * Atomically check for signals and block processes from sending * a signal to our process group until the child is visible. */ if (!killsx_locked && sx_slock_sig(&pg->pg_killsx) != 0) { error = ERESTART; goto fail2; } if (__predict_false(p1->p_pgrp != pg || sig_intr() != 0)) { /* * Either the process was moved to other process * group, or there is pending signal. sx_slock_sig() * does not check for signals if not sleeping for the * lock. */ sx_sunlock(&pg->pg_killsx); killsx_locked = false; error = ERESTART; goto fail2; } else { killsx_locked = true; } /* * If required, create a process descriptor in the parent first; we * will abandon it if something goes wrong. We don't finit() until * later. */ if (flags & RFPROCDESC) { error = procdesc_falloc(td, &fp_procdesc, fr->fr_pd_fd, fr->fr_pd_flags, fr->fr_pd_fcaps); if (error != 0) goto fail2; AUDIT_ARG_FD(*fr->fr_pd_fd); } mem_charged = 0; if (pages == 0) pages = kstack_pages; /* Allocate new proc. */ newproc = uma_zalloc(proc_zone, M_WAITOK); td2 = FIRST_THREAD_IN_PROC(newproc); if (td2 == NULL) { td2 = thread_alloc(pages); if (td2 == NULL) { error = ENOMEM; goto fail2; } proc_linkup(newproc, td2); } else { error = thread_recycle(td2, pages); if (error != 0) goto fail2; } if ((flags & RFMEM) == 0) { vm2 = vmspace_fork(p1->p_vmspace, &mem_charged); if (vm2 == NULL) { error = ENOMEM; goto fail2; } if (!swap_reserve(mem_charged)) { /* * The swap reservation failed. The accounting * from the entries of the copied vm2 will be * subtracted in vmspace_free(), so force the * reservation there. */ swap_reserve_force(mem_charged); error = ENOMEM; goto fail2; } } else vm2 = NULL; /* * XXX: This is ugly; when we copy resource usage, we need to bump * per-cred resource counters. */ newproc->p_ucred = crcowget(td->td_ucred); /* * Initialize resource accounting for the child process. */ error = racct_proc_fork(p1, newproc); if (error != 0) { error = EAGAIN; goto fail1; } #ifdef MAC mac_proc_init(newproc); #endif newproc->p_klist = knlist_alloc(&newproc->p_mtx); STAILQ_INIT(&newproc->p_ktr); /* * Increment the count of procs running with this uid. Don't allow * a nonprivileged user to exceed their current limit. */ cred = td->td_ucred; if (!chgproccnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPROC))) { if (priv_check_cred(cred, PRIV_PROC_LIMIT) != 0) goto fail0; chgproccnt(cred->cr_ruidinfo, 1, 0); } do_fork(td, fr, newproc, td2, vm2, fp_procdesc); error = 0; goto cleanup; fail0: error = EAGAIN; #ifdef MAC mac_proc_destroy(newproc); #endif racct_proc_exit(newproc); fail1: proc_unset_cred(newproc, false); fail2: if (vm2 != NULL) vmspace_free(vm2); uma_zfree(proc_zone, newproc); if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) { fdclose(td, fp_procdesc, *fr->fr_pd_fd); fdrop(fp_procdesc, td); } atomic_add_int(&nprocs, -1); cleanup: if (killsx_locked) sx_sunlock(&pg->pg_killsx); if (singlethreaded) { PROC_LOCK(p1); thread_single_end(p1, SINGLE_BOUNDARY); PROC_UNLOCK(p1); } if (error != 0) pause("fork", hz / 2); return (error); } /* * Handle the return of a child process from fork1(). This function * is called from the MD fork_trampoline() entry point. */ void fork_exit(void (*callout)(void *, struct trapframe *), void *arg, struct trapframe *frame) { struct proc *p; struct thread *td; struct thread *dtd; kmsan_mark(frame, sizeof(*frame), KMSAN_STATE_INITED); td = curthread; p = td->td_proc; KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new")); CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)", td, td_get_sched(td), p->p_pid, td->td_name); sched_fork_exit(td); /* * Processes normally resume in mi_switch() after being * cpu_switch()'ed to, but when children start up they arrive here * instead, so we must do much the same things as mi_switch() would. */ if ((dtd = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(dtd); } thread_unlock(td); /* * cpu_fork_kthread_handler intercepts this function call to * have this call a non-return function to stay in kernel mode. * initproc has its own fork handler, but it does return. */ KASSERT(callout != NULL, ("NULL callout in fork_exit")); callout(arg, frame); /* * Check if a kernel thread misbehaved and returned from its main * function. */ if (p->p_flag & P_KPROC) { printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n", td->td_name, p->p_pid); kthread_exit(); } mtx_assert(&Giant, MA_NOTOWNED); /* * Now going to return to userland. */ if (p->p_sysent->sv_schedtail != NULL) (p->p_sysent->sv_schedtail)(td); userret(td, frame); } /* * Simplified back end of syscall(), used when returning from fork() * directly into user mode. This function is passed in to fork_exit() * as the first parameter and is called when returning to a new * userland process. */ void fork_return(struct thread *td, struct trapframe *frame) { struct proc *p; p = td->td_proc; if (td->td_dbgflags & TDB_STOPATFORK) { PROC_LOCK(p); if ((p->p_flag & P_TRACED) != 0) { /* * Inform the debugger if one is still present. */ td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP; ptracestop(td, SIGSTOP, NULL); td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX); } else { /* * ... otherwise clear the request. */ td->td_dbgflags &= ~TDB_STOPATFORK; } PROC_UNLOCK(p); } else if (p->p_flag & P_TRACED) { /* * This is the start of a new thread in a traced * process. Report a system call exit event. */ PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; if ((p->p_ptevents & PTRACE_SCX) != 0 || (td->td_dbgflags & TDB_BORN) != 0) ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~(TDB_SCX | TDB_BORN); PROC_UNLOCK(p); } /* * If the prison was killed mid-fork, die along with it. */ if (!prison_isalive(td->td_ucred->cr_prison)) exit1(td, 0, SIGKILL); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(td->td_sa.code, 0, 0); #endif } static void fork_init(void *arg __unused) { ast_register(TDA_VFORK, ASTR_ASTF_REQUIRED | ASTR_TDP, TDP_RFPPWAIT, ast_vfork); } SYSINIT(fork, SI_SUB_INTRINSIC, SI_ORDER_ANY, fork_init, NULL); diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c index 16fa47c5605a..d5b3b62f0821 100644 --- a/sys/kern/subr_syscall.c +++ b/sys/kern/subr_syscall.c @@ -1,294 +1,298 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2010 Konstantin Belousov * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #ifdef KTRACE #include #include #endif #include static inline void syscallenter(struct thread *td) { struct proc *p; struct syscall_args *sa; struct sysent *se; int error, traced; bool sy_thr_static; VM_CNT_INC(v_syscall); p = td->td_proc; sa = &td->td_sa; td->td_pticks = 0; if (__predict_false(td->td_cowgen != atomic_load_int(&p->p_cowgen))) thread_cow_update(td); traced = (p->p_flag & P_TRACED) != 0; if (__predict_false(traced || td->td_dbgflags & TDB_USERWR)) { PROC_LOCK(p); MPASS((td->td_dbgflags & TDB_BOUNDARY) == 0); td->td_dbgflags &= ~TDB_USERWR; if (traced) td->td_dbgflags |= TDB_SCE; PROC_UNLOCK(p); } + if ((td->td_pflags2 & TDP2_UEXTERR) != 0) + td->td_pflags2 &= ~TDP2_EXTERR; error = (p->p_sysent->sv_fetch_syscall_args)(td); se = sa->callp; #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, se->sy_narg, sa->args); #endif KTR_START4(KTR_SYSC, "syscall", syscallname(p, sa->code), (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "arg0:%p", sa->args[0], "arg1:%p", sa->args[1], "arg2:%p", sa->args[2]); if (__predict_false(error != 0)) { td->td_errno = error; goto retval; } if (__predict_false(traced)) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_SCE) ptracestop((td), SIGTRAP, NULL); PROC_UNLOCK(p); if ((td->td_dbgflags & TDB_USERWR) != 0) { /* * Reread syscall number and arguments if debugger * modified registers or memory. */ error = (p->p_sysent->sv_fetch_syscall_args)(td); se = sa->callp; #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(sa->code, se->sy_narg, sa->args); #endif if (error != 0) { td->td_errno = error; goto retval; } } } #ifdef CAPABILITY_MODE /* * In capability mode, we only allow access to system calls * flagged with SYF_CAPENABLED. */ if ((se->sy_flags & SYF_CAPENABLED) == 0) { if (CAP_TRACING(td)) ktrcapfail(CAPFAIL_SYSCALL, NULL); if (IN_CAPABILITY_MODE(td)) { td->td_errno = error = ECAPMODE; goto retval; } } #endif /* * Fetch fast sigblock value at the time of syscall entry to * handle sleepqueue primitives which might call cursig(). */ if (__predict_false(sigfastblock_fetch_always)) (void)sigfastblock_fetch(td); /* Let system calls set td_errno directly. */ KASSERT((td->td_pflags & TDP_NERRNO) == 0, ("%s: TDP_NERRNO set", __func__)); sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0; if (__predict_false(AUDIT_SYSCALL_ENABLED() || SYSTRACE_ENABLED() || !sy_thr_static)) { if (!sy_thr_static) { error = syscall_thread_enter(td, &se); sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0; if (error != 0) { td->td_errno = error; goto retval; } } #ifdef KDTRACE_HOOKS /* Give the syscall:::entry DTrace probe a chance to fire. */ if (__predict_false(se->sy_entry != 0)) (*systrace_probe_func)(sa, SYSTRACE_ENTRY, 0); #endif AUDIT_SYSCALL_ENTER(sa->code, td); error = (se->sy_call)(td, sa->args); /* Save the latest error return value. */ if (__predict_false((td->td_pflags & TDP_NERRNO) != 0)) td->td_pflags &= ~TDP_NERRNO; else td->td_errno = error; /* * Note that some syscall implementations (e.g., sys_execve) * will commit the audit record just before their final return. * These were done under the assumption that nothing of interest * would happen between their return and here, where we would * normally commit the audit record. These assumptions will * need to be revisited should any substantial logic be added * above. */ AUDIT_SYSCALL_EXIT(error, td); #ifdef KDTRACE_HOOKS /* Give the syscall:::return DTrace probe a chance to fire. */ if (__predict_false(se->sy_return != 0)) (*systrace_probe_func)(sa, SYSTRACE_RETURN, error ? -1 : td->td_retval[0]); #endif if (!sy_thr_static) syscall_thread_exit(td, se); } else { error = (se->sy_call)(td, sa->args); /* Save the latest error return value. */ if (__predict_false((td->td_pflags & TDP_NERRNO) != 0)) td->td_pflags &= ~TDP_NERRNO; else td->td_errno = error; } retval: KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code), (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "error:%d", error, "retval0:%#lx", td->td_retval[0], "retval1:%#lx", td->td_retval[1]); if (__predict_false(traced)) { PROC_LOCK(p); td->td_dbgflags &= ~(TDB_SCE | TDB_BOUNDARY); PROC_UNLOCK(p); } (p->p_sysent->sv_set_syscall_retval)(td, error); + if (error != 0 && (td->td_pflags2 & TDP2_UEXTERR) != 0) + exterr_copyout(td); } static inline void syscallret(struct thread *td) { struct proc *p; struct syscall_args *sa; ksiginfo_t ksi; int traced; KASSERT(td->td_errno != ERELOOKUP, ("ERELOOKUP not consumed syscall %d", td->td_sa.code)); p = td->td_proc; sa = &td->td_sa; if (__predict_false(td->td_errno == ENOTCAPABLE || td->td_errno == ECAPMODE)) { if ((trap_enotcap || (p->p_flag2 & P2_TRAPCAP) != 0) && IN_CAPABILITY_MODE(td)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_errno = td->td_errno; ksi.ksi_code = TRAP_CAP; ksi.ksi_info.si_syscall = sa->original_code; trapsignal(td, &ksi); } } /* * Handle reschedule and other end-of-syscall issues */ userret(td, td->td_frame); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) { ktrsysret(sa->code, td->td_errno, td->td_retval[0]); } #endif traced = 0; if (__predict_false(p->p_flag & P_TRACED)) { traced = 1; PROC_LOCK(p); td->td_dbgflags |= TDB_SCX; PROC_UNLOCK(p); } if (__predict_false(traced || (td->td_dbgflags & (TDB_EXEC | TDB_FORK)) != 0)) { PROC_LOCK(p); /* * Linux debuggers expect an additional stop for exec, * between the usual syscall entry and exit. Raise * the exec event now and then clear TDB_EXEC so that * the next stop is reported as a syscall exit by * linux_ptrace_status(). * * We are accessing p->p_pptr without any additional * locks here: it cannot change while p is kept locked; * while the debugger could in theory change its ABI * while tracing another process, the outcome of such * a race wouln't be deterministic anyway. */ if (traced && (td->td_dbgflags & TDB_EXEC) != 0 && SV_PROC_ABI(p->p_pptr) == SV_ABI_LINUX) { ptracestop(td, SIGTRAP, NULL); td->td_dbgflags &= ~TDB_EXEC; } /* * If tracing the execed process, trap to the debugger * so that breakpoints can be set before the program * executes. If debugger requested tracing of syscall * returns, do it now too. */ if (traced && ((td->td_dbgflags & (TDB_FORK | TDB_EXEC)) != 0 || (p->p_ptevents & PTRACE_SCX) != 0)) { MPASS((td->td_dbgflags & TDB_BOUNDARY) == 0); td->td_dbgflags |= TDB_BOUNDARY; ptracestop(td, SIGTRAP, NULL); } td->td_dbgflags &= ~(TDB_SCX | TDB_EXEC | TDB_FORK | TDB_BOUNDARY); PROC_UNLOCK(p); } } diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index dd9c28e81388..91bf3e93fa7c 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -1,2202 +1,2273 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include #include "opt_capsicum.h" #include "opt_ktrace.h" +#define EXTERR_CATEGORY EXTERR_CAT_FILEDESC #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include /* * The following macro defines how many bytes will be allocated from * the stack instead of memory allocated when passing the IOCTL data * structures from userspace and to the kernel. Some IOCTLs having * small data structures are used very frequently and this small * buffer on the stack gives a significant speedup improvement for * those requests. The value of this define should be greater or equal * to 64 bytes and should also be power of two. The data structure is * currently hard-aligned to a 8-byte boundary on the stack. This * should currently be sufficient for all supported platforms. */ #define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ #define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ #ifdef __LP64__ static int iosize_max_clamp = 0; SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); static int devfs_iosize_max_clamp = 1; SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); #endif /* * Assert that the return value of read(2) and write(2) syscalls fits * into a register. If not, an architecture will need to provide the * usermode wrappers to reconstruct the result. */ CTASSERT(sizeof(register_t) >= sizeof(size_t)); static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); static int pollout(struct thread *, struct pollfd *, struct pollfd *, u_int); static int pollscan(struct thread *, struct pollfd *, u_int); static int pollrescan(struct thread *); static int selscan(struct thread *, fd_mask **, fd_mask **, int); static int selrescan(struct thread *, fd_mask **, fd_mask **); static void selfdalloc(struct thread *, void *); static void selfdfree(struct seltd *, struct selfd *); static int dofileread(struct thread *, int, struct file *, struct uio *, off_t, int); static int dofilewrite(struct thread *, int, struct file *, struct uio *, off_t, int); static void doselwakeup(struct selinfo *, int); static void seltdinit(struct thread *); static int seltdwait(struct thread *, sbintime_t, sbintime_t); static void seltdclear(struct thread *); /* * One seltd per-thread allocated on demand as needed. * * t - protected by st_mtx * k - Only accessed by curthread or read-only */ struct seltd { STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ struct selfd *st_free1; /* (k) free fd for read set. */ struct selfd *st_free2; /* (k) free fd for write set. */ struct mtx st_mtx; /* Protects struct seltd */ struct cv st_wait; /* (t) Wait channel. */ int st_flags; /* (t) SELTD_ flags. */ }; #define SELTD_PENDING 0x0001 /* We have pending events. */ #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ /* * One selfd allocated per-thread per-file-descriptor. * f - protected by sf_mtx */ struct selfd { STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ struct selinfo *sf_si; /* (f) selinfo when linked. */ struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ struct seltd *sf_td; /* (k) owning seltd. */ void *sf_cookie; /* (k) fd or pollfd. */ }; MALLOC_DEFINE(M_SELFD, "selfd", "selfd"); static struct mtx_pool *mtxpool_select; #ifdef __LP64__ size_t devfs_iosize_max(void) { return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? INT_MAX : SSIZE_MAX); } size_t iosize_max(void) { return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? INT_MAX : SSIZE_MAX); } #endif #ifndef _SYS_SYSPROTO_H_ struct read_args { int fd; void *buf; size_t nbyte; }; #endif int sys_read(struct thread *td, struct read_args *uap) { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_readv(td, uap->fd, &auio); return (error); } /* * Positioned read system call */ #ifndef _SYS_SYSPROTO_H_ struct pread_args { int fd; void *buf; size_t nbyte; int pad; off_t offset; }; #endif int sys_pread(struct thread *td, struct pread_args *uap) { return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); } int kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset) { struct uio auio; struct iovec aiov; int error; if (nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_preadv(td, fd, &auio, offset); return (error); } #if defined(COMPAT_FREEBSD6) int freebsd6_pread(struct thread *td, struct freebsd6_pread_args *uap) { return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); } #endif /* * Scatter read system call. */ #ifndef _SYS_SYSPROTO_H_ struct readv_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int sys_readv(struct thread *td, struct readv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_readv(td, uap->fd, auio); freeuio(auio); return (error); } int kern_readv(struct thread *td, int fd, struct uio *auio) { struct file *fp; int error; error = fget_read(td, fd, &cap_read_rights, &fp); if (error) return (error); error = dofileread(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Scatter positioned read system call. */ #ifndef _SYS_SYSPROTO_H_ struct preadv_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int sys_preadv(struct thread *td, struct preadv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_preadv(td, uap->fd, auio, uap->offset); freeuio(auio); return (error); } int kern_preadv(struct thread *td, int fd, struct uio *auio, off_t offset) { struct file *fp; int error; error = fget_read(td, fd, &cap_pread_rights, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) error = EINVAL; else error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for readv and preadv that reads data in * from a file using the passed in uio, offset, and flags. */ static int dofileread(struct thread *td, int fd, struct file *fp, struct uio *auio, off_t offset, int flags) { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif AUDIT_ARG_FD(fd); /* Finish zero length reads right here */ if (auio->uio_resid == 0) { td->td_retval[0] = 0; return (0); } auio->uio_rw = UIO_READ; auio->uio_offset = offset; auio->uio_td = td; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_READ, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } #ifndef _SYS_SYSPROTO_H_ struct write_args { int fd; const void *buf; size_t nbyte; }; #endif int sys_write(struct thread *td, struct write_args *uap) { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_writev(td, uap->fd, &auio); return (error); } /* * Positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwrite_args { int fd; const void *buf; size_t nbyte; int pad; off_t offset; }; #endif int sys_pwrite(struct thread *td, struct pwrite_args *uap) { return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); } int kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte, off_t offset) { struct uio auio; struct iovec aiov; int error; if (nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_pwritev(td, fd, &auio, offset); return (error); } #if defined(COMPAT_FREEBSD6) int freebsd6_pwrite(struct thread *td, struct freebsd6_pwrite_args *uap) { return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); } #endif /* * Gather write system call. */ #ifndef _SYS_SYSPROTO_H_ struct writev_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int sys_writev(struct thread *td, struct writev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_writev(td, uap->fd, auio); freeuio(auio); return (error); } int kern_writev(struct thread *td, int fd, struct uio *auio) { struct file *fp; int error; error = fget_write(td, fd, &cap_write_rights, &fp); if (error) return (error); error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Gather positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwritev_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int sys_pwritev(struct thread *td, struct pwritev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_pwritev(td, uap->fd, auio, uap->offset); freeuio(auio); return (error); } int kern_pwritev(struct thread *td, int fd, struct uio *auio, off_t offset) { struct file *fp; int error; error = fget_write(td, fd, &cap_pwrite_rights, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) error = EINVAL; else error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for writev and pwritev that writes data to * a file using the passed in uio, offset, and flags. */ static int dofilewrite(struct thread *td, int fd, struct file *fp, struct uio *auio, off_t offset, int flags) { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif AUDIT_ARG_FD(fd); auio->uio_rw = UIO_WRITE; auio->uio_td = td; auio->uio_offset = offset; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; error = fo_write(fp, auio, td->td_ucred, flags, td); /* * Socket layer is responsible for special error handling, * see sousrsend(). */ if (error != 0 && fp->f_type != DTYPE_SOCKET) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error == EPIPE) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { if (error == 0) ktruio->uio_resid = cnt; ktrgenio(fd, UIO_WRITE, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } /* * Truncate a file given a file descriptor. * * Can't use fget_write() here, since must return EINVAL and not EBADF if the * descriptor isn't writable. */ int kern_ftruncate(struct thread *td, int fd, off_t length) { struct file *fp; int error; AUDIT_ARG_FD(fd); if (length < 0) return (EINVAL); error = fget(td, fd, &cap_ftruncate_rights, &fp); if (error) return (error); AUDIT_ARG_FILE(td->td_proc, fp); if (!(fp->f_flag & FWRITE)) { fdrop(fp, td); return (EINVAL); } error = fo_truncate(fp, length, td->td_ucred, td); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ftruncate_args { int fd; int pad; off_t length; }; #endif int sys_ftruncate(struct thread *td, struct ftruncate_args *uap) { return (kern_ftruncate(td, uap->fd, uap->length)); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct oftruncate_args { int fd; long length; }; #endif int oftruncate(struct thread *td, struct oftruncate_args *uap) { return (kern_ftruncate(td, uap->fd, uap->length)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct ioctl_args { int fd; u_long com; caddr_t data; }; #endif /* ARGSUSED */ int sys_ioctl(struct thread *td, struct ioctl_args *uap) { u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN); uint32_t com; int arg, error; u_int size; caddr_t data; #ifdef INVARIANTS if (uap->com > 0xffffffff) { printf( "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", td->td_proc->p_pid, td->td_name, uap->com); } #endif com = (uint32_t)uap->com; /* * Interpret high order word to find amount of data to be * copied to/from the user's address space. */ size = IOCPARM_LEN(com); if ((size > IOCPARM_MAX) || ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) ((com & IOC_OUT) && size == 0) || #else ((com & (IOC_IN | IOC_OUT)) && size == 0) || #endif ((com & IOC_VOID) && size > 0 && size != sizeof(int))) return (ENOTTY); if (size > 0) { if (com & IOC_VOID) { /* Integer argument. */ arg = (intptr_t)uap->data; data = (void *)&arg; size = 0; } else { if (size > SYS_IOCTL_SMALL_SIZE) data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); else data = smalldata; } } else data = (void *)&uap->data; if (com & IOC_IN) { error = copyin(uap->data, data, (u_int)size); if (error != 0) goto out; } else if (com & IOC_OUT) { /* * Zero the buffer so the user always * gets back something deterministic. */ bzero(data, size); } error = kern_ioctl(td, uap->fd, com, data); if (error == 0 && (com & IOC_OUT)) error = copyout(data, uap->data, (u_int)size); out: if (size > SYS_IOCTL_SMALL_SIZE) free(data, M_IOCTLOPS); return (error); } int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) { struct file *fp; struct filedesc *fdp; int error, tmp, locked; AUDIT_ARG_FD(fd); AUDIT_ARG_CMD(com); fdp = td->td_proc->p_fd; switch (com) { case FIONCLEX: case FIOCLEX: FILEDESC_XLOCK(fdp); locked = LA_XLOCKED; break; default: #ifdef CAPABILITIES FILEDESC_SLOCK(fdp); locked = LA_SLOCKED; #else locked = LA_UNLOCKED; #endif break; } #ifdef CAPABILITIES if ((fp = fget_noref(fdp, fd)) == NULL) { error = EBADF; goto out; } if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { fp = NULL; /* fhold() was not called yet */ goto out; } if (!fhold(fp)) { error = EBADF; fp = NULL; goto out; } if (locked == LA_SLOCKED) { FILEDESC_SUNLOCK(fdp); locked = LA_UNLOCKED; } #else error = fget(td, fd, &cap_ioctl_rights, &fp); if (error != 0) { fp = NULL; goto out; } #endif if ((fp->f_flag & (FREAD | FWRITE)) == 0) { error = EBADF; goto out; } switch (com) { case FIONCLEX: fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; goto out; case FIOCLEX: fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; goto out; case FIONBIO: if ((tmp = *(int *)data)) atomic_set_int(&fp->f_flag, FNONBLOCK); else atomic_clear_int(&fp->f_flag, FNONBLOCK); data = (void *)&tmp; break; case FIOASYNC: if ((tmp = *(int *)data)) atomic_set_int(&fp->f_flag, FASYNC); else atomic_clear_int(&fp->f_flag, FASYNC); data = (void *)&tmp; break; } error = fo_ioctl(fp, com, data, td->td_ucred, td); out: switch (locked) { case LA_XLOCKED: FILEDESC_XUNLOCK(fdp); break; #ifdef CAPABILITIES case LA_SLOCKED: FILEDESC_SUNLOCK(fdp); break; #endif default: FILEDESC_UNLOCK_ASSERT(fdp); break; } if (fp != NULL) fdrop(fp, td); return (error); } int sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap) { int error; error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len); return (kern_posix_error(td, error)); } int kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len) { struct file *fp; int error; AUDIT_ARG_FD(fd); if (offset < 0 || len <= 0) return (EINVAL); /* Check for wrap. */ if (offset > OFF_MAX - len) return (EFBIG); AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_pwrite_rights, &fp); if (error != 0) return (error); AUDIT_ARG_FILE(td->td_proc, fp); if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { error = ESPIPE; goto out; } if ((fp->f_flag & FWRITE) == 0) { error = EBADF; goto out; } error = fo_fallocate(fp, offset, len, td); out: fdrop(fp, td); return (error); } int sys_fspacectl(struct thread *td, struct fspacectl_args *uap) { struct spacectl_range rqsr, rmsr; int error, cerror; error = copyin(uap->rqsr, &rqsr, sizeof(rqsr)); if (error != 0) return (error); error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags, &rmsr); if (uap->rmsr != NULL) { cerror = copyout(&rmsr, uap->rmsr, sizeof(rmsr)); if (error == 0) error = cerror; } return (error); } int kern_fspacectl(struct thread *td, int fd, int cmd, const struct spacectl_range *rqsr, int flags, struct spacectl_range *rmsrp) { struct file *fp; struct spacectl_range rmsr; int error; AUDIT_ARG_FD(fd); AUDIT_ARG_CMD(cmd); AUDIT_ARG_FFLAGS(flags); if (rqsr == NULL) return (EINVAL); rmsr = *rqsr; if (rmsrp != NULL) *rmsrp = rmsr; if (cmd != SPACECTL_DEALLOC || rqsr->r_offset < 0 || rqsr->r_len <= 0 || rqsr->r_offset > OFF_MAX - rqsr->r_len || (flags & ~SPACECTL_F_SUPPORTED) != 0) return (EINVAL); error = fget_write(td, fd, &cap_pwrite_rights, &fp); if (error != 0) return (error); AUDIT_ARG_FILE(td->td_proc, fp); if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { error = ESPIPE; goto out; } if ((fp->f_flag & FWRITE) == 0) { error = EBADF; goto out; } error = fo_fspacectl(fp, cmd, &rmsr.r_offset, &rmsr.r_len, flags, td->td_ucred, td); /* fspacectl is not restarted after signals if the file is modified. */ if (rmsr.r_len != rqsr->r_len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (rmsrp != NULL) *rmsrp = rmsr; out: fdrop(fp, td); return (error); } int kern_specialfd(struct thread *td, int type, void *arg) { struct file *fp; struct specialfd_eventfd *ae; int error, fd, fflags; fflags = 0; error = falloc_noinstall(td, &fp); if (error != 0) return (error); switch (type) { case SPECIALFD_EVENTFD: ae = arg; if ((ae->flags & EFD_CLOEXEC) != 0) fflags |= O_CLOEXEC; error = eventfd_create_file(td, fp, ae->initval, ae->flags); break; default: error = EINVAL; break; } if (error == 0) error = finstall(td, fp, &fd, fflags, NULL); fdrop(fp, td); if (error == 0) td->td_retval[0] = fd; return (error); } int sys___specialfd(struct thread *td, struct __specialfd_args *args) { struct specialfd_eventfd ae; int error; switch (args->type) { case SPECIALFD_EVENTFD: if (args->len != sizeof(struct specialfd_eventfd)) { error = EINVAL; break; } error = copyin(args->req, &ae, sizeof(ae)); if (error != 0) break; if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) != 0) { error = EINVAL; break; } error = kern_specialfd(td, args->type, &ae); break; default: error = EINVAL; break; } return (error); } int poll_no_poll(int events) { /* * Return true for read/write. If the user asked for something * special, return POLLNVAL, so that clients have a way of * determining reliably whether or not the extended * functionality is present without hard-coding knowledge * of specific filesystem implementations. */ if (events & ~POLLSTANDARD) return (POLLNVAL); return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } int sys_pselect(struct thread *td, struct pselect_args *uap) { struct timespec ts; struct timeval tv, *tvp; sigset_t set, *uset; int error; if (uap->ts != NULL) { error = copyin(uap->ts, &ts, sizeof(ts)); if (error != 0) return (error); TIMESPEC_TO_TIMEVAL(&tv, &ts); tvp = &tv; } else tvp = NULL; if (uap->sm != NULL) { error = copyin(uap->sm, &set, sizeof(set)); if (error != 0) return (error); uset = &set; } else uset = NULL; return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, uset, NFDBITS)); } int kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits) { int error; if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &td->td_oldsigmask, 0); if (error != 0) return (error); td->td_pflags |= TDP_OLDMASK; } error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); if (uset != NULL) { /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. If we didn't get interrupted, then the caller is * likely not expecting a signal to hit that should normally be * blocked by its signal mask, so we restore the mask before * any signals could be delivered. */ if (error == EINTR) { ast_sched(td, TDA_SIGSUSPEND); } else { /* *select(2) should never restart. */ MPASS(error != ERESTART); ast_sched(td, TDA_PSELECT); } } return (error); } #ifndef _SYS_SYSPROTO_H_ struct select_args { int nd; fd_set *in, *ou, *ex; struct timeval *tv; }; #endif int sys_select(struct thread *td, struct select_args *uap) { struct timeval tv, *tvp; int error; if (uap->tv != NULL) { error = copyin(uap->tv, &tv, sizeof(tv)); if (error) return (error); tvp = &tv; } else tvp = NULL; return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, NFDBITS)); } /* * In the unlikely case when user specified n greater then the last * open file descriptor, check that no bits are set after the last * valid fd. We must return EBADF if any is set. * * There are applications that rely on the behaviour. * * nd is fd_nfiles. */ static int select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) { char *addr, *oaddr; int b, i, res; uint8_t bits; if (nd >= ndu || fd_in == NULL) return (0); oaddr = NULL; bits = 0; /* silence gcc */ for (i = nd; i < ndu; i++) { b = i / NBBY; #if BYTE_ORDER == LITTLE_ENDIAN addr = (char *)fd_in + b; #else addr = (char *)fd_in; if (abi_nfdbits == NFDBITS) { addr += rounddown(b, sizeof(fd_mask)) + sizeof(fd_mask) - 1 - b % sizeof(fd_mask); } else { addr += rounddown(b, sizeof(uint32_t)) + sizeof(uint32_t) - 1 - b % sizeof(uint32_t); } #endif if (addr != oaddr) { res = fubyte(addr); if (res == -1) return (EFAULT); oaddr = addr; bits = res; } if ((bits & (1 << (i % NBBY))) != 0) return (EBADF); } return (0); } int kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) { struct filedesc *fdp; /* * The magic 2048 here is chosen to be just enough for FD_SETSIZE * infds with the new FD_SETSIZE of 1024, and more than enough for * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE * of 256. */ fd_mask s_selbits[howmany(2048, NFDBITS)]; fd_mask *ibits[3], *obits[3], *selbits, *sbp; struct timeval rtv; sbintime_t asbt, precision, rsbt; u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; int error, lf, ndu; if (nd < 0) return (EINVAL); fdp = td->td_proc->p_fd; ndu = nd; lf = fdp->fd_nfiles; if (nd > lf) nd = lf; error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); if (error != 0) return (error); error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); if (error != 0) return (error); error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); if (error != 0) return (error); /* * Allocate just enough bits for the non-null fd_sets. Use the * preallocated auto buffer if possible. */ nfdbits = roundup(nd, NFDBITS); ncpbytes = nfdbits / NBBY; ncpubytes = roundup(nd, abi_nfdbits) / NBBY; nbufbytes = 0; if (fd_in != NULL) nbufbytes += 2 * ncpbytes; if (fd_ou != NULL) nbufbytes += 2 * ncpbytes; if (fd_ex != NULL) nbufbytes += 2 * ncpbytes; if (nbufbytes <= sizeof s_selbits) selbits = &s_selbits[0]; else selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); /* * Assign pointers into the bit buffers and fetch the input bits. * Put the output buffers together so that they can be bzeroed * together. */ sbp = selbits; #define getbits(name, x) \ do { \ if (name == NULL) { \ ibits[x] = NULL; \ obits[x] = NULL; \ } else { \ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ obits[x] = sbp; \ sbp += ncpbytes / sizeof *sbp; \ error = copyin(name, ibits[x], ncpubytes); \ if (error != 0) \ goto done; \ if (ncpbytes != ncpubytes) \ bzero((char *)ibits[x] + ncpubytes, \ ncpbytes - ncpubytes); \ } \ } while (0) getbits(fd_in, 0); getbits(fd_ou, 1); getbits(fd_ex, 2); #undef getbits #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) /* * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, * we are running under 32-bit emulation. This should be more * generic. */ #define swizzle_fdset(bits) \ if (abi_nfdbits != NFDBITS && bits != NULL) { \ int i; \ for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ } #else #define swizzle_fdset(bits) #endif /* Make sure the bit order makes it through an ABI transition */ swizzle_fdset(ibits[0]); swizzle_fdset(ibits[1]); swizzle_fdset(ibits[2]); if (nbufbytes != 0) bzero(selbits, nbufbytes / 2); precision = 0; if (tvp != NULL) { rtv = *tvp; if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || rtv.tv_usec >= 1000000) { error = EINVAL; goto done; } if (!timevalisset(&rtv)) asbt = 0; else if (rtv.tv_sec <= INT32_MAX) { rsbt = tvtosbt(rtv); precision = rsbt; precision >>= tc_precexp; if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = -1; } else asbt = -1; } else asbt = -1; seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = selscan(td, ibits, obits, nd); if (error || td->td_retval[0] != 0) break; error = seltdwait(td, asbt, precision); if (error) break; error = selrescan(td, ibits, obits); if (error || td->td_retval[0] != 0) break; } seltdclear(td); done: /* select is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; /* swizzle bit order back, if necessary */ swizzle_fdset(obits[0]); swizzle_fdset(obits[1]); swizzle_fdset(obits[2]); #undef swizzle_fdset #define putbits(name, x) \ if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ error = error2; if (error == 0) { int error2; putbits(fd_in, 0); putbits(fd_ou, 1); putbits(fd_ex, 2); #undef putbits } if (selbits != &s_selbits[0]) free(selbits, M_SELECT); return (error); } /* * Convert a select bit set to poll flags. * * The backend always returns POLLHUP/POLLERR if appropriate and we * return this as a set bit in any set. */ static const int select_flags[3] = { POLLRDNORM | POLLHUP | POLLERR, POLLWRNORM | POLLHUP | POLLERR, POLLRDBAND | POLLERR }; /* * Compute the fo_poll flags required for a fd given by the index and * bit position in the fd_mask array. */ static __inline int selflags(fd_mask **ibits, int idx, fd_mask bit) { int flags; int msk; flags = 0; for (msk = 0; msk < 3; msk++) { if (ibits[msk] == NULL) continue; if ((ibits[msk][idx] & bit) == 0) continue; flags |= select_flags[msk]; } return (flags); } /* * Set the appropriate output bits given a mask of fired events and the * input bits originally requested. */ static __inline int selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) { int msk; int n; n = 0; for (msk = 0; msk < 3; msk++) { if ((events & select_flags[msk]) == 0) continue; if (ibits[msk] == NULL) continue; if ((ibits[msk][idx] & bit) == 0) continue; /* * XXX Check for a duplicate set. This can occur because a * socket calls selrecord() twice for each poll() call * resulting in two selfds per real fd. selrescan() will * call selsetbits twice as a result. */ if ((obits[msk][idx] & bit) != 0) continue; obits[msk][idx] |= bit; n++; } return (n); } /* * Traverse the list of fds attached to this thread's seltd and check for * completion. */ static int selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) { struct filedesc *fdp; struct selinfo *si; struct seltd *stp; struct selfd *sfp; struct selfd *sfn; struct file *fp; fd_mask bit; int fd, ev, n, idx; int error; bool only_user; fdp = td->td_proc->p_fd; stp = td->td_sel; n = 0; only_user = FILEDESC_IS_ONLY_USER(fdp); STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (int)(uintptr_t)sfp->sf_cookie; si = sfp->sf_si; selfdfree(stp, sfp); /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; if (only_user) error = fget_only_user(fdp, fd, &cap_event_rights, &fp); else error = fget_unlocked(td, fd, &cap_event_rights, &fp); if (__predict_false(error != 0)) return (error); idx = fd / NFDBITS; bit = (fd_mask)1 << (fd % NFDBITS); ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); if (only_user) fput_only_user(fdp, fp); else fdrop(fp, td); if (ev != 0) n += selsetbits(ibits, obits, idx, bit, ev); } stp->st_flags = 0; td->td_retval[0] = n; return (0); } /* * Perform the initial filedescriptor scan and register ourselves with * each selinfo. */ static int selscan(struct thread *td, fd_mask **ibits, fd_mask **obits, int nfd) { struct filedesc *fdp; struct file *fp; fd_mask bit; int ev, flags, end, fd; int n, idx; int error; bool only_user; fdp = td->td_proc->p_fd; n = 0; only_user = FILEDESC_IS_ONLY_USER(fdp); for (idx = 0, fd = 0; fd < nfd; idx++) { end = imin(fd + NFDBITS, nfd); for (bit = 1; fd < end; bit <<= 1, fd++) { /* Compute the list of events we're interested in. */ flags = selflags(ibits, idx, bit); if (flags == 0) continue; if (only_user) error = fget_only_user(fdp, fd, &cap_event_rights, &fp); else error = fget_unlocked(td, fd, &cap_event_rights, &fp); if (__predict_false(error != 0)) return (error); selfdalloc(td, (void *)(uintptr_t)fd); ev = fo_poll(fp, flags, td->td_ucred, td); if (only_user) fput_only_user(fdp, fp); else fdrop(fp, td); if (ev != 0) n += selsetbits(ibits, obits, idx, bit, ev); } } td->td_retval[0] = n; return (0); } int sys_poll(struct thread *td, struct poll_args *uap) { struct timespec ts, *tsp; if (uap->timeout != INFTIM) { if (uap->timeout < 0) return (EINVAL); ts.tv_sec = uap->timeout / 1000; ts.tv_nsec = (uap->timeout % 1000) * 1000000; tsp = &ts; } else tsp = NULL; return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL)); } /* * kfds points to an array in the kernel. */ int kern_poll_kfds(struct thread *td, struct pollfd *kfds, u_int nfds, struct timespec *tsp, sigset_t *uset) { sbintime_t sbt, precision, tmp; time_t over; struct timespec ts; int error; precision = 0; if (tsp != NULL) { if (!timespecvalid_interval(tsp)) return (EINVAL); if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) sbt = 0; else { ts = *tsp; if (ts.tv_sec > INT32_MAX / 2) { over = ts.tv_sec - INT32_MAX / 2; ts.tv_sec -= over; } else over = 0; tmp = tstosbt(ts); precision = tmp; precision >>= tc_precexp; if (TIMESEL(&sbt, tmp)) sbt += tc_tick_sbt; sbt += tmp; } } else sbt = -1; if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &td->td_oldsigmask, 0); if (error) return (error); td->td_pflags |= TDP_OLDMASK; } seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = pollscan(td, kfds, nfds); if (error || td->td_retval[0] != 0) break; error = seltdwait(td, sbt, precision); if (error) break; error = pollrescan(td); if (error || td->td_retval[0] != 0) break; } seltdclear(td); /* poll is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; if (uset != NULL) { /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. If we didn't get interrupted, then the caller is * likely not expecting a signal to hit that should normally be * blocked by its signal mask, so we restore the mask before * any signals could be delivered. */ if (error == EINTR) ast_sched(td, TDA_SIGSUSPEND); else ast_sched(td, TDA_PSELECT); } return (error); } int sys_ppoll(struct thread *td, struct ppoll_args *uap) { struct timespec ts, *tsp; sigset_t set, *ssp; int error; if (uap->ts != NULL) { error = copyin(uap->ts, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; if (uap->set != NULL) { error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); ssp = &set; } else ssp = NULL; return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp)); } /* * ufds points to an array in user space. */ int kern_poll(struct thread *td, struct pollfd *ufds, u_int nfds, struct timespec *tsp, sigset_t *set) { struct pollfd *kfds; struct pollfd stackfds[32]; int error; if (kern_poll_maxfds(nfds)) return (EINVAL); if (nfds > nitems(stackfds)) kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK); else kfds = stackfds; error = copyin(ufds, kfds, nfds * sizeof(*kfds)); if (error != 0) goto out; error = kern_poll_kfds(td, kfds, nfds, tsp, set); if (error == 0) error = pollout(td, kfds, ufds, nfds); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray("pollfd", UIO_USERSPACE, ufds, nfds, sizeof(*ufds)); #endif out: if (nfds > nitems(stackfds)) free(kfds, M_TEMP); return (error); } bool kern_poll_maxfds(u_int nfds) { /* * This is kinda bogus. We have fd limits, but that is not * really related to the size of the pollfd array. Make sure * we let the process use at least FD_SETSIZE entries and at * least enough for the system-wide limits. We want to be reasonably * safe, but not overly restrictive. */ return (nfds > maxfilesperproc && nfds > FD_SETSIZE); } static int pollrescan(struct thread *td) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; struct selinfo *si; struct filedesc *fdp; struct file *fp; struct pollfd *fd; int n, error; bool only_user; n = 0; fdp = td->td_proc->p_fd; stp = td->td_sel; only_user = FILEDESC_IS_ONLY_USER(fdp); STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (struct pollfd *)sfp->sf_cookie; si = sfp->sf_si; selfdfree(stp, sfp); /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; if (only_user) error = fget_only_user(fdp, fd->fd, &cap_event_rights, &fp); else error = fget_unlocked(td, fd->fd, &cap_event_rights, &fp); if (__predict_false(error != 0)) { fd->revents = POLLNVAL; n++; continue; } /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); if (only_user) fput_only_user(fdp, fp); else fdrop(fp, td); if (fd->revents != 0) n++; } stp->st_flags = 0; td->td_retval[0] = n; return (0); } static int pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd) { int error = 0; u_int i = 0; u_int n = 0; for (i = 0; i < nfd; i++) { error = copyout(&fds->revents, &ufds->revents, sizeof(ufds->revents)); if (error) return (error); if (fds->revents != 0) n++; fds++; ufds++; } td->td_retval[0] = n; return (0); } static int pollscan(struct thread *td, struct pollfd *fds, u_int nfd) { struct filedesc *fdp; struct file *fp; int i, n, error; bool only_user; n = 0; fdp = td->td_proc->p_fd; only_user = FILEDESC_IS_ONLY_USER(fdp); for (i = 0; i < nfd; i++, fds++) { if (fds->fd < 0) { fds->revents = 0; continue; } if (only_user) error = fget_only_user(fdp, fds->fd, &cap_event_rights, &fp); else error = fget_unlocked(td, fds->fd, &cap_event_rights, &fp); if (__predict_false(error != 0)) { fds->revents = POLLNVAL; n++; continue; } /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ selfdalloc(td, fds); fds->revents = fo_poll(fp, fds->events, td->td_ucred, td); if (only_user) fput_only_user(fdp, fp); else fdrop(fp, td); /* * POSIX requires POLLOUT to be never * set simultaneously with POLLHUP. */ if ((fds->revents & POLLHUP) != 0) fds->revents &= ~POLLOUT; if (fds->revents != 0) n++; } td->td_retval[0] = n; return (0); } /* * XXX This was created specifically to support netncp and netsmb. This * allows the caller to specify a socket to wait for events on. It returns * 0 if any events matched and an error otherwise. There is no way to * determine which events fired. */ int selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) { struct timeval rtv; sbintime_t asbt, precision, rsbt; int error; precision = 0; /* stupid gcc! */ if (tvp != NULL) { rtv = *tvp; if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || rtv.tv_usec >= 1000000) return (EINVAL); if (!timevalisset(&rtv)) asbt = 0; else if (rtv.tv_sec <= INT32_MAX) { rsbt = tvtosbt(rtv); precision = rsbt; precision >>= tc_precexp; if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = -1; } else asbt = -1; } else asbt = -1; seltdinit(td); /* * Iterate until the timeout expires or the socket becomes ready. */ for (;;) { selfdalloc(td, NULL); if (so->so_proto->pr_sopoll(so, events, td) != 0) { error = 0; break; } error = seltdwait(td, asbt, precision); if (error) break; } seltdclear(td); /* XXX Duplicates ncp/smb behavior. */ if (error == ERESTART) error = 0; return (error); } /* * Preallocate two selfds associated with 'cookie'. Some fo_poll routines * have two select sets, one for read and another for write. */ static void selfdalloc(struct thread *td, void *cookie) { struct seltd *stp; stp = td->td_sel; if (stp->st_free1 == NULL) stp->st_free1 = malloc(sizeof(*stp->st_free1), M_SELFD, M_WAITOK|M_ZERO); stp->st_free1->sf_td = stp; stp->st_free1->sf_cookie = cookie; if (stp->st_free2 == NULL) stp->st_free2 = malloc(sizeof(*stp->st_free2), M_SELFD, M_WAITOK|M_ZERO); stp->st_free2->sf_td = stp; stp->st_free2->sf_cookie = cookie; } static void selfdfree(struct seltd *stp, struct selfd *sfp) { STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); /* * Paired with doselwakeup. */ if (atomic_load_acq_ptr((uintptr_t *)&sfp->sf_si) != (uintptr_t)NULL) { mtx_lock(sfp->sf_mtx); if (sfp->sf_si != NULL) { TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); } mtx_unlock(sfp->sf_mtx); } free(sfp, M_SELFD); } /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ void seldrain(struct selinfo *sip) { /* * This feature is already provided by doselwakeup(), thus it is * enough to go for it. * Eventually, the context, should take care to avoid races * between thread calling select()/poll() and file descriptor * detaching, but, again, the races are just the same as * selwakeup(). */ doselwakeup(sip, -1); } /* * Record a select request. */ void selrecord(struct thread *selector, struct selinfo *sip) { struct selfd *sfp; struct seltd *stp; struct mtx *mtxp; stp = selector->td_sel; /* * Don't record when doing a rescan. */ if (stp->st_flags & SELTD_RESCAN) return; /* * Grab one of the preallocated descriptors. */ sfp = NULL; if ((sfp = stp->st_free1) != NULL) stp->st_free1 = NULL; else if ((sfp = stp->st_free2) != NULL) stp->st_free2 = NULL; else panic("selrecord: No free selfd on selq"); mtxp = sip->si_mtx; if (mtxp == NULL) mtxp = mtx_pool_find(mtxpool_select, sip); /* * Initialize the sfp and queue it in the thread. */ sfp->sf_si = sip; sfp->sf_mtx = mtxp; STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); /* * Now that we've locked the sip, check for initialization. */ mtx_lock(mtxp); if (sip->si_mtx == NULL) { sip->si_mtx = mtxp; TAILQ_INIT(&sip->si_tdlist); } /* * Add this thread to the list of selfds listening on this selinfo. */ TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); mtx_unlock(sip->si_mtx); } /* Wake up a selecting thread. */ void selwakeup(struct selinfo *sip) { doselwakeup(sip, -1); } /* Wake up a selecting thread, and set its priority. */ void selwakeuppri(struct selinfo *sip, int pri) { doselwakeup(sip, pri); } /* * Do a wakeup when a selectable event occurs. */ static void doselwakeup(struct selinfo *sip, int pri) { struct selfd *sfp; struct selfd *sfn; struct seltd *stp; /* If it's not initialized there can't be any waiters. */ if (sip->si_mtx == NULL) return; /* * Locking the selinfo locks all selfds associated with it. */ mtx_lock(sip->si_mtx); TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { /* * Once we remove this sfp from the list and clear the * sf_si seltdclear will know to ignore this si. */ TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); stp = sfp->sf_td; mtx_lock(&stp->st_mtx); stp->st_flags |= SELTD_PENDING; cv_broadcastpri(&stp->st_wait, pri); mtx_unlock(&stp->st_mtx); /* * Paired with selfdfree. * * Storing this only after the wakeup provides an invariant that * stp is not used after selfdfree returns. */ atomic_store_rel_ptr((uintptr_t *)&sfp->sf_si, (uintptr_t)NULL); } mtx_unlock(sip->si_mtx); } static void seltdinit(struct thread *td) { struct seltd *stp; stp = td->td_sel; if (stp != NULL) { MPASS(stp->st_flags == 0); MPASS(STAILQ_EMPTY(&stp->st_selq)); return; } stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); cv_init(&stp->st_wait, "select"); stp->st_flags = 0; STAILQ_INIT(&stp->st_selq); td->td_sel = stp; } static int seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) { struct seltd *stp; int error; stp = td->td_sel; /* * An event of interest may occur while we do not hold the seltd * locked so check the pending flag before we sleep. */ mtx_lock(&stp->st_mtx); /* * Any further calls to selrecord will be a rescan. */ stp->st_flags |= SELTD_RESCAN; if (stp->st_flags & SELTD_PENDING) { mtx_unlock(&stp->st_mtx); return (0); } if (sbt == 0) error = EWOULDBLOCK; else if (sbt != -1) error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, sbt, precision, C_ABSOLUTE); else error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); mtx_unlock(&stp->st_mtx); return (error); } void seltdfini(struct thread *td) { struct seltd *stp; stp = td->td_sel; if (stp == NULL) return; MPASS(stp->st_flags == 0); MPASS(STAILQ_EMPTY(&stp->st_selq)); if (stp->st_free1) free(stp->st_free1, M_SELFD); if (stp->st_free2) free(stp->st_free2, M_SELFD); td->td_sel = NULL; cv_destroy(&stp->st_wait); mtx_destroy(&stp->st_mtx); free(stp, M_SELECT); } /* * Remove the references to the thread from all of the objects we were * polling. */ static void seltdclear(struct thread *td) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; stp = td->td_sel; STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) selfdfree(stp, sfp); stp->st_flags = 0; } static void selectinit(void *); SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); static void selectinit(void *dummy __unused) { mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); } /* * Set up a syscall return value that follows the convention specified for * posix_* functions. */ int kern_posix_error(struct thread *td, int error) { if (error <= 0) return (error); td->td_errno = error; td->td_pflags |= TDP_NERRNO; td->td_retval[0] = error; return (0); } int kcmp_cmp(uintptr_t a, uintptr_t b) { if (a == b) return (0); else if (a < b) return (1); return (2); } static int kcmp_pget(struct thread *td, pid_t pid, struct proc **pp) { int error; if (pid == td->td_proc->p_pid) { *pp = td->td_proc; return (0); } error = pget(pid, PGET_NOTID | PGET_CANDEBUG | PGET_NOTWEXIT | PGET_HOLD, pp); MPASS(*pp != td->td_proc); return (error); } int kern_kcmp(struct thread *td, pid_t pid1, pid_t pid2, int type, uintptr_t idx1, uintptr_t idx2) { struct proc *p1, *p2; struct file *fp1, *fp2; int error, res; res = -1; p1 = p2 = NULL; error = kcmp_pget(td, pid1, &p1); if (error == 0) error = kcmp_pget(td, pid2, &p2); if (error != 0) goto out; switch (type) { case KCMP_FILE: case KCMP_FILEOBJ: error = fget_remote(td, p1, idx1, &fp1); if (error == 0) { error = fget_remote(td, p2, idx2, &fp2); if (error == 0) { if (type == KCMP_FILEOBJ) res = fo_cmp(fp1, fp2, td); else res = kcmp_cmp((uintptr_t)fp1, (uintptr_t)fp2); fdrop(fp2, td); } fdrop(fp1, td); } break; case KCMP_FILES: res = kcmp_cmp((uintptr_t)p1->p_fd, (uintptr_t)p2->p_fd); break; case KCMP_SIGHAND: res = kcmp_cmp((uintptr_t)p1->p_sigacts, (uintptr_t)p2->p_sigacts); break; case KCMP_VM: res = kcmp_cmp((uintptr_t)p1->p_vmspace, (uintptr_t)p2->p_vmspace); break; default: error = EINVAL; break; } out: if (p1 != NULL && p1 != td->td_proc) PRELE(p1); if (p2 != NULL && p2 != td->td_proc) PRELE(p2); td->td_retval[0] = res; return (error); } int sys_kcmp(struct thread *td, struct kcmp_args *uap) { return (kern_kcmp(td, uap->pid1, uap->pid2, uap->type, uap->idx1, uap->idx2)); } int file_kcmp_generic(struct file *fp1, struct file *fp2, struct thread *td) { if (fp1->f_type != fp2->f_type) return (3); return (kcmp_cmp((uintptr_t)fp1->f_data, (uintptr_t)fp2->f_data)); } + +void +exterr_copyout(struct thread *td) +{ + struct uexterror ue; + ksiginfo_t ksi; + void *uloc; + size_t sz; + int error; + + MPASS((td->td_pflags2 & TDP2_UEXTERR) != 0); + + uloc = (char *)td->td_exterr_ptr + __offsetof(struct uexterror, + error); + if ((td->td_pflags2 & TDP2_EXTERR) == 0) { + ue.error = 0; + sz = sizeof(ue.error); + } else { + memset(&ue, 0, sizeof(ue)); + ue.error = td->td_kexterr.error; + ue.cat = td->td_kexterr.cat; + ue.src_line = td->td_kexterr.src_line; + ue.p1 = td->td_kexterr.p1; + ue.p2 = td->td_kexterr.p2; + if (td->td_kexterr.msg != NULL) + strlcpy(ue.msg, td->td_kexterr.msg, sizeof(ue.msg)); + sz = sizeof(ue) - __offsetof(struct uexterror, error); + } + error = copyout(&ue.error, uloc, sz); + if (error != 0) { + td->td_pflags2 &= ~TDP2_UEXTERR; + ksiginfo_init_trap(&ksi); + ksi.ksi_signo = SIGSEGV; + ksi.ksi_code = SEGV_ACCERR; + ksi.ksi_addr = uloc; + trapsignal(td, &ksi); + } +} + +int +sys_exterrctl(struct thread *td, struct exterrctl_args *uap) +{ + uint32_t ver; + int error; + + if ((uap->flags & ~(EXTERRCTLF_FORCE)) != 0) + return (EINVAL); + switch (uap->op) { + case EXTERRCTL_ENABLE: + if ((td->td_pflags2 & TDP2_UEXTERR) != 0 && + (uap->flags & EXTERRCTLF_FORCE) == 0) + return (EBUSY); + td->td_pflags2 &= ~TDP2_UEXTERR; + error = copyin(uap->ptr, &ver, sizeof(ver)); + if (error != 0) + return (error); + if (ver != UEXTERROR_VER) + return (EINVAL); + td->td_pflags2 |= TDP2_UEXTERR; + td->td_exterr_ptr = uap->ptr; + return (0); + case EXTERRCTL_DISABLE: + if ((td->td_pflags2 & TDP2_UEXTERR) == 0) + return (EINVAL); + td->td_pflags2 &= ~TDP2_UEXTERR; + return (0); + default: + return (EINVAL); + } +} diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index 67396a4cabc5..08b557a7a540 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -1,3353 +1,3359 @@ ; System call name/number master file. ; Processed to created init_sysent.c, syscalls.c and syscall.h. ; New FreeBSD system calls should be added to the bottom of this file. ; Columns: number audit type name alt{name,tag,rtyp}/comments ; number system call number, must be in order ; audit the audit event associated with the system call ; A value of AUE_NULL means no auditing, but it also means that ; there is no audit event for the call at this time. For the ; case where the event exists, but we don't want auditing, the ; event should be #defined to AUE_NULL in audit_kevents.h. ; type one of STD, OBSOL, RESERVED, UNIMPL, SYSMUX, COMPAT*, ; NODEF, NOARGS, NOPROTO, NOSTD ; The COMPAT* options may be combined with one or more NO* ; options separated by '|' with no spaces (e.g. COMPAT|NOARGS) ; The CAPENABLED option may be ORed into a type. ; name pseudo-prototype of syscall routine ; If one of the following alts is different, then all appear: ; altname name of system call if different ; alttag name of args struct tag if different from [o]`name'"_args" ; altrtyp return type if not int (bogus - syscalls always return int) ; for UNIMPL/OBSOL, name continues with comments ; types: ; STD always included ; COMPAT included on COMPAT #ifdef ; COMPAT4 included on COMPAT_FREEBSD4 #ifdef (FreeBSD 4 compat) ; COMPAT6 included on COMPAT_FREEBSD6 #ifdef (FreeBSD 6 compat) ; COMPAT7 included on COMPAT_FREEBSD7 #ifdef (FreeBSD 7 compat) ; COMPAT10 included on COMPAT_FREEBSD10 #ifdef (FreeBSD 10 compat) ; COMPAT11 included on COMPAT_FREEBSD11 #ifdef (FreeBSD 11 compat) ; COMPAT12 included on COMPAT_FREEBSD12 #ifdef (FreeBSD 12 compat) ; COMPAT13 included on COMPAT_FREEBSD13 #ifdef (FreeBSD 13 compat) ; COMPAT14 included on COMPAT_FREEBSD14 #ifdef (FreeBSD 14 compat) ; OBSOL obsolete, not included in system, only specifies name ; RESERVED reserved for local or vendor use (not for FreeBSD) ; UNIMPL not implemented, placeholder only ; NOSTD implemented but as a lkm that can be statically ; compiled in; sysent entry will be filled with lkmressys ; so the SYSCALL_MODULE macro works ; NOARGS same as STD except do not create structure in sys/sysproto.h ; NODEF same as STD except only have the entry in the syscall table ; added. Meaning - do not create structure or function ; prototype in sys/sysproto.h ; NOPROTO same as STD except do not create structure or ; function prototype in sys/sysproto.h. Does add a ; definition to syscall.h besides adding a sysent. ; NOLIB don't create stubs in libc or libsys ; NOTSTATIC syscall is loadable ; SYSMUX syscall multiplexer. No prototype, argument struct, or ; handler is declared or used. Handled in MD syscall code. ; CAPENABLED syscall is allowed in capability mode ; ; To support programmatic generation of both the default ABI and 32-bit compat ; (freebsd32) we impose a number of restrictions on the types of system calls. ; For integer types: ; - Bare int and long are allowed (long is a sign of a bad interface). ; - Use u_int and u_long rather than "unsigned (int|long)". ; - size_t is allowed. ; - typedefs are allowed, but new signed types that vary between 32- and ; 64-bit ABIs must be added to config.known_abi_flags in ; sys/tools/syscalls/config.lua so it knows they require handling. ; - Always-64-bit types other than dev_t, id_t, and off_t must be added to ; util.is64bitType in sys/tools/syscalls/tools/util.lua. ; For pointers: ; - Prefer structs to typedefs so an ABI-specific suffix (e.g., "32") can ; be prepended (e.g., ucontext_t -> struct ucontext -> struct ucontext32). ; - Pointers to objects (structs, unions, etc) containing any long, pointer, ; or time_t arguments need _Contains_ annotations. Such objects should be ; padded such that all 64-bit types are 64-bit aligned. ; annotations: ; SAL 2.0 annotations are used to specify how system calls treat ; arguments that are passed using pointers. There are three basic ; annotations. ; ; _In_ Object pointed to will be read and not modified. ; _Out_ Object pointed to will be written and not read. ; _Inout_ Object pointed to will be written and read. ; ; These annotations are used alone when the pointer refers to a single ; object i.e. scalar types, structs, and pointers, and not NULL. Adding ; the _opt_ suffix, e.g. _In_opt_, implies that the pointer may also ; refer to NULL. ; ; For pointers to arrays, additional suffixes are added: ; ; _In_z_, _Out_z_, _Inout_z_: ; for a NUL terminated array e.g. a string. ; _In_reads_z_(n),_Out_writes_z_(n), _Inout_updates_z_(n): ; for a NUL terminated array e.g. a string, of known length n bytes. ; _In_reads_(n),_Out_writes_(n),_Inout_updates_(n): ; for an array of n elements. ; _In_reads_bytes_(n), _Out_writes_bytes_(n), _Inout_updates_bytes(n): ; for a buffer of n-bytes. ; ; In addition to SAL annotations, pointers are annotated to indicate ; that they point to types that change between ABIs. That means that ; they contain long, pointer, or time_t types. This is indicated with ; a _Contains_ annotation followed immediately by one or more of: ; ; long_ Object contains a direct (or typedef'd) long value and varies ; between 32- and 64-bit ABIs. This includes size_t. ; ptr_ Object contains pointers (or intptr_t) and varies between ; 32- and 64-bit ABIs. ; timet_ Object contains a time_t and varies between i386 and other ; ABIs. ; #include's, #defines's, etc. may be included, and are copied to a ; limited set of output files. Before the first syscalls, #include lines will ; be copied and %%ABI_HEADERS%% expanded. Between system call entries, ; all lines beginning with # will be copied. Caveat Emptor. ; WARNING: this functionality is deprecated. #include #include #include %%ABI_HEADERS%% 0 AUE_NULL SYSMUX { int syscall( int number, ... ); } 1 AUE_EXIT STD|CAPENABLED { void exit( int rval ); } 2 AUE_FORK STD|CAPENABLED { int fork(void); } 3 AUE_READ STD|CAPENABLED { ssize_t read( int fd, _Out_writes_bytes_(nbyte) void *buf, size_t nbyte ); } 4 AUE_WRITE STD|CAPENABLED { ssize_t write( int fd, _In_reads_bytes_(nbyte) const void *buf, size_t nbyte ); } 5 AUE_OPEN_RWTC STD { int open( _In_z_ const char *path, int flags, mode_t mode ); } ; XXX should be { int open(const char *path, int flags, ...); } ; but we're not ready for varargs. 6 AUE_CLOSE STD|CAPENABLED { int close( int fd ); } 7 AUE_WAIT4 STD|CAPENABLED { int wait4( int pid, _Out_opt_ int *status, int options, _Out_opt_ _Contains_long_timet_ struct rusage *rusage ); } 8 AUE_CREAT COMPAT { int creat( _In_z_ const char *path, int mode ); } 9 AUE_LINK STD { int link( _In_z_ const char *path, _In_z_ const char *link ); } 10 AUE_UNLINK STD { int unlink( _In_z_ const char *path ); } 11 AUE_NULL OBSOL execv 12 AUE_CHDIR STD { int chdir( _In_z_ const char *path ); } 13 AUE_FCHDIR STD { int fchdir( int fd ); } 14 AUE_MKNOD COMPAT11 { int mknod( _In_z_ const char *path, int mode, uint32_t dev ); } 15 AUE_CHMOD STD { int chmod( _In_z_ const char *path, mode_t mode ); } 16 AUE_CHOWN STD { int chown( _In_z_ const char *path, int uid, int gid ); } 17 AUE_NULL STD|CAPENABLED { void *break( _In_ char *nsize ); } 18 AUE_GETFSSTAT COMPAT4 { int getfsstat( _Out_writes_bytes_opt_(bufsize) _Contains_long_ struct ostatfs *buf, long bufsize, int mode ); } 19 AUE_LSEEK COMPAT|CAPENABLED { long lseek( int fd, long offset, int whence ); } 20 AUE_GETPID STD|CAPENABLED { pid_t getpid(void); } 21 AUE_MOUNT STD { int mount( _In_z_ const char *type, _In_z_ const char *path, int flags, _In_opt_ void *data ); } 22 AUE_UMOUNT STD { int unmount( _In_z_ const char *path, int flags ); } 23 AUE_SETUID STD|CAPENABLED { int setuid( uid_t uid ); } 24 AUE_GETUID STD|CAPENABLED { uid_t getuid(void); } 25 AUE_GETEUID STD|CAPENABLED { uid_t geteuid(void); } 26 AUE_PTRACE STD { int ptrace( int req, pid_t pid, _Inout_opt_ _Contains_long_ptr_ caddr_t addr, int data ); } 27 AUE_RECVMSG STD|CAPENABLED { ssize_t recvmsg( int s, _Inout_ _Contains_ptr_ struct msghdr *msg, int flags ); } 28 AUE_SENDMSG STD|CAPENABLED { ssize_t sendmsg( int s, _In_ _Contains_ptr_ const struct msghdr *msg, int flags ); } 29 AUE_RECVFROM STD|CAPENABLED { ssize_t recvfrom( int s, _Out_writes_bytes_(len) void *buf, size_t len, int flags, _Out_writes_bytes_opt_(*fromlenaddr) struct sockaddr *from, _Inout_opt_ __socklen_t *fromlenaddr ); } 30 AUE_ACCEPT STD|CAPENABLED { int accept( int s, _Out_writes_bytes_opt_(*anamelen) struct sockaddr *name, _Inout_opt_ __socklen_t *anamelen ); } 31 AUE_GETPEERNAME STD|CAPENABLED { int getpeername( int fdes, _Out_writes_bytes_(*alen) struct sockaddr *asa, _Inout_opt_ __socklen_t *alen ); } 32 AUE_GETSOCKNAME STD|CAPENABLED { int getsockname( int fdes, _Out_writes_bytes_(*alen) struct sockaddr *asa, _Inout_ __socklen_t *alen ); } 33 AUE_ACCESS STD { int access( _In_z_ const char *path, int amode ); } 34 AUE_CHFLAGS STD { int chflags( _In_z_ const char *path, u_long flags ); } 35 AUE_FCHFLAGS STD|CAPENABLED { int fchflags( int fd, u_long flags ); } 36 AUE_SYNC STD|CAPENABLED { int sync(void); } 37 AUE_KILL STD|CAPENABLED { int kill( int pid, int signum ); } 38 AUE_STAT COMPAT { int stat( _In_z_ const char *path, _Out_ _Contains_timet_ struct ostat *ub ); } 39 AUE_GETPPID STD|CAPENABLED { pid_t getppid(void); } 40 AUE_LSTAT COMPAT { int lstat( _In_z_ const char *path, _Out_ _Contains_timet_ struct ostat *ub ); } 41 AUE_DUP STD|CAPENABLED { int dup( u_int fd ); } 42 AUE_PIPE COMPAT10|CAPENABLED { int pipe(void); } 43 AUE_GETEGID STD|CAPENABLED { gid_t getegid(void); } 44 AUE_PROFILE STD|CAPENABLED { int profil( _Out_writes_bytes_(size) char *samples, size_t size, size_t offset, u_int scale ); } 45 AUE_KTRACE STD { int ktrace( _In_z_ const char *fname, int ops, int facs, int pid ); } 46 AUE_SIGACTION COMPAT|CAPENABLED { int sigaction( int signum, _In_opt_ _Contains_ptr_ struct osigaction *nsa, _Out_opt_ _Contains_ptr_ struct osigaction *osa ); } 47 AUE_GETGID STD|CAPENABLED { gid_t getgid(void); } 48 AUE_SIGPROCMASK COMPAT|CAPENABLED { int sigprocmask( int how, osigset_t mask ); } ; XXX note nonstandard (bogus) calling convention - the libc stub passes ; us the mask, not a pointer to it, and we return the old mask as the ; (int) return value. 49 AUE_GETLOGIN STD|CAPENABLED { int getlogin( _Out_writes_z_(namelen) char *namebuf, u_int namelen ); } 50 AUE_SETLOGIN STD { int setlogin( _In_z_ const char *namebuf ); } 51 AUE_ACCT STD { int acct( _In_z_ const char *path ); } 52 AUE_SIGPENDING COMPAT|CAPENABLED { int sigpending(void); } 53 AUE_SIGALTSTACK STD|CAPENABLED { int sigaltstack( _In_opt_ _Contains_long_ptr_ const struct sigaltstack *ss, _Out_opt_ _Contains_long_ptr_ struct sigaltstack *oss ); } 54 AUE_IOCTL STD|CAPENABLED { int ioctl( int fd, u_long com, _Inout_opt_ _Contains_long_ptr_ char *data ); } 55 AUE_REBOOT STD { int reboot( int opt ); } 56 AUE_REVOKE STD { int revoke( _In_z_ const char *path ); } 57 AUE_SYMLINK STD { int symlink( _In_z_ const char *path, _In_z_ const char *link ); } 58 AUE_READLINK STD { ssize_t readlink( _In_z_ const char *path, _Out_writes_z_(count) char *buf, size_t count ); } 59 AUE_EXECVE STD { int execve( _In_z_ const char *fname, _In_z_ char **argv, _In_z_ char **envv ); } 60 AUE_UMASK STD|CAPENABLED { mode_t umask( mode_t newmask ); } 61 AUE_CHROOT STD { int chroot( _In_z_ const char *path ); } 62 AUE_FSTAT COMPAT|CAPENABLED { int fstat( int fd, _Out_ _Contains_timet_ struct ostat *sb ); } 63 AUE_NULL COMPAT { int getkerninfo( int op, _Out_writes_bytes_opt(*size) char *where, _Inout_opt_ size_t *size, int arg ); } 64 AUE_NULL COMPAT|CAPENABLED { int getpagesize(void); } 65 AUE_MSYNC STD|CAPENABLED { int msync( _In_ void *addr, size_t len, int flags ); } 66 AUE_VFORK STD|CAPENABLED { int vfork(void); } 67 AUE_NULL OBSOL vread 68 AUE_NULL OBSOL vwrite 69 AUE_NULL OBSOL sbrk 70 AUE_NULL OBSOL sstk 71 AUE_MMAP COMPAT|CAPENABLED { void *mmap( _In_ void *addr, int len, int prot, int flags, int fd, long pos ); } 72 AUE_O_VADVISE COMPAT11 { int vadvise( int anom ); } 73 AUE_MUNMAP STD|CAPENABLED { int munmap( _In_ void *addr, size_t len ); } 74 AUE_MPROTECT STD|CAPENABLED { int mprotect( _In_ void *addr, size_t len, int prot ); } 75 AUE_MADVISE STD|CAPENABLED { int madvise( _In_ void *addr, size_t len, int behav ); } 76 AUE_NULL OBSOL vhangup 77 AUE_NULL OBSOL vlimit 78 AUE_MINCORE STD|CAPENABLED { int mincore( _In_ const void *addr, size_t len, _Out_writes_bytes_(len/PAGE_SIZE) char *vec ); } 79 AUE_GETGROUPS STD|CAPENABLED { int getgroups( int gidsetsize, _Out_writes_opt_(gidsetsize) gid_t *gidset ); } 80 AUE_SETGROUPS STD { int setgroups( int gidsetsize, _In_reads_(gidsetsize) const gid_t *gidset ); } 81 AUE_GETPGRP STD|CAPENABLED { int getpgrp(void); } 82 AUE_SETPGRP STD { int setpgid( int pid, int pgid ); } 83 AUE_SETITIMER STD|CAPENABLED { int setitimer( int which, _In_ _Contains_timet_ const struct itimerval *itv, _Out_opt_ _Contains_timet_ struct itimerval *oitv ); } 84 AUE_WAIT4 COMPAT { int wait(void); } 85 AUE_SWAPON STD { int swapon( _In_z_ const char *name ); } 86 AUE_GETITIMER STD|CAPENABLED { int getitimer( int which, _Out_ _Contains_timet_ struct itimerval *itv ); } 87 AUE_SYSCTL COMPAT|CAPENABLED { int gethostname( _Out_writes_z_(len) char *hostname, u_int len ); } 88 AUE_SYSCTL COMPAT { int sethostname( _In_reads_z_(len) char *hostname, u_int len ); } 89 AUE_GETDTABLESIZE STD|CAPENABLED { int getdtablesize(void); } 90 AUE_DUP2 STD|CAPENABLED { int dup2( u_int from, u_int to ); } 91 AUE_NULL RESERVED 92 AUE_FCNTL STD|CAPENABLED { int fcntl( int fd, int cmd, intptr_t arg ); } ; XXX should be { int fcntl(int fd, int cmd, ...); } ; but we're not ready for varargs. 93 AUE_SELECT STD|CAPENABLED { int select( int nd, _Inout_opt_ fd_set *in, _Inout_opt_ fd_set *ou, _Inout_opt_ fd_set *ex, _In_opt_ _Contains_long_timet_ struct timeval *tv ); } 94 AUE_NULL RESERVED 95 AUE_FSYNC STD|CAPENABLED { int fsync( int fd ); } 96 AUE_SETPRIORITY STD|CAPENABLED { int setpriority( int which, int who, int prio ); } 97 AUE_SOCKET STD|CAPENABLED { int socket( int domain, int type, int protocol ); } 98 AUE_CONNECT STD { int connect( int s, _In_reads_bytes_(namelen) const struct sockaddr *name, __socklen_t namelen ); } 99 AUE_ACCEPT COMPAT|CAPENABLED { int accept( int s, _Out_writes_bytes_opt_(*anamelen) struct sockaddr *name, __socklen_t *anamelen ); } 100 AUE_GETPRIORITY STD|CAPENABLED { int getpriority( int which, int who ); } 101 AUE_SEND COMPAT|CAPENABLED { int send( int s, _In_reads_bytes_(len) const void *buf, int len, int flags ); } 102 AUE_RECV COMPAT|CAPENABLED { int recv( int s, _Out_writes_bytes_(len) void *buf, int len, int flags ); } 103 AUE_SIGRETURN COMPAT|CAPENABLED { int sigreturn( _In_ struct osigcontext *sigcntxp ); } 104 AUE_BIND STD { int bind( int s, _In_reads_bytes_(namelen) const struct sockaddr *name, __socklen_t namelen ); } 105 AUE_SETSOCKOPT STD|CAPENABLED { int setsockopt( int s, int level, int name, _In_reads_bytes_opt_(valsize) const void *val, __socklen_t valsize ); } 106 AUE_LISTEN STD|CAPENABLED { int listen( int s, int backlog ); } 107 AUE_NULL OBSOL vtimes 108 AUE_NULL COMPAT|CAPENABLED { int sigvec( int signum, _In_opt_ _Contains_ptr_ struct sigvec *nsv, _Out_opt_ _Contains_ptr_ struct sigvec *osv ); } 109 AUE_NULL COMPAT|CAPENABLED { int sigblock( int mask ); } 110 AUE_NULL COMPAT|CAPENABLED { int sigsetmask( int mask ); } 111 AUE_NULL COMPAT|CAPENABLED { int sigsuspend( osigset_t mask ); } ; XXX note nonstandard (bogus) calling convention - the libc stub passes ; us the mask, not a pointer to it. 112 AUE_NULL COMPAT|CAPENABLED { int sigstack( _In_opt_ _Contains_ptr_ struct sigstack *nss, _Out_opt_ _Contains_ptr_ struct sigstack *oss ); } 113 AUE_RECVMSG COMPAT|CAPENABLED { int recvmsg( int s, _Inout_ _Contains_ptr_ struct omsghdr *msg, int flags ); } 114 AUE_SENDMSG COMPAT|CAPENABLED { int sendmsg( int s, _In_ _Contains_ptr_ const struct omsghdr *msg, int flags ); } 115 AUE_NULL OBSOL vtrace 116 AUE_GETTIMEOFDAY STD|CAPENABLED { int gettimeofday( _Out_ _Contains_long_timet_ struct timeval *tp, _Out_opt_ struct timezone *tzp ); } 117 AUE_GETRUSAGE STD|CAPENABLED { int getrusage( int who, _Out_ _Contains_long_ struct rusage *rusage ); } 118 AUE_GETSOCKOPT STD|CAPENABLED { int getsockopt( int s, int level, int name, _Out_writes_bytes_opt_(*avalsize) void *val, _Inout_ __socklen_t *avalsize ); } 119 AUE_NULL RESERVED 120 AUE_READV STD|CAPENABLED { ssize_t readv( int fd, _In_reads_(iovcnt) _Contains_long_ptr_ const struct iovec *iovp, u_int iovcnt ); } 121 AUE_WRITEV STD|CAPENABLED { ssize_t writev( int fd, _In_reads_(iovcnt) _Contains_long_ptr_ const struct iovec *iovp, u_int iovcnt ); } 122 AUE_SETTIMEOFDAY STD { int settimeofday( _In_ _Contains_long_timet_ const struct timeval *tv, _In_opt_ const struct timezone *tzp ); } 123 AUE_FCHOWN STD|CAPENABLED { int fchown( int fd, int uid, int gid ); } 124 AUE_FCHMOD STD|CAPENABLED { int fchmod( int fd, mode_t mode ); } 125 AUE_RECVFROM COMPAT|CAPENABLED { int recvfrom( int s, _Out_writes_(len) void *buf, size_t len, int flags, _Out_writes_bytes_(*fromlenaddr) struct sockaddr *from, _Inout_ __socklen_t *fromlenaddr ); } 126 AUE_SETREUID STD|CAPENABLED { int setreuid( int ruid, int euid ); } 127 AUE_SETREGID STD|CAPENABLED { int setregid( int rgid, int egid ); } 128 AUE_RENAME STD { int rename( _In_z_ const char *from, _In_z_ const char *to ); } 129 AUE_TRUNCATE COMPAT { int truncate( _In_z_ const char *path, long length ); } 130 AUE_FTRUNCATE COMPAT|CAPENABLED { int ftruncate( int fd, long length ); } 131 AUE_FLOCK STD|CAPENABLED { int flock( int fd, int how ); } 132 AUE_MKFIFO STD { int mkfifo( _In_z_ const char *path, mode_t mode ); } 133 AUE_SENDTO STD|CAPENABLED { ssize_t sendto( int s, _In_reads_bytes_(len) const void *buf, size_t len, int flags, _In_reads_bytes_opt_(tolen) const struct sockaddr *to, __socklen_t tolen ); } 134 AUE_SHUTDOWN STD|CAPENABLED { int shutdown( int s, int how ); } 135 AUE_SOCKETPAIR STD|CAPENABLED { int socketpair( int domain, int type, int protocol, _Out_writes_(2) int *rsv ); } 136 AUE_MKDIR STD { int mkdir( _In_z_ const char *path, mode_t mode ); } 137 AUE_RMDIR STD { int rmdir( _In_z_ const char *path ); } 138 AUE_UTIMES STD { int utimes( _In_z_ const char *path, _In_ _Contains_long_timet_ const struct timeval *tptr ); } 139 AUE_NULL OBSOL sigreturn 140 AUE_ADJTIME STD { int adjtime( _In_ _Contains_long_timet_ const struct timeval *delta, _Out_opt_ _Contains_long_timet_ struct timeval *olddelta ); } 141 AUE_GETPEERNAME COMPAT|CAPENABLED { int getpeername( int fdes, _Out_writes_bytes_(*alen) struct sockaddr *asa, _Inout_opt_ __socklen_t *alen ); } 142 AUE_SYSCTL COMPAT|CAPENABLED { long gethostid(void); } 143 AUE_SYSCTL COMPAT { int sethostid( long hostid ); } 144 AUE_GETRLIMIT COMPAT|CAPENABLED { int getrlimit( u_int which, _Out_ struct orlimit *rlp ); } 145 AUE_SETRLIMIT COMPAT|CAPENABLED { int setrlimit( u_int which, _Out_ struct orlimit *rlp ); } 146 AUE_KILLPG COMPAT { int killpg( int pgid, int signum ); } 147 AUE_SETSID STD|CAPENABLED { int setsid(void); } 148 AUE_QUOTACTL STD { int quotactl( _In_z_ const char *path, int cmd, int uid, _In_ void *arg ); } 149 AUE_O_QUOTA COMPAT { int quota(void); } 150 AUE_GETSOCKNAME COMPAT|CAPENABLED { int getsockname( int fdes, _Out_writes_bytes_(*alen) struct sockaddr *asa, _Inout_ __socklen_t *alen ); } 151-153 AUE_NULL RESERVED ; 154 is initialised by the NLM code, if present. 154 AUE_NULL NOSTD { int nlm_syscall( int debug_level, int grace_period, int addr_count, _In_reads_(addr_count) char **addrs ); } ; 155 is initialized by the NFS code, if present. 155 AUE_NFS_SVC NOSTD { int nfssvc( int flag, _In_ void *argp ); } 156 AUE_GETDIRENTRIES COMPAT|CAPENABLED { int getdirentries( int fd, _Out_writes_bytes_(count) char *buf, u_int count, _Out_opt_ long *basep ); } 157 AUE_STATFS COMPAT4 { int statfs( _In_z_ const char *path, _Out_ _Contains_long_ struct ostatfs *buf ); } 158 AUE_FSTATFS COMPAT4|CAPENABLED { int fstatfs( int fd, _Out_ _Contains_long_ struct ostatfs *buf ); } 159 AUE_NULL RESERVED 160 AUE_LGETFH STD { int lgetfh( _In_z_ const char *fname, _Out_ struct fhandle *fhp ); } 161 AUE_NFS_GETFH STD { int getfh( _In_z_ const char *fname, _Out_ struct fhandle *fhp ); } 162 AUE_SYSCTL COMPAT4|CAPENABLED { int getdomainname( _Out_writes_z_(len) char *domainname, int len ); } 163 AUE_SYSCTL COMPAT4 { int setdomainname( _In_reads_z_(len) char *domainname, int len ); } 164 AUE_NULL COMPAT4 { int uname( _Out_ struct utsname *name ); } 165 AUE_SYSARCH STD|CAPENABLED { int sysarch( int op, _In_z_ char *parms ); } 166 AUE_RTPRIO STD|CAPENABLED { int rtprio( int function, pid_t pid, _Inout_ struct rtprio *rtp ); } 167-168 AUE_NULL RESERVED 169 AUE_SEMSYS NOSTD { int semsys( int which, int a2, int a3, int a4, int a5 ); } ; XXX should be { int semsys(int which, ...); } 170 AUE_MSGSYS NOSTD { int msgsys( int which, int a2, int a3, int a4, int a5, int a6 ); } ; XXX should be { int msgsys(int which, ...); } 171 AUE_SHMSYS NOSTD { int shmsys( int which, int a2, int a3, int a4 ); } ; XXX should be { int shmsys(int which, ...); } 172 AUE_NULL RESERVED 173 AUE_PREAD COMPAT6|CAPENABLED { ssize_t pread( int fd, _Out_writes_bytes_(nbyte) void *buf, size_t nbyte, int pad, off_t offset ); } 174 AUE_PWRITE COMPAT6|CAPENABLED { ssize_t pwrite( int fd, _In_reads_bytes_(nbyte) const void *buf, size_t nbyte, int pad, off_t offset ); } 175 AUE_SETFIB STD { int setfib( int fibnum ); } 176 AUE_NTP_ADJTIME STD { int ntp_adjtime( _Inout_ _Contains_long_ struct timex *tp ); } 177-180 AUE_NULL RESERVED 181 AUE_SETGID STD|CAPENABLED { int setgid( gid_t gid ); } 182 AUE_SETEGID STD|CAPENABLED { int setegid( gid_t egid ); } 183 AUE_SETEUID STD|CAPENABLED { int seteuid( uid_t euid ); } 184 AUE_NULL OBSOL lfs_bmapv 185 AUE_NULL OBSOL lfs_markv 186 AUE_NULL OBSOL lfs_segclean 187 AUE_NULL OBSOL lfs_segwait 188 AUE_STAT COMPAT11 { int stat( _In_z_ const char *path, _Out_ _Contains_timet_ struct freebsd11_stat *ub ); } 189 AUE_FSTAT COMPAT11|CAPENABLED { int fstat( int fd, _Out_ _Contains_timet_ struct freebsd11_stat *sb ); } 190 AUE_LSTAT COMPAT11 { int lstat( _In_z_ const char *path, _Out_ _Contains_timet_ struct freebsd11_stat *ub ); } 191 AUE_PATHCONF STD { int pathconf( _In_z_ const char *path, int name ); } 192 AUE_FPATHCONF STD|CAPENABLED { int fpathconf( int fd, int name ); } 193 AUE_NULL RESERVED 194 AUE_GETRLIMIT STD|CAPENABLED { int getrlimit( u_int which, _Out_ struct rlimit *rlp ); } 195 AUE_SETRLIMIT STD|CAPENABLED { int setrlimit( u_int which, _In_ struct rlimit *rlp ); } 196 AUE_GETDIRENTRIES COMPAT11|CAPENABLED { int getdirentries( int fd, _Out_writes_bytes_(count) char *buf, u_int count, _Out_opt_ long *basep ); } 197 AUE_MMAP COMPAT6|CAPENABLED { void *mmap( _In_ void *addr, size_t len, int prot, int flags, int fd, int pad, off_t pos ); } 198 AUE_NULL SYSMUX { int __syscall( int64_t number, ... ); } 199 AUE_LSEEK COMPAT6|CAPENABLED { off_t lseek( int fd, int pad, off_t offset, int whence ); } 200 AUE_TRUNCATE COMPAT6 { int truncate( _In_z_ const char *path, int pad, off_t length ); } 201 AUE_FTRUNCATE COMPAT6|CAPENABLED { int ftruncate( int fd, int pad, off_t length ); } 202 AUE_SYSCTL STD|CAPENABLED { int __sysctl( _In_reads_(namelen) int *name, u_int namelen, _Out_writes_bytes_opt_(*oldlenp) void *old, _Inout_opt_ size_t *oldlenp, _In_reads_bytes_opt_(newlen) const void *new, size_t newlen ); } 203 AUE_MLOCK STD|CAPENABLED { int mlock( _In_ const void *addr, size_t len ); } 204 AUE_MUNLOCK STD|CAPENABLED { int munlock( _In_ const void *addr, size_t len ); } 205 AUE_UNDELETE STD { int undelete( _In_z_ const char *path ); } 206 AUE_FUTIMES STD|CAPENABLED { int futimes( int fd, _In_reads_(2) _Contains_long_timet_ const struct timeval *tptr ); } 207 AUE_GETPGID STD|CAPENABLED { int getpgid( pid_t pid ); } 208 AUE_NULL RESERVED 209 AUE_POLL STD|CAPENABLED { int poll( _Inout_updates_(nfds) struct pollfd *fds, u_int nfds, int timeout ); } ; ; The following are reserved for loadable syscalls ; 210 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int 211 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int 212 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int 213 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int 214 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int 215 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int 216 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int 217 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int 218 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int 219 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int 220 AUE_SEMCTL COMPAT7|NOSTD { int __semctl( int semid, int semnum, int cmd, _Contains_ptr_ union semun_old *arg ); } 221 AUE_SEMGET NOSTD { int semget( key_t key, int nsems, int semflg ); } 222 AUE_SEMOP NOSTD { int semop( int semid, _In_reads_(nsops) struct sembuf *sops, size_t nsops ); } 223 AUE_NULL OBSOL semconfig 224 AUE_MSGCTL COMPAT7|NOSTD { int msgctl( int msqid, int cmd, _Contains_long_ptr_timet_ struct msqid_ds_old *buf ); } 225 AUE_MSGGET NOSTD { int msgget( key_t key, int msgflg ); } 226 AUE_MSGSND NOSTD { int msgsnd( int msqid, _In_reads_bytes_(msgsz) _Contains_long_ const void *msgp, size_t msgsz, int msgflg ); } 227 AUE_MSGRCV NOSTD { ssize_t msgrcv( int msqid, _Out_writes_bytes_(msgsz) _Contains_long_ void *msgp, size_t msgsz, long msgtyp, int msgflg ); } 228 AUE_SHMAT NOSTD { void *shmat( int shmid, _In_ const void *shmaddr, int shmflg ); } 229 AUE_SHMCTL COMPAT7|NOSTD { int shmctl( int shmid, int cmd, _Inout_opt_ _Contains_long_ struct shmid_ds_old *buf ); } 230 AUE_SHMDT NOSTD { int shmdt( _In_ const void *shmaddr ); } 231 AUE_SHMGET NOSTD { int shmget( key_t key, size_t size, int shmflg ); } 232 AUE_NULL STD|CAPENABLED { int clock_gettime( clockid_t clock_id, _Out_ _Contains_long_timet_ struct timespec *tp ); } 233 AUE_CLOCK_SETTIME STD { int clock_settime( clockid_t clock_id, _In_ _Contains_long_timet_ const struct timespec *tp ); } 234 AUE_NULL STD|CAPENABLED { int clock_getres( clockid_t clock_id, _Out_ _Contains_long_timet_ struct timespec *tp ); } 235 AUE_NULL STD|CAPENABLED { int ktimer_create( clockid_t clock_id, _In_ _Contains_long_ptr_ struct sigevent *evp, _Out_ int *timerid ); } 236 AUE_NULL STD|CAPENABLED { int ktimer_delete( int timerid ); } 237 AUE_NULL STD|CAPENABLED { int ktimer_settime( int timerid, int flags, _In_ _Contains_long_timet_ const struct itimerspec *value, _Out_opt_ _Contains_long_timet_ struct itimerspec *ovalue ); } 238 AUE_NULL STD|CAPENABLED { int ktimer_gettime( int timerid, _Out_ _Contains_long_timet_ struct itimerspec *value ); } 239 AUE_NULL STD|CAPENABLED { int ktimer_getoverrun( int timerid ); } 240 AUE_NULL STD|CAPENABLED { int nanosleep( _In_ _Contains_long_timet_ const struct timespec *rqtp, _Out_opt_ _Contains_long_timet_ struct timespec *rmtp ); } 241 AUE_NULL STD { int ffclock_getcounter( _Out_ ffcounter *ffcount ); } 242 AUE_NULL STD { int ffclock_setestimate( _In_ _Contains_timet_ struct ffclock_estimate *cest ); } 243 AUE_NULL STD { int ffclock_getestimate( _Out_ _Contains_timet_ struct ffclock_estimate *cest ); } 244 AUE_NULL STD { int clock_nanosleep( clockid_t clock_id, int flags, _In_ _Contains_long_timet_ const struct timespec *rqtp, _Out_opt_ _Contains_long_timet_ struct timespec *rmtp ); } 245-246 AUE_NULL RESERVED 247 AUE_NULL STD { int clock_getcpuclockid2( id_t id, int which, _Out_ clockid_t *clock_id ); } 248 AUE_NULL STD|CAPENABLED { int ntp_gettime( _Out_ _Contains_long_timet_ struct ntptimeval *ntvp ); } 249 AUE_NULL RESERVED 250 AUE_MINHERIT STD|CAPENABLED { int minherit( _In_ void *addr, size_t len, int inherit ); } 251 AUE_RFORK STD|CAPENABLED { int rfork( int flags ); } 252 AUE_POLL OBSOL openbsd_poll 253 AUE_ISSETUGID STD|CAPENABLED { int issetugid(void); } 254 AUE_LCHOWN STD { int lchown( _In_z_ const char *path, int uid, int gid ); } 255 AUE_AIO_READ STD|CAPENABLED { int aio_read( _Inout_ _Contains_long_ptr_ struct aiocb *aiocbp ); } 256 AUE_AIO_WRITE STD|CAPENABLED { int aio_write( _Inout_ _Contains_long_ptr_ struct aiocb *aiocbp ); } 257 AUE_LIO_LISTIO STD|CAPENABLED { int lio_listio( int mode, _Inout_updates_(nent) _Contains_long_ptr_ struct aiocb * const *acb_list, int nent, _In_opt_ _Contains_long_ptr_ struct sigevent *sig ); } 258-271 AUE_NULL RESERVED 272 AUE_O_GETDENTS COMPAT11|CAPENABLED { int getdents( int fd, _Out_writes_bytes_(count) char *buf, size_t count ); } 273 AUE_NULL RESERVED 274 AUE_LCHMOD STD { int lchmod( _In_z_ const char *path, mode_t mode ); } 275 AUE_NULL OBSOL netbsd_lchown 276 AUE_LUTIMES STD { int lutimes( _In_z_ const char *path, _In_ _Contains_long_timet_ const struct timeval *tptr ); } 277 AUE_NULL OBSOL netbsd_msync 278 AUE_STAT COMPAT11 { int nstat( _In_z_ const char *path, _Out_ _Contains_long_timet_ struct nstat *ub ); } 279 AUE_FSTAT COMPAT11 { int nfstat( int fd, _Out_ _Contains_long_timet_ struct nstat *sb ); } 280 AUE_LSTAT COMPAT11 { int nlstat( _In_z_ const char *path, _Out_ _Contains_long_timet_ struct nstat *ub ); } 281-288 AUE_NULL RESERVED 289 AUE_PREADV STD|CAPENABLED { ssize_t preadv( int fd, _In_reads_(iovcnt) _Contains_long_ptr_ struct iovec *iovp, u_int iovcnt, off_t offset ); } 290 AUE_PWRITEV STD|CAPENABLED { ssize_t pwritev( int fd, _In_reads_(iovcnt) _Contains_long_ptr_ struct iovec *iovp, u_int iovcnt, off_t offset ); } 291-296 AUE_NULL RESERVED 297 AUE_FHSTATFS COMPAT4 { int fhstatfs( _In_ const struct fhandle *u_fhp, _Out_ _Contains_long_ struct ostatfs *buf ); } 298 AUE_FHOPEN STD { int fhopen( _In_ const struct fhandle *u_fhp, int flags ); } 299 AUE_FHSTAT COMPAT11 { int fhstat( _In_ const struct fhandle *u_fhp, _Out_ _Contains_long_timet_ struct freebsd11_stat *sb ); } 300 AUE_NULL STD { int modnext( int modid ); } 301 AUE_NULL STD { int modstat( int modid, _Out_ _Contains_long_ struct module_stat *stat ); } 302 AUE_NULL STD { int modfnext( int modid ); } 303 AUE_NULL STD { int modfind( _In_z_ const char *name ); } 304 AUE_MODLOAD STD { int kldload( _In_z_ const char *file ); } 305 AUE_MODUNLOAD STD { int kldunload( int fileid ); } 306 AUE_NULL STD { int kldfind( _In_z_ const char *file ); } 307 AUE_NULL STD { int kldnext( int fileid ); } 308 AUE_NULL STD { int kldstat( int fileid, _Out_ _Contains_long_ptr_ struct kld_file_stat *stat ); } 309 AUE_NULL STD { int kldfirstmod( int fileid ); } 310 AUE_GETSID STD|CAPENABLED { int getsid( pid_t pid ); } 311 AUE_SETRESUID STD|CAPENABLED { int setresuid( uid_t ruid, uid_t euid, uid_t suid ); } 312 AUE_SETRESGID STD|CAPENABLED { int setresgid( gid_t rgid, gid_t egid, gid_t sgid ); } 313 AUE_NULL OBSOL signanosleep 314 AUE_AIO_RETURN STD|CAPENABLED { ssize_t aio_return( _Inout_ _Contains_long_ptr_ struct aiocb *aiocbp ); } 315 AUE_AIO_SUSPEND STD|CAPENABLED { int aio_suspend( _Inout_updates_(nent) _Contains_long_ptr_ const struct aiocb * const * aiocbp, int nent, _In_opt_ _Contains_long_timet_ const struct timespec *timeout ); } 316 AUE_AIO_CANCEL STD|CAPENABLED { int aio_cancel( int fd, _In_opt_ _Contains_long_ptr_ struct aiocb *aiocbp ); } 317 AUE_AIO_ERROR STD|CAPENABLED { int aio_error( _In_ _Contains_long_ptr_ struct aiocb *aiocbp ); } 318 AUE_AIO_READ COMPAT6|CAPENABLED { int aio_read( _Inout_ _Contains_long_ptr_ struct oaiocb *aiocbp ); } 319 AUE_AIO_WRITE COMPAT6|CAPENABLED { int aio_write( _Inout_ _Contains_long_ptr_ struct oaiocb *aiocbp ); } 320 AUE_LIO_LISTIO COMPAT6|CAPENABLED { int lio_listio( int mode, _Inout_updates_(nent) _Contains_long_ptr_ struct oaiocb * const *acb_list, int nent, _In_opt_ _Contains_ptr_ struct osigevent *sig ); } 321 AUE_NULL STD|CAPENABLED|NOLIB { int yield(void); } 322 AUE_NULL OBSOL thr_sleep 323 AUE_NULL OBSOL thr_wakeup 324 AUE_MLOCKALL STD|CAPENABLED { int mlockall( int how ); } 325 AUE_MUNLOCKALL STD|CAPENABLED { int munlockall(void); } 326 AUE_GETCWD STD { int __getcwd( _Out_writes_z_(buflen) char *buf, size_t buflen ); } 327 AUE_NULL STD|CAPENABLED { int sched_setparam( pid_t pid, _In_ const struct sched_param *param ); } 328 AUE_NULL STD|CAPENABLED { int sched_getparam( pid_t pid, _Out_ struct sched_param *param ); } 329 AUE_NULL STD|CAPENABLED { int sched_setscheduler( pid_t pid, int policy, _In_ const struct sched_param *param ); } 330 AUE_NULL STD|CAPENABLED { int sched_getscheduler( pid_t pid ); } 331 AUE_NULL STD|CAPENABLED { int sched_yield(void); } 332 AUE_NULL STD|CAPENABLED { int sched_get_priority_max( int policy ); } 333 AUE_NULL STD|CAPENABLED { int sched_get_priority_min( int policy ); } 334 AUE_NULL STD|CAPENABLED { int sched_rr_get_interval( pid_t pid, _Out_ _Contains_long_timet_ struct timespec *interval ); } 335 AUE_NULL STD|CAPENABLED { int utrace( _In_reads_bytes_(len) const void *addr, size_t len ); } 336 AUE_SENDFILE COMPAT4|CAPENABLED { int sendfile( int fd, int s, off_t offset, size_t nbytes, _In_opt_ _Contains_ptr_ struct sf_hdtr *hdtr, _Out_opt_ off_t *sbytes, int flags ); } 337 AUE_NULL STD { int kldsym( int fileid, int cmd, _In_ _Contains_long_ptr_ void *data ); } 338 AUE_JAIL STD { int jail( _In_ _Contains_ptr_ struct jail *jail ); } 339 AUE_NULL NOSTD|NOTSTATIC { int nnpfs_syscall( int operation, char *a_pathP, int a_opcode, void *a_paramsP, int a_followSymlinks ); } 340 AUE_SIGPROCMASK STD|CAPENABLED { int sigprocmask( int how, _In_opt_ const sigset_t *set, _Out_opt_ sigset_t *oset ); } 341 AUE_SIGSUSPEND STD|CAPENABLED { int sigsuspend( _In_ const sigset_t *sigmask ); } 342 AUE_SIGACTION COMPAT4|CAPENABLED { int sigaction( int sig, _In_opt_ _Contains_ptr_ const struct sigaction *act, _Out_opt_ _Contains_ptr_ struct sigaction *oact ); } 343 AUE_SIGPENDING STD|CAPENABLED { int sigpending( _In_ sigset_t *set ); } 344 AUE_SIGRETURN COMPAT4|CAPENABLED { int sigreturn( _In_ _Contains_long_ptr_ const struct freebsd4_ucontext *sigcntxp ); } 345 AUE_SIGWAIT STD|CAPENABLED { int sigtimedwait( _In_ const sigset_t *set, _Out_opt_ _Contains_long_ptr_ struct __siginfo *info, _In_opt_ _Contains_long_timet_ const struct timespec *timeout ); } 346 AUE_NULL STD|CAPENABLED { int sigwaitinfo( _In_ const sigset_t *set, _Out_opt_ _Contains_long_ptr_ struct __siginfo *info ); } 347 AUE_ACL_GET_FILE STD { int __acl_get_file( _In_z_ const char *path, __acl_type_t type, _Out_ struct acl *aclp ); } 348 AUE_ACL_SET_FILE STD { int __acl_set_file( _In_z_ const char *path, __acl_type_t type, _In_ struct acl *aclp ); } 349 AUE_ACL_GET_FD STD|CAPENABLED { int __acl_get_fd( int filedes, __acl_type_t type, _Out_ struct acl *aclp ); } 350 AUE_ACL_SET_FD STD|CAPENABLED { int __acl_set_fd( int filedes, __acl_type_t type, _In_ struct acl *aclp ); } 351 AUE_ACL_DELETE_FILE STD { int __acl_delete_file( _In_z_ const char *path, __acl_type_t type ); } 352 AUE_ACL_DELETE_FD STD|CAPENABLED { int __acl_delete_fd( int filedes, __acl_type_t type ); } 353 AUE_ACL_CHECK_FILE STD { int __acl_aclcheck_file( _In_z_ const char *path, __acl_type_t type, _In_ struct acl *aclp ); } 354 AUE_ACL_CHECK_FD STD|CAPENABLED { int __acl_aclcheck_fd( int filedes, __acl_type_t type, _In_ struct acl *aclp ); } 355 AUE_EXTATTRCTL STD { int extattrctl( _In_z_ const char *path, int cmd, _In_z_opt_ const char *filename, int attrnamespace, _In_z_ const char *attrname ); } 356 AUE_EXTATTR_SET_FILE STD { ssize_t extattr_set_file( _In_z_ const char *path, int attrnamespace, _In_z_ const char *attrname, _In_reads_bytes_(nbytes) void *data, size_t nbytes ); } 357 AUE_EXTATTR_GET_FILE STD { ssize_t extattr_get_file( _In_z_ const char *path, int attrnamespace, _In_z_ const char *attrname, _Out_writes_bytes_(nbytes) void *data, size_t nbytes ); } 358 AUE_EXTATTR_DELETE_FILE STD { int extattr_delete_file( _In_z_ const char *path, int attrnamespace, _In_z_ const char *attrname ); } 359 AUE_AIO_WAITCOMPLETE STD|CAPENABLED { ssize_t aio_waitcomplete( _Outptr_result_maybenull_ struct aiocb **aiocbp, _In_opt_ _Contains_long_timet_ struct timespec *timeout ); } 360 AUE_GETRESUID STD|CAPENABLED { int getresuid( _Out_opt_ uid_t *ruid, _Out_opt_ uid_t *euid, _Out_opt_ uid_t *suid ); } 361 AUE_GETRESGID STD|CAPENABLED { int getresgid( _Out_opt_ gid_t *rgid, _Out_opt_ gid_t *egid, _Out_opt_ gid_t *sgid ); } 362 AUE_KQUEUE STD|CAPENABLED { int kqueue(void); } 363 AUE_KEVENT COMPAT11|CAPENABLED { int kevent( int fd, _In_reads_opt_(nchanges) _Contains_ptr_ const struct freebsd11_kevent *changelist, int nchanges, _Out_writes_opt_(nevents) _Contains_ptr_ struct freebsd11_kevent *eventlist, int nevents, _In_opt_ _Contains_long_timet_ const struct timespec *timeout ); } 364 AUE_NULL OBSOL __cap_get_proc 365 AUE_NULL OBSOL __cap_set_proc 366 AUE_NULL OBSOL __cap_get_fd 367 AUE_NULL OBSOL __cap_get_file 368 AUE_NULL OBSOL __cap_set_fd 369 AUE_NULL OBSOL __cap_set_file 370 AUE_NULL RESERVED 371 AUE_EXTATTR_SET_FD STD|CAPENABLED { ssize_t extattr_set_fd( int fd, int attrnamespace, _In_z_ const char *attrname, _In_reads_bytes_(nbytes) void *data, size_t nbytes ); } 372 AUE_EXTATTR_GET_FD STD|CAPENABLED { ssize_t extattr_get_fd( int fd, int attrnamespace, _In_z_ const char *attrname, _Out_writes_bytes_(nbytes) void *data, size_t nbytes ); } 373 AUE_EXTATTR_DELETE_FD STD|CAPENABLED { int extattr_delete_fd( int fd, int attrnamespace, _In_z_ const char *attrname ); } 374 AUE_SETUGID STD { int __setugid( int flag ); } 375 AUE_NULL OBSOL nfsclnt 376 AUE_EACCESS STD { int eaccess( _In_z_ const char *path, int amode ); } 377 AUE_NULL NOSTD|NOTSTATIC { int afs3_syscall( long syscall, long parm1, long parm2, long parm3, long parm4, long parm5, long parm6 ); } 378 AUE_NMOUNT STD { int nmount( _In_reads_(iovcnt) _Contains_long_ptr_ struct iovec *iovp, unsigned int iovcnt, int flags ); } 379 AUE_NULL OBSOL kse_exit 380 AUE_NULL OBSOL kse_wakeup 381 AUE_NULL OBSOL kse_create 382 AUE_NULL OBSOL kse_thr_interrupt 383 AUE_NULL OBSOL kse_release 384 AUE_NULL STD|CAPENABLED { int __mac_get_proc( _In_ _Contains_long_ptr_ struct mac *mac_p ); } 385 AUE_NULL STD|CAPENABLED { int __mac_set_proc( _In_ _Contains_long_ptr_ struct mac *mac_p ); } 386 AUE_NULL STD|CAPENABLED { int __mac_get_fd( int fd, _In_ _Contains_long_ptr_ struct mac *mac_p ); } 387 AUE_NULL STD { int __mac_get_file( _In_z_ const char *path_p, _In_ _Contains_long_ptr_ struct mac *mac_p ); } 388 AUE_NULL STD|CAPENABLED { int __mac_set_fd( int fd, _In_ _Contains_long_ptr_ struct mac *mac_p ); } 389 AUE_NULL STD { int __mac_set_file( _In_z_ const char *path_p, _In_ _Contains_long_ptr_ struct mac *mac_p ); } 390 AUE_NULL STD { int kenv( int what, _In_z_opt_ const char *name, _Inout_updates_opt_(len) char *value, int len ); } 391 AUE_LCHFLAGS STD { int lchflags( _In_z_ const char *path, u_long flags ); } 392 AUE_NULL STD|CAPENABLED { int uuidgen( _Out_writes_(count) struct uuid *store, int count ); } 393 AUE_SENDFILE STD|CAPENABLED { int sendfile( int fd, int s, off_t offset, size_t nbytes, _In_opt_ _Contains_ptr_ struct sf_hdtr *hdtr, _Out_opt_ off_t *sbytes, int flags ); } 394 AUE_NULL STD { int mac_syscall( _In_z_ const char *policy, int call, _In_opt_ void *arg ); } 395 AUE_GETFSSTAT COMPAT11 { int getfsstat( _Out_writes_bytes_opt_(bufsize) struct freebsd11_statfs *buf, long bufsize, int mode ); } 396 AUE_STATFS COMPAT11 { int statfs( _In_z_ const char *path, _Out_ struct freebsd11_statfs *buf ); } 397 AUE_FSTATFS COMPAT11|CAPENABLED { int fstatfs( int fd, _Out_ struct freebsd11_statfs *buf ); } 398 AUE_FHSTATFS COMPAT11 { int fhstatfs( _In_ const struct fhandle *u_fhp, _Out_ struct freebsd11_statfs *buf ); } 399 AUE_NULL RESERVED 400 AUE_SEMCLOSE NOSTD { int ksem_close( semid_t id ); } 401 AUE_SEMPOST NOSTD { int ksem_post( semid_t id ); } 402 AUE_SEMWAIT NOSTD { int ksem_wait( semid_t id ); } 403 AUE_SEMTRYWAIT NOSTD { int ksem_trywait( semid_t id ); } 404 AUE_SEMINIT NOSTD { int ksem_init( _Out_ semid_t *idp, unsigned int value ); } 405 AUE_SEMOPEN NOSTD { int ksem_open( _Out_ semid_t *idp, _In_z_ const char *name, int oflag, mode_t mode, unsigned int value ); } 406 AUE_SEMUNLINK NOSTD { int ksem_unlink( _In_z_ const char *name ); } 407 AUE_SEMGETVALUE NOSTD { int ksem_getvalue( semid_t id, _Out_ int *val ); } 408 AUE_SEMDESTROY NOSTD { int ksem_destroy( semid_t id ); } 409 AUE_NULL STD { int __mac_get_pid( pid_t pid, _In_ _Contains_long_ptr_ struct mac *mac_p ); } 410 AUE_NULL STD { int __mac_get_link( _In_z_ const char *path_p, _In_ _Contains_long_ptr_ struct mac *mac_p ); } 411 AUE_NULL STD { int __mac_set_link( _In_z_ const char *path_p, _In_ _Contains_long_ptr_ struct mac *mac_p ); } 412 AUE_EXTATTR_SET_LINK STD { ssize_t extattr_set_link( _In_z_ const char *path, int attrnamespace, _In_z_ const char *attrname, _In_reads_bytes_(nbytes) void *data, size_t nbytes ); } 413 AUE_EXTATTR_GET_LINK STD { ssize_t extattr_get_link( _In_z_ const char *path, int attrnamespace, _In_z_ const char *attrname, _Out_writes_bytes_(nbytes) void *data, size_t nbytes ); } 414 AUE_EXTATTR_DELETE_LINK STD { int extattr_delete_link( _In_z_ const char *path, int attrnamespace, _In_z_ const char *attrname ); } 415 AUE_NULL STD { int __mac_execve( _In_z_ const char *fname, _In_ char **argv, _In_ char **envv, _In_ _Contains_long_ptr_ struct mac *mac_p ); } 416 AUE_SIGACTION STD|CAPENABLED { int sigaction( int sig, _In_opt_ _Contains_ptr_ const struct sigaction *act, _Out_opt_ _Contains_ptr_ struct sigaction *oact ); } 417 AUE_SIGRETURN STD|CAPENABLED { int sigreturn( _In_ _Contains_long_ptr_ const struct __ucontext *sigcntxp ); } 418-420 AUE_NULL RESERVED 421 AUE_NULL STD|CAPENABLED { int getcontext( _Out_ _Contains_long_ptr_ struct __ucontext *ucp ); } 422 AUE_NULL STD|CAPENABLED { int setcontext( _In_ _Contains_long_ptr_ const struct __ucontext *ucp ); } 423 AUE_NULL STD { int swapcontext( _Out_ _Contains_long_ptr_ struct __ucontext *oucp, _In_ _Contains_long_ptr_ const struct __ucontext *ucp ); } 424 AUE_SWAPOFF COMPAT13 { int swapoff( _In_z_ const char *name ); } 425 AUE_ACL_GET_LINK STD { int __acl_get_link( _In_z_ const char *path, __acl_type_t type, _Out_ struct acl *aclp ); } 426 AUE_ACL_SET_LINK STD { int __acl_set_link( _In_z_ const char *path, __acl_type_t type, _In_ struct acl *aclp ); } 427 AUE_ACL_DELETE_LINK STD { int __acl_delete_link( _In_z_ const char *path, __acl_type_t type ); } 428 AUE_ACL_CHECK_LINK STD { int __acl_aclcheck_link( _In_z_ const char *path, __acl_type_t type, _In_ struct acl *aclp ); } 429 AUE_SIGWAIT STD|CAPENABLED { int sigwait( _In_ const sigset_t *set, _Out_ int *sig ); } 430 AUE_THR_CREATE STD|CAPENABLED { int thr_create( _In_ _Contains_long_ptr_ ucontext_t *ctx, _Out_ long *id, int flags ); } 431 AUE_THR_EXIT STD|CAPENABLED { void thr_exit( _Out_opt_ long *state ); } 432 AUE_NULL STD|CAPENABLED { int thr_self( _Out_ long *id ); } 433 AUE_THR_KILL STD|CAPENABLED { int thr_kill( long id, int sig ); } 434 AUE_NULL COMPAT10 { int _umtx_lock( _Inout_ struct umtx *umtx ); } 435 AUE_NULL COMPAT10 { int _umtx_unlock( _Inout_ struct umtx *umtx ); } 436 AUE_JAIL_ATTACH STD { int jail_attach( int jid ); } 437 AUE_EXTATTR_LIST_FD STD|CAPENABLED { ssize_t extattr_list_fd( int fd, int attrnamespace, _Out_writes_bytes_opt_(nbytes) void *data, size_t nbytes ); } 438 AUE_EXTATTR_LIST_FILE STD { ssize_t extattr_list_file( _In_z_ const char *path, int attrnamespace, _Out_writes_bytes_opt_(nbytes) void *data, size_t nbytes ); } 439 AUE_EXTATTR_LIST_LINK STD { ssize_t extattr_list_link( _In_z_ const char *path, int attrnamespace, _Out_writes_bytes_opt_(nbytes) void *data, size_t nbytes ); } 440 AUE_NULL OBSOL kse_switchin 441 AUE_SEMWAIT NOSTD { int ksem_timedwait( semid_t id, _In_opt_ _Contains_long_timet_ const struct timespec *abstime ); } 442 AUE_NULL STD|CAPENABLED { int thr_suspend( _In_opt_ _Contains_long_timet_ const struct timespec *timeout ); } 443 AUE_NULL STD|CAPENABLED { int thr_wake( long id ); } 444 AUE_MODUNLOAD STD { int kldunloadf( int fileid, int flags ); } 445 AUE_AUDIT STD { int audit( _In_reads_bytes_(length) const void *record, u_int length ); } 446 AUE_AUDITON STD { int auditon( int cmd, _In_opt_ void *data, u_int length ); } 447 AUE_GETAUID STD|CAPENABLED { int getauid( _Out_ uid_t *auid ); } 448 AUE_SETAUID STD|CAPENABLED { int setauid( _In_ uid_t *auid ); } 449 AUE_GETAUDIT STD|CAPENABLED { int getaudit( _Out_ struct auditinfo *auditinfo ); } 450 AUE_SETAUDIT STD|CAPENABLED { int setaudit( _In_ struct auditinfo *auditinfo ); } 451 AUE_GETAUDIT_ADDR STD|CAPENABLED { int getaudit_addr( _Out_writes_bytes_(length) struct auditinfo_addr *auditinfo_addr, u_int length ); } 452 AUE_SETAUDIT_ADDR STD|CAPENABLED { int setaudit_addr( _In_reads_bytes_(length) struct auditinfo_addr *auditinfo_addr, u_int length ); } 453 AUE_AUDITCTL STD { int auditctl( _In_z_ const char *path ); } 454 AUE_NULL STD|CAPENABLED { int _umtx_op( _Inout_ void *obj, int op, u_long val, _In_ void *uaddr1, _In_ void *uaddr2 ); } 455 AUE_THR_NEW STD|CAPENABLED { int thr_new( _In_ _Contains_long_ptr_ struct thr_param *param, int param_size ); } 456 AUE_NULL STD|CAPENABLED { int sigqueue( pid_t pid, int signum, _In_ void *value ); } 457 AUE_MQ_OPEN NOSTD { int kmq_open( _In_z_ const char *path, int flags, mode_t mode, _In_opt_ _Contains_long_ const struct mq_attr *attr ); } 458 AUE_MQ_SETATTR NOSTD|CAPENABLED { int kmq_setattr( int mqd, _In_opt_ _Contains_long_ const struct mq_attr *attr, _Out_opt_ _Contains_long_ struct mq_attr *oattr ); } 459 AUE_MQ_TIMEDRECEIVE NOSTD|CAPENABLED { int kmq_timedreceive( int mqd, _Out_writes_bytes_(msg_len) char *msg_ptr, size_t msg_len, _Out_opt_ unsigned *msg_prio, _In_opt_ _Contains_long_timet_ const struct timespec *abs_timeout ); } 460 AUE_MQ_TIMEDSEND NOSTD|CAPENABLED { int kmq_timedsend( int mqd, _In_reads_bytes_(msg_len) const char *msg_ptr, size_t msg_len, unsigned msg_prio, _In_opt_ _Contains_long_timet_ const struct timespec *abs_timeout ); } 461 AUE_MQ_NOTIFY NOSTD|CAPENABLED { int kmq_notify( int mqd, _In_opt_ _Contains_long_ptr_ const struct sigevent *sigev ); } 462 AUE_MQ_UNLINK NOSTD { int kmq_unlink( _In_z_ const char *path ); } 463 AUE_NULL STD|CAPENABLED { void abort2( _In_z_ const char *why, int nargs, _In_reads_(nargs) void **args ); } 464 AUE_NULL STD|CAPENABLED { int thr_set_name( long id, _In_z_ const char *name ); } 465 AUE_AIO_FSYNC STD|CAPENABLED { int aio_fsync( int op, _In_ _Contains_long_ptr_ struct aiocb *aiocbp ); } 466 AUE_RTPRIO STD|CAPENABLED { int rtprio_thread( int function, lwpid_t lwpid, _Inout_ struct rtprio *rtp ); } 467-470 AUE_NULL RESERVED 471 AUE_SCTP_PEELOFF NOSTD|CAPENABLED { int sctp_peeloff( int sd, uint32_t name ); } 472 AUE_SCTP_GENERIC_SENDMSG NOSTD|CAPENABLED { int sctp_generic_sendmsg( int sd, _In_reads_bytes_(mlen) void *msg, int mlen, _In_reads_bytes_(tolen) const struct sockaddr *to, __socklen_t tolen, _In_opt_ struct sctp_sndrcvinfo *sinfo, int flags ); } 473 AUE_SCTP_GENERIC_SENDMSG_IOV NOSTD|CAPENABLED { int sctp_generic_sendmsg_iov( int sd, _In_reads_(iovlen) _Contains_long_ptr_ struct iovec *iov, int iovlen, _In_reads_bytes_(tolen) const struct sockaddr *to, __socklen_t tolen, _In_opt_ struct sctp_sndrcvinfo *sinfo, int flags ); } 474 AUE_SCTP_GENERIC_RECVMSG NOSTD|CAPENABLED { int sctp_generic_recvmsg( int sd, _In_reads_(iovlen) _Contains_long_ptr_ struct iovec *iov, int iovlen, _Out_writes_bytes_(*fromlenaddr) struct sockaddr *from, _Out_ __socklen_t *fromlenaddr, _In_opt_ struct sctp_sndrcvinfo *sinfo, _Out_opt_ int *msg_flags ); } 475 AUE_PREAD STD|CAPENABLED { ssize_t pread( int fd, _Out_writes_bytes_(nbyte) void *buf, size_t nbyte, off_t offset ); } 476 AUE_PWRITE STD|CAPENABLED { ssize_t pwrite( int fd, _In_reads_bytes_(nbyte) const void *buf, size_t nbyte, off_t offset ); } 477 AUE_MMAP STD|CAPENABLED { void *mmap( _In_ void *addr, size_t len, int prot, int flags, int fd, off_t pos ); } 478 AUE_LSEEK STD|CAPENABLED { off_t lseek( int fd, off_t offset, int whence ); } 479 AUE_TRUNCATE STD { int truncate( _In_z_ const char *path, off_t length ); } 480 AUE_FTRUNCATE STD|CAPENABLED { int ftruncate( int fd, off_t length ); } 481 AUE_THR_KILL2 STD { int thr_kill2( pid_t pid, long id, int sig ); } 482 AUE_SHMOPEN COMPAT12|CAPENABLED { int shm_open( _In_z_ const char *path, int flags, mode_t mode ); } 483 AUE_SHMUNLINK STD { int shm_unlink( _In_z_ const char *path ); } 484 AUE_NULL STD { int cpuset( _Out_ cpusetid_t *setid ); } 485 AUE_NULL STD { int cpuset_setid( cpuwhich_t which, id_t id, cpusetid_t setid ); } 486 AUE_NULL STD { int cpuset_getid( cpulevel_t level, cpuwhich_t which, id_t id, _Out_ cpusetid_t *setid ); } 487 AUE_NULL STD|CAPENABLED { int cpuset_getaffinity( cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, _Out_ cpuset_t *mask ); } 488 AUE_NULL STD|CAPENABLED { int cpuset_setaffinity( cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, _Out_ const cpuset_t *mask ); } 489 AUE_FACCESSAT STD|CAPENABLED { int faccessat( int fd, _In_z_ const char *path, int amode, int flag ); } 490 AUE_FCHMODAT STD|CAPENABLED { int fchmodat( int fd, _In_z_ const char *path, mode_t mode, int flag ); } 491 AUE_FCHOWNAT STD|CAPENABLED { int fchownat( int fd, _In_z_ const char *path, uid_t uid, gid_t gid, int flag ); } 492 AUE_FEXECVE STD|CAPENABLED { int fexecve( int fd, _In_ char **argv, _In_ char **envv ); } 493 AUE_FSTATAT COMPAT11|CAPENABLED { int fstatat( int fd, _In_z_ const char *path, _Out_ _Contains_long_timet_ struct freebsd11_stat *buf, int flag ); } 494 AUE_FUTIMESAT STD|CAPENABLED { int futimesat( int fd, _In_z_ const char *path, _In_reads_(2) _Contains_long_timet_ const struct timeval *times ); } 495 AUE_LINKAT STD|CAPENABLED { int linkat( int fd1, _In_z_ const char *path1, int fd2, _In_z_ const char *path2, int flag ); } 496 AUE_MKDIRAT STD|CAPENABLED { int mkdirat( int fd, _In_z_ const char *path, mode_t mode ); } 497 AUE_MKFIFOAT STD|CAPENABLED { int mkfifoat( int fd, _In_z_ const char *path, mode_t mode ); } 498 AUE_MKNODAT COMPAT11|CAPENABLED { int mknodat( int fd, _In_z_ const char *path, mode_t mode, uint32_t dev ); } ; XXX: see the comment for open 499 AUE_OPENAT_RWTC STD|CAPENABLED { int openat( int fd, _In_z_ const char *path, int flag, mode_t mode ); } 500 AUE_READLINKAT STD|CAPENABLED { ssize_t readlinkat( int fd, _In_z_ const char *path, _Out_writes_bytes_(bufsize) char *buf, size_t bufsize ); } 501 AUE_RENAMEAT STD|CAPENABLED { int renameat( int oldfd, _In_z_ const char *old, int newfd, _In_z_ const char *new ); } 502 AUE_SYMLINKAT STD|CAPENABLED { int symlinkat( _In_z_ const char *path1, int fd, _In_z_ const char *path2 ); } 503 AUE_UNLINKAT STD|CAPENABLED { int unlinkat( int fd, _In_z_ const char *path, int flag ); } 504 AUE_POSIX_OPENPT STD { int posix_openpt( int flags ); } 505 AUE_NULL OBSOL kgssapi 506 AUE_JAIL_GET STD { int jail_get( _In_reads_(iovcnt) _Contains_long_ptr_ struct iovec *iovp, unsigned int iovcnt, int flags ); } 507 AUE_JAIL_SET STD { int jail_set( _In_reads_(iovcnt) _Contains_long_ptr_ struct iovec *iovp, unsigned int iovcnt, int flags ); } 508 AUE_JAIL_REMOVE STD { int jail_remove( int jid ); } 509 AUE_CLOSEFROM COMPAT12|CAPENABLED { int closefrom( int lowfd ); } 510 AUE_SEMCTL NOSTD { int __semctl( int semid, int semnum, int cmd, _Inout_ _Contains_ptr_ union semun *arg ); } 511 AUE_MSGCTL NOSTD { int msgctl( int msqid, int cmd, _Inout_opt_ _Contains_long_ptr_ struct msqid_ds *buf ); } 512 AUE_SHMCTL NOSTD { int shmctl( int shmid, int cmd, _Inout_opt_ _Contains_long_ struct shmid_ds *buf ); } 513 AUE_LPATHCONF STD { int lpathconf( _In_z_ const char *path, int name ); } 514 AUE_NULL OBSOL cap_new 515 AUE_CAP_RIGHTS_GET STD|CAPENABLED { int __cap_rights_get( int version, int fd, _Out_ cap_rights_t *rightsp ); } 516 AUE_CAP_ENTER STD|CAPENABLED { int cap_enter(void); } 517 AUE_CAP_GETMODE STD|CAPENABLED { int cap_getmode( _Out_ u_int *modep ); } 518 AUE_PDFORK STD|CAPENABLED { int pdfork( _Out_ int *fdp, int flags ); } 519 AUE_PDKILL STD|CAPENABLED { int pdkill( int fd, int signum ); } 520 AUE_PDGETPID STD|CAPENABLED { int pdgetpid( int fd, _Out_ pid_t *pidp ); } 521 AUE_NULL RESERVED 522 AUE_SELECT STD|CAPENABLED { int pselect( int nd, _Inout_opt_ fd_set *in, _Inout_opt_ fd_set *ou, _Inout_opt_ fd_set *ex, _In_opt_ _Contains_long_timet_ const struct timespec *ts, _In_opt_ const sigset_t *sm ); } 523 AUE_GETLOGINCLASS STD|CAPENABLED { int getloginclass( _Out_writes_z_(namelen) char *namebuf, size_t namelen ); } 524 AUE_SETLOGINCLASS STD { int setloginclass( _In_z_ const char *namebuf ); } 525 AUE_NULL STD { int rctl_get_racct( _In_reads_bytes_(inbuflen) const void *inbufp, size_t inbuflen, _Out_writes_bytes_(outbuflen) void *outbufp, size_t outbuflen ); } 526 AUE_NULL STD { int rctl_get_rules( _In_reads_bytes_(inbuflen) const void *inbufp, size_t inbuflen, _Out_writes_bytes_(outbuflen) void *outbufp, size_t outbuflen ); } 527 AUE_NULL STD { int rctl_get_limits( _In_reads_bytes_(inbuflen) const void *inbufp, size_t inbuflen, _Out_writes_bytes_(outbuflen) void *outbufp, size_t outbuflen ); } 528 AUE_NULL STD { int rctl_add_rule( _In_reads_bytes_(inbuflen) const void *inbufp, size_t inbuflen, _Out_writes_bytes_(outbuflen) void *outbufp, size_t outbuflen ); } 529 AUE_NULL STD { int rctl_remove_rule( _In_reads_bytes_(inbuflen) const void *inbufp, size_t inbuflen, _Out_writes_bytes_(outbuflen) void *outbufp, size_t outbuflen ); } 530 AUE_POSIX_FALLOCATE STD|CAPENABLED { int posix_fallocate( int fd, off_t offset, off_t len ); } 531 AUE_POSIX_FADVISE STD|CAPENABLED { int posix_fadvise( int fd, off_t offset, off_t len, int advice ); } 532 AUE_WAIT6 STD { int wait6( idtype_t idtype, id_t id, _Out_opt_ int *status, int options, _Out_opt_ _Contains_long_ struct __wrusage *wrusage, _Out_opt_ _Contains_long_ptr_ struct __siginfo *info ); } 533 AUE_CAP_RIGHTS_LIMIT STD|CAPENABLED { int cap_rights_limit( int fd, _In_ cap_rights_t *rightsp ); } 534 AUE_CAP_IOCTLS_LIMIT STD|CAPENABLED { int cap_ioctls_limit( int fd, _In_reads_(ncmds) const u_long *cmds, size_t ncmds ); } 535 AUE_CAP_IOCTLS_GET STD|CAPENABLED { ssize_t cap_ioctls_get( int fd, _Out_writes_(maxcmds) u_long *cmds, size_t maxcmds ); } 536 AUE_CAP_FCNTLS_LIMIT STD|CAPENABLED { int cap_fcntls_limit( int fd, uint32_t fcntlrights ); } 537 AUE_CAP_FCNTLS_GET STD|CAPENABLED { int cap_fcntls_get( int fd, _Out_ uint32_t *fcntlrightsp ); } 538 AUE_BINDAT STD|CAPENABLED { int bindat( int fd, int s, _In_reads_bytes_(namelen) const struct sockaddr *name, __socklen_t namelen ); } 539 AUE_CONNECTAT STD|CAPENABLED { int connectat( int fd, int s, _In_reads_bytes_(namelen) const struct sockaddr *name, __socklen_t namelen ); } 540 AUE_CHFLAGSAT STD|CAPENABLED { int chflagsat( int fd, _In_z_ const char *path, u_long flags, int atflag ); } 541 AUE_ACCEPT STD|CAPENABLED { int accept4( int s, _Out_writes_bytes_opt_(*anamelen) struct sockaddr *name, _Inout_opt_ __socklen_t *anamelen, int flags ); } 542 AUE_PIPE STD|CAPENABLED { int pipe2( _Out_writes_(2) int *fildes, int flags ); } 543 AUE_AIO_MLOCK STD { int aio_mlock( _In_ _Contains_long_ptr_ struct aiocb *aiocbp ); } 544 AUE_PROCCTL STD { int procctl( idtype_t idtype, id_t id, int com, _In_opt_ void *data ); } 545 AUE_POLL STD|CAPENABLED { int ppoll( _Inout_updates_(nfds) struct pollfd *fds, u_int nfds, _In_opt_ _Contains_long_timet_ const struct timespec *ts, _In_opt_ const sigset_t *set ); } 546 AUE_FUTIMES STD|CAPENABLED { int futimens( int fd, _In_reads_(2) _Contains_long_timet_ const struct timespec *times ); } 547 AUE_FUTIMESAT STD|CAPENABLED { int utimensat( int fd, _In_z_ const char *path, _In_reads_(2) _Contains_long_timet_ const struct timespec *times, int flag ); } 548 AUE_NULL OBSOL numa_getaffinity 549 AUE_NULL OBSOL numa_setaffinity 550 AUE_FSYNC STD|CAPENABLED { int fdatasync( int fd ); } 551 AUE_FSTAT STD|CAPENABLED { int fstat( int fd, _Out_ _Contains_long_timet_ struct stat *sb ); } 552 AUE_FSTATAT STD|CAPENABLED { int fstatat( int fd, _In_z_ const char *path, _Out_ _Contains_long_timet_ struct stat *buf, int flag ); } 553 AUE_FHSTAT STD { int fhstat( _In_ const struct fhandle *u_fhp, _Out_ _Contains_long_timet_ struct stat *sb ); } 554 AUE_GETDIRENTRIES STD|CAPENABLED { ssize_t getdirentries( int fd, _Out_writes_bytes_(count) char *buf, size_t count, _Out_opt_ off_t *basep ); } 555 AUE_STATFS STD { int statfs( _In_z_ const char *path, _Out_ struct statfs *buf ); } 556 AUE_FSTATFS STD|CAPENABLED { int fstatfs( int fd, _Out_ struct statfs *buf ); } 557 AUE_GETFSSTAT STD { int getfsstat( _Out_writes_bytes_opt_(bufsize) struct statfs *buf, long bufsize, int mode ); } 558 AUE_FHSTATFS STD { int fhstatfs( _In_ const struct fhandle *u_fhp, _Out_ struct statfs *buf ); } 559 AUE_MKNODAT STD|CAPENABLED { int mknodat( int fd, _In_z_ const char *path, mode_t mode, dev_t dev ); } 560 AUE_KEVENT STD|CAPENABLED { int kevent( int fd, _In_reads_opt_(nchanges) _Contains_ptr_ const struct kevent *changelist, int nchanges, _Out_writes_opt_(nevents) _Contains_ptr_ struct kevent *eventlist, int nevents, _In_opt_ _Contains_long_timet_ const struct timespec *timeout ); } 561 AUE_NULL STD|CAPENABLED { int cpuset_getdomain( cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, _Out_writes_bytes_(domainsetsize) domainset_t *mask, _Out_ int *policy ); } 562 AUE_NULL STD|CAPENABLED { int cpuset_setdomain( cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, _In_ domainset_t *mask, int policy ); } 563 AUE_NULL STD|CAPENABLED { int getrandom( _Out_writes_bytes_(buflen) void *buf, size_t buflen, unsigned int flags ); } 564 AUE_NULL STD { int getfhat( int fd, _In_z_ char *path, _Out_ struct fhandle *fhp, int flags ); } 565 AUE_NULL STD { int fhlink( _In_ struct fhandle *fhp, _In_z_ const char *to ); } 566 AUE_NULL STD { int fhlinkat( _In_ struct fhandle *fhp, int tofd, _In_z_ const char *to, ); } 567 AUE_NULL STD { int fhreadlink( _In_ struct fhandle *fhp, _Out_writes_(bufsize) char *buf, size_t bufsize ); } 568 AUE_UNLINKAT STD|CAPENABLED { int funlinkat( int dfd, _In_z_ const char *path, int fd, int flag ); } 569 AUE_NULL STD|CAPENABLED { ssize_t copy_file_range( int infd, _Inout_opt_ off_t *inoffp, int outfd, _Inout_opt_ off_t *outoffp, size_t len, unsigned int flags ); } 570 AUE_SYSCTL STD|CAPENABLED { int __sysctlbyname( _In_reads_(namelen) const char *name, size_t namelen, _Out_writes_bytes_opt_(*oldlenp) void *old, _Inout_opt_ size_t *oldlenp, _In_reads_bytes_opt_(newlen) void *new, size_t newlen ); } 571 AUE_SHMOPEN STD|CAPENABLED { int shm_open2( _In_z_ const char *path, int flags, mode_t mode, int shmflags, _In_z_ const char *name ); } 572 AUE_SHMRENAME STD { int shm_rename( _In_z_ const char *path_from, _In_z_ const char *path_to, int flags ); } 573 AUE_NULL STD|CAPENABLED { int sigfastblock( int cmd, _Inout_updates_bytes_opt_(4) void *ptr ); } 574 AUE_REALPATHAT STD { int __realpathat( int fd, _In_z_ const char *path, _Out_writes_z_(size) char *buf, size_t size, int flags ); } 575 AUE_CLOSERANGE STD|CAPENABLED { int close_range( u_int lowfd, u_int highfd, int flags ); } ; 576 is initialised by the krpc code, if present. 576 AUE_NULL NOSTD { int rpctls_syscall( uint64_t socookie ); } 577 AUE_SPECIALFD STD|CAPENABLED { int __specialfd( int type, _In_reads_bytes_(len) const void *req, size_t len ); } 578 AUE_AIO_WRITEV STD|CAPENABLED { int aio_writev( _Inout_ _Contains_long_ptr_ struct aiocb *aiocbp ); } 579 AUE_AIO_READV STD|CAPENABLED { int aio_readv( _Inout_ _Contains_long_ptr_ struct aiocb *aiocbp ); } 580 AUE_FSPACECTL STD|CAPENABLED { int fspacectl( int fd, int cmd, _In_ const struct spacectl_range *rqsr, int flags, _Out_opt_ struct spacectl_range *rmsr, ); } 581 AUE_NULL STD|CAPENABLED { int sched_getcpu(void); } 582 AUE_SWAPOFF STD { int swapoff( _In_z_ const char *name, u_int flags, ); } 583 AUE_KQUEUE STD|CAPENABLED { int kqueuex( u_int flags ); } 584 AUE_NULL STD|CAPENABLED { int membarrier( int cmd, unsigned flags, int cpu_id ); } 585 AUE_TIMERFD STD|CAPENABLED { int timerfd_create( int clockid, int flags ); } 586 AUE_TIMERFD STD|CAPENABLED { int timerfd_gettime( int fd, _Out_ _Contains_long_timet_ struct itimerspec *curr_value ); } 587 AUE_TIMERFD STD|CAPENABLED { int timerfd_settime( int fd, int flags, _In_ _Contains_long_timet_ const struct itimerspec *new_value, _Out_opt_ _Contains_long_timet_ struct itimerspec *old_value ); } 588 AUE_NULL STD { int kcmp( pid_t pid1, pid_t pid2, int type, uintptr_t idx1, uintptr_t idx2 ); } 589 AUE_NULL STD|CAPENABLED { int getrlimitusage( u_int which, int flags, _Out_ rlim_t *res ); } 590 AUE_NULL STD { int fchroot( int fd ); } 591 AUE_SETCRED STD|CAPENABLED { int setcred( u_int flags, _In_reads_bytes_(size) _Contains_ptr_ const struct setcred *wcred, size_t size ); } - +592 AUE_NULL STD { + int exterrctl( + u_int op, + u_int flags, + _In_reads_bytes_(4) void *ptr + ); + } ; vim: syntax=off diff --git a/sys/sys/exterrvar.h b/sys/sys/exterrvar.h index 6e392ff2c18c..5afcd82b136a 100644 --- a/sys/sys/exterrvar.h +++ b/sys/sys/exterrvar.h @@ -1,58 +1,71 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2025 The FreeBSD Foundation * All rights reserved. * * This software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. */ #ifndef _SYS_EXTERRVAR_H_ #define _SYS_EXTERRVAR_H_ #include #include #include struct uexterror { uint32_t ver; uint32_t error; uint32_t cat; uint32_t src_line; uint64_t p1; uint64_t p2; uint64_t rsrv1[4]; char msg[128]; }; +#define UEXTERROR_VER 0x10010001 + +#define EXTERRCTL_ENABLE 1 +#define EXTERRCTL_DISABLE 2 + +#define EXTERRCTLF_FORCE 0x00000001 + #ifdef _KERNEL #ifndef EXTERR_CATEGORY #error "Specify error category before including sys/exterrvar.h" #endif #ifdef BLOW_KERNEL_WITH_EXTERR #define SET_ERROR_MSG(mmsg) _Td->td_kexterr.msg = mmsg #else #define SET_ERROR_MSG(mmsg) _Td->td_kexterr.msg = NULL #endif #define SET_ERROR2(eerror, mmsg, pp1, pp2) do { \ struct thread *_Td = curthread; \ if ((_Td->td_pflags2 & TDP2_UEXTERR) != 0) { \ _Td->td_pflags2 |= TDP2_EXTERR; \ _Td->td_kexterr.error = eerror; \ _Td->td_kexterr.cat = EXTERR_CATEGORY; \ SET_ERROR_MSG(mmsg); \ _Td->td_kexterr.p1 = (uintptr_t)pp1; \ _Td->td_kexterr.p2 = (uintptr_t)pp2; \ _Td->td_kexterr.src_line = __LINE__; \ } \ } while (0) #define SET_ERROR0(eerror, mmsg) SET_ERROR2(eerror, mmsg, 0, 0) #define SET_ERROR1(eerror, mmsg, pp1) SET_ERROR2(eerror, mmsg, pp1, 0) +#else /* _KERNEL */ + +__BEGIN_DECLS +int exterrctl(u_int op, u_int flags, void *ptr); +__END_DECLS + #endif /* _KERNEL */ #endif diff --git a/sys/sys/proc.h b/sys/sys/proc.h index cab487719c31..b48681420028 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -1,1350 +1,1352 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #ifdef _KERNEL #include #endif #include #include #ifndef _KERNEL #include #endif #include #include #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #include #endif #include #include #include #include #include /* Machine-dependent proc substruct. */ #ifdef _KERNEL #include #endif /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { u_int s_count; /* Ref cnt; pgrps in session - atomic. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct cdev_priv *s_ttydp; /* (m) Device of controlling tty. */ struct tty *s_ttyp; /* (e) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ struct mtx pg_mtx; /* Mutex to protect members */ int pg_flags; /* (m) PGRP_ flags */ struct sx pg_killsx; /* Mutual exclusion between group member * fork() and killpg() */ }; #define PGRP_ORPHANED 0x00000001 /* Group is orphaned */ /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * kx- only accessed by curthread and by debugger * l - the attaching proc or attaching proc parent * n - not locked, lazy * o - ktrace lock * q - td_contested lock * r - p_peers lock * s - see sleepq_switch(), sleeping_on_old_rtc(), and sleep(9) * t - thread lock * u - process stat lock * w - process timer lock * x - created at fork, only changes during single threading in exec * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct cpuset; struct filecaps; struct filemon; struct kaioinfo; struct kaudit_record; struct kcov_info; struct kdtrace_proc; struct kdtrace_thread; struct kmsan_td; struct kq_timer_cb_data; struct mqueue_notifier; struct p_sched; struct proc; struct procdesc; struct racct; struct sbuf; struct sleepqueue; struct socket; struct td_sched; struct thread; struct trapframe; struct turnstile; struct vm_map; struct vm_map_entry; struct epoch_tracker; struct syscall_args { u_int code; u_int original_code; struct sysent *callp; register_t args[8]; }; /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux. * Locking for td_rux: (t) for all fields. */ struct rusage_ext { uint64_t rux_runtime; /* (cu) Real time. */ uint64_t rux_uticks; /* (cu) Statclock hits in user mode. */ uint64_t rux_sticks; /* (cu) Statclock hits in sys mode. */ uint64_t rux_iticks; /* (cu) Statclock hits in intr mode. */ uint64_t rux_uu; /* (c) Previous user time in usec. */ uint64_t rux_su; /* (c) Previous sys time in usec. */ uint64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_runq; /* (t) Run queue. */ union { TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ struct thread *td_zombie; /* Zombie list linkage */ }; TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct domainset_ref td_domain; /* (a) NUMA policy */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ void *td_pad1; /* Available */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals u_char td_lend_user_pri; /* (t) Lend user pri. */ u_char td_allocdomain; /* (b) NUMA domain backing this struct thread. */ u_char td_base_ithread_pri; /* (t) Base ithread pri */ struct kmsan_td *td_kmsan; /* (k) KMSAN state */ /* Cleared during fork1(), thread_create(), or kthread_add(). */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ int td_ast; /* (t) TDA_* indicators */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_pflags2; /* (k) Private thread (TDP2_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ const void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ u_char _td_pad0[2]; /* Available. */ int td_locks; /* (k) Debug: count of non-spin locks */ int td_rw_rlocks; /* (k) Count of rwlock read locks. */ int td_sx_slocks; /* (k) Count of sx shared locks. */ int td_lk_slocks; /* (k) Count of lockmgr shared locks. */ struct lock_object *td_wantedlock; /* (k) Lock we are contending on */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct ucred *td_realucred; /* (k) Reference to credentials. */ struct ucred *td_ucred; /* (k) Used credentials, temporarily switchable. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ int td_swvoltick; /* (t) Time at last SW_VOL switch. */ int td_swinvoltick; /* (t) Time at last SW_INVOL switch. */ u_int td_cow; /* (*) Number of copy-on-write faults */ struct rusage td_ru; /* (t) rusage information. */ struct rusage_ext td_rux; /* (t) Internal rusage information. */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ siginfo_t td_si; /* (c) For debugger or core file */ int td_ng_outbound; /* (k) Thread entered ng from above. */ struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_no_sleeping; /* (k) Sleeping disabled count. */ struct vnode *td_vp_reserved;/* (k) Preallocated vnode. */ void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ int td_errno; /* (k) Error from last syscall. */ size_t td_vslock_sz; /* (k) amount of vslock-ed space */ struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */ long td_ucredref; /* (k) references on td_realucred */ struct kexterr td_kexterr; #define td_endzero td_sigmask /* Copied during fork1(), thread_create(), or kthread_add(). */ #define td_startcopy td_endzero sigset_t td_sigmask; /* (c) Current signal mask. */ u_char td_rqindex; /* (t) Run queue index. */ u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ uintptr_t td_rb_list; /* (k) Robust list head. */ uintptr_t td_rbp_list; /* (k) Robust priv list head. */ uintptr_t td_rb_inact; /* (k) Current in-action mutex loc. */ struct syscall_args td_sa; /* (kx) Syscall parameters. Copied on fork for child tracing. */ void *td_sigblock_ptr; /* (k) uptr for fast sigblock. */ uint32_t td_sigblock_val; /* (k) fast sigblock value read at td_sigblock_ptr on kern entry */ + void *td_exterr_ptr; #define td_endcopy td_pcb /* * Fields that must be manually set in fork1(), thread_create(), kthread_add(), * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum td_states { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ /* Note: td_state must be accessed using TD_{GET,SET}_STATE(). */ union { syscallarg_t tdu_retval[2]; off_t tdu_off; } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval u_int td_cowgen; /* (k) Generation of COW pointers. */ /* LP64 hole */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ u_short td_kstack_pages; /* (a) Size of the kstack. */ u_short td_kstack_domain; /* (a) Domain backing kstack KVA. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ int td_ma_cnt; /* (k) size of *td_ma */ /* LP64 hole */ void *td_emuldata; /* Emulator state data */ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ void *td_lkpi_task; /* LinuxKPI task struct pointer */ int td_pmcpend; void *td_remotereq; /* (c) dbg remote request. */ off_t td_ktr_io_lim; /* (k) limit for ktrace file size */ #ifdef EPOCH_TRACE SLIST_HEAD(, epoch_tracker) td_epochs; #endif }; struct thread0_storage { struct thread t0st_thread; uint64_t t0st_sched[10]; }; struct mtx *thread_lock_block(struct thread *); void thread_lock_block_wait(struct thread *); void thread_lock_set(struct thread *, struct mtx *); void thread_lock_unblock(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ mtx_assert((td)->td_lock, (type)) #define THREAD_LOCK_BLOCKED_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) #ifdef INVARIANTS #define THREAD_LOCKPTR_ASSERT(td, lock) \ do { \ struct mtx *__m; \ __m = (td)->td_lock; \ KASSERT(__m == (lock), \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define THREAD_LOCKPTR_BLOCKED_ASSERT(td, lock) \ do { \ struct mtx *__m; \ __m = (td)->td_lock; \ KASSERT(__m == (lock) || __m == &blocked_lock, \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define TD_LOCKS_INC(td) ((td)->td_locks++) #define TD_LOCKS_DEC(td) do { \ KASSERT(SCHEDULER_STOPPED() || (td)->td_locks > 0, \ ("Thread %p owns no locks", (td))); \ (td)->td_locks--; \ } while (0) #else #define THREAD_LOCKPTR_ASSERT(td, lock) #define THREAD_LOCKPTR_BLOCKED_ASSERT(td, lock) #define TD_LOCKS_INC(td) #define TD_LOCKS_DEC(td) #endif /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_UNUSED11 0x00000040 /* Available */ #define TDF_SIGWAIT 0x00000080 /* Ignore ignored signals */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_UNUSED1 0x00000800 /* Available */ #define TDF_UNUSED2 0x00001000 /* Available */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_UNUSED3 0x00008000 /* Available */ #define TDF_UNUSED4 0x00010000 /* Available */ #define TDF_UNUSED5 0x00020000 /* Available */ #define TDF_NOLOAD 0x00040000 /* Ignore during load avg calculations. */ #define TDF_SERESTART 0x00080000 /* ERESTART on stop attempts. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */ #define TDF_UNUSED12 0x00400000 /* Available */ #define TDF_UNUSED6 0x00800000 /* Available */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_UNUSED7 0x10000000 /* Available */ #define TDF_UNUSED8 0x20000000 /* Available */ #define TDF_UNUSED9 0x40000000 /* Available */ #define TDF_UNUSED10 0x80000000 /* Available */ enum { TDA_AST = 0, /* Special: call all non-flagged AST handlers */ TDA_OWEUPC, TDA_HWPMC, TDA_VFORK, TDA_ALRM, TDA_PROF, TDA_MAC, TDA_SCHED, TDA_UFS, TDA_GEOM, TDA_KQUEUE, TDA_RACCT, TDA_MOD1, /* For third party use, before signals are */ TDA_MOD2, /* processed .. */ TDA_PSELECT, /* For discarding temporary signal mask */ TDA_SIG, TDA_KTRACE, TDA_SUSPEND, TDA_SIGSUSPEND, TDA_MOD3, /* .. and after */ TDA_MOD4, TDA_MAX, }; #define TDAI(tda) (1U << (tda)) #define td_ast_pending(td, tda) ((td->td_ast & TDAI(tda)) != 0) /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ #define TDB_USERWR 0x00000004 /* Debugger modified memory or registers */ #define TDB_SCE 0x00000008 /* Thread performs syscall enter */ #define TDB_SCX 0x00000010 /* Thread performs syscall exit */ #define TDB_EXEC 0x00000020 /* TDB_SCX from exec(2) family */ #define TDB_FORK 0x00000040 /* TDB_SCX from fork(2) that created new process */ #define TDB_STOPATFORK 0x00000080 /* Stop at the return from fork (child only) */ #define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */ #define TDB_BORN 0x00000200 /* New LWP indicator for ptrace() */ #define TDB_EXIT 0x00000400 /* Exiting LWP indicator for ptrace() */ #define TDB_VFORK 0x00000800 /* vfork indicator for ptrace() */ #define TDB_FSTP 0x00001000 /* The thread is PT_ATTACH leader */ #define TDB_STEP 0x00002000 /* (x86) PSL_T set for PT_STEP */ #define TDB_SSWITCH 0x00004000 /* Suspended in ptracestop */ #define TDB_BOUNDARY 0x00008000 /* ptracestop() at boundary */ #define TDB_COREDUMPREQ 0x00010000 /* Coredump request */ #define TDB_SCREMOTEREQ 0x00020000 /* Remote syscall request */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock acquisition - deadlock treatment. */ #define TDP_NOFAULTING 0x00000080 /* Do not handle page faults. */ #define TDP_SIGFASTBLOCK 0x00000100 /* Fast sigblock active */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_SYNCIO 0x00000800 /* Local override, disable async i/o. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ #define TDP_EFIRT 0x20000000 /* In firmware (EFI RT) call */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ #define TDP_SIGFASTPENDING 0x80000000 /* Pending signal due to sigfastblock */ #define TDP2_SBPAGES 0x00000001 /* Owns sbusy on some pages */ #define TDP2_COMPAT32RB 0x00000002 /* compat32 ABI for robust lists */ #define TDP2_ACCT 0x00000004 /* Doing accounting */ #define TDP2_SAN_QUIET 0x00000008 /* Disable warnings from K(A|M)SAN */ #define TDP2_EXTERR 0x00000010 /* Kernel reported ext error */ +#define TDP2_UEXTERR 0x00000020 /* User set ext error reporting ptr */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #ifdef _KERNEL #define TD_GET_STATE(td) atomic_load_int(&(td)->td_state) #else #define TD_GET_STATE(td) ((td)->td_state) #endif #define TD_IS_RUNNING(td) (TD_GET_STATE(td) == TDS_RUNNING) #define TD_ON_RUNQ(td) (TD_GET_STATE(td) == TDS_RUNQ) #define TD_CAN_RUN(td) (TD_GET_STATE(td) == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) (TD_GET_STATE(td) == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #define TD_CAN_ABORT(td) (TD_ON_SLEEPQ((td)) && \ ((td)->td_flags & TDF_SINTR) != 0) #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ ((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \ ((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding") #define TD_SET_INHIB(td, inhib) do { \ TD_SET_STATE(td, TDS_INHIBITED); \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ TD_SET_STATE(td, TDS_CAN_RUN); \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #ifdef _KERNEL #define TD_SET_STATE(td, state) atomic_store_int(&(td)->td_state, state) #else #define TD_SET_STATE(td, state) (td)->td_state = state #endif #define TD_SET_RUNNING(td) TD_SET_STATE(td, TDS_RUNNING) #define TD_SET_RUNQ(td) TD_SET_STATE(td, TDS_RUNQ) #define TD_SET_CAN_RUN(td) TD_SET_STATE(td, TDS_CAN_RUN) #define TD_SBDRY_INTR(td) \ (((td)->td_flags & (TDF_SEINTR | TDF_SERESTART)) != 0) #define TD_SBDRY_ERRNO(td) \ (((td)->td_flags & TDF_SEINTR) != 0 ? EINTR : ERESTART) /* * Process structure. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pwddesc *p_pd; /* (b) Cwd, chroot, jail, umask */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Resource limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ enum p_states { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) Process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct proc *p_reaper; /* (e) My reaper. */ LIST_HEAD(, proc) p_reaplist; /* (e) List of my descendants (if I am reaper). */ LIST_ENTRY(proc) p_reapsibling; /* (e) List of siblings - descendants of the same reaper. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct mtx p_statmtx; /* Lock for the stats */ struct mtx p_itimmtx; /* Lock for the virt/prof timers */ struct mtx p_profmtx; /* Lock for the profiling */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals pid_t p_oppid; /* (c + e) Real parent pid. */ /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_vmspace struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ u_int p_cowgen; /* (c) Generation of COW pointers. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cu) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct ktr_io_params *p_ktrioparms; /* (c + o) Params for ktrace. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ struct vnode *p_textdvp; /* (b) Dir containing textvp. */ char *p_binname; /* (b) Binary hardlink name. */ u_int p_lock; /* (c) Prevent exit. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_int p_ptevents; /* (c + e) ptrace() event mask. */ struct kaioinfo *p_aioinfo; /* (y) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (j) Num threads at user boundary */ int p_pendingcnt; /* (c) how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ u_int p_treeflag; /* (e) P_TREE flags */ int p_pendingexits; /* (c) Count of pending thread exits. */ struct filemon *p_filemon; /* (c) filemon-specific data. */ int p_pdeathsig; /* (c) Signal from parent on exit. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ uint32_t p_fctl0; /* (x) ABI feature control, ELF note */ char p_comm[MAXCOMLEN + 1]; /* (x) Process name. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ pid_t p_reapsubtree; /* (e) Pid of the direct child of the reaper which spawned our subtree. */ uint64_t p_elf_flags; /* (x) ELF flags */ void *p_elf_brandinfo; /* (x) Elf_Brandinfo, NULL for non ELF binaries. */ sbintime_t p_umtx_min_timeout; /* End area that is copied on creation. */ #define p_endcopy p_xexit u_int p_xexit; /* (c) Exit code. */ u_int p_xsig; /* (c) Stop/kill sig. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct knlist *p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ /* * An orphan is the child that has been re-parented to the * debugger as a result of attaching to it. Need to keep * track of them for parent to be able to collect the exit * status of what used to be children. */ LIST_ENTRY(proc) p_orphan; /* (e) List of orphan processes. */ LIST_HEAD(, proc) p_orphans; /* (e) Pointer to list of orphans. */ TAILQ_HEAD(, kq_timer_cb_data) p_kqtim_stop; /* (c) */ LIST_ENTRY(proc) p_jaillist; /* (d) Jail process linkage. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU (-1) /* For when we aren't on a CPU. */ #define NOCPU_OLD (255) #define MAXCPU_OLD (254) #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) #define PROC_STATLOCK(p) mtx_lock_spin(&(p)->p_statmtx) #define PROC_STATUNLOCK(p) mtx_unlock_spin(&(p)->p_statmtx) #define PROC_STATLOCK_ASSERT(p, type) mtx_assert(&(p)->p_statmtx, (type)) #define PROC_ITIMLOCK(p) mtx_lock_spin(&(p)->p_itimmtx) #define PROC_ITIMUNLOCK(p) mtx_unlock_spin(&(p)->p_itimmtx) #define PROC_ITIMLOCK_ASSERT(p, type) mtx_assert(&(p)->p_itimmtx, (type)) #define PROC_PROFLOCK(p) mtx_lock_spin(&(p)->p_profmtx) #define PROC_PROFUNLOCK(p) mtx_unlock_spin(&(p)->p_profmtx) #define PROC_PROFLOCK_ASSERT(p, type) mtx_assert(&(p)->p_profmtx, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00000001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00000002 /* Has a controlling terminal. */ #define P_KPROC 0x00000004 /* Kernel process. */ #define P_IDLEPROC 0x00000008 /* Container for system idle threads. */ #define P_PPWAIT 0x00000010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00000020 /* Has started profiling. */ #define P_STOPPROF 0x00000040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00000080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00000100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00000200 /* System proc: no sigs or stats. */ #define P_SINGLE_EXIT 0x00000400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00000800 /* Debugged process being traced. */ #define P_WAITED 0x00001000 /* Someone is waiting for us. */ #define P_WEXIT 0x00002000 /* Working on exiting. */ #define P_EXEC 0x00004000 /* Process called exec. */ #define P_WKILLED 0x00008000 /* Killed, go to kernel/user boundary ASAP. */ #define P_CONTINUED 0x00010000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x00020000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x00040000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x00080000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x00100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x00200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x00400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x00800000 /* Process is using HWPMCs */ #define P_JAILED 0x01000000 /* Process is in jail. */ #define P_TOTAL_STOP 0x02000000 /* Stopped in stop_all_proc. */ #define P_INEXEC 0x04000000 /* Process is in execve(). */ #define P_STATCHILD 0x08000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory, always set. */ #define P_UNUSED1 0x20000000 /* --available-- */ #define P_UNUSED2 0x40000000 /* --available-- */ #define P_PPTRACE 0x80000000 /* PT_TRACEME by vforked child. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) #define P_KILLED(p) ((p)->p_flag & P_WKILLED) /* These flags are kept in p_flag2. */ #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ #define P2_NOTRACE 0x00000002 /* No ptrace(2) attach or coredumps. */ #define P2_NOTRACE_EXEC 0x00000004 /* Keep P2_NOPTRACE on exec(2). */ #define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ #define P2_PTRACE_FSTP 0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */ #define P2_TRAPCAP 0x00000020 /* SIGTRAP on ENOTCAPABLE */ #define P2_ASLR_ENABLE 0x00000040 /* Force enable ASLR. */ #define P2_ASLR_DISABLE 0x00000080 /* Force disable ASLR. */ #define P2_ASLR_IGNSTART 0x00000100 /* Enable ASLR to consume sbrk area. */ #define P2_PROTMAX_ENABLE 0x00000200 /* Force enable implied PROT_MAX. */ #define P2_PROTMAX_DISABLE 0x00000400 /* Force disable implied PROT_MAX. */ #define P2_STKGAP_DISABLE 0x00000800 /* Disable stack gap for MAP_STACK */ #define P2_STKGAP_DISABLE_EXEC 0x00001000 /* Stack gap disabled after exec */ #define P2_ITSTOPPED 0x00002000 /* itimers stopped */ #define P2_PTRACEREQ 0x00004000 /* Active ptrace req */ #define P2_NO_NEW_PRIVS 0x00008000 /* Ignore setuid */ #define P2_WXORX_DISABLE 0x00010000 /* WX mappings enabled */ #define P2_WXORX_ENABLE_EXEC 0x00020000 /* WXORX enabled after exec */ #define P2_WEXIT 0x00040000 /* exit just started, no external thread_single() is permitted */ #define P2_REAPKILLED 0x00080000 /* REAP_KILL pass touched me */ #define P2_MEMBAR_PRIVE 0x00100000 /* membar private expedited registered */ #define P2_MEMBAR_PRIVE_SYNCORE 0x00200000 /* membar private expedited sync core registered */ #define P2_MEMBAR_GLOBE 0x00400000 /* membar global expedited registered */ #define P2_LOGSIGEXIT_ENABLE 0x00800000 /* Disable logging on sigexit */ #define P2_LOGSIGEXIT_CTL 0x01000000 /* Override kern.logsigexit */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan list */ #define P_TREE_REAPER 0x00000004 /* Reaper of subtree */ #define P_TREE_GRPEXITED 0x00000008 /* exit1() done with job ctl */ /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Types and flags for mi_switch(9). */ #define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ #define SWT_OWEPREEMPT 1 /* Switching due to owepreempt. */ #define SWT_TURNSTILE 2 /* Turnstile contention. */ #define SWT_SLEEPQ 3 /* Sleepq wait. */ #define SWT_RELINQUISH 4 /* yield call. */ #define SWT_NEEDRESCHED 5 /* NEEDRESCHED was set. */ #define SWT_IDLE 6 /* Switching from the idle thread. */ #define SWT_IWAIT 7 /* Waiting for interrupts. */ #define SWT_SUSPEND 8 /* Thread suspended. */ #define SWT_REMOTEPREEMPT 9 /* Remote processor preempted. */ #define SWT_REMOTEWAKEIDLE 10 /* Remote processor preempted idle. */ #define SWT_BIND 11 /* Thread bound to a new CPU. */ #define SWT_COUNT 12 /* Number of switch types. */ /* Flags */ #define SW_VOL 0x0100 /* Voluntary switch. */ #define SW_INVOL 0x0200 /* Involuntary switch. */ #define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 #define SINGLE_ALLPROC 3 #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit * in a pid_t, as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID (PID_MAX + 1) #define THREAD0_TID NO_PID extern pid_t pid_max; #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_WAIT_UNLOCKED(p) mtx_wait_unlocked(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* * A non-zero p_lock prevents the process from exiting; it will sleep in exit1() * until the count reaches zero. * * PHOLD() asserts that the process (except the current process) is * not exiting and increments p_lock. * _PHOLD() is same as PHOLD(), it takes the process locked. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process %p not held", p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ PROC_ASSERT_HELD(p); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process %p held", p)); \ } while (0) #define PROC_UPDATE_COW(p) do { \ struct proc *_p = (p); \ PROC_LOCK_ASSERT((_p), MA_OWNED); \ atomic_store_int(&_p->p_cowgen, _p->p_cowgen + 1); \ } while (0) #define PROC_COW_CHANGECOUNT(td, p) ({ \ struct thread *_td = (td); \ struct proc *_p = (p); \ MPASS(_td == curthread); \ PROC_LOCK_ASSERT(_p, MA_OWNED); \ _p->p_cowgen - _td->td_cowgen; \ }) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() do { \ curthread->td_no_sleeping++; \ MPASS(curthread->td_no_sleeping > 0); \ } while (0) #define THREAD_SLEEPING_OK() do { \ MPASS(curthread->td_no_sleeping > 0); \ curthread->td_no_sleeping--; \ } while (0) #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) #define THREAD_CONTENDS_ON_LOCK(lo) do { \ MPASS(curthread->td_wantedlock == NULL); \ curthread->td_wantedlock = lo; \ } while (0) #define THREAD_CONTENTION_DONE(lo) do { \ MPASS(curthread->td_wantedlock == lo); \ curthread->td_wantedlock = NULL; \ } while (0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) #define PIDHASHLOCK(pid) (&pidhashtbl_lock[((pid) & pidhashlock)]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern struct sx *pidhashtbl_lock; extern u_long pidhash; extern u_long pidhashlock; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern int allproc_gen; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct mtx procid_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread0_storage thread0_st; /* Primary thread in proc0. */ #define thread0 (thread0_st.t0st_thread) extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; extern struct uma_zone *pgrp_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_any(pid_t); /* Find (zombie) process by id. */ struct proc *pfind_any_locked(pid_t pid); /* Find process by id, locked. */ struct pgrp *pgfind(pid_t); /* Find process group by id. */ void pidhash_slockall(void); /* Shared lock all pid hash lists. */ void pidhash_sunlockall(void); /* Shared unlock all pid hash lists. */ struct fork_req { int fr_flags; int fr_pages; int *fr_pidp; struct proc **fr_procp; int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; int fr_flags2; #define FR2_DROPSIG_CAUGHT 0x00000001 /* Drop caught non-DFL signals */ #define FR2_SHARE_PATHS 0x00000002 /* Invert sense of RFFDG for paths */ #define FR2_KPROC 0x00000004 /* Create a kernel process */ }; /* * pget() flags. */ #define PGET_HOLD 0x00001 /* Hold the process. */ #define PGET_CANSEE 0x00002 /* Check against p_cansee(). */ #define PGET_CANDEBUG 0x00004 /* Check against p_candebug(). */ #define PGET_ISCURRENT 0x00008 /* Check that the found process is current. */ #define PGET_NOTWEXIT 0x00010 /* Check that the process is not in P_WEXIT. */ #define PGET_NOTINEXEC 0x00020 /* Check that the process is not in P_INEXEC. */ #define PGET_NOTID 0x00040 /* Do not assume tid if pid > PID_MAX. */ #define PGET_WANTREAD (PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT) int pget(pid_t pid, int flags, struct proc **pp); /* ast_register() flags */ #define ASTR_ASTF_REQUIRED 0x0001 /* td_ast TDAI(TDA_X) flag set is required for call */ #define ASTR_TDP 0x0002 /* td_pflags flag set is required */ #define ASTR_KCLEAR 0x0004 /* call me on ast_kclear() */ #define ASTR_UNCOND 0x0008 /* call me always */ void ast(struct trapframe *framep); void ast_kclear(struct thread *td); void ast_register(int ast, int ast_flags, int tdp, void (*f)(struct thread *td, int asts)); void ast_deregister(int tda); void ast_sched_locked(struct thread *td, int tda); void ast_sched_mask(struct thread *td, int ast); void ast_sched(struct thread *td, int tda); void ast_unsched_locked(struct thread *td, int tda); struct thread *choosethread(void); int cr_bsd_visible(struct ucred *u1, struct ucred *u2); int cr_cansee(struct ucred *u1, struct ucred *u2); int cr_canseesocket(struct ucred *cred, struct socket *so); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); int fork1(struct thread *, struct fork_req *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void itimer_proc_continue(struct proc *p); void kqtimer_proc_continue(struct proc *p); void kern_proc_vmmap_resident(struct vm_map *map, struct vm_map_entry *entry, int *resident_count, bool *super); void kern_yield(int); void killjobc(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void maybe_yield(void); void mi_switch(int flags); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); int pgrp_calc_jobc(struct pgrp *pgrp); void proc_add_orphan(struct proc *child, struct proc *parent); int proc_get_binpath(struct proc *p, char *binname, char **fullpath, char **freepath); int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); void procinit(void); int proc_iterate(int (*cb)(struct proc *, void *), void *cbarg); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent, bool set_oppid); void proc_set_p2_wexit(struct proc *p); void proc_set_traced(struct proc *p, bool stop); void proc_wkilled(struct proc *p); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); void proc_clear_orphan(struct proc *p); void reaper_abandon_children(struct proc *p, bool exiting); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sess_hold(struct session *); void sess_release(struct session *); void setrunnable(struct thread *, int); void setsugid(struct proc *p); bool should_yield(void); int sigonstack(size_t sp); void stopevent(struct proc *, u_int, u_int); struct thread *tdfind(lwpid_t, pid_t); void threadinit(void); void tidhash_add(struct thread *); void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_sync_core(void); void cpu_throw(struct thread *, struct thread *) __dead2; void cpu_update_pcb(struct thread *); bool curproc_sigkilled(void); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int, int) __dead2; void cpu_copy_thread(struct thread *td, struct thread *td0); bool cpu_exec_vmspace_reuse(struct proc *p, struct vm_map *map); int cpu_fetch_syscall_args(struct thread *td); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *); int cpu_procctl(struct thread *td, int idtype, id_t id, int com, void *data); void cpu_set_syscall_retval(struct thread *, int); int cpu_set_upcall(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base, int flags); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); struct thread *thread_alloc(int pages); int thread_check_susp(struct thread *td, bool sleep); void thread_cow_get_proc(struct thread *newtd, struct proc *p); void thread_cow_get(struct thread *newtd, struct thread *td); void thread_cow_free(struct thread *td); void thread_cow_update(struct thread *td); void thread_cow_synced(struct thread *td); int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk); void thread_exit(void) __dead2; void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap_barrier(void); int thread_recycle(struct thread *, int pages); int thread_single(struct proc *p, int how); void thread_single_end(struct proc *p, int how); void thread_stash(struct thread *td); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); void thread_run_flash(struct thread *td); int thread_suspend_check(int how); bool thread_suspend_check_needed(void); void thread_suspend_switch(struct thread *, struct proc *p); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_wait(struct proc *p); bool stop_all_proc_block(void); void stop_all_proc_unblock(void); void stop_all_proc(void); void resume_all_proc(void); static __inline int curthread_pflags_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags & flags); td->td_pflags |= flags; return (save); } static __inline void curthread_pflags_restore(int save) { curthread->td_pflags &= save; } static __inline int curthread_pflags2_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags2 & flags); td->td_pflags2 |= flags; return (save); } static __inline void curthread_pflags2_restore(int save) { curthread->td_pflags2 &= save; } static __inline __pure2 struct td_sched * td_get_sched(struct thread *td) { return ((struct td_sched *)&td[1]); } #define PROC_ID_PID 0 #define PROC_ID_GROUP 1 #define PROC_ID_SESSION 2 #define PROC_ID_REAP 3 void proc_id_set(int type, pid_t id); void proc_id_set_cond(int type, pid_t id); void proc_id_clear(int type, pid_t id); EVENTHANDLER_LIST_DECLARE(process_ctor); EVENTHANDLER_LIST_DECLARE(process_dtor); EVENTHANDLER_LIST_DECLARE(process_init); EVENTHANDLER_LIST_DECLARE(process_fini); EVENTHANDLER_LIST_DECLARE(process_exit); EVENTHANDLER_LIST_DECLARE(process_fork); EVENTHANDLER_LIST_DECLARE(process_exec); EVENTHANDLER_LIST_DECLARE(thread_ctor); EVENTHANDLER_LIST_DECLARE(thread_dtor); EVENTHANDLER_LIST_DECLARE(thread_init); #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ diff --git a/sys/sys/uio.h b/sys/sys/uio.h index ec4e92d852a6..05c1ed640b63 100644 --- a/sys/sys/uio.h +++ b/sys/sys/uio.h @@ -1,118 +1,119 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_UIO_H_ #define _SYS_UIO_H_ #include #include #include #include #ifndef _SSIZE_T_DECLARED typedef __ssize_t ssize_t; #define _SSIZE_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifdef _KERNEL struct uio { struct iovec *uio_iov; /* scatter/gather list */ int uio_iovcnt; /* length of scatter/gather list */ off_t uio_offset; /* offset in target object */ ssize_t uio_resid; /* remaining bytes to process */ enum uio_seg uio_segflg; /* address space */ enum uio_rw uio_rw; /* operation */ struct thread *uio_td; /* owner */ }; /* * Limits * * N.B.: UIO_MAXIOV must be no less than IOV_MAX from * which in turn must be no less than _XOPEN_IOV_MAX from . If * we ever make this tunable (probably pointless), then IOV_MAX should be * removed from and applications would be expected to use * sysconf(3) to find out the correct value, or else assume the worst * (_XOPEN_IOV_MAX). Perhaps UIO_MAXIOV should be simply defined as * IOV_MAX. */ #define UIO_MAXIOV 1024 /* max 1K of iov's */ struct vm_object; struct vm_page; struct bus_dma_segment; struct uio *allocuio(u_int iovcnt); void freeuio(struct uio *uio); struct uio *cloneuio(struct uio *uiop); int copyiniov(const struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error); int copyinuio(const struct iovec *iovp, u_int iovcnt, struct uio **uiop); int copyout_map(struct thread *td, vm_offset_t *addr, size_t sz); int copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz); +void exterr_copyout(struct thread *td); int physcopyin(void *src, vm_paddr_t dst, size_t len); int physcopyout(vm_paddr_t src, void *dst, size_t len); int physcopyin_vlist(struct bus_dma_segment *src, off_t offset, vm_paddr_t dst, size_t len); int physcopyout_vlist(vm_paddr_t src, struct bus_dma_segment *dst, off_t offset, size_t len); void uioadvance(struct uio *, size_t); int uiomove(void *cp, int n, struct uio *uio); int uiomove_frombuf(void *buf, int buflen, struct uio *uio); int uiomove_fromphys(struct vm_page *ma[], vm_offset_t offset, int n, struct uio *uio); int uiomove_nofault(void *cp, int n, struct uio *uio); int uiomove_object(struct vm_object *obj, off_t obj_size, struct uio *uio); #else /* !_KERNEL */ #if defined(_FORTIFY_SOURCE) && _FORTIFY_SOURCE > 0 #include #endif __BEGIN_DECLS ssize_t readv(int, const struct iovec *, int); ssize_t writev(int, const struct iovec *, int); #if __BSD_VISIBLE ssize_t preadv(int, const struct iovec *, int, off_t); ssize_t pwritev(int, const struct iovec *, int, off_t); #endif __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_UIO_H_ */