Index: head/sys/kern/kern_acl.c =================================================================== --- head/sys/kern/kern_acl.c (revision 167231) +++ head/sys/kern/kern_acl.c (revision 167232) @@ -1,431 +1,431 @@ /*- * Copyright (c) 1999-2006 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Developed by the TrustedBSD Project. * * ACL system calls and other functions common across different ACL types. * Type-specific routines go into subr_acl_.c. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include uma_zone_t acl_zone; static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); /* * These calls wrap the real vnode operations, and are called by the syscall * code once the syscall has converted the path or file descriptor to a vnode * (unlocked). The aclp pointer is assumed still to point to userland, so * this should not be consumed within the kernel except by syscall code. * Other code should directly invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernacl; struct mount *mp; int error; error = copyin(aclp, &inkernacl, sizeof(struct acl)); if (error) return(error); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC error = mac_check_vnode_setacl(td->td_ucred, vp, type, &inkernacl); if (error != 0) goto out; #endif error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return(error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC error = mac_check_vnode_getacl(td->td_ucred, vp, type); if (error != 0) goto out; #endif error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp, 0, td); if (error == 0) error = copyout(&inkernelacl, aclp, sizeof(struct acl)); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) { struct mount *mp; int error; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC error = mac_check_vnode_deleteacl(td->td_ucred, vp, type); if (error) goto out; #endif error = VOP_SETACL(vp, type, 0, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it */ static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl inkernelacl; int error; error = copyin(aclp, &inkernelacl, sizeof(struct acl)); if (error) return(error); error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't * need to lock, as the vacl_ code will get/release any locks required. */ /* * Given a file path, get an ACL for it */ int __acl_get_file(struct thread *td, struct __acl_get_file_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, get an ACL for it; don't follow links. */ int __acl_get_link(struct thread *td, struct __acl_get_link_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, set an ACL for it. */ int __acl_set_file(struct thread *td, struct __acl_set_file_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, set an ACL for it; don't follow links. */ int __acl_set_link(struct thread *td, struct __acl_set_link_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* - * Given a file descriptor, get an ACL for it + * Given a file descriptor, get an ACL for it. */ int __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) { struct file *fp; int vfslocked, error; error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); } return (error); } /* - * Given a file descriptor, set an ACL for it + * Given a file descriptor, set an ACL for it. */ int __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) { struct file *fp; int vfslocked, error; error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); } return (error); } /* * Given a file path, delete an ACL from it. */ int __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, uap->type); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, delete an ACL from it; don't follow links. */ int __acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, uap->type); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, delete an ACL from it. */ int __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) { struct file *fp; int vfslocked, error; error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_delete(td, fp->f_vnode, uap->type); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); } return (error); } /* * Given a file path, check an ACL for it. */ int __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file path, check an ACL for it; don't follow links. */ int __acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap) { struct nameidata nd; int vfslocked, error; NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td); error = namei(&nd); vfslocked = NDHASGIANT(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp); NDFREE(&nd, 0); } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Given a file descriptor, check an ACL for it. */ int __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) { struct file *fp; int vfslocked, error; error = getvnode(td->td_proc->p_fd, uap->filedes, &fp); if (error == 0) { vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); } return (error); } /* ARGUSED */ static void aclinit(void *dummy __unused) { acl_zone = uma_zcreate("ACL UMA zone", sizeof(struct acl), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(acls, SI_SUB_ACL, SI_ORDER_FIRST, aclinit, NULL) Index: head/sys/kern/kern_context.c =================================================================== --- head/sys/kern/kern_context.c (revision 167231) +++ head/sys/kern/kern_context.c (revision 167232) @@ -1,130 +1,130 @@ /*- * Copyright (c) 2002 Daniel M. Eischen * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include /* - * The first two fields of a ucontext_t are the signal mask and - * the machine context. The next field is uc_link; we want to - * avoid destroying the link when copying out contexts. + * The first two fields of a ucontext_t are the signal mask and the machine + * context. The next field is uc_link; we want to avoid destroying the link + * when copying out contexts. */ #define UC_COPY_SIZE offsetof(ucontext_t, uc_link) #ifndef _SYS_SYSPROTO_H_ struct getcontext_args { struct __ucontext *ucp; } struct setcontext_args { const struct __ucontext_t *ucp; } struct swapcontext_args { struct __ucontext *oucp; const struct __ucontext_t *ucp; } #endif int getcontext(struct thread *td, struct getcontext_args *uap) { ucontext_t uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); ret = copyout(&uc, uap->ucp, UC_COPY_SIZE); } return (ret); } int setcontext(struct thread *td, struct setcontext_args *uap) { ucontext_t uc; int ret; if (uap->ucp == NULL) ret = EINVAL; else { ret = copyin(uap->ucp, &uc, UC_COPY_SIZE); if (ret == 0) { ret = set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { SIG_CANTMASK(uc.uc_sigmask); PROC_LOCK(td->td_proc); td->td_sigmask = uc.uc_sigmask; PROC_UNLOCK(td->td_proc); } } } return (ret == 0 ? EJUSTRETURN : ret); } int swapcontext(struct thread *td, struct swapcontext_args *uap) { ucontext_t uc; int ret; if (uap->oucp == NULL || uap->ucp == NULL) ret = EINVAL; else { get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET); PROC_LOCK(td->td_proc); uc.uc_sigmask = td->td_sigmask; PROC_UNLOCK(td->td_proc); ret = copyout(&uc, uap->oucp, UC_COPY_SIZE); if (ret == 0) { ret = copyin(uap->ucp, &uc, UC_COPY_SIZE); if (ret == 0) { ret = set_mcontext(td, &uc.uc_mcontext); if (ret == 0) { SIG_CANTMASK(uc.uc_sigmask); PROC_LOCK(td->td_proc); td->td_sigmask = uc.uc_sigmask; PROC_UNLOCK(td->td_proc); } } } } return (ret == 0 ? EJUSTRETURN : ret); } Index: head/sys/kern/kern_descrip.c =================================================================== --- head/sys/kern/kern_descrip.c (revision 167231) +++ head/sys/kern/kern_descrip.c (revision 167232) @@ -1,2722 +1,2722 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); static uma_zone_t file_zone; /* How to treat 'new' parameter when allocating a fd for do_dup(). */ enum dup_type { DUP_VARIABLE, DUP_FIXED }; static int do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval); static int fd_first_free(struct filedesc *, int, int); static int fd_last_used(struct filedesc *, int, int); static void fdgrowtable(struct filedesc *, int); static int fdrop_locked(struct file *fp, struct thread *td); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); /* * A process is initially started out with NDFILE descriptors stored within * this structure, selected to be enough for typical applications based on * the historical limit of 20 open files (and the usage of descriptors by * shells). If these descriptors are exhausted, a larger descriptor table * may be allocated, up to a process' resource limit; the internal arrays * are then unused. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* * Storage required per open file descriptor. */ #define OFILESIZE (sizeof(struct file *) + sizeof(char)) /* * Basic allocation of descriptors: * one of the above, plus arrays for NDFILE descriptors. */ struct filedesc0 { struct filedesc fd_fd; /* * These arrays are used when the number of open files is * <= NDFILE, and are then pointed to by the pointers above. */ struct file *fd_dfiles[NDFILE]; char fd_dfileflags[NDFILE]; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ struct filelist filehead; /* head of list of open files */ int openfiles; /* actual number of open files */ struct sx filelist_lock; /* sx to protect filelist */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* A mutex to protect the association between a proc and filedesc. */ static struct mtx fdesc_mtx; /* * Find the first zero bit in the given bitmap, starting at low and not * exceeding size - 1. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the highest non-zero bit in the given bitmap, starting at low and * not exceeding size - 1. */ static int fd_last_used(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, minoff; if (low >= size) return (-1); off = NDSLOT(size); if (size % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); if ((mask &= map[off]) != 0) return (off * NDENTRIES + flsl(mask) - 1); --off; } for (minoff = NDSLOT(low); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (low - 1); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused(struct filedesc *fdp, int fd) { FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); KASSERT(!fdisused(fdp, fd), ("fd already used")); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); if (fd > fdp->fd_lastfile) fdp->fd_lastfile = fd; if (fd == fdp->fd_freefile) fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); KASSERT(fdisused(fdp, fd), ("fd is already unused")); KASSERT(fdp->fd_ofiles[fd] == NULL, ("fd is still in use")); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; if (fd == fdp->fd_lastfile) fdp->fd_lastfile = fd_last_used(fdp, 0, fd); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int getdtablesize(struct thread *td, struct getdtablesize_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); return (0); } /* * Duplicate a file descriptor to a particular value. * - * note: keep in mind that a potential race condition exists when closing + * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int dup2(struct thread *td, struct dup2_args *uap) { return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to, td->td_retval)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int dup(struct thread *td, struct dup_args *uap) { return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int fcntl(struct thread *td, struct fcntl_args *uap) { struct flock fl; intptr_t arg; int error; error = 0; switch (uap->cmd) { case F_GETLK: case F_SETLK: case F_SETLKW: error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); arg = (intptr_t)&fl; break; default: arg = uap->arg; break; } if (error) return (error); error = kern_fcntl(td, uap->fd, uap->cmd, arg); if (error) return (error); if (uap->cmd == F_GETLK) error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); return (error); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp; struct proc *p; char *pop; struct vnode *vp; u_int newmin; int error, flg, tmp; int giant_locked; /* * XXXRW: Some fcntl() calls require Giant -- others don't. Try to * avoid grabbing Giant for calls we know don't need it. */ switch (cmd) { case F_DUPFD: case F_GETFD: case F_SETFD: case F_GETFL: giant_locked = 0; break; default: giant_locked = 1; mtx_lock(&Giant); } error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; FILEDESC_LOCK(fdp); if ((unsigned)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) { FILEDESC_UNLOCK(fdp); error = EBADF; goto done2; } pop = &fdp->fd_ofileflags[fd]; switch (cmd) { case F_DUPFD: /* mtx_assert(&Giant, MA_NOTOWNED); */ FILEDESC_UNLOCK(fdp); newmin = arg; PROC_LOCK(p); if (newmin >= lim_cur(p, RLIMIT_NOFILE) || newmin >= maxfilesperproc) { PROC_UNLOCK(p); error = EINVAL; break; } PROC_UNLOCK(p); error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval); break; case F_GETFD: /* mtx_assert(&Giant, MA_NOTOWNED); */ td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0; FILEDESC_UNLOCK(fdp); break; case F_SETFD: /* mtx_assert(&Giant, MA_NOTOWNED); */ *pop = (*pop &~ UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); FILEDESC_UNLOCK(fdp); break; case F_GETFL: /* mtx_assert(&Giant, MA_NOTOWNED); */ FILE_LOCK(fp); td->td_retval[0] = OFLAGS(fp->f_flag); FILE_UNLOCK(fp); FILEDESC_UNLOCK(fdp); break; case F_SETFL: mtx_assert(&Giant, MA_OWNED); FILE_LOCK(fp); fhold_locked(fp); fp->f_flag &= ~FCNTLFLAGS; fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; FILE_UNLOCK(fp); FILEDESC_UNLOCK(fdp); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } FILE_LOCK(fp); fp->f_flag &= ~FNONBLOCK; FILE_UNLOCK(fp); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: mtx_assert(&Giant, MA_OWNED); fhold(fp); FILEDESC_UNLOCK(fdp); error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: mtx_assert(&Giant, MA_OWNED); fhold(fp); FILEDESC_UNLOCK(fdp); tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLKW: mtx_assert(&Giant, MA_OWNED); flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: mtx_assert(&Giant, MA_OWNED); if (fp->f_type != DTYPE_VNODE) { FILEDESC_UNLOCK(fdp); error = EBADF; break; } flp = (struct flock *)arg; if (flp->l_whence == SEEK_CUR) { if (fp->f_offset < 0 || (flp->l_start > 0 && fp->f_offset > OFF_MAX - flp->l_start)) { FILEDESC_UNLOCK(fdp); error = EOVERFLOW; break; } flp->l_start += fp->f_offset; } /* * VOP_ADVLOCK() may block. */ fhold(fp); FILEDESC_UNLOCK(fdp); vp = fp->f_vnode; switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); break; default: error = EINVAL; break; } /* Check for race with close */ FILEDESC_LOCK_FAST(fdp); if ((unsigned) fd >= fdp->fd_nfiles || fp != fdp->fd_ofiles[fd]) { FILEDESC_UNLOCK_FAST(fdp); flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); } else FILEDESC_UNLOCK_FAST(fdp); fdrop(fp, td); break; case F_GETLK: mtx_assert(&Giant, MA_OWNED); if (fp->f_type != DTYPE_VNODE) { FILEDESC_UNLOCK(fdp); error = EBADF; break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { FILEDESC_UNLOCK(fdp); error = EINVAL; break; } if (flp->l_whence == SEEK_CUR) { if ((flp->l_start > 0 && fp->f_offset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && fp->f_offset < OFF_MIN - flp->l_start)) { FILEDESC_UNLOCK(fdp); error = EOVERFLOW; break; } flp->l_start += fp->f_offset; } /* * VOP_ADVLOCK() may block. */ fhold(fp); FILEDESC_UNLOCK(fdp); vp = fp->f_vnode; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); fdrop(fp, td); break; default: FILEDESC_UNLOCK(fdp); error = EINVAL; break; } done2: if (giant_locked) mtx_unlock(&Giant); return (error); } /* * Common code for dup, dup2, and fcntl(F_DUPFD). */ static int do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval) { struct filedesc *fdp; struct proc *p; struct file *fp; struct file *delfp; int error, holdleaders, maxfd; KASSERT((type == DUP_VARIABLE || type == DUP_FIXED), ("invalid dup type %d", type)); p = td->td_proc; fdp = p->p_fd; /* * Verify we have a valid descriptor to dup from and possibly to * dup to. */ if (old < 0 || new < 0) return (EBADF); PROC_LOCK(p); maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); if (new >= maxfd) return (EMFILE); FILEDESC_LOCK(fdp); if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { FILEDESC_UNLOCK(fdp); return (EBADF); } if (type == DUP_FIXED && old == new) { *retval = new; FILEDESC_UNLOCK(fdp); return (0); } fp = fdp->fd_ofiles[old]; fhold(fp); /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. Since the filedesc * lock may be temporarily dropped in the process, we have to look * out for a race. */ if (type == DUP_FIXED) { if (new >= fdp->fd_nfiles) fdgrowtable(fdp, new + 1); if (fdp->fd_ofiles[new] == NULL) fdused(fdp, new); } else { if ((error = fdalloc(td, new, &new)) != 0) { FILEDESC_UNLOCK(fdp); fdrop(fp, td); return (error); } } /* * If the old file changed out from under us then treat it as a * bad file descriptor. Userland should do its own locking to * avoid this case. */ if (fdp->fd_ofiles[old] != fp) { /* we've allocated a descriptor which we won't use */ if (fdp->fd_ofiles[new] == NULL) fdunused(fdp, new); FILEDESC_UNLOCK(fdp); fdrop(fp, td); return (EBADF); } KASSERT(old != new, ("new fd is same as old")); /* * Save info on the descriptor being overwritten. We cannot close * it without introducing an ownership race for the slot, since we * need to drop the filedesc lock to call closef(). * * XXX this duplicates parts of close(). */ delfp = fdp->fd_ofiles[new]; holdleaders = 0; if (delfp != NULL) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; holdleaders = 1; } } /* * Duplicate the source descriptor */ fdp->fd_ofiles[new] = fp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; if (new > fdp->fd_lastfile) fdp->fd_lastfile = new; *retval = new; /* * If we dup'd over a valid file, we now own the reference to it * and must dispose of it using closef() semantics (as if a * close() were performed on it). * * XXX this duplicates parts of close(). */ if (delfp != NULL) { knote_fdclose(td, new); if (delfp->f_type == DTYPE_MQUEUE) mq_fdclose(td, new, delfp); FILEDESC_UNLOCK(fdp); (void) closef(delfp, td); if (holdleaders) { FILEDESC_LOCK_FAST(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_UNLOCK_FAST(fdp); } } else { FILEDESC_UNLOCK(fdp); } return (0); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } *(sigio->sio_myref) = NULL; if ((sigio)->sio_pgid < 0) { struct pgrp *pg = (sigio)->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { struct proc *p = (sigio)->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); } /* * Free a list of sigio structures. * We only need to lock the SIGIO_LOCK because we have made ourselves * inaccessible to callers of fsetown and therefore do not need to lock * the proc or pgrp struct for the list manipulation. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio; sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; /* * Every entry of the list should belong * to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK_ASSERT(p, MA_NOTOWNED); } SIGIO_LOCK(); while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { *(sigio->sio_myref) = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); SIGIO_LOCK(); } SIGIO_UNLOCK(); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } ret = 0; /* Allocate and fill in the new sigio out of locks. */ MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; sx_slock(&proctree_lock); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { ret = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ PROC_UNLOCK(proc); if (proc->p_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } pgrp = NULL; } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; goto fail; } PGRP_UNLOCK(pgrp); /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } proc = NULL; } funsetown(sigiop); if (pgid > 0) { PROC_LOCK(proc); /* * Since funsetownlst() is called without the proctree * locked, we need to check for P_WEXIT. * XXX: is ESRCH correct? */ if ((proc->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(proc); ret = ESRCH; goto fail; } SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); sigio->sio_proc = proc; PROC_UNLOCK(proc); } else { PGRP_LOCK(pgrp); SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); SIGIO_LOCK(); *sigiop = sigio; SIGIO_UNLOCK(); return (0); fail: sx_sunlock(&proctree_lock); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(sigiop) struct sigio **sigiop; { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int close(td, uap) struct thread *td; struct close_args *uap; { return (kern_close(td, uap->fd)); } int kern_close(td, fd) struct thread *td; int fd; { struct filedesc *fdp; struct file *fp; int error; int holdleaders; error = 0; holdleaders = 0; fdp = td->td_proc->p_fd; AUDIT_SYSCLOSE(td, fd); FILEDESC_LOCK(fdp); if ((unsigned)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) { FILEDESC_UNLOCK(fdp); return (EBADF); } fdp->fd_ofiles[fd] = NULL; fdp->fd_ofileflags[fd] = 0; fdunused(fdp, fd); if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; holdleaders = 1; } /* * We now hold the fp reference that used to be owned by the descriptor * array. * We have to unlock the FILEDESC *AFTER* knote_fdclose to prevent a * race of the fd getting opened, a knote added, and deleteing a knote * for the new fd. */ knote_fdclose(td, fd); if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, fd, fp); FILEDESC_UNLOCK(fdp); error = closef(fp, td); if (holdleaders) { FILEDESC_LOCK_FAST(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_UNLOCK_FAST(fdp); } return (error); } #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; int error; AUDIT_ARG(fd, fd); if ((error = fget(td, fd, &fp)) != 0) return (error); AUDIT_ARG(file, td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int nfstat(struct thread *td, struct nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtnstat(&ub, &nub); error = copyout(&nub, uap->sb, sizeof(nub)); } return (error); } /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int fpathconf(struct thread *td, struct fpathconf_args *uap) { struct file *fp; struct vnode *vp; int error; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); /* If asynchronous I/O is available, it works for all descriptors. */ if (uap->name == _PC_ASYNC_IO) { td->td_retval[0] = async_io_version; goto out; } vp = fp->f_vnode; if (vp != NULL) { int vfslocked; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_PATHCONF(vp, uap->name, td->td_retval); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (uap->name != _PC_PIPE_BUF) { error = EINVAL; } else { td->td_retval[0] = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Grow the file table to accomodate (at least) nfd descriptors. This may * block and drop the filedesc lock, but it will reacquire it before * returning. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct file **ntable; char *nfileflags; int nnfiles, onfiles; NDSLOTTYPE *nmap; FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* compute the size of the new table */ onfiles = fdp->fd_nfiles; nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* allocate a new table and (if required) new bitmaps */ FILEDESC_UNLOCK(fdp); MALLOC(ntable, struct file **, nnfiles * OFILESIZE, M_FILEDESC, M_ZERO | M_WAITOK); nfileflags = (char *)&ntable[nnfiles]; if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); else nmap = NULL; FILEDESC_LOCK(fdp); /* * We now have new tables ready to go. Since we dropped the * filedesc lock to call malloc(), watch out for a race. */ onfiles = fdp->fd_nfiles; if (onfiles >= nnfiles) { /* we lost the race, but that's OK */ free(ntable, M_FILEDESC); if (nmap != NULL) free(nmap, M_FILEDESC); return; } bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable)); bcopy(fdp->fd_ofileflags, nfileflags, onfiles); if (onfiles > NDFILE) free(fdp->fd_ofiles, M_FILEDESC); fdp->fd_ofiles = ntable; fdp->fd_ofileflags = nfileflags; if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap)); if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); fdp->fd_map = nmap; } fdp->fd_nfiles = nnfiles; } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd = -1, maxfd; FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; PROC_LOCK(p); maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); /* * Search the bitmap for a free descriptor. If none is found, try * to grow the file table. Keep at it until we either get a file * descriptor or run into process or system limits; fdgrowtable() * may drop the filedesc lock, so we're in a race. */ for (;;) { fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (fd >= maxfd) return (EMFILE); if (fd < fdp->fd_nfiles) break; fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd)); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd] == NULL, ("free descriptor isn't")); fdp->fd_ofileflags[fd] = 0; /* XXX needed? */ fdused(fdp, fd); *result = fd; return (0); } /* * Check to see whether n user file descriptors * are available to the process p. */ int fdavail(struct thread *td, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = td->td_proc->p_fd; struct file **fpp; int i, lim, last; FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); PROC_LOCK(p); lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) return (1); last = min(fdp->fd_nfiles, lim); fpp = &fdp->fd_ofiles[fdp->fd_freefile]; for (i = last - fdp->fd_freefile; --i >= 0; fpp++) { if (*fpp == NULL && --n <= 0) return (1); } return (0); } /* * Create a new open file structure and allocate * a file decriptor for the process that refers to it. * We add one reference to the file for the descriptor table * and one reference for resultfp. This is to prevent us being * preempted and the entry in the descriptor table closed after * we release the FILEDESC lock. */ int falloc(struct thread *td, struct file **resultfp, int *resultfd) { struct proc *p = td->td_proc; struct file *fp, *fq; int error, i; int maxuserfiles = maxfiles - (maxfiles / 20); static struct timeval lastfail; static int curfail; fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); sx_xlock(&filelist_lock); if ((openfiles >= maxuserfiles && priv_check_cred(td->td_ucred, PRIV_MAXFILES, SUSER_RUID) != 0) || openfiles >= maxfiles) { if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n", td->td_ucred->cr_ruid); } sx_xunlock(&filelist_lock); uma_zfree(file_zone, fp); return (ENFILE); } openfiles++; /* * If the process has file descriptor zero open, add the new file * descriptor to the list of open files at that point, otherwise * put it at the front of the list of open files. */ fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep); fp->f_count = 1; if (resultfp) fp->f_count++; fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; fp->f_data = NULL; fp->f_vnode = NULL; FILEDESC_LOCK(p->p_fd); if ((fq = p->p_fd->fd_ofiles[0])) { LIST_INSERT_AFTER(fq, fp, f_list); } else { LIST_INSERT_HEAD(&filehead, fp, f_list); } sx_xunlock(&filelist_lock); if ((error = fdalloc(td, 0, &i))) { FILEDESC_UNLOCK(p->p_fd); fdrop(fp, td); if (resultfp) fdrop(fp, td); return (error); } p->p_fd->fd_ofiles[i] = fp; FILEDESC_UNLOCK(p->p_fd); if (resultfp) *resultfp = fp; if (resultfd) *resultfd = i; return (0); } /* * Build a new filedesc structure from another. * Copy the current, root, and jail root vnode references. */ struct filedesc * fdinit(struct filedesc *fdp) { struct filedesc0 *newfdp; newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO); mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF); if (fdp != NULL) { FILEDESC_LOCK(fdp); newfdp->fd_fd.fd_cdir = fdp->fd_cdir; if (newfdp->fd_fd.fd_cdir) VREF(newfdp->fd_fd.fd_cdir); newfdp->fd_fd.fd_rdir = fdp->fd_rdir; if (newfdp->fd_fd.fd_rdir) VREF(newfdp->fd_fd.fd_rdir); newfdp->fd_fd.fd_jdir = fdp->fd_jdir; if (newfdp->fd_fd.fd_jdir) VREF(newfdp->fd_fd.fd_jdir); FILEDESC_UNLOCK(fdp); } /* Create the file descriptor table. */ newfdp->fd_fd.fd_refcnt = 1; newfdp->fd_fd.fd_holdcnt = 1; newfdp->fd_fd.fd_cmask = CMASK; newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; newfdp->fd_fd.fd_nfiles = NDFILE; newfdp->fd_fd.fd_map = newfdp->fd_dmap; newfdp->fd_fd.fd_lastfile = -1; return (&newfdp->fd_fd); } static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; mtx_lock(&fdesc_mtx); fdp = p->p_fd; if (fdp != NULL) fdp->fd_holdcnt++; mtx_unlock(&fdesc_mtx); return (fdp); } static void fddrop(struct filedesc *fdp) { int i; mtx_lock(&fdesc_mtx); i = --fdp->fd_holdcnt; mtx_unlock(&fdesc_mtx); if (i > 0) return; mtx_destroy(&fdp->fd_mtx); FREE(fdp, M_FILEDESC); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { FILEDESC_LOCK_FAST(fdp); fdp->fd_refcnt++; FILEDESC_UNLOCK_FAST(fdp); return (fdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct proc *p, struct thread *td) { FILEDESC_LOCK_FAST(p->p_fd); if (p->p_fd->fd_refcnt > 1) { struct filedesc *tmp; FILEDESC_UNLOCK_FAST(p->p_fd); tmp = fdcopy(p->p_fd); fdfree(td); p->p_fd = tmp; } else FILEDESC_UNLOCK_FAST(p->p_fd); } /* * Copy a filedesc structure. * A NULL pointer in returns a NULL reference, this is to ease callers, * not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; int i; /* Certain daemons might not have file descriptors. */ if (fdp == NULL) return (NULL); newfdp = fdinit(fdp); FILEDESC_LOCK_FAST(fdp); while (fdp->fd_lastfile >= newfdp->fd_nfiles) { FILEDESC_UNLOCK_FAST(fdp); FILEDESC_LOCK(newfdp); fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_UNLOCK(newfdp); FILEDESC_LOCK_FAST(fdp); } /* copy everything except kqueue descriptors */ newfdp->fd_freefile = -1; for (i = 0; i <= fdp->fd_lastfile; ++i) { if (fdisused(fdp, i) && fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE) { newfdp->fd_ofiles[i] = fdp->fd_ofiles[i]; newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; fhold(newfdp->fd_ofiles[i]); newfdp->fd_lastfile = i; } else { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; } } FILEDESC_UNLOCK_FAST(fdp); FILEDESC_LOCK(newfdp); for (i = 0; i <= newfdp->fd_lastfile; ++i) if (newfdp->fd_ofiles[i] != NULL) fdused(newfdp, i); FILEDESC_UNLOCK(newfdp); FILEDESC_LOCK_FAST(fdp); if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_UNLOCK_FAST(fdp); return (newfdp); } /* * Release a filedesc structure. */ void fdfree(struct thread *td) { struct filedesc *fdp; struct file **fpp; int i, locked; struct filedesc_to_leader *fdtol; struct file *fp; struct vnode *cdir, *jdir, *rdir, *vp; struct flock lf; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; /* Check for special need to clear POSIX style locks */ fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { FILEDESC_LOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { for (i = 0, fpp = fdp->fd_ofiles; i <= fdp->fd_lastfile; i++, fpp++) { if (*fpp == NULL || (*fpp)->f_type != DTYPE_VNODE) continue; fp = *fpp; fhold(fp); FILEDESC_UNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; locked = VFS_LOCK_GIANT(vp->v_mount); (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc-> p_leader, F_UNLCK, &lf, F_POSIX); VFS_UNLOCK_GIANT(locked); FILEDESC_LOCK(fdp); fdrop(fp, td); fpp = fdp->fd_ofiles + i; } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or do_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx, PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader * remains valid in closef(). */ fdtol->fdl_wakeup = 1; msleep(fdtol, &fdp->fd_mtx, PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; td->td_proc->p_fdtol = NULL; FILEDESC_UNLOCK(fdp); if (fdtol != NULL) FREE(fdtol, M_FILEDESC_TO_LEADER); } FILEDESC_LOCK_FAST(fdp); i = --fdp->fd_refcnt; FILEDESC_UNLOCK_FAST(fdp); if (i > 0) return; /* * We are the last reference to the structure, so we can * safely assume it will not change out from under us. */ fpp = fdp->fd_ofiles; for (i = fdp->fd_lastfile; i-- >= 0; fpp++) { if (*fpp) (void) closef(*fpp, td); } FILEDESC_LOCK(fdp); /* XXX This should happen earlier. */ mtx_lock(&fdesc_mtx); td->td_proc->p_fd = NULL; mtx_unlock(&fdesc_mtx); if (fdp->fd_nfiles > NDFILE) FREE(fdp->fd_ofiles, M_FILEDESC); if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) FREE(fdp->fd_map, M_FILEDESC); fdp->fd_nfiles = 0; cdir = fdp->fd_cdir; fdp->fd_cdir = NULL; rdir = fdp->fd_rdir; fdp->fd_rdir = NULL; jdir = fdp->fd_jdir; fdp->fd_jdir = NULL; FILEDESC_UNLOCK(fdp); if (cdir) { locked = VFS_LOCK_GIANT(cdir->v_mount); vrele(cdir); VFS_UNLOCK_GIANT(locked); } if (rdir) { locked = VFS_LOCK_GIANT(rdir->v_mount); vrele(rdir); VFS_UNLOCK_GIANT(locked); } if (jdir) { locked = VFS_LOCK_GIANT(jdir->v_mount); vrele(jdir); VFS_UNLOCK_GIANT(locked); } fddrop(fdp); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since setugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static int is_unsafe(struct file *fp) { if (fp->f_type == DTYPE_VNODE) { struct vnode *vp = fp->f_vnode; if ((vp->v_vflag & VV_PROCDEP) != 0) return (1); } return (0); } /* * Make this setguid thing safe, if at all possible. */ void setugidsafety(struct thread *td) { struct filedesc *fdp; int i; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; /* * Note: fdp->fd_ofiles may be reallocated out from under us while * we are blocked in a close. Be careful! */ FILEDESC_LOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { if (i > 2) break; if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { struct file *fp; knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fp = fdp->fd_ofiles[i]; fdp->fd_ofiles[i] = NULL; fdp->fd_ofileflags[i] = 0; fdunused(fdp, i); FILEDESC_UNLOCK(fdp); (void) closef(fp, td); FILEDESC_LOCK(fdp); } } FILEDESC_UNLOCK(fdp); } /* * If a specific file object occupies a specific file descriptor, * close the file descriptor entry and drop a reference on the file * object. This is a convenience function to handle a subsequent * error in a function that calls falloc() that handles the race that * another thread might have closed the file descriptor out from under * the thread creating the file object. */ void fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td) { FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[idx] == fp) { fdp->fd_ofiles[idx] = NULL; fdunused(fdp, idx); FILEDESC_UNLOCK(fdp); fdrop(fp, td); } else { FILEDESC_UNLOCK(fdp); } } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; int i; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; FILEDESC_LOCK(fdp); /* * We cannot cache fd_ofiles or fd_ofileflags since operations * may block and rip them out from under us. */ for (i = 0; i <= fdp->fd_lastfile; i++) { if (fdp->fd_ofiles[i] != NULL && (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE || (fdp->fd_ofileflags[i] & UF_EXCLOSE))) { struct file *fp; knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fp = fdp->fd_ofiles[i]; fdp->fd_ofiles[i] = NULL; fdp->fd_ofileflags[i] = 0; fdunused(fdp, i); if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, i, fp); FILEDESC_UNLOCK(fdp); (void) closef(fp, td); FILEDESC_LOCK(fdp); } } FILEDESC_UNLOCK(fdp); } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct nameidata nd; struct filedesc *fdp; struct file *fp; register_t retval; int fd, i, error, flags, devnull; fdp = td->td_proc->p_fd; if (fdp == NULL) return (0); KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); devnull = -1; error = 0; for (i = 0; i < 3; i++) { if (fdp->fd_ofiles[i] != NULL) continue; if (devnull < 0) { int vfslocked; error = falloc(td, &fp, &fd); if (error != 0) break; /* Note extra ref on `fp' held for us by falloc(). */ KASSERT(fd == i, ("oof, we didn't get our fd")); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, "/dev/null", td); flags = FREAD | FWRITE; error = vn_open(&nd, &flags, 0, fd); if (error != 0) { /* * Someone may have closed the entry in the * file descriptor table, so check it hasn't * changed before dropping the reference count. */ FILEDESC_LOCK(fdp); KASSERT(fdp->fd_ofiles[fd] == fp, ("table not shared, how did it change?")); fdp->fd_ofiles[fd] = NULL; fdunused(fdp, fd); FILEDESC_UNLOCK(fdp); fdrop(fp, td); fdrop(fp, td); break; } vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); fp->f_flag = flags; fp->f_vnode = nd.ni_vp; if (fp->f_data == NULL) fp->f_data = nd.ni_vp; if (fp->f_ops == &badfileops) fp->f_ops = &vnops; fp->f_type = DTYPE_VNODE; VOP_UNLOCK(nd.ni_vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); devnull = fd; fdrop(fp, td); } else { error = do_dup(td, DUP_FIXED, devnull, i, &retval); if (error != 0) break; } } return (error); } /* * Internal form of close. * Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. * * XXXRW: Giant is not required for the caller, but often will be held; this * makes it moderately likely the Giant will be recursed in the VFS case. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE && td != NULL) { int vfslocked; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table * is shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_LOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_UNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_LOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_UNLOCK(fdp); } VFS_UNLOCK_GIANT(vfslocked); } return (fdrop(fp, td)); } /* * Extract the file pointer associated with the specified descriptor for * the current user process. * * If the descriptor doesn't exist, EBADF is returned. * * If the descriptor exists but doesn't match 'flags' then * return EBADF for read attempts and EINVAL for write attempts. * * If 'hold' is set (non-zero) the file's refcount will be bumped on return. * It should be dropped with fdrop(). * If it is not set, then the refcount will not be bumped however the * thread's filedesc struct will be returned locked (for fgetsock). * * If an error occured the non-zero error is returned and *fpp is set to NULL. * Otherwise *fpp is set and zero is returned. */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold) { struct filedesc *fdp; struct file *fp; *fpp = NULL; if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) return (EBADF); FILEDESC_LOCK(fdp); if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) { FILEDESC_UNLOCK(fdp); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. * * Only one flag, or 0, may be specified. */ if (flags == FREAD && (fp->f_flag & FREAD) == 0) { FILEDESC_UNLOCK(fdp); return (EBADF); } if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) { FILEDESC_UNLOCK(fdp); return (EBADF); } if (hold) { fhold(fp); FILEDESC_UNLOCK(fdp); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, 0, 1)); } int fget_read(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, FREAD, 1)); } int fget_write(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, FWRITE, 1)); } /* * Like fget() but loads the underlying vnode, or returns an error if * the descriptor does not represent a vnode. Note that pipes use vnodes * but never have VM objects. The returned vnode will be vref()d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) { struct file *fp; int error; *vpp = NULL; if ((error = _fget(td, fd, &fp, 0, 0)) != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vref(*vpp); } FILEDESC_UNLOCK(td->td_proc->p_fd); return (error); } int fgetvp(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, 0)); } int fgetvp_read(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, FREAD)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, FWRITE)); } #endif /* * Like fget() but loads the underlying socket, or returns an error if * the descriptor does not represent a socket. * * We bump the ref count on the returned socket. XXX Also obtain the SX * lock in the future. * * XXXRW: fgetsock() and fputsock() are deprecated, as consumers should rely * on their file descriptor reference to prevent the socket from being * freed during use. */ int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) { struct file *fp; int error; NET_ASSERT_GIANT(); *spp = NULL; if (fflagp != NULL) *fflagp = 0; if ((error = _fget(td, fd, &fp, 0, 0)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { error = ENOTSOCK; } else { *spp = fp->f_data; if (fflagp) *fflagp = fp->f_flag; SOCK_LOCK(*spp); soref(*spp); SOCK_UNLOCK(*spp); } FILEDESC_UNLOCK(td->td_proc->p_fd); return (error); } /* * Drop the reference count on the socket and XXX release the SX lock in the * future. The last reference closes the socket. * * XXXRW: fputsock() is deprecated, see comment for fgetsock(). */ void fputsock(struct socket *so) { NET_ASSERT_GIANT(); ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); } int fdrop(struct file *fp, struct thread *td) { FILE_LOCK(fp); return (fdrop_locked(fp, td)); } /* * Drop reference on struct file passed in, may call closef if the * reference hits zero. * Expects struct file locked, and will unlock it. */ static int fdrop_locked(struct file *fp, struct thread *td) { int error; FILE_LOCK_ASSERT(fp, MA_OWNED); if (--fp->f_count > 0) { FILE_UNLOCK(fp); return (0); } /* * We might have just dropped the last reference to a file * object that is for a UNIX domain socket whose message * buffers are being examined in unp_gc(). If that is the * case, FWAIT will be set in f_gcflag and we need to wait for * unp_gc() to finish its scan. */ while (fp->f_gcflag & FWAIT) msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0); /* We have the last ref so we can proceed without the file lock. */ FILE_UNLOCK(fp); if (fp->f_count < 0) panic("fdrop: count < 0"); if (fp->f_ops != &badfileops) error = fo_close(fp, td); else error = 0; sx_xlock(&filelist_lock); LIST_REMOVE(fp, f_list); openfiles--; sx_xunlock(&filelist_lock); crfree(fp->f_cred); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * - * Just attempt to get a record lock of the requested type on - * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). + * Just attempt to get a record lock of the requested type on the entire file + * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; int error; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); return (EOPNOTSUPP); } mtx_lock(&Giant); vp = fp->f_vnode; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; FILE_LOCK(fp); fp->f_flag &= ~FHASLOCK; FILE_UNLOCK(fp); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done2; } FILE_LOCK(fp); fp->f_flag |= FHASLOCK; FILE_UNLOCK(fp); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: fdrop(fp, td); mtx_unlock(&Giant); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error) { struct file *wfp; struct file *fp; /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_LOCK(fdp); if (dfd < 0 || dfd >= fdp->fd_nfiles || (wfp = fdp->fd_ofiles[dfd]) == NULL) { FILEDESC_UNLOCK(fdp); return (EBADF); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor * (indx) and return. * * For ENXIO steal away the file structure from (dfd) and * store it in (indx). (dfd) is effectively closed by * this operation. * * Any other error code is just returned. */ switch (error) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ FILE_LOCK(wfp); if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { FILE_UNLOCK(wfp); FILEDESC_UNLOCK(fdp); return (EACCES); } fp = fdp->fd_ofiles[indx]; fdp->fd_ofiles[indx] = wfp; fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; if (fp == NULL) fdused(fdp, indx); fhold_locked(wfp); FILE_UNLOCK(wfp); FILEDESC_UNLOCK(fdp); if (fp != NULL) { /* * We now own the reference to fp that the ofiles[] * array used to own. Release it. */ FILE_LOCK(fp); fdrop_locked(fp, td); } return (0); case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ fp = fdp->fd_ofiles[indx]; fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; fdp->fd_ofiles[dfd] = NULL; fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; fdp->fd_ofileflags[dfd] = 0; fdunused(fdp, dfd); if (fp == NULL) fdused(fdp, indx); if (fp != NULL) FILE_LOCK(fp); /* * We now own the reference to fp that the ofiles[] array * used to own. Release it. */ if (fp != NULL) fdrop_locked(fp, td); FILEDESC_UNLOCK(fdp); return (0); default: FILEDESC_UNLOCK(fdp); return (error); } /* NOTREACHED */ } /* * Scan all active processes to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new * mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct filedesc *fdp; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { fdp = fdhold(p); if (fdp == NULL) continue; nrele = 0; FILEDESC_LOCK_FAST(fdp); if (fdp->fd_cdir == olddp) { vref(newdp); fdp->fd_cdir = newdp; nrele++; } if (fdp->fd_rdir == olddp) { vref(newdp); fdp->fd_rdir = newdp; nrele++; } FILEDESC_UNLOCK_FAST(fdp); fddrop(fdp); while (nrele--) vrele(olddp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { vrele(rootvnode); vref(newdp); rootvnode = newdp; } } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; MALLOC(fdtol, struct filedesc_to_leader *, sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_LOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_UNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } /* * Get file structures. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n; /* * Note: because the number of file descriptors is calculated * in different ways for sizing vs returning the data, * there is information leakage from the first loop. However, * it is of a similar order of magnitude to the leakage from * global system statistics such as kern.openfiles. */ error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 16; /* A slight overestimate. */ sx_slock(&filelist_lock); LIST_FOREACH(fp, &filehead, f_list) { /* * We should grab the lock, but this is an * estimate, so does it really matter? */ /* mtx_lock(fp->f_mtxp); */ n += fp->f_count; /* mtx_unlock(f->f_mtxp); */ } sx_sunlock(&filelist_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; PROC_LOCK(p); if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; PROC_UNLOCK(p); fdp = fdhold(p); if (fdp == NULL) continue; FILEDESC_LOCK_FAST(fdp); for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_ofiles[n]) == NULL) continue; xf.xf_fd = n; xf.xf_file = fp; xf.xf_data = fp->f_data; xf.xf_vnode = fp->f_vnode; xf.xf_type = fp->f_type; xf.xf_count = fp->f_count; xf.xf_msgcount = fp->f_msgcount; xf.xf_offset = fp->f_offset; xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } FILEDESC_UNLOCK_FAST(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnod"); case DTYPE_SOCKET: return ("sock"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kque"); case DTYPE_CRYPTO: return ("crpt"); case DTYPE_MQUEUE: return ("mque"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n < fdp->fd_nfiles; n++) { if (fp == fdp->fd_ofiles[n]) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { struct proc *p; if (header) db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", "File", "Type", "Data", "Flag", "GCFl", "Count", "MCount", "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } DB_SHOW_COMMAND(files, db_show_files) { struct file *fp; int header; header = 1; LIST_FOREACH(fp, &filehead, f_list) { db_print_file(fp, header); header = 0; } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, &openfiles, 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); sx_init(&filelist_lock, "filelist lock"); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL) /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (EBADF); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, }; /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL) Index: head/sys/kern/kern_environment.c =================================================================== --- head/sys/kern/kern_environment.c (revision 167231) +++ head/sys/kern/kern_environment.c (revision 167232) @@ -1,558 +1,557 @@ /*- * Copyright (c) 1998 Michael Smith * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The unified bootloader passes us a pointer to a preserved copy of * bootstrap/kernel environment variables. We convert them to a * dynamic array of strings later when the VM subsystem is up. * * We make these available through the kenv(2) syscall for userland * and through getenv()/freeenv() setenv() unsetenv() testenv() for * the kernel. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment"); #define KENV_SIZE 512 /* Maximum number of environment strings */ /* pointer to the static environment */ char *kern_envp; static char *kernenv_next(char *); /* dynamic environment variables */ char **kenvp; struct mtx kenv_lock; /* - * No need to protect this with a mutex - * since SYSINITS are single threaded. + * No need to protect this with a mutex since SYSINITS are single threaded. */ int dynamic_kenv = 0; #define KENV_CHECK if (!dynamic_kenv) \ panic("%s: called before SI_SUB_KMEM", __func__) int kenv(td, uap) struct thread *td; struct kenv_args /* { int what; const char *name; char *value; int len; } */ *uap; { char *name, *value, *buffer = NULL; size_t len, done, needed; int error, i; KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0")); error = 0; if (uap->what == KENV_DUMP) { #ifdef MAC error = mac_check_kenv_dump(td->td_ucred); if (error) return (error); #endif done = needed = 0; if (uap->len > 0 && uap->value != NULL) buffer = malloc(uap->len, M_TEMP, M_WAITOK|M_ZERO); mtx_lock(&kenv_lock); for (i = 0; kenvp[i] != NULL; i++) { len = strlen(kenvp[i]) + 1; needed += len; len = min(len, uap->len - done); /* * If called with a NULL or insufficiently large * buffer, just keep computing the required size. */ if (uap->value != NULL && buffer != NULL && len > 0) { bcopy(kenvp[i], buffer + done, len); done += len; } } mtx_unlock(&kenv_lock); if (buffer != NULL) { error = copyout(buffer, uap->value, done); free(buffer, M_TEMP); } td->td_retval[0] = ((done == needed) ? 0 : needed); return (error); } switch (uap->what) { case KENV_SET: error = priv_check(td, PRIV_KENV_SET); if (error) return (error); break; case KENV_UNSET: error = priv_check(td, PRIV_KENV_UNSET); if (error) return (error); break; } name = malloc(KENV_MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->name, name, KENV_MNAMELEN, NULL); if (error) goto done; switch (uap->what) { case KENV_GET: #ifdef MAC error = mac_check_kenv_get(td->td_ucred, name); if (error) goto done; #endif value = getenv(name); if (value == NULL) { error = ENOENT; goto done; } len = strlen(value) + 1; if (len > uap->len) len = uap->len; error = copyout(value, uap->value, len); freeenv(value); if (error) goto done; td->td_retval[0] = len; break; case KENV_SET: len = uap->len; if (len < 1) { error = EINVAL; goto done; } if (len > KENV_MVALLEN) len = KENV_MVALLEN; value = malloc(len, M_TEMP, M_WAITOK); error = copyinstr(uap->value, value, len, NULL); if (error) { free(value, M_TEMP); goto done; } #ifdef MAC error = mac_check_kenv_set(td->td_ucred, name, value); if (error == 0) #endif setenv(name, value); free(value, M_TEMP); break; case KENV_UNSET: #ifdef MAC error = mac_check_kenv_unset(td->td_ucred, name); if (error) goto done; #endif error = unsetenv(name); if (error) error = ENOENT; break; default: error = EINVAL; break; } done: free(name, M_TEMP); return (error); } /* * Setup the dynamic kernel environment. */ static void init_dynamic_kenv(void *data __unused) { char *cp; int len, i; kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV, M_WAITOK | M_ZERO); i = 0; for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) { len = strlen(cp) + 1; if (i < KENV_SIZE) { kenvp[i] = malloc(len, M_KENV, M_WAITOK); strcpy(kenvp[i++], cp); } else printf( "WARNING: too many kenv strings, ignoring %s\n", cp); } kenvp[i] = NULL; mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF); dynamic_kenv = 1; } SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL); void freeenv(char *env) { if (dynamic_kenv) free(env, M_KENV); } /* * Internal functions for string lookup. */ static char * _getenv_dynamic(const char *name, int *idx) { char *cp; int len, i; mtx_assert(&kenv_lock, MA_OWNED); len = strlen(name); for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) { if ((strncmp(cp, name, len) == 0) && (cp[len] == '=')) { if (idx != NULL) *idx = i; return (cp + len + 1); } } return (NULL); } static char * _getenv_static(const char *name) { char *cp, *ep; int len; for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) { for (ep = cp; (*ep != '=') && (*ep != 0); ep++) ; if (*ep != '=') continue; len = ep - cp; ep++; if (!strncmp(name, cp, len) && name[len] == 0) return (ep); } return (NULL); } /* * Look up an environment variable by name. * Return a pointer to the string if found. * The pointer has to be freed with freeenv() * after use. */ char * getenv(const char *name) { char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1]; char *ret, *cp; int len; if (dynamic_kenv) { mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, NULL); if (cp != NULL) { strcpy(buf, cp); mtx_unlock(&kenv_lock); len = strlen(buf) + 1; ret = malloc(len, M_KENV, M_WAITOK); strcpy(ret, buf); } else { mtx_unlock(&kenv_lock); ret = NULL; } } else ret = _getenv_static(name); return (ret); } /* * Test if an environment variable is defined. */ int testenv(const char *name) { char *cp; if (dynamic_kenv) { mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, NULL); mtx_unlock(&kenv_lock); } else cp = _getenv_static(name); if (cp != NULL) return (1); return (0); } /* * Set an environment variable by name. */ int setenv(const char *name, const char *value) { char *buf, *cp, *oldenv; int namelen, vallen, i; KENV_CHECK; namelen = strlen(name) + 1; if (namelen > KENV_MNAMELEN) return (-1); vallen = strlen(value) + 1; if (vallen > KENV_MVALLEN) return (-1); buf = malloc(namelen + vallen, M_KENV, M_WAITOK); sprintf(buf, "%s=%s", name, value); mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, &i); if (cp != NULL) { oldenv = kenvp[i]; kenvp[i] = buf; mtx_unlock(&kenv_lock); free(oldenv, M_KENV); } else { /* We add the option if it wasn't found */ for (i = 0; (cp = kenvp[i]) != NULL; i++) ; /* Bounds checking */ if (i < 0 || i >= KENV_SIZE) { free(buf, M_KENV); mtx_unlock(&kenv_lock); return (-1); } kenvp[i] = buf; kenvp[i + 1] = NULL; mtx_unlock(&kenv_lock); } return (0); } /* * Unset an environment variable string. */ int unsetenv(const char *name) { char *cp, *oldenv; int i, j; KENV_CHECK; mtx_lock(&kenv_lock); cp = _getenv_dynamic(name, &i); if (cp != NULL) { oldenv = kenvp[i]; for (j = i + 1; kenvp[j] != NULL; j++) kenvp[i++] = kenvp[j]; kenvp[i] = NULL; mtx_unlock(&kenv_lock); free(oldenv, M_KENV); return (0); } mtx_unlock(&kenv_lock); return (-1); } /* * Return a string value from an environment variable. */ int getenv_string(const char *name, char *data, int size) { char *tmp; tmp = getenv(name); if (tmp != NULL) { strlcpy(data, tmp, size); freeenv(tmp); return (1); } else return (0); } /* * Return an integer value from an environment variable. */ int getenv_int(const char *name, int *data) { quad_t tmp; int rval; rval = getenv_quad(name, &tmp); if (rval) *data = (int) tmp; return (rval); } /* * Return a long value from an environment variable. */ long getenv_long(const char *name, long *data) { quad_t tmp; long rval; rval = getenv_quad(name, &tmp); if (rval) *data = (long) tmp; return (rval); } /* * Return an unsigned long value from an environment variable. */ unsigned long getenv_ulong(const char *name, unsigned long *data) { quad_t tmp; long rval; rval = getenv_quad(name, &tmp); if (rval) *data = (unsigned long) tmp; return (rval); } /* * Return a quad_t value from an environment variable. */ int getenv_quad(const char *name, quad_t *data) { char *value; char *vtp; quad_t iv; value = getenv(name); if (value == NULL) return (0); iv = strtoq(value, &vtp, 0); if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0')) { freeenv(value); return (0); } switch (vtp[0]) { case 't': case 'T': iv *= 1024; case 'g': case 'G': iv *= 1024; case 'm': case 'M': iv *= 1024; case 'k': case 'K': iv *= 1024; case '\0': break; default: freeenv(value); return (0); } *data = iv; freeenv(value); return (1); } /* * Find the next entry after the one which (cp) falls within, return a * pointer to its start or NULL if there are no more. */ static char * kernenv_next(char *cp) { if (cp != NULL) { while (*cp != 0) cp++; cp++; if (*cp == 0) cp = NULL; } return (cp); } void tunable_int_init(void *data) { struct tunable_int *d = (struct tunable_int *)data; TUNABLE_INT_FETCH(d->path, d->var); } void tunable_long_init(void *data) { struct tunable_long *d = (struct tunable_long *)data; TUNABLE_LONG_FETCH(d->path, d->var); } void tunable_ulong_init(void *data) { struct tunable_ulong *d = (struct tunable_ulong *)data; TUNABLE_ULONG_FETCH(d->path, d->var); } void tunable_str_init(void *data) { struct tunable_str *d = (struct tunable_str *)data; TUNABLE_STR_FETCH(d->path, d->var, d->size); } Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 167231) +++ head/sys/kern/kern_exec.c (revision 167232) @@ -1,1299 +1,1299 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); static void exec_free_args(struct image_args *); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int execve(td, uap) struct thread *td; struct execve_args /* { char *fname; char **argv; char **envv; } */ *uap; { int error; struct image_args args; error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int __mac_execve(td, uap) struct thread *td; struct __mac_execve_args /* { char *fname; char **argv; char **envv; struct mac *mac_p; } */ *uap; { #ifdef MAC int error; struct image_args args; error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); return (error); #else return (ENOSYS); #endif } /* - * XXX: kern_execve has the astonishing property of not always - * returning to the caller. If sufficiently bad things happen during - * the call to do_execve(), it can end up calling exit1(); as a result, - * callers must avoid doing anything which they might need to undo - * (e.g., allocating memory). + * XXX: kern_execve has the astonishing property of not always returning to + * the caller. If sufficiently bad things happen during the call to + * do_execve(), it can end up calling exit1(); as a result, callers must + * avoid doing anything which they might need to undo (e.g., allocating + * memory). */ int kern_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; int error; AUDIT_ARG(argv, args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG(envv, args->begin_envv, args->envc, args->endp - args->begin_envv); if (p->p_flag & P_HADTHREADS) { PROC_LOCK(p); if (thread_single(SINGLE_BOUNDARY)) { PROC_UNLOCK(p); exec_free_args(args); return (ERESTART); /* Try again later. */ } PROC_UNLOCK(p); } error = do_execve(td, args, mac_p); if (p->p_flag & P_HADTHREADS) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == 0) thread_single(SINGLE_EXIT); else thread_single_end(); PROC_UNLOCK(p); } return (error); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; struct nameidata nd, *ndp; struct ucred *newcred = NULL, *oldcred; struct uidinfo *euip; register_t *stack_base; int error, len, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts, *newsigacts; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *textvp = NULL; int credential_changing; int vfslocked; int textset; #ifdef MAC struct label *interplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif vfslocked = 0; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ imgp->proc = p; imgp->execlabel = NULL; imgp->attr = &attr; imgp->entry_addr = 0; imgp->vmspace_destroyed = 0; imgp->interpreted = 0; imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX; imgp->auxargs = NULL; imgp->vp = NULL; imgp->object = NULL; imgp->firstpage = NULL; imgp->ps_strings = 0; imgp->auxarg_size = 0; imgp->args = args; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif imgp->image_header = NULL; /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ ndp = &nd; NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); interpret: error = namei(ndp); if (error) goto exec_fail; vfslocked = NDHASGIANT(ndp); imgp->vp = ndp->ni_vp; /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = imgp->vp->v_vflag & VV_TEXT; imgp->vp->v_vflag |= VV_TEXT; error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) imgp->vp->v_vflag &= ~VV_TEXT; error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ imgp->vp->v_vflag &= ~VV_TEXT; /* free name buffer and old vnode */ NDFREE(ndp, NDF_ONLY_PNBUF); #ifdef MAC interplabel = mac_vnode_label_alloc(); mac_copy_vnode_label(ndp->ni_vp->v_label, interplabel); #endif vput(ndp->ni_vp); vm_object_deallocate(imgp->object); imgp->object = NULL; VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; /* set new name to that of the interpreter */ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE, UIO_SYSSPACE, imgp->interpreter_name, td); goto interpret; } /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); /* * For security and other reasons, the file descriptor table cannot * be shared after an exec. */ fdunshare(p, td); /* * Malloc things before we need locks. */ newcred = crget(); euip = uifind(attr.va_uid); i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* close files on exec */ VOP_UNLOCK(imgp->vp, 0, td); fdcloseexec(td); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); /* Get a reference to the vnode prior to locking the proc */ VREF(ndp->ni_vp); /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ PROC_LOCK(p); if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; PROC_UNLOCK(p); newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); PROC_LOCK(p); p->p_sigacts = newsigacts; } else oldsigacts = NULL; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); p->p_comm[len] = 0; /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if (p->p_pptr && (p->p_flag & P_PPWAIT)) { p->p_flag &= ~P_PPWAIT; wakeup(p->p_pptr); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ oldcred = p->p_ucred; credential_changing = 0; credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_execve_will_transition(oldcred, imgp->vp, interplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracevp != NULL && priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, SUSER_ALLOWJAIL)) { mtx_lock(&ktrace_mtx); p->p_traceflag = 0; tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); } #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * setugidsafety() may call closef() and then pfind() * which may grab the process lock. * fdcheckstd() may call falloc() which may block to * allocate memory, so temporarily drop the process lock. */ PROC_UNLOCK(p); setugidsafety(td); VOP_UNLOCK(imgp->vp, 0, td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); if (error != 0) goto done1; PROC_LOCK(p); /* * Set the new credentials. */ crcopy(newcred, oldcred); if (attr.va_mode & VSUID) change_euid(newcred, euip); if (attr.va_mode & VSGID) change_egid(newcred, attr.va_gid); #ifdef MAC if (will_transition) { mac_execve_transition(oldcred, newcred, imgp->vp, interplabel, imgp); } #endif /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { crcopy(newcred, oldcred); change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } } /* * Store the vp for use in procfs. This vnode was referenced prior * to locking the proc lock. */ textvp = p->p_textvp; p->p_textvp = ndp->ni_vp; /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* * If tracing the process, trap to debugger so breakpoints * can be set before the program executes. * Use tdsignal to deliver signal to current thread, use * psignal may cause the signal to be delivered to wrong thread * because that thread will exit, remember we are going to enter * single thread mode. */ if (p->p_flag & P_TRACED) tdsignal(p, td, SIGTRAP, NULL); /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. * * The proc lock needs to be released before taking the PMC * SX. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { PROC_UNLOCK(p); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); } else PROC_UNLOCK(p); #else /* !HWPMC_HOOKS */ PROC_UNLOCK(p); #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); else exec_setregs(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base, imgp->ps_strings); vfs_mark_atime(imgp->vp, td); done1: /* * Free any resources malloc'd earlier that we didn't use. */ uifree(euip); if (newcred == NULL) crfree(oldcred); else crfree(newcred); VOP_UNLOCK(imgp->vp, 0, td); /* * Handle deferred decrement of ref counts. */ if (textvp != NULL) { int tvfslocked; tvfslocked = VFS_LOCK_GIANT(textvp->v_mount); vrele(textvp); VFS_UNLOCK_GIANT(tvfslocked); } if (ndp->ni_vp && error != 0) vrele(ndp->ni_vp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td); if (oldargs != NULL) pargs_drop(oldargs); if (newargs != NULL) pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(imgp->vp); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); if (error == 0) { /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); goto done2; } exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); done2: #ifdef MAC mac_execve_exit(imgp); if (interplabel != NULL) mac_vnode_label_free(interplabel); #endif VFS_UNLOCK_GIANT(vfslocked); exec_free_args(args); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ } return (error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i; int initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_LOCK(object); ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { initial_pagein = VM_INITIAL_PAGEIN; if (initial_pagein > object->size) initial_pagein = object->size; for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_lookup(object, i)) != NULL) { if (ma[i]->valid) break; if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy) break; vm_page_busy(ma[i]); } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, 0); ma[0] = vm_page_lookup(object, 0); if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) { if (ma[0]) { vm_page_lock_queues(); vm_page_free(ma[0]); vm_page_unlock_queues(); } VM_OBJECT_UNLOCK(object); return (EIO); } } vm_page_lock_queues(); vm_page_hold(ma[0]); vm_page_unlock_queues(); vm_page_wakeup(ma[0]); VM_OBJECT_UNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(imgp) struct image_params *imgp; { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock_queues(); vm_page_unhold(m); vm_page_unlock_queues(); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp, sv) struct image_params *imgp; struct sysentvec *sv; { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_offset_t stack_addr; vm_map_t map; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* Called with Giant held, do not depend on it! */ EVENTHANDLER_INVOKE(process_exec, p, imgp); /* * Here is as good a place as any to do any resource limit cleanups. * This is needed if a 64 bit binary exec's a 32 bit binary - the * data size limit may need to be changed to a value that makes * sense for the 32 bit binary. */ if (sv->sv_fixlimits != NULL) sv->sv_fixlimits(p); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); } else { vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Allocate a new stack */ stack_addr = sv->sv_usrstack - maxssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz, sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error) return (error); #ifdef __ia64__ /* Allocate a new register stack */ stack_addr = IA64_BACKINGSTORE; error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz, sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP); if (error) return (error); #endif /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the * VM_STACK case, but they are still used to monitor the size of the * process stack so we can check the stack rlimit. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz; return (0); } /* - * Copy out argument and environment strings from the old process - * address space into the temporary string buffer. + * Copy out argument and environment strings from the old process address + * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { char *argp, *envp; int error; size_t length; error = 0; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate temporary demand zeroed space for argument and * environment strings: * * o ARG_MAX for argument and environment; * o MAXSHELLCMDLEN for the name of interpreters. */ args->buf = (char *) kmem_alloc_wait(exec_map, PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); if (args->buf == NULL) return (ENOMEM); args->begin_argv = args->buf; args->endp = args->begin_argv; args->stringspace = ARG_MAX; args->fname = args->buf + ARG_MAX; /* * Copy the file name. */ error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; /* * extract arguments first */ while ((argp = (caddr_t) (intptr_t) fuword(argv++))) { if (argp == (caddr_t) -1) { error = EFAULT; goto err_exit; } if ((error = copyinstr(argp, args->endp, args->stringspace, &length))) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { while ((envp = (caddr_t)(intptr_t)fuword(envv++))) { if (envp == (caddr_t)-1) { error = EFAULT; goto err_exit; } if ((error = copyinstr(envp, args->endp, args->stringspace, &length))) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } static void exec_free_args(struct image_args *args) { if (args->buf) { kmem_free_wakeup(exec_map, (vm_offset_t)args->buf, PATH_MAX + ARG_MAX + MAXSHELLCMDLEN); args->buf = NULL; } } /* - * Copy strings out to the new process address space, constructing - * new arg and env vector tables. Return a pointer to the base - * so that it can be used as the initial stack pointer. + * Copy strings out to the new process address space, constructing new arg + * and env vector tables. Return a pointer to the base so that it can be used + * as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp, *destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; int szsigcode; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); /* * install sigcode */ if (szsigcode) copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo - szsigcode), szsigcode); /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error; td = curthread; /* XXXKSE */ /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred, td); if (error) return (error); #ifdef MAC error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that this * file resides on. * 2) Insure that at least one execute bit is on - otherwise root * will always succeed, and we don't want to happen unless the * file really is executable. * 3) Insure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr->va_mode & 0111) == 0) || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ if (vp->v_writecount) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1); return (error); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/kern_exit.c =================================================================== --- head/sys/kern/kern_exit.c (revision 167231) +++ head/sys/kern/kern_exit.c (revision 167232) @@ -1,899 +1,898 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for acct_process() function prototype */ #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include /* Required to be non-static for SysVR4 emulator */ MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status"); /* Hook for NFS teardown procedure. */ void (*nlminfo_release_p)(struct proc *p); /* - * exit -- - * Death of process. + * exit -- death of process. */ void sys_exit(struct thread *td, struct sys_exit_args *uap) { exit1(td, W_EXITCODE(uap->rval, 0)); /* NOTREACHED */ } /* - * Exit: deallocate address space and other resources, change proc state - * to zombie, and unlink proc from allproc and parent's lists. Save exit - * status and rusage for wait(). Check for child processes and orphan them. + * Exit: deallocate address space and other resources, change proc state to + * zombie, and unlink proc from allproc and parent's lists. Save exit status + * and rusage for wait(). Check for child processes and orphan them. */ void exit1(struct thread *td, int rv) { struct proc *p, *nq, *q; struct tty *tp; struct vnode *ttyvp; struct vnode *vtmp; #ifdef KTRACE struct vnode *tracevp; struct ucred *tracecred; #endif struct plimit *plim; int locked; /* * Drop Giant if caller has it. Eventually we should warn about * being called with Giant held. */ while (mtx_owned(&Giant)) mtx_unlock(&Giant); p = td->td_proc; if (p == initproc) { printf("init died (signal %d, exit %d)\n", WTERMSIG(rv), WEXITSTATUS(rv)); panic("Going nowhere without my init!"); } /* * MUST abort all other threads before proceeding past here. */ PROC_LOCK(p); if (p->p_flag & P_HADTHREADS) { retry: /* * First check if some other thread got here before us.. * if so, act apropriatly, (exit or suspend); */ thread_suspend_check(0); /* * Kill off the other threads. This requires * some co-operation from other parts of the kernel * so it may not be instantaneous. With this state set * any thread entering the kernel from userspace will * thread_exit() in trap(). Any thread attempting to * sleep will return immediately with EINTR or EWOULDBLOCK * which will hopefully force them to back out to userland * freeing resources as they go. Any thread attempting * to return to userland will thread_exit() from userret(). * thread_exit() will unsuspend us when the last of the * other threads exits. * If there is already a thread singler after resumption, * calling thread_single will fail; in that case, we just * re-check all suspension request, the thread should * either be suspended there or exit. */ if (thread_single(SINGLE_EXIT)) goto retry; /* * All other activity in this process is now stopped. * Threading support has been turned off. */ } /* * Wakeup anyone in procfs' PIOCWAIT. They should have a hold * on our vmspace, so we should block below until they have * released their reference to us. Note that if they have * requested S_EXIT stops we will block here until they ack * via PIOCCONT. */ _STOPEVENT(p, S_EXIT, rv); /* * Note that we are exiting and do another wakeup of anyone in * PIOCWAIT in case they aren't listening for S_EXIT stops or * decided to wait again after we told them we are exiting. */ p->p_flag |= P_WEXIT; wakeup(&p->p_stype); /* * Wait for any processes that have a hold on our vmspace to * release their reference. */ while (p->p_lock > 0) msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0); PROC_UNLOCK(p); #ifdef AUDIT /* * The Sun BSM exit token contains two components: an exit status as * passed to exit(), and a return value to indicate what sort of exit * it was. The exit status is WEXITSTATUS(rv), but it's not clear * what the return value is. */ AUDIT_ARG(exit, WEXITSTATUS(rv), 0); AUDIT_SYSCALL_EXIT(0, td); #endif /* Are we a task leader? */ if (p == p->p_leader) { mtx_lock(&ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); psignal(q, SIGKILL); PROC_UNLOCK(q); q = q->p_peers; } while (p->p_peers != NULL) msleep(p, &ppeers_lock, PWAIT, "exit1", 0); mtx_unlock(&ppeers_lock); } /* * Check if any loadable modules need anything done at process exit. * E.g. SYSV IPC stuff * XXX what if one of these generates an error? */ EVENTHANDLER_INVOKE(process_exit, p); MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), M_ZOMBIE, M_WAITOK); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ PROC_LOCK(p); stopprofclock(p); p->p_flag &= ~(P_TRACED | P_PPWAIT); /* * Stop the real interval timer. If the handler is currently * executing, prevent it from rearming itself and let it finish. */ if (timevalisset(&p->p_realtimer.it_value) && callout_stop(&p->p_itcallout) == 0) { timevalclear(&p->p_realtimer.it_interval); msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0); KASSERT(!timevalisset(&p->p_realtimer.it_value), ("realtime timer is still armed")); } PROC_UNLOCK(p); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pid. */ funsetownlst(&p->p_sigiolst); /* * If this process has an nlminfo data area (for lockd), release it */ if (nlminfo_release_p != NULL && p->p_nlminfo != NULL) (*nlminfo_release_p)(p); /* * Close open files and release open-file table. * This may block! */ fdfree(td); /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland */ if (td->td_pflags & TDP_GEOM) g_waitidle(); /* * Remove ourself from our leader's peer list and wake our leader. */ mtx_lock(&ppeers_lock); if (p->p_leader->p_peers) { q = p->p_leader; while (q->p_peers != p) q = q->p_peers; q->p_peers = p->p_peers; wakeup(p->p_leader); } mtx_unlock(&ppeers_lock); vmspace_exit(td); mtx_lock(&Giant); /* XXX TTY */ sx_xlock(&proctree_lock); if (SESS_LEADER(p)) { struct session *sp; sp = p->p_session; if (sp->s_ttyvp) { /* * Controlling process. * Signal foreground pgrp, * drain controlling terminal * and revoke access to controlling terminal. */ if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) { tp = sp->s_ttyp; if (sp->s_ttyp->t_pgrp) { PGRP_LOCK(sp->s_ttyp->t_pgrp); pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1); PGRP_UNLOCK(sp->s_ttyp->t_pgrp); } /* XXX tp should be locked. */ sx_xunlock(&proctree_lock); (void) ttywait(tp); sx_xlock(&proctree_lock); /* * The tty could have been revoked * if we blocked. */ if (sp->s_ttyvp) { ttyvp = sp->s_ttyvp; SESS_LOCK(p->p_session); sp->s_ttyvp = NULL; SESS_UNLOCK(p->p_session); sx_xunlock(&proctree_lock); VOP_LOCK(ttyvp, LK_EXCLUSIVE, td); VOP_REVOKE(ttyvp, REVOKEALL); vput(ttyvp); sx_xlock(&proctree_lock); } } if (sp->s_ttyvp) { ttyvp = sp->s_ttyvp; SESS_LOCK(p->p_session); sp->s_ttyvp = NULL; SESS_UNLOCK(p->p_session); vrele(ttyvp); } /* * s_ttyp is not zero'd; we use this to indicate * that the session once had a controlling terminal. * (for logging and informational purposes) */ } SESS_LOCK(p->p_session); sp->s_leader = NULL; SESS_UNLOCK(p->p_session); } fixjobc(p, p->p_pgrp, 0); sx_xunlock(&proctree_lock); (void)acct_process(td); mtx_unlock(&Giant); #ifdef KTRACE /* * Drain any pending records on the thread and release the trace * file. It might be better if drain-and-clear were atomic. */ ktrprocexit(td); PROC_LOCK(p); mtx_lock(&ktrace_mtx); p->p_traceflag = 0; /* don't trace the vrele() */ tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); PROC_UNLOCK(p); if (tracevp != NULL) { locked = VFS_LOCK_GIANT(tracevp->v_mount); vrele(tracevp); VFS_UNLOCK_GIANT(locked); } if (tracecred != NULL) crfree(tracecred); #endif /* * Release reference to text vnode */ if ((vtmp = p->p_textvp) != NULL) { p->p_textvp = NULL; locked = VFS_LOCK_GIANT(vtmp->v_mount); vrele(vtmp); VFS_UNLOCK_GIANT(locked); } /* * Release our limits structure. */ PROC_LOCK(p); plim = p->p_limit; p->p_limit = NULL; PROC_UNLOCK(p); lim_free(plim); /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); LIST_REMOVE(p, p_hash); sx_xunlock(&allproc_lock); /* * Reparent all of our children to init. */ sx_xlock(&proctree_lock); q = LIST_FIRST(&p->p_children); if (q != NULL) /* only need this if any child is S_ZOMB */ wakeup(initproc); for (; q != NULL; q = nq) { nq = LIST_NEXT(q, p_sibling); PROC_LOCK(q); proc_reparent(q, initproc); q->p_sigparent = SIGCHLD; /* * Traced processes are killed * since their existence means someone is screwing up. */ if (q->p_flag & P_TRACED) { q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE); psignal(q, SIGKILL); } PROC_UNLOCK(q); } /* * Save exit status and finalize rusage info except for times, * adding in child rusage info later when our time is locked. */ PROC_LOCK(p); p->p_xstat = rv; p->p_xthread = td; p->p_stats->p_ru.ru_nvcsw++; *p->p_ru = p->p_stats->p_ru; /* * Notify interested parties of our demise. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXIT); /* * Just delete all entries in the p_klist. At this point we won't * report any more events, and there are nasty race conditions that * can beat us if we don't. */ knlist_clear(&p->p_klist, 1); /* * Notify parent that we're gone. If parent has the PS_NOCLDWAIT * flag set, or if the handler is set to SIG_IGN, notify process * 1 instead (and hope it will handle this situation). */ PROC_LOCK(p->p_pptr); mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { struct proc *pp; mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); pp = p->p_pptr; PROC_UNLOCK(pp); proc_reparent(p, initproc); p->p_sigparent = SIGCHLD; PROC_LOCK(p->p_pptr); /* * If this was the last child of our parent, notify * parent, so in case he was wait(2)ing, he will * continue. */ if (LIST_EMPTY(&pp->p_children)) wakeup(pp); } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr == initproc) psignal(p->p_pptr, SIGCHLD); else if (p->p_sigparent != 0) { if (p->p_sigparent == SIGCHLD) childproc_exited(p); else /* LINUX thread */ psignal(p->p_pptr, p->p_sigparent); } PROC_UNLOCK(p->p_pptr); PROC_UNLOCK(p); /* * Finally, call machine-dependent code to release the remaining * resources including address space. * The address space is released by "vmspace_exitfree(p)" in * vm_waitproc(). */ cpu_exit(td); WITNESS_WARN(WARN_PANIC, &proctree_lock.sx_object, "process (pid %d) exiting", p->p_pid); PROC_LOCK(p); PROC_LOCK(p->p_pptr); sx_xunlock(&proctree_lock); /* * The state PRS_ZOMBIE prevents other proesses from sending * signal to the process, to avoid memory leak, we free memory * for signal queue at the time when the state is set. */ sigqueue_flush(&p->p_sigqueue); sigqueue_flush(&td->td_sigqueue); /* * We have to wait until after acquiring all locks before * changing p_state. We need to avoid all possible context * switches (including ones from blocking on a mutex) while * marked as a zombie. We also have to set the zombie state * before we release the parent process' proc lock to avoid * a lost wakeup. So, we first call wakeup, then we grab the * sched lock, update the state, and release the parent process' * proc lock. */ wakeup(p->p_pptr); mtx_lock_spin(&sched_lock); p->p_state = PRS_ZOMBIE; PROC_UNLOCK(p->p_pptr); sched_exit(p->p_pptr, td); /* * Hopefully no one will try to deliver a signal to the process this * late in the game. */ knlist_destroy(&p->p_klist); /* * Make sure the scheduler takes this thread out of its tables etc. * This will also release this thread's reference to the ucred. * Other thread parts to release include pcb bits and such. */ thread_exit(); } #ifndef _SYS_SYSPROTO_H_ struct abort2_args { char *why; int nargs; void **args; }; #endif int abort2(struct thread *td, struct abort2_args *uap) { struct proc *p = td->td_proc; struct sbuf *sb; void *uargs[16]; int error, i, sig; error = 0; /* satisfy compiler */ /* * Do it right now so we can log either proper call of abort2(), or * note, that invalid argument was passed. 512 is big enough to * handle 16 arguments' descriptions with additional comments. */ sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN); sbuf_clear(sb); sbuf_printf(sb, "%s(pid %d uid %d) aborted: ", p->p_comm, p->p_pid, td->td_ucred->cr_uid); /* * Since we can't return from abort2(), send SIGKILL in cases, where * abort2() was called improperly */ sig = SIGKILL; /* Prevent from DoSes from user-space. */ if (uap->nargs < 0 || uap->nargs > 16) goto out; if (uap->args == NULL) goto out; error = copyin(uap->args, uargs, uap->nargs * sizeof(void *)); if (error != 0) goto out; /* * Limit size of 'reason' string to 128. Will fit even when * maximal number of arguments was chosen to be logged. */ if (uap->why != NULL) { error = sbuf_copyin(sb, uap->why, 128); if (error < 0) goto out; } else { sbuf_printf(sb, "(null)"); } if (uap->nargs) { sbuf_printf(sb, "("); for (i = 0;i < uap->nargs; i++) sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]); sbuf_printf(sb, ")"); } /* * Final stage: arguments were proper, string has been * successfully copied from userspace, and copying pointers * from user-space succeed. */ sig = SIGABRT; out: if (sig == SIGKILL) { sbuf_trim(sb); sbuf_printf(sb, " (Reason text inaccessible)"); } sbuf_cat(sb, "\n"); sbuf_finish(sb); log(LOG_INFO, "%s", sbuf_data(sb)); sbuf_delete(sb); exit1(td, W_EXITCODE(0, sig)); return (0); } #ifdef COMPAT_43 /* * The dirty work is handled by kern_wait(). */ int owait(struct thread *td, struct owait_args *uap __unused) { int error, status; error = kern_wait(td, WAIT_ANY, &status, 0, NULL); if (error == 0) td->td_retval[1] = status; return (error); } #endif /* COMPAT_43 */ /* * The dirty work is handled by kern_wait(). */ int wait4(struct thread *td, struct wait_args *uap) { struct rusage ru, *rup; int error, status; if (uap->rusage != NULL) rup = &ru; else rup = NULL; error = kern_wait(td, uap->pid, &status, uap->options, rup); if (uap->status != NULL && error == 0) error = copyout(&status, uap->status, sizeof(status)); if (uap->rusage != NULL && error == 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int kern_wait(struct thread *td, pid_t pid, int *status, int options, struct rusage *rusage) { struct proc *p, *q, *t; int error, nfound; AUDIT_ARG(pid, pid); q = td->td_proc; if (pid == 0) { PROC_LOCK(q); pid = -q->p_pgid; PROC_UNLOCK(q); } if (options &~ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE)) return (EINVAL); loop: if (q->p_flag & P_STATCHILD) { PROC_LOCK(q); q->p_flag &= ~P_STATCHILD; PROC_UNLOCK(q); } nfound = 0; sx_xlock(&proctree_lock); LIST_FOREACH(p, &q->p_children, p_sibling) { PROC_LOCK(p); if (pid != WAIT_ANY && p->p_pid != pid && p->p_pgid != -pid) { PROC_UNLOCK(p); continue; } if (p_canwait(td, p)) { PROC_UNLOCK(p); continue; } /* * This special case handles a kthread spawned by linux_clone * (see linux_misc.c). The linux_wait4 and linux_waitpid * functions need to be able to distinguish between waiting * on a process and waiting on a thread. It is a thread if * p_sigparent is not SIGCHLD, and the WLINUXCLONE option * signifies we want to wait for threads and not processes. */ if ((p->p_sigparent != SIGCHLD) ^ ((options & WLINUXCLONE) != 0)) { PROC_UNLOCK(p); continue; } nfound++; if (p->p_state == PRS_ZOMBIE) { /* * It is possible that the last thread of this * process is still running on another CPU * in thread_exit() after having dropped the process * lock via PROC_UNLOCK() but before it has completed * cpu_throw(). In that case, the other thread must * still hold sched_lock, so simply by acquiring * sched_lock once we will wait long enough for the * thread to exit in that case. */ mtx_lock_spin(&sched_lock); mtx_unlock_spin(&sched_lock); td->td_retval[0] = p->p_pid; if (status) *status = p->p_xstat; /* convert to int */ if (rusage) { *rusage = *p->p_ru; calcru(p, &rusage->ru_utime, &rusage->ru_stime); } PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); /* * If we got the child via a ptrace 'attach', * we need to give it back to the old parent. */ PROC_UNLOCK(p); if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) { PROC_LOCK(p); p->p_oppid = 0; proc_reparent(p, t); PROC_UNLOCK(p); tdsignal(t, NULL, SIGCHLD, p->p_ksi); wakeup(t); PROC_UNLOCK(t); sx_xunlock(&proctree_lock); return (0); } /* * Remove other references to this process to ensure * we have an exclusive reference. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); /* off zombproc */ sx_xunlock(&allproc_lock); LIST_REMOVE(p, p_sibling); leavepgrp(p); sx_xunlock(&proctree_lock); /* * As a side effect of this lock, we know that * all other writes to this proc are visible now, so * no more locking is needed for p. */ PROC_LOCK(p); p->p_xstat = 0; /* XXX: why? */ PROC_UNLOCK(p); PROC_LOCK(q); ruadd(&q->p_stats->p_cru, &q->p_crux, p->p_ru, &p->p_rux); PROC_UNLOCK(q); FREE(p->p_ru, M_ZOMBIE); p->p_ru = NULL; /* * Decrement the count of procs running with this uid. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0); /* * Free credentials, arguments, and sigacts. */ crfree(p->p_ucred); p->p_ucred = NULL; pargs_drop(p->p_args); p->p_args = NULL; sigacts_free(p->p_sigacts); p->p_sigacts = NULL; /* * Do any thread-system specific cleanups. */ thread_wait(p); /* * Give vm and machine-dependent layer a chance * to free anything that cpu_exit couldn't * release while still running in process context. */ vm_waitproc(p); #ifdef MAC mac_destroy_proc(p); #endif #ifdef AUDIT audit_proc_free(p); #endif KASSERT(FIRST_THREAD_IN_PROC(p), ("kern_wait: no residual thread!")); uma_zfree(proc_zone, p); sx_xlock(&allproc_lock); nprocs--; sx_xunlock(&allproc_lock); return (0); } mtx_lock_spin(&sched_lock); if ((p->p_flag & P_STOPPED_SIG) && (p->p_suspcount == p->p_numthreads) && (p->p_flag & P_WAITED) == 0 && (p->p_flag & P_TRACED || options & WUNTRACED)) { mtx_unlock_spin(&sched_lock); p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; if (status) *status = W_STOPCODE(p->p_xstat); PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); PROC_UNLOCK(p); return (0); } mtx_unlock_spin(&sched_lock); if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) { sx_xunlock(&proctree_lock); td->td_retval[0] = p->p_pid; p->p_flag &= ~P_CONTINUED; PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); PROC_UNLOCK(p); if (status) *status = SIGCONT; return (0); } PROC_UNLOCK(p); } if (nfound == 0) { sx_xunlock(&proctree_lock); return (ECHILD); } if (options & WNOHANG) { sx_xunlock(&proctree_lock); td->td_retval[0] = 0; return (0); } PROC_LOCK(q); sx_xunlock(&proctree_lock); if (q->p_flag & P_STATCHILD) { q->p_flag &= ~P_STATCHILD; error = 0; } else error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0); PROC_UNLOCK(q); if (error) return (error); goto loop; } /* * Make process 'parent' the new parent of process 'child'. * Must be called with an exclusive hold of proctree lock. */ void proc_reparent(struct proc *child, struct proc *parent) { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(child, MA_OWNED); if (child->p_pptr == parent) return; PROC_LOCK(child->p_pptr); sigqueue_take(child->p_ksi); PROC_UNLOCK(child->p_pptr); LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); child->p_pptr = parent; } Index: head/sys/kern/kern_ktrace.c =================================================================== --- head/sys/kern/kern_ktrace.c (revision 167231) +++ head/sys/kern/kern_ktrace.c (revision 167232) @@ -1,1018 +1,1012 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The ktrace facility allows the tracing of certain key events in user space * processes, such as system calls, signal delivery, context switches, and * user generated events using utrace(2). It works by streaming event * records and data to a vnode associated with the process using the * ktrace(2) system call. In general, records can be written directly from * the context that generates the event. One important exception to this is * during a context switch, where sleeping is not permitted. To handle this * case, trace events are generated using in-kernel ktr_request records, and * then delivered to disk at a convenient moment -- either immediately, the * next traceable event, at system call return, or at process exit. * * When dealing with multiple threads or processes writing to the same event * log, ordering guarantees are weak: specifically, if an event has multiple * records (i.e., system call enter and return), they may be interlaced with * records from another event. Process and thread ID information is provided * in the record, and user applications can de-interlace events if required. */ static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE"); #ifdef KTRACE #ifndef KTRACE_REQUEST_POOL #define KTRACE_REQUEST_POOL 100 #endif struct ktr_request { struct ktr_header ktr_header; void *ktr_buffer; union { struct ktr_syscall ktr_syscall; struct ktr_sysret ktr_sysret; struct ktr_genio ktr_genio; struct ktr_psig ktr_psig; struct ktr_csw ktr_csw; } ktr_data; STAILQ_ENTRY(ktr_request) ktr_list; }; static int data_lengths[] = { 0, /* none */ offsetof(struct ktr_syscall, ktr_args), /* KTR_SYSCALL */ sizeof(struct ktr_sysret), /* KTR_SYSRET */ 0, /* KTR_NAMEI */ sizeof(struct ktr_genio), /* KTR_GENIO */ sizeof(struct ktr_psig), /* KTR_PSIG */ sizeof(struct ktr_csw), /* KTR_CSW */ 0 /* KTR_USER */ }; static STAILQ_HEAD(, ktr_request) ktr_free; static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options"); static u_int ktr_requestpool = KTRACE_REQUEST_POOL; TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool); static u_int ktr_geniosize = PAGE_SIZE; TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize); SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize, 0, "Maximum size of genio event payload"); static int print_message = 1; struct mtx ktrace_mtx; static struct sx ktrace_sx; static void ktrace_init(void *dummy); static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS); static u_int ktrace_resize_pool(u_int newsize); static struct ktr_request *ktr_getrequest(int type); static void ktr_submitrequest(struct thread *td, struct ktr_request *req); static void ktr_freerequest(struct ktr_request *req); static void ktr_writerequest(struct thread *td, struct ktr_request *req); static int ktrcanset(struct thread *,struct proc *); static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *); static int ktrops(struct thread *,struct proc *,int,int,struct vnode *); /* * ktrace itself generates events, such as context switches, which we do not * wish to trace. Maintain a flag, TDP_INKTRACE, on each thread to determine * whether or not it is in a region where tracing of events should be * suppressed. */ static void ktrace_enter(struct thread *td) { KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set")); td->td_pflags |= TDP_INKTRACE; } static void ktrace_exit(struct thread *td) { KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set")); td->td_pflags &= ~TDP_INKTRACE; } static void ktrace_assert(struct thread *td) { KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set")); } static void ktrace_init(void *dummy) { struct ktr_request *req; int i; mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET); sx_init(&ktrace_sx, "ktrace_sx"); STAILQ_INIT(&ktr_free); for (i = 0; i < ktr_requestpool; i++) { req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK); STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); } } SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL); static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS) { struct thread *td; u_int newsize, oldsize, wantsize; int error; /* Handle easy read-only case first to avoid warnings from GCC. */ if (!req->newptr) { mtx_lock(&ktrace_mtx); oldsize = ktr_requestpool; mtx_unlock(&ktrace_mtx); return (SYSCTL_OUT(req, &oldsize, sizeof(u_int))); } error = SYSCTL_IN(req, &wantsize, sizeof(u_int)); if (error) return (error); td = curthread; ktrace_enter(td); mtx_lock(&ktrace_mtx); oldsize = ktr_requestpool; newsize = ktrace_resize_pool(wantsize); mtx_unlock(&ktrace_mtx); ktrace_exit(td); error = SYSCTL_OUT(req, &oldsize, sizeof(u_int)); if (error) return (error); if (wantsize > oldsize && newsize < wantsize) return (ENOSPC); return (0); } SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW, &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", ""); static u_int ktrace_resize_pool(u_int newsize) { struct ktr_request *req; int bound; mtx_assert(&ktrace_mtx, MA_OWNED); print_message = 1; bound = newsize - ktr_requestpool; if (bound == 0) return (ktr_requestpool); if (bound < 0) /* Shrink pool down to newsize if possible. */ while (bound++ < 0) { req = STAILQ_FIRST(&ktr_free); if (req == NULL) return (ktr_requestpool); STAILQ_REMOVE_HEAD(&ktr_free, ktr_list); ktr_requestpool--; mtx_unlock(&ktrace_mtx); free(req, M_KTRACE); mtx_lock(&ktrace_mtx); } else /* Grow pool up to newsize. */ while (bound-- > 0) { mtx_unlock(&ktrace_mtx); req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK); mtx_lock(&ktrace_mtx); STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); ktr_requestpool++; } return (ktr_requestpool); } static struct ktr_request * ktr_getrequest(int type) { struct ktr_request *req; struct thread *td = curthread; struct proc *p = td->td_proc; int pm; ktrace_enter(td); /* XXX: In caller instead? */ mtx_lock(&ktrace_mtx); if (!KTRCHECK(td, type)) { mtx_unlock(&ktrace_mtx); ktrace_exit(td); return (NULL); } req = STAILQ_FIRST(&ktr_free); if (req != NULL) { STAILQ_REMOVE_HEAD(&ktr_free, ktr_list); req->ktr_header.ktr_type = type; if (p->p_traceflag & KTRFAC_DROP) { req->ktr_header.ktr_type |= KTR_DROP; p->p_traceflag &= ~KTRFAC_DROP; } mtx_unlock(&ktrace_mtx); microtime(&req->ktr_header.ktr_time); req->ktr_header.ktr_pid = p->p_pid; req->ktr_header.ktr_tid = td->td_tid; bcopy(p->p_comm, req->ktr_header.ktr_comm, MAXCOMLEN + 1); req->ktr_buffer = NULL; req->ktr_header.ktr_len = 0; } else { p->p_traceflag |= KTRFAC_DROP; pm = print_message; print_message = 0; mtx_unlock(&ktrace_mtx); if (pm) printf("Out of ktrace request objects.\n"); ktrace_exit(td); } return (req); } /* * Some trace generation environments don't permit direct access to VFS, * such as during a context switch where sleeping is not allowed. Under these * circumstances, queue a request to the thread to be written asynchronously * later. */ static void ktr_enqueuerequest(struct thread *td, struct ktr_request *req) { mtx_lock(&ktrace_mtx); STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list); mtx_unlock(&ktrace_mtx); ktrace_exit(td); } /* * Drain any pending ktrace records from the per-thread queue to disk. This * is used both internally before committing other records, and also on * system call return. We drain all the ones we can find at the time when * drain is requested, but don't keep draining after that as those events * may me approximately "after" the current event. */ static void ktr_drain(struct thread *td) { struct ktr_request *queued_req; STAILQ_HEAD(, ktr_request) local_queue; ktrace_assert(td); sx_assert(&ktrace_sx, SX_XLOCKED); STAILQ_INIT(&local_queue); /* XXXRW: needed? */ if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) { mtx_lock(&ktrace_mtx); STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr); mtx_unlock(&ktrace_mtx); while ((queued_req = STAILQ_FIRST(&local_queue))) { STAILQ_REMOVE_HEAD(&local_queue, ktr_list); ktr_writerequest(td, queued_req); ktr_freerequest(queued_req); } } } /* * Submit a trace record for immediate commit to disk -- to be used only * where entering VFS is OK. First drain any pending records that may have * been cached in the thread. */ static void ktr_submitrequest(struct thread *td, struct ktr_request *req) { ktrace_assert(td); sx_xlock(&ktrace_sx); ktr_drain(td); ktr_writerequest(td, req); ktr_freerequest(req); sx_xunlock(&ktrace_sx); ktrace_exit(td); } static void ktr_freerequest(struct ktr_request *req) { if (req->ktr_buffer != NULL) free(req->ktr_buffer, M_KTRACE); mtx_lock(&ktrace_mtx); STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list); mtx_unlock(&ktrace_mtx); } void ktrsyscall(code, narg, args) int code, narg; register_t args[]; { struct ktr_request *req; struct ktr_syscall *ktp; size_t buflen; char *buf = NULL; buflen = sizeof(register_t) * narg; if (buflen > 0) { buf = malloc(buflen, M_KTRACE, M_WAITOK); bcopy(args, buf, buflen); } req = ktr_getrequest(KTR_SYSCALL); if (req == NULL) { if (buf != NULL) free(buf, M_KTRACE); return; } ktp = &req->ktr_data.ktr_syscall; ktp->ktr_code = code; ktp->ktr_narg = narg; if (buflen > 0) { req->ktr_header.ktr_len = buflen; req->ktr_buffer = buf; } ktr_submitrequest(curthread, req); } void ktrsysret(code, error, retval) int code, error; register_t retval; { struct ktr_request *req; struct ktr_sysret *ktp; req = ktr_getrequest(KTR_SYSRET); if (req == NULL) return; ktp = &req->ktr_data.ktr_sysret; ktp->ktr_code = code; ktp->ktr_error = error; ktp->ktr_retval = retval; /* what about val2 ? */ ktr_submitrequest(curthread, req); } /* * When a process exits, drain per-process asynchronous trace records. */ void ktrprocexit(struct thread *td) { ktrace_enter(td); sx_xlock(&ktrace_sx); ktr_drain(td); sx_xunlock(&ktrace_sx); ktrace_exit(td); } /* * When a thread returns, drain any asynchronous records generated by the * system call. */ void ktruserret(struct thread *td) { ktrace_enter(td); sx_xlock(&ktrace_sx); ktr_drain(td); sx_xunlock(&ktrace_sx); ktrace_exit(td); } void ktrnamei(path) char *path; { struct ktr_request *req; int namelen; char *buf = NULL; namelen = strlen(path); if (namelen > 0) { buf = malloc(namelen, M_KTRACE, M_WAITOK); bcopy(path, buf, namelen); } req = ktr_getrequest(KTR_NAMEI); if (req == NULL) { if (buf != NULL) free(buf, M_KTRACE); return; } if (namelen > 0) { req->ktr_header.ktr_len = namelen; req->ktr_buffer = buf; } ktr_submitrequest(curthread, req); } void ktrgenio(fd, rw, uio, error) int fd; enum uio_rw rw; struct uio *uio; int error; { struct ktr_request *req; struct ktr_genio *ktg; int datalen; char *buf; if (error) { free(uio, M_IOV); return; } uio->uio_offset = 0; uio->uio_rw = UIO_WRITE; datalen = imin(uio->uio_resid, ktr_geniosize); buf = malloc(datalen, M_KTRACE, M_WAITOK); error = uiomove(buf, datalen, uio); free(uio, M_IOV); if (error) { free(buf, M_KTRACE); return; } req = ktr_getrequest(KTR_GENIO); if (req == NULL) { free(buf, M_KTRACE); return; } ktg = &req->ktr_data.ktr_genio; ktg->ktr_fd = fd; ktg->ktr_rw = rw; req->ktr_header.ktr_len = datalen; req->ktr_buffer = buf; ktr_submitrequest(curthread, req); } void ktrpsig(sig, action, mask, code) int sig; sig_t action; sigset_t *mask; int code; { struct ktr_request *req; struct ktr_psig *kp; req = ktr_getrequest(KTR_PSIG); if (req == NULL) return; kp = &req->ktr_data.ktr_psig; kp->signo = (char)sig; kp->action = action; kp->mask = *mask; kp->code = code; ktr_enqueuerequest(curthread, req); } void ktrcsw(out, user) int out, user; { struct ktr_request *req; struct ktr_csw *kc; req = ktr_getrequest(KTR_CSW); if (req == NULL) return; kc = &req->ktr_data.ktr_csw; kc->out = out; kc->user = user; ktr_enqueuerequest(curthread, req); } #endif /* KTRACE */ /* Interface and common routines */ -/* - * ktrace system call - */ #ifndef _SYS_SYSPROTO_H_ struct ktrace_args { char *fname; int ops; int facs; int pid; }; #endif /* ARGSUSED */ int ktrace(td, uap) struct thread *td; register struct ktrace_args *uap; { #ifdef KTRACE register struct vnode *vp = NULL; register struct proc *p; struct pgrp *pg; int facs = uap->facs & ~KTRFAC_ROOT; int ops = KTROP(uap->ops); int descend = uap->ops & KTRFLAG_DESCEND; int nfound, ret = 0; int flags, error = 0, vfslocked; struct nameidata nd; struct ucred *cred; /* * Need something to (un)trace. */ if (ops != KTROP_CLEARFILE && facs == 0) return (EINVAL); ktrace_enter(td); if (ops != KTROP_CLEAR) { /* * an operation which requires a file argument. */ NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE, uap->fname, td); flags = FREAD | FWRITE | O_NOFOLLOW; error = vn_open(&nd, &flags, 0, -1); if (error) { ktrace_exit(td); return (error); } vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; VOP_UNLOCK(vp, 0, td); if (vp->v_type != VREG) { (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); ktrace_exit(td); return (EACCES); } VFS_UNLOCK_GIANT(vfslocked); } /* * Clear all uses of the tracefile. */ if (ops == KTROP_CLEARFILE) { int vrele_count; vrele_count = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_tracevp == vp) { if (ktrcanset(td, p)) { mtx_lock(&ktrace_mtx); cred = p->p_tracecred; p->p_tracecred = NULL; p->p_tracevp = NULL; p->p_traceflag = 0; mtx_unlock(&ktrace_mtx); vrele_count++; crfree(cred); } else error = EPERM; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); if (vrele_count > 0) { vfslocked = VFS_LOCK_GIANT(vp->v_mount); while (vrele_count-- > 0) vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } goto done; } /* * do it */ sx_slock(&proctree_lock); if (uap->pid < 0) { /* * by process group */ pg = pgfind(-uap->pid); if (pg == NULL) { sx_sunlock(&proctree_lock); error = ESRCH; goto done; } /* * ktrops() may call vrele(). Lock pg_members * by the proctree_lock rather than pg_mtx. */ PGRP_UNLOCK(pg); nfound = 0; LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (p_cansee(td, p) != 0) { PROC_UNLOCK(p); continue; } PROC_UNLOCK(p); nfound++; if (descend) ret |= ktrsetchildren(td, p, ops, facs, vp); else ret |= ktrops(td, p, ops, facs, vp); } if (nfound == 0) { sx_sunlock(&proctree_lock); error = ESRCH; goto done; } } else { /* * by pid */ p = pfind(uap->pid); if (p == NULL) { sx_sunlock(&proctree_lock); error = ESRCH; goto done; } error = p_cansee(td, p); /* * The slock of the proctree lock will keep this process * from going away, so unlocking the proc here is ok. */ PROC_UNLOCK(p); if (error) { sx_sunlock(&proctree_lock); goto done; } if (descend) ret |= ktrsetchildren(td, p, ops, facs, vp); else ret |= ktrops(td, p, ops, facs, vp); } sx_sunlock(&proctree_lock); if (!ret) error = EPERM; done: if (vp != NULL) { vfslocked = VFS_LOCK_GIANT(vp->v_mount); (void) vn_close(vp, FWRITE, td->td_ucred, td); VFS_UNLOCK_GIANT(vfslocked); } ktrace_exit(td); return (error); #else /* !KTRACE */ return (ENOSYS); #endif /* KTRACE */ } -/* - * utrace system call - */ /* ARGSUSED */ int utrace(td, uap) struct thread *td; register struct utrace_args *uap; { #ifdef KTRACE struct ktr_request *req; void *cp; int error; if (!KTRPOINT(td, KTR_USER)) return (0); if (uap->len > KTR_USER_MAXLEN) return (EINVAL); cp = malloc(uap->len, M_KTRACE, M_WAITOK); error = copyin(uap->addr, cp, uap->len); if (error) { free(cp, M_KTRACE); return (error); } req = ktr_getrequest(KTR_USER); if (req == NULL) { free(cp, M_KTRACE); return (ENOMEM); } req->ktr_buffer = cp; req->ktr_header.ktr_len = uap->len; ktr_submitrequest(td, req); return (0); #else /* !KTRACE */ return (ENOSYS); #endif /* KTRACE */ } #ifdef KTRACE static int ktrops(td, p, ops, facs, vp) struct thread *td; struct proc *p; int ops, facs; struct vnode *vp; { struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; PROC_LOCK(p); if (!ktrcanset(td, p)) { PROC_UNLOCK(p); return (0); } mtx_lock(&ktrace_mtx); if (ops == KTROP_SET) { if (p->p_tracevp != vp) { /* * if trace file already in use, relinquish below */ tracevp = p->p_tracevp; VREF(vp); p->p_tracevp = vp; } if (p->p_tracecred != td->td_ucred) { tracecred = p->p_tracecred; p->p_tracecred = crhold(td->td_ucred); } p->p_traceflag |= facs; if (priv_check_cred(td->td_ucred, PRIV_KTRACE, SUSER_ALLOWJAIL) == 0) p->p_traceflag |= KTRFAC_ROOT; } else { /* KTROP_CLEAR */ if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) { /* no more tracing */ p->p_traceflag = 0; tracevp = p->p_tracevp; p->p_tracevp = NULL; tracecred = p->p_tracecred; p->p_tracecred = NULL; } } mtx_unlock(&ktrace_mtx); PROC_UNLOCK(p); if (tracevp != NULL) { int vfslocked; vfslocked = VFS_LOCK_GIANT(tracevp->v_mount); vrele(tracevp); VFS_UNLOCK_GIANT(vfslocked); } if (tracecred != NULL) crfree(tracecred); return (1); } static int ktrsetchildren(td, top, ops, facs, vp) struct thread *td; struct proc *top; int ops, facs; struct vnode *vp; { register struct proc *p; register int ret = 0; p = top; sx_assert(&proctree_lock, SX_LOCKED); for (;;) { ret |= ktrops(td, p, ops, facs, vp); /* * If this process has children, descend to them next, * otherwise do any siblings, and if done with this level, * follow back up the tree (but not past top). */ if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) return (ret); if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } /*NOTREACHED*/ } static void ktr_writerequest(struct thread *td, struct ktr_request *req) { struct ktr_header *kth; struct vnode *vp; struct proc *p; struct ucred *cred; struct uio auio; struct iovec aiov[3]; struct mount *mp; int datalen, buflen, vrele_count; int error, vfslocked; /* * We hold the vnode and credential for use in I/O in case ktrace is * disabled on the process as we write out the request. * * XXXRW: This is not ideal: we could end up performing a write after * the vnode has been closed. */ mtx_lock(&ktrace_mtx); vp = td->td_proc->p_tracevp; if (vp != NULL) VREF(vp); cred = td->td_proc->p_tracecred; if (cred != NULL) crhold(cred); mtx_unlock(&ktrace_mtx); /* * If vp is NULL, the vp has been cleared out from under this * request, so just drop it. Make sure the credential and vnode are * in sync: we should have both or neither. */ if (vp == NULL) { KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL")); return; } KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL")); kth = &req->ktr_header; datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP]; buflen = kth->ktr_len; auio.uio_iov = &aiov[0]; auio.uio_offset = 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; aiov[0].iov_base = (caddr_t)kth; aiov[0].iov_len = sizeof(struct ktr_header); auio.uio_resid = sizeof(struct ktr_header); auio.uio_iovcnt = 1; auio.uio_td = td; if (datalen != 0) { aiov[1].iov_base = (caddr_t)&req->ktr_data; aiov[1].iov_len = datalen; auio.uio_resid += datalen; auio.uio_iovcnt++; kth->ktr_len += datalen; } if (buflen != 0) { KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write")); aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer; aiov[auio.uio_iovcnt].iov_len = buflen; auio.uio_resid += buflen; auio.uio_iovcnt++; } vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); (void)VOP_LEASE(vp, td, cred, LEASE_WRITE); #ifdef MAC error = mac_check_vnode_write(cred, NOCRED, vp); if (error == 0) #endif error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); if (!error) return; /* * If error encountered, give up tracing on this vnode. We defer * all the vrele()'s on the vnode until after we are finished walking * the various lists to avoid needlessly holding locks. */ log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n", error); vrele_count = 0; /* * First, clear this vnode from being used by any processes in the * system. * XXX - If one process gets an EPERM writing to the vnode, should * we really do this? Other processes might have suitable * credentials for the operation. */ cred = NULL; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_tracevp == vp) { mtx_lock(&ktrace_mtx); p->p_tracevp = NULL; p->p_traceflag = 0; cred = p->p_tracecred; p->p_tracecred = NULL; mtx_unlock(&ktrace_mtx); vrele_count++; } PROC_UNLOCK(p); if (cred != NULL) { crfree(cred); cred = NULL; } } sx_sunlock(&allproc_lock); /* * We can't clear any pending requests in threads that have cached * them but not yet committed them, as those are per-thread. The * thread will have to clear it itself on system call return. */ vfslocked = VFS_LOCK_GIANT(vp->v_mount); while (vrele_count-- > 0) vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } /* * Return true if caller has permission to set the ktracing state * of target. Essentially, the target can't possess any * more permissions than the caller. KTRFAC_ROOT signifies that * root previously set the tracing status on the target process, and * so, only root may further change it. */ static int ktrcanset(td, targetp) struct thread *td; struct proc *targetp; { PROC_LOCK_ASSERT(targetp, MA_OWNED); if (targetp->p_traceflag & KTRFAC_ROOT && priv_check_cred(td->td_ucred, PRIV_KTRACE, SUSER_ALLOWJAIL)) return (0); if (p_candebug(td, targetp) != 0) return (0); return (1); } #endif /* KTRACE */ Index: head/sys/kern/kern_ntptime.c =================================================================== --- head/sys/kern/kern_ntptime.c (revision 167231) +++ head/sys/kern/kern_ntptime.c (revision 167232) @@ -1,972 +1,972 @@ /*- *********************************************************************** * * * Copyright (c) David L. Mills 1993-2001 * * * * Permission to use, copy, modify, and distribute this software and * * its documentation for any purpose and without fee is hereby * * granted, provided that the above copyright notice appears in all * * copies and that both the copyright notice and this permission * * notice appear in supporting documentation, and that the name * * University of Delaware not be used in advertising or publicity * * pertaining to distribution of the software without specific, * * written prior permission. The University of Delaware makes no * * representations about the suitability this software for any * * purpose. It is provided "as is" without express or implied * * warranty. * * * **********************************************************************/ /* * Adapted from the original sources for FreeBSD and timecounters by: * Poul-Henning Kamp . * * The 32bit version of the "LP" macros seems a bit past its "sell by" * date so I have retained only the 64bit version and included it directly * in this file. * * Only minor changes done to interface with the timecounters over in * sys/kern/kern_clock.c. Some of the comments below may be (even more) * confusing and/or plain wrong in that context. */ #include __FBSDID("$FreeBSD$"); #include "opt_ntp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Single-precision macros for 64-bit machines */ typedef int64_t l_fp; #define L_ADD(v, u) ((v) += (u)) #define L_SUB(v, u) ((v) -= (u)) #define L_ADDHI(v, a) ((v) += (int64_t)(a) << 32) #define L_NEG(v) ((v) = -(v)) #define L_RSHIFT(v, n) \ do { \ if ((v) < 0) \ (v) = -(-(v) >> (n)); \ else \ (v) = (v) >> (n); \ } while (0) #define L_MPY(v, a) ((v) *= (a)) #define L_CLR(v) ((v) = 0) #define L_ISNEG(v) ((v) < 0) #define L_LINT(v, a) ((v) = (int64_t)(a) << 32) #define L_GINT(v) ((v) < 0 ? -(-(v) >> 32) : (v) >> 32) /* * Generic NTP kernel interface * * These routines constitute the Network Time Protocol (NTP) interfaces * for user and daemon application programs. The ntp_gettime() routine * provides the time, maximum error (synch distance) and estimated error * (dispersion) to client user application programs. The ntp_adjtime() * routine is used by the NTP daemon to adjust the system clock to an * externally derived time. The time offset and related variables set by * this routine are used by other routines in this module to adjust the * phase and frequency of the clock discipline loop which controls the * system clock. * * When the kernel time is reckoned directly in nanoseconds (NTP_NANO * defined), the time at each tick interrupt is derived directly from * the kernel time variable. When the kernel time is reckoned in * microseconds, (NTP_NANO undefined), the time is derived from the * kernel time variable together with a variable representing the * leftover nanoseconds at the last tick interrupt. In either case, the * current nanosecond time is reckoned from these values plus an * interpolated value derived by the clock routines in another * architecture-specific module. The interpolation can use either a * dedicated counter or a processor cycle counter (PCC) implemented in * some architectures. * * Note that all routines must run at priority splclock or higher. */ /* * Phase/frequency-lock loop (PLL/FLL) definitions * * The nanosecond clock discipline uses two variable types, time * variables and frequency variables. Both types are represented as 64- * bit fixed-point quantities with the decimal point between two 32-bit * halves. On a 32-bit machine, each half is represented as a single * word and mathematical operations are done using multiple-precision * arithmetic. On a 64-bit machine, ordinary computer arithmetic is * used. * * A time variable is a signed 64-bit fixed-point number in ns and * fraction. It represents the remaining time offset to be amortized * over succeeding tick interrupts. The maximum time offset is about * 0.5 s and the resolution is about 2.3e-10 ns. * * 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |s s s| ns | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | fraction | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * A frequency variable is a signed 64-bit fixed-point number in ns/s * and fraction. It represents the ns and fraction to be added to the * kernel time variable at each second. The maximum frequency offset is * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s. * * 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |s s s s s s s s s s s s s| ns/s | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | fraction | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ /* * The following variables establish the state of the PLL/FLL and the * residual time and frequency offset of the local clock. */ #define SHIFT_PLL 4 /* PLL loop gain (shift) */ #define SHIFT_FLL 2 /* FLL loop gain (shift) */ static int time_state = TIME_OK; /* clock state */ static int time_status = STA_UNSYNC; /* clock status bits */ static long time_tai; /* TAI offset (s) */ static long time_monitor; /* last time offset scaled (ns) */ static long time_constant; /* poll interval (shift) (s) */ static long time_precision = 1; /* clock precision (ns) */ static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */ static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */ static long time_reftime; /* time at last adjustment (s) */ static l_fp time_offset; /* time offset (ns) */ static l_fp time_freq; /* frequency offset (ns/s) */ static l_fp time_adj; /* tick adjust (ns/s) */ static int64_t time_adjtime; /* correction from adjtime(2) (usec) */ #ifdef PPS_SYNC /* * The following variables are used when a pulse-per-second (PPS) signal * is available and connected via a modem control lead. They establish * the engineering parameters of the clock discipline loop when * controlled by the PPS signal. */ #define PPS_FAVG 2 /* min freq avg interval (s) (shift) */ #define PPS_FAVGDEF 8 /* default freq avg int (s) (shift) */ #define PPS_FAVGMAX 15 /* max freq avg interval (s) (shift) */ #define PPS_PAVG 4 /* phase avg interval (s) (shift) */ #define PPS_VALID 120 /* PPS signal watchdog max (s) */ #define PPS_MAXWANDER 100000 /* max PPS wander (ns/s) */ #define PPS_POPCORN 2 /* popcorn spike threshold (shift) */ static struct timespec pps_tf[3]; /* phase median filter */ static l_fp pps_freq; /* scaled frequency offset (ns/s) */ static long pps_fcount; /* frequency accumulator */ static long pps_jitter; /* nominal jitter (ns) */ static long pps_stabil; /* nominal stability (scaled ns/s) */ static long pps_lastsec; /* time at last calibration (s) */ static int pps_valid; /* signal watchdog counter */ static int pps_shift = PPS_FAVG; /* interval duration (s) (shift) */ static int pps_shiftmax = PPS_FAVGDEF; /* max interval duration (s) (shift) */ static int pps_intcnt; /* wander counter */ /* * PPS signal quality monitors */ static long pps_calcnt; /* calibration intervals */ static long pps_jitcnt; /* jitter limit exceeded */ static long pps_stbcnt; /* stability limit exceeded */ static long pps_errcnt; /* calibration errors */ #endif /* PPS_SYNC */ /* * End of phase/frequency-lock loop (PLL/FLL) definitions */ static void ntp_init(void); static void hardupdate(long offset); static void ntp_gettime1(struct ntptimeval *ntvp); static void ntp_gettime1(struct ntptimeval *ntvp) { struct timespec atv; /* nanosecond time */ GIANT_REQUIRED; nanotime(&atv); ntvp->time.tv_sec = atv.tv_sec; ntvp->time.tv_nsec = atv.tv_nsec; ntvp->maxerror = time_maxerror; ntvp->esterror = time_esterror; ntvp->tai = time_tai; ntvp->time_state = time_state; /* * Status word error decode. If any of these conditions occur, * an error is returned, instead of the status word. Most * applications will care only about the fact the system clock * may not be trusted, not about the details. * * Hardware or software error */ if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) || /* * PPS signal lost when either time or frequency synchronization * requested */ (time_status & (STA_PPSFREQ | STA_PPSTIME) && !(time_status & STA_PPSSIGNAL)) || /* * PPS jitter exceeded when time synchronization requested */ (time_status & STA_PPSTIME && time_status & STA_PPSJITTER) || /* * PPS wander exceeded or calibration error when frequency * synchronization requested */ (time_status & STA_PPSFREQ && time_status & (STA_PPSWANDER | STA_PPSERROR))) ntvp->time_state = TIME_ERROR; } /* * ntp_gettime() - NTP user application interface * - * See the timex.h header file for synopsis and API description. Note - * that the TAI offset is returned in the ntvtimeval.tai structure - * member. + * See the timex.h header file for synopsis and API description. Note that + * the TAI offset is returned in the ntvtimeval.tai structure member. */ #ifndef _SYS_SYSPROTO_H_ struct ntp_gettime_args { struct ntptimeval *ntvp; }; #endif /* ARGSUSED */ int ntp_gettime(struct thread *td, struct ntp_gettime_args *uap) { struct ntptimeval ntv; mtx_lock(&Giant); ntp_gettime1(&ntv); mtx_unlock(&Giant); td->td_retval[0] = ntv.time_state; return (copyout(&ntv, uap->ntvp, sizeof(ntv))); } static int ntp_sysctl(SYSCTL_HANDLER_ARGS) { struct ntptimeval ntv; /* temporary structure */ ntp_gettime1(&ntv); return (sysctl_handle_opaque(oidp, &ntv, sizeof(ntv), req)); } SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW, 0, ""); SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", ""); #ifdef PPS_SYNC SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW, &pps_shiftmax, 0, ""); SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW, &pps_shift, 0, ""); SYSCTL_INT(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD, &time_monitor, 0, ""); SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD, &pps_freq, sizeof(pps_freq), "I", ""); SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD, &time_freq, sizeof(time_freq), "I", ""); #endif + /* * ntp_adjtime() - NTP daemon application interface * - * See the timex.h header file for synopsis and API description. Note - * that the timex.constant structure member has a dual purpose to set - * the time constant and to set the TAI offset. + * See the timex.h header file for synopsis and API description. Note that + * the timex.constant structure member has a dual purpose to set the time + * constant and to set the TAI offset. */ #ifndef _SYS_SYSPROTO_H_ struct ntp_adjtime_args { struct timex *tp; }; #endif int ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap) { struct timex ntv; /* temporary structure */ long freq; /* frequency ns/s) */ int modes; /* mode bits from structure */ int s; /* caller priority */ int error; error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv)); if (error) return(error); /* * Update selected clock variables - only the superuser can * change anything. Note that there is no error checking here on * the assumption the superuser should know what it is doing. * Note that either the time constant or TAI offset are loaded * from the ntv.constant member, depending on the mode bits. If * the STA_PLL bit in the status word is cleared, the state and * status words are reset to the initial values at boot. */ mtx_lock(&Giant); modes = ntv.modes; if (modes) error = priv_check(td, PRIV_NTP_ADJTIME); if (error) goto done2; s = splclock(); if (modes & MOD_MAXERROR) time_maxerror = ntv.maxerror; if (modes & MOD_ESTERROR) time_esterror = ntv.esterror; if (modes & MOD_STATUS) { if (time_status & STA_PLL && !(ntv.status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; #ifdef PPS_SYNC pps_shift = PPS_FAVG; #endif /* PPS_SYNC */ } time_status &= STA_RONLY; time_status |= ntv.status & ~STA_RONLY; } if (modes & MOD_TIMECONST) { if (ntv.constant < 0) time_constant = 0; else if (ntv.constant > MAXTC) time_constant = MAXTC; else time_constant = ntv.constant; } if (modes & MOD_TAI) { if (ntv.constant > 0) /* XXX zero & negative numbers ? */ time_tai = ntv.constant; } #ifdef PPS_SYNC if (modes & MOD_PPSMAX) { if (ntv.shift < PPS_FAVG) pps_shiftmax = PPS_FAVG; else if (ntv.shift > PPS_FAVGMAX) pps_shiftmax = PPS_FAVGMAX; else pps_shiftmax = ntv.shift; } #endif /* PPS_SYNC */ if (modes & MOD_NANO) time_status |= STA_NANO; if (modes & MOD_MICRO) time_status &= ~STA_NANO; if (modes & MOD_CLKB) time_status |= STA_CLK; if (modes & MOD_CLKA) time_status &= ~STA_CLK; if (modes & MOD_FREQUENCY) { freq = (ntv.freq * 1000LL) >> 16; if (freq > MAXFREQ) L_LINT(time_freq, MAXFREQ); else if (freq < -MAXFREQ) L_LINT(time_freq, -MAXFREQ); else { /* * ntv.freq is [PPM * 2^16] = [us/s * 2^16] * time_freq is [ns/s * 2^32] */ time_freq = ntv.freq * 1000LL * 65536LL; } #ifdef PPS_SYNC pps_freq = time_freq; #endif /* PPS_SYNC */ } if (modes & MOD_OFFSET) { if (time_status & STA_NANO) hardupdate(ntv.offset); else hardupdate(ntv.offset * 1000); } /* * Retrieve all clock variables. Note that the TAI offset is * returned only by ntp_gettime(); */ if (time_status & STA_NANO) ntv.offset = L_GINT(time_offset); else ntv.offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */ ntv.freq = L_GINT((time_freq / 1000LL) << 16); ntv.maxerror = time_maxerror; ntv.esterror = time_esterror; ntv.status = time_status; ntv.constant = time_constant; if (time_status & STA_NANO) ntv.precision = time_precision; else ntv.precision = time_precision / 1000; ntv.tolerance = MAXFREQ * SCALE_PPM; #ifdef PPS_SYNC ntv.shift = pps_shift; ntv.ppsfreq = L_GINT((pps_freq / 1000LL) << 16); if (time_status & STA_NANO) ntv.jitter = pps_jitter; else ntv.jitter = pps_jitter / 1000; ntv.stabil = pps_stabil; ntv.calcnt = pps_calcnt; ntv.errcnt = pps_errcnt; ntv.jitcnt = pps_jitcnt; ntv.stbcnt = pps_stbcnt; #endif /* PPS_SYNC */ splx(s); error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv)); if (error) goto done2; /* * Status word error decode. See comments in * ntp_gettime() routine. */ if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) || (time_status & (STA_PPSFREQ | STA_PPSTIME) && !(time_status & STA_PPSSIGNAL)) || (time_status & STA_PPSTIME && time_status & STA_PPSJITTER) || (time_status & STA_PPSFREQ && time_status & (STA_PPSWANDER | STA_PPSERROR))) { td->td_retval[0] = TIME_ERROR; } else { td->td_retval[0] = time_state; } done2: mtx_unlock(&Giant); return (error); } /* * second_overflow() - called after ntp_tick_adjust() * * This routine is ordinarily called immediately following the above * routine ntp_tick_adjust(). While these two routines are normally * combined, they are separated here only for the purposes of * simulation. */ void ntp_update_second(int64_t *adjustment, time_t *newsec) { int tickrate; l_fp ftemp; /* 32/64-bit temporary */ /* * On rollover of the second both the nanosecond and microsecond * clocks are updated and the state machine cranked as * necessary. The phase adjustment to be used for the next * second is calculated and the maximum error is increased by * the tolerance. */ time_maxerror += MAXFREQ / 1000; /* * Leap second processing. If in leap-insert state at * the end of the day, the system clock is set back one * second; if in leap-delete state, the system clock is * set ahead one second. The nano_time() routine or * external clock driver will insure that reported time * is always monotonic. */ switch (time_state) { /* * No warning. */ case TIME_OK: if (time_status & STA_INS) time_state = TIME_INS; else if (time_status & STA_DEL) time_state = TIME_DEL; break; /* * Insert second 23:59:60 following second * 23:59:59. */ case TIME_INS: if (!(time_status & STA_INS)) time_state = TIME_OK; else if ((*newsec) % 86400 == 0) { (*newsec)--; time_state = TIME_OOP; time_tai++; } break; /* * Delete second 23:59:59. */ case TIME_DEL: if (!(time_status & STA_DEL)) time_state = TIME_OK; else if (((*newsec) + 1) % 86400 == 0) { (*newsec)++; time_tai--; time_state = TIME_WAIT; } break; /* * Insert second in progress. */ case TIME_OOP: time_state = TIME_WAIT; break; /* * Wait for status bits to clear. */ case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; } /* * Compute the total time adjustment for the next second * in ns. The offset is reduced by a factor depending on * whether the PPS signal is operating. Note that the * value is in effect scaled by the clock frequency, * since the adjustment is added at each tick interrupt. */ ftemp = time_offset; #ifdef PPS_SYNC /* XXX even if PPS signal dies we should finish adjustment ? */ if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) L_RSHIFT(ftemp, pps_shift); else L_RSHIFT(ftemp, SHIFT_PLL + time_constant); #else L_RSHIFT(ftemp, SHIFT_PLL + time_constant); #endif /* PPS_SYNC */ time_adj = ftemp; L_SUB(time_offset, ftemp); L_ADD(time_adj, time_freq); /* * Apply any correction from adjtime(2). If more than one second * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM) * until the last second is slewed the final < 500 usecs. */ if (time_adjtime != 0) { if (time_adjtime > 1000000) tickrate = 5000; else if (time_adjtime < -1000000) tickrate = -5000; else if (time_adjtime > 500) tickrate = 500; else if (time_adjtime < -500) tickrate = -500; else tickrate = time_adjtime; time_adjtime -= tickrate; L_LINT(ftemp, tickrate * 1000); L_ADD(time_adj, ftemp); } *adjustment = time_adj; #ifdef PPS_SYNC if (pps_valid > 0) pps_valid--; else time_status &= ~STA_PPSSIGNAL; #endif /* PPS_SYNC */ } /* * ntp_init() - initialize variables and structures * * This routine must be called after the kernel variables hz and tick * are set or changed and before the next tick interrupt. In this * particular implementation, these values are assumed set elsewhere in * the kernel. The design allows the clock frequency and tick interval * to be changed while the system is running. So, this routine should * probably be integrated with the code that does that. */ static void ntp_init() { /* * The following variables are initialized only at startup. Only * those structures not cleared by the compiler need to be * initialized, and these only in the simulator. In the actual * kernel, any nonzero values here will quickly evaporate. */ L_CLR(time_offset); L_CLR(time_freq); #ifdef PPS_SYNC pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0; pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0; pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0; pps_fcount = 0; L_CLR(pps_freq); #endif /* PPS_SYNC */ } SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, ntp_init, NULL) /* * hardupdate() - local clock update * * This routine is called by ntp_adjtime() to update the local clock * phase and frequency. The implementation is of an adaptive-parameter, * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new * time and frequency offset estimates for each call. If the kernel PPS * discipline code is configured (PPS_SYNC), the PPS signal itself * determines the new time offset, instead of the calling argument. * Presumably, calls to ntp_adjtime() occur only when the caller * believes the local clock is valid within some bound (+-128 ms with * NTP). If the caller's time is far different than the PPS time, an * argument will ensue, and it's not clear who will lose. * * For uncompensated quartz crystal oscillators and nominal update * intervals less than 256 s, operation should be in phase-lock mode, * where the loop is disciplined to phase. For update intervals greater * than 1024 s, operation should be in frequency-lock mode, where the * loop is disciplined to frequency. Between 256 s and 1024 s, the mode * is selected by the STA_MODE status bit. */ static void hardupdate(offset) long offset; /* clock offset (ns) */ { long mtemp; l_fp ftemp; /* * Select how the phase is to be controlled and from which * source. If the PPS signal is present and enabled to * discipline the time, the PPS offset is used; otherwise, the * argument offset is used. */ if (!(time_status & STA_PLL)) return; if (!(time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)) { if (offset > MAXPHASE) time_monitor = MAXPHASE; else if (offset < -MAXPHASE) time_monitor = -MAXPHASE; else time_monitor = offset; L_LINT(time_offset, time_monitor); } /* * Select how the frequency is to be controlled and in which * mode (PLL or FLL). If the PPS signal is present and enabled * to discipline the frequency, the PPS frequency is used; * otherwise, the argument offset is used to compute it. */ if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) { time_reftime = time_second; return; } if (time_status & STA_FREQHOLD || time_reftime == 0) time_reftime = time_second; mtemp = time_second - time_reftime; L_LINT(ftemp, time_monitor); L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1); L_MPY(ftemp, mtemp); L_ADD(time_freq, ftemp); time_status &= ~STA_MODE; if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { L_LINT(ftemp, (time_monitor << 4) / mtemp); L_RSHIFT(ftemp, SHIFT_FLL + 4); L_ADD(time_freq, ftemp); time_status |= STA_MODE; } time_reftime = time_second; if (L_GINT(time_freq) > MAXFREQ) L_LINT(time_freq, MAXFREQ); else if (L_GINT(time_freq) < -MAXFREQ) L_LINT(time_freq, -MAXFREQ); } #ifdef PPS_SYNC /* * hardpps() - discipline CPU clock oscillator to external PPS signal * * This routine is called at each PPS interrupt in order to discipline * the CPU clock oscillator to the PPS signal. There are two independent * first-order feedback loops, one for the phase, the other for the * frequency. The phase loop measures and grooms the PPS phase offset * and leaves it in a handy spot for the seconds overflow routine. The * frequency loop averages successive PPS phase differences and * calculates the PPS frequency offset, which is also processed by the * seconds overflow routine. The code requires the caller to capture the * time and architecture-dependent hardware counter values in * nanoseconds at the on-time PPS signal transition. * * Note that, on some Unix systems this routine runs at an interrupt * priority level higher than the timer interrupt routine hardclock(). * Therefore, the variables used are distinct from the hardclock() * variables, except for the actual time and frequency variables, which * are determined by this routine and updated atomically. */ void hardpps(tsp, nsec) struct timespec *tsp; /* time at PPS */ long nsec; /* hardware counter at PPS */ { long u_sec, u_nsec, v_nsec; /* temps */ l_fp ftemp; /* * The signal is first processed by a range gate and frequency * discriminator. The range gate rejects noise spikes outside * the range +-500 us. The frequency discriminator rejects input * signals with apparent frequency outside the range 1 +-500 * PPM. If two hits occur in the same second, we ignore the * later hit; if not and a hit occurs outside the range gate, * keep the later hit for later comparison, but do not process * it. */ time_status |= STA_PPSSIGNAL | STA_PPSJITTER; time_status &= ~(STA_PPSWANDER | STA_PPSERROR); pps_valid = PPS_VALID; u_sec = tsp->tv_sec; u_nsec = tsp->tv_nsec; if (u_nsec >= (NANOSECOND >> 1)) { u_nsec -= NANOSECOND; u_sec++; } v_nsec = u_nsec - pps_tf[0].tv_nsec; if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND - MAXFREQ) return; pps_tf[2] = pps_tf[1]; pps_tf[1] = pps_tf[0]; pps_tf[0].tv_sec = u_sec; pps_tf[0].tv_nsec = u_nsec; /* * Compute the difference between the current and previous * counter values. If the difference exceeds 0.5 s, assume it * has wrapped around, so correct 1.0 s. If the result exceeds * the tick interval, the sample point has crossed a tick * boundary during the last second, so correct the tick. Very * intricate. */ u_nsec = nsec; if (u_nsec > (NANOSECOND >> 1)) u_nsec -= NANOSECOND; else if (u_nsec < -(NANOSECOND >> 1)) u_nsec += NANOSECOND; pps_fcount += u_nsec; if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ) return; time_status &= ~STA_PPSJITTER; /* * A three-stage median filter is used to help denoise the PPS * time. The median sample becomes the time offset estimate; the * difference between the other two samples becomes the time * dispersion (jitter) estimate. */ if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) { if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) { v_nsec = pps_tf[1].tv_nsec; /* 0 1 2 */ u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec; } else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) { v_nsec = pps_tf[0].tv_nsec; /* 2 0 1 */ u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec; } else { v_nsec = pps_tf[2].tv_nsec; /* 0 2 1 */ u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec; } } else { if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) { v_nsec = pps_tf[1].tv_nsec; /* 2 1 0 */ u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec; } else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) { v_nsec = pps_tf[0].tv_nsec; /* 1 0 2 */ u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec; } else { v_nsec = pps_tf[2].tv_nsec; /* 1 2 0 */ u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec; } } /* * Nominal jitter is due to PPS signal noise and interrupt * latency. If it exceeds the popcorn threshold, the sample is * discarded. otherwise, if so enabled, the time offset is * updated. We can tolerate a modest loss of data here without * much degrading time accuracy. */ if (u_nsec > (pps_jitter << PPS_POPCORN)) { time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { time_monitor = -v_nsec; L_LINT(time_offset, time_monitor); } pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG; u_sec = pps_tf[0].tv_sec - pps_lastsec; if (u_sec < (1 << pps_shift)) return; /* * At the end of the calibration interval the difference between * the first and last counter values becomes the scaled * frequency. It will later be divided by the length of the * interval to determine the frequency update. If the frequency * exceeds a sanity threshold, or if the actual calibration * interval is not equal to the expected length, the data are * discarded. We can tolerate a modest loss of data here without * much degrading frequency accuracy. */ pps_calcnt++; v_nsec = -pps_fcount; pps_lastsec = pps_tf[0].tv_sec; pps_fcount = 0; u_nsec = MAXFREQ << pps_shift; if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 << pps_shift)) { time_status |= STA_PPSERROR; pps_errcnt++; return; } /* * Here the raw frequency offset and wander (stability) is * calculated. If the wander is less than the wander threshold * for four consecutive averaging intervals, the interval is * doubled; if it is greater than the threshold for four * consecutive intervals, the interval is halved. The scaled * frequency offset is converted to frequency offset. The * stability metric is calculated as the average of recent * frequency changes, but is used only for performance * monitoring. */ L_LINT(ftemp, v_nsec); L_RSHIFT(ftemp, pps_shift); L_SUB(ftemp, pps_freq); u_nsec = L_GINT(ftemp); if (u_nsec > PPS_MAXWANDER) { L_LINT(ftemp, PPS_MAXWANDER); pps_intcnt--; time_status |= STA_PPSWANDER; pps_stbcnt++; } else if (u_nsec < -PPS_MAXWANDER) { L_LINT(ftemp, -PPS_MAXWANDER); pps_intcnt--; time_status |= STA_PPSWANDER; pps_stbcnt++; } else { pps_intcnt++; } if (pps_intcnt >= 4) { pps_intcnt = 4; if (pps_shift < pps_shiftmax) { pps_shift++; pps_intcnt = 0; } } else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) { pps_intcnt = -4; if (pps_shift > PPS_FAVG) { pps_shift--; pps_intcnt = 0; } } if (u_nsec < 0) u_nsec = -u_nsec; pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG; /* * The PPS frequency is recalculated and clamped to the maximum * MAXFREQ. If enabled, the system clock frequency is updated as * well. */ L_ADD(pps_freq, ftemp); u_nsec = L_GINT(pps_freq); if (u_nsec > MAXFREQ) L_LINT(pps_freq, MAXFREQ); else if (u_nsec < -MAXFREQ) L_LINT(pps_freq, -MAXFREQ); if (time_status & STA_PPSFREQ) time_freq = pps_freq; } #endif /* PPS_SYNC */ #ifndef _SYS_SYSPROTO_H_ struct adjtime_args { struct timeval *delta; struct timeval *olddelta; }; #endif /* ARGSUSED */ int adjtime(struct thread *td, struct adjtime_args *uap) { struct timeval delta, olddelta, *deltap; int error; if (uap->delta) { error = copyin(uap->delta, &delta, sizeof(delta)); if (error) return (error); deltap = δ } else deltap = NULL; error = kern_adjtime(td, deltap, &olddelta); if (uap->olddelta && error == 0) error = copyout(&olddelta, uap->olddelta, sizeof(olddelta)); return (error); } int kern_adjtime(struct thread *td, struct timeval *delta, struct timeval *olddelta) { struct timeval atv; int error; if ((error = priv_check(td, PRIV_ADJTIME))) return (error); mtx_lock(&Giant); if (olddelta) { atv.tv_sec = time_adjtime / 1000000; atv.tv_usec = time_adjtime % 1000000; if (atv.tv_usec < 0) { atv.tv_usec += 1000000; atv.tv_sec--; } *olddelta = atv; } if (delta) time_adjtime = (int64_t)delta->tv_sec * 1000000 + delta->tv_usec; mtx_unlock(&Giant); return (error); } Index: head/sys/kern/kern_prot.c =================================================================== --- head/sys/kern/kern_prot.c (revision 167231) +++ head/sys/kern/kern_prot.c (revision 167232) @@ -1,2050 +1,2048 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 * The Regents of the University of California. * (c) UNIX System Laboratories, Inc. * Copyright (c) 2000-2001 Robert N. M. Watson. * All rights reserved. * * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94 */ /* * System calls related to processes and protection */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_CRED, "cred", "credentials"); SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy"); #ifndef _SYS_SYSPROTO_H_ struct getpid_args { int dummy; }; #endif /* ARGSUSED */ int getpid(struct thread *td, struct getpid_args *uap) { struct proc *p = td->td_proc; td->td_retval[0] = p->p_pid; #if defined(COMPAT_43) PROC_LOCK(p); td->td_retval[1] = p->p_pptr->p_pid; PROC_UNLOCK(p); #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct getppid_args { int dummy; }; #endif /* ARGSUSED */ int getppid(struct thread *td, struct getppid_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = p->p_pptr->p_pid; PROC_UNLOCK(p); return (0); } /* * Get process group ID; note that POSIX getpgrp takes no parameter. */ #ifndef _SYS_SYSPROTO_H_ struct getpgrp_args { int dummy; }; #endif int getpgrp(struct thread *td, struct getpgrp_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* Get an arbitary pid's process group id */ #ifndef _SYS_SYSPROTO_H_ struct getpgid_args { pid_t pid; }; #endif int getpgid(struct thread *td, struct getpgid_args *uap) { struct proc *p; int error; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_pgrp->pg_id; PROC_UNLOCK(p); return (0); } /* * Get an arbitary pid's session id. */ #ifndef _SYS_SYSPROTO_H_ struct getsid_args { pid_t pid; }; #endif int getsid(struct thread *td, struct getsid_args *uap) { struct proc *p; int error; if (uap->pid == 0) { p = td->td_proc; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); error = p_cansee(td, p); if (error) { PROC_UNLOCK(p); return (error); } } td->td_retval[0] = p->p_session->s_sid; PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct getuid_args { int dummy; }; #endif /* ARGSUSED */ int getuid(struct thread *td, struct getuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_ruid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_uid; #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct geteuid_args { int dummy; }; #endif /* ARGSUSED */ int geteuid(struct thread *td, struct geteuid_args *uap) { td->td_retval[0] = td->td_ucred->cr_uid; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgid_args { int dummy; }; #endif /* ARGSUSED */ int getgid(struct thread *td, struct getgid_args *uap) { td->td_retval[0] = td->td_ucred->cr_rgid; #if defined(COMPAT_43) td->td_retval[1] = td->td_ucred->cr_groups[0]; #endif return (0); } /* * Get effective group ID. The "egid" is groups[0], and could be obtained * via getgroups. This syscall exists because it is somewhat painful to do * correctly in a library function. */ #ifndef _SYS_SYSPROTO_H_ struct getegid_args { int dummy; }; #endif /* ARGSUSED */ int getegid(struct thread *td, struct getegid_args *uap) { td->td_retval[0] = td->td_ucred->cr_groups[0]; return (0); } #ifndef _SYS_SYSPROTO_H_ struct getgroups_args { u_int gidsetsize; gid_t *gidset; }; #endif int getgroups(struct thread *td, register struct getgroups_args *uap) { gid_t groups[NGROUPS]; u_int ngrp; int error; ngrp = MIN(uap->gidsetsize, NGROUPS); error = kern_getgroups(td, &ngrp, groups); if (error) return (error); if (uap->gidsetsize > 0) error = copyout(groups, uap->gidset, ngrp * sizeof(gid_t)); if (error == 0) td->td_retval[0] = ngrp; return (error); } int kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups) { struct ucred *cred; cred = td->td_ucred; if (*ngrp == 0) { *ngrp = cred->cr_ngroups; return (0); } if (*ngrp < cred->cr_ngroups) return (EINVAL); *ngrp = cred->cr_ngroups; bcopy(cred->cr_groups, groups, *ngrp * sizeof(gid_t)); return (0); } #ifndef _SYS_SYSPROTO_H_ struct setsid_args { int dummy; }; #endif /* ARGSUSED */ int setsid(register struct thread *td, struct setsid_args *uap) { struct pgrp *pgrp; int error; struct proc *p = td->td_proc; struct pgrp *newpgrp; struct session *newsess; error = 0; pgrp = NULL; MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO); sx_xlock(&proctree_lock); if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) { if (pgrp != NULL) PGRP_UNLOCK(pgrp); error = EPERM; } else { (void)enterpgrp(p, p->p_pid, newpgrp, newsess); td->td_retval[0] = p->p_pid; newpgrp = NULL; newsess = NULL; } sx_xunlock(&proctree_lock); if (newpgrp != NULL) FREE(newpgrp, M_PGRP); if (newsess != NULL) FREE(newsess, M_SESSION); return (error); } /* * set process group (setpgid/old setpgrp) * * caller does setpgid(targpid, targpgid) * * pid must be caller or child of caller (ESRCH) * if a child * pid must be in same session (EPERM) * pid can't have done an exec (EACCES) * if pgid != pid * there must exist some pid in same session having pgid (EPERM) * pid must not be session leader (EPERM) */ #ifndef _SYS_SYSPROTO_H_ struct setpgid_args { int pid; /* target process id */ int pgid; /* target pgrp id */ }; #endif /* ARGSUSED */ int setpgid(struct thread *td, register struct setpgid_args *uap) { struct proc *curp = td->td_proc; register struct proc *targp; /* target process */ register struct pgrp *pgrp; /* target pgrp */ int error; struct pgrp *newpgrp; if (uap->pgid < 0) return (EINVAL); error = 0; MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO); sx_xlock(&proctree_lock); if (uap->pid != 0 && uap->pid != curp->p_pid) { if ((targp = pfind(uap->pid)) == NULL) { error = ESRCH; goto done; } if (!inferior(targp)) { PROC_UNLOCK(targp); error = ESRCH; goto done; } if ((error = p_cansee(td, targp))) { PROC_UNLOCK(targp); goto done; } if (targp->p_pgrp == NULL || targp->p_session != curp->p_session) { PROC_UNLOCK(targp); error = EPERM; goto done; } if (targp->p_flag & P_EXEC) { PROC_UNLOCK(targp); error = EACCES; goto done; } PROC_UNLOCK(targp); } else targp = curp; if (SESS_LEADER(targp)) { error = EPERM; goto done; } if (uap->pgid == 0) uap->pgid = targp->p_pid; if ((pgrp = pgfind(uap->pgid)) == NULL) { if (uap->pgid == targp->p_pid) { error = enterpgrp(targp, uap->pgid, newpgrp, NULL); if (error == 0) newpgrp = NULL; } else error = EPERM; } else { if (pgrp == targp->p_pgrp) { PGRP_UNLOCK(pgrp); goto done; } if (pgrp->pg_id != targp->p_pid && pgrp->pg_session != curp->p_session) { PGRP_UNLOCK(pgrp); error = EPERM; goto done; } PGRP_UNLOCK(pgrp); error = enterthispgrp(targp, pgrp); } done: sx_xunlock(&proctree_lock); KASSERT((error == 0) || (newpgrp != NULL), ("setpgid failed and newpgrp is NULL")); if (newpgrp != NULL) FREE(newpgrp, M_PGRP); return (error); } /* * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD * compatible. It says that setting the uid/gid to euid/egid is a special * case of "appropriate privilege". Once the rules are expanded out, this * basically means that setuid(nnn) sets all three id's, in all permitted * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid()) * does not set the saved id - this is dangerous for traditional BSD * programs. For this reason, we *really* do not want to set * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2. */ #define POSIX_APPENDIX_B_4_2_2 #ifndef _SYS_SYSPROTO_H_ struct setuid_args { uid_t uid; }; #endif /* ARGSUSED */ int setuid(struct thread *td, struct setuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t uid; struct uidinfo *uip; int error; uid = uap->uid; AUDIT_ARG(uid, uid); newcred = crget(); uip = uifind(uid); PROC_LOCK(p); oldcred = p->p_ucred; #ifdef MAC error = mac_check_proc_setuid(p, oldcred, uid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setuid(geteuid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setuid(xx)" sets all * three id's (assuming you have privs). * * Notes on the logic. We do things in three steps. * 1: We determine if the euid is going to change, and do EPERM * right away. We unconditionally change the euid later if this * test is satisfied, simplifying that part of the logic. * 2: We determine if the real and/or saved uids are going to * change. Determined by compile options. * 3: Change euid last. (after tests in #2 for "appropriate privs") */ if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */ #ifdef _POSIX_SAVED_IDS uid != oldcred->cr_svuid && /* allow setuid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ uid != oldcred->cr_uid && /* allow setuid(geteuid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETUID, SUSER_ALLOWJAIL)) != 0) goto fail; /* * Copy credentials so other references do not see our changes. */ crcopy(newcred, oldcred); #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or uid == euid) * If so, we are changing the real uid and/or saved uid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */ uid == oldcred->cr_uid || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETUID, SUSER_ALLOWJAIL) == 0) #endif { /* * Set the real uid and transfer proc count to new user. */ if (uid != oldcred->cr_ruid) { change_ruid(newcred, uip); setsugid(p); } /* * Set saved uid * * XXX always set saved uid even if not _POSIX_SAVED_IDS, as * the security of seteuid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (uid != oldcred->cr_svuid) { change_svuid(newcred, uid); setsugid(p); } } /* * In all permitted cases, we are changing the euid. */ if (uid != oldcred->cr_uid) { change_euid(newcred, uip); setsugid(p); } p->p_ucred = newcred; PROC_UNLOCK(p); uifree(uip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(uip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct seteuid_args { uid_t euid; }; #endif /* ARGSUSED */ int seteuid(struct thread *td, struct seteuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid; struct uidinfo *euip; int error; euid = uap->euid; AUDIT_ARG(euid, euid); newcred = crget(); euip = uifind(euid); PROC_LOCK(p); oldcred = p->p_ucred; #ifdef MAC error = mac_check_proc_seteuid(p, oldcred, euid); if (error) goto fail; #endif if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */ euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID, SUSER_ALLOWJAIL)) != 0) goto fail; /* * Everything's okay, do it. Copy credentials so other references do * not see our changes. */ crcopy(newcred, oldcred); if (oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } p->p_ucred = newcred; PROC_UNLOCK(p); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgid_args { gid_t gid; }; #endif /* ARGSUSED */ int setgid(struct thread *td, struct setgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t gid; int error; gid = uap->gid; AUDIT_ARG(gid, gid); newcred = crget(); PROC_LOCK(p); oldcred = p->p_ucred; #ifdef MAC error = mac_check_proc_setgid(p, oldcred, gid); if (error) goto fail; #endif /* * See if we have "permission" by POSIX 1003.1 rules. * * Note that setgid(getegid()) is a special case of * "appropriate privileges" in appendix B.4.2.2. We need * to use this clause to be compatible with traditional BSD * semantics. Basically, it means that "setgid(xx)" sets all * three id's (assuming you have privs). * * For notes on the logic here, see setuid() above. */ if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */ #ifdef _POSIX_SAVED_IDS gid != oldcred->cr_svgid && /* allow setgid(saved gid) */ #endif #ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */ #endif (error = priv_check_cred(oldcred, PRIV_CRED_SETGID, SUSER_ALLOWJAIL)) != 0) goto fail; crcopy(newcred, oldcred); #ifdef _POSIX_SAVED_IDS /* * Do we have "appropriate privileges" (are we root or gid == egid) * If so, we are changing the real uid and saved gid. */ if ( #ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */ gid == oldcred->cr_groups[0] || #endif /* We are using privs. */ priv_check_cred(oldcred, PRIV_CRED_SETGID, SUSER_ALLOWJAIL) == 0) #endif { /* * Set real gid */ if (oldcred->cr_rgid != gid) { change_rgid(newcred, gid); setsugid(p); } /* * Set saved gid * * XXX always set saved gid even if not _POSIX_SAVED_IDS, as * the security of setegid() depends on it. B.4.2.2 says it * is important that we should do this. */ if (oldcred->cr_svgid != gid) { change_svgid(newcred, gid); setsugid(p); } } /* * In all cases permitted cases, we are changing the egid. * Copy credentials so other references do not see our changes. */ if (oldcred->cr_groups[0] != gid) { change_egid(newcred, gid); setsugid(p); } p->p_ucred = newcred; PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setegid_args { gid_t egid; }; #endif /* ARGSUSED */ int setegid(struct thread *td, struct setegid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid; int error; egid = uap->egid; AUDIT_ARG(egid, egid); newcred = crget(); PROC_LOCK(p); oldcred = p->p_ucred; #ifdef MAC error = mac_check_proc_setegid(p, oldcred, egid); if (error) goto fail; #endif if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */ egid != oldcred->cr_svgid && /* allow setegid(saved gid) */ (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID, SUSER_ALLOWJAIL)) != 0) goto fail; crcopy(newcred, oldcred); if (oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } p->p_ucred = newcred; PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setgroups_args { u_int gidsetsize; gid_t *gidset; }; #endif /* ARGSUSED */ int setgroups(struct thread *td, struct setgroups_args *uap) { gid_t groups[NGROUPS]; int error; if (uap->gidsetsize > NGROUPS) return (EINVAL); error = copyin(uap->gidset, groups, uap->gidsetsize * sizeof(gid_t)); if (error) return (error); return (kern_setgroups(td, uap->gidsetsize, groups)); } int kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; int error; if (ngrp > NGROUPS) return (EINVAL); AUDIT_ARG(groupset, groups, ngrp); newcred = crget(); PROC_LOCK(p); oldcred = p->p_ucred; #ifdef MAC error = mac_check_proc_setgroups(p, oldcred, ngrp, groups); if (error) goto fail; #endif error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, SUSER_ALLOWJAIL); if (error) goto fail; /* * XXX A little bit lazy here. We could test if anything has * changed before crcopy() and setting P_SUGID. */ crcopy(newcred, oldcred); if (ngrp < 1) { /* * setgroups(0, NULL) is a legitimate way of clearing the * groups vector on non-BSD systems (which generally do not * have the egid in the groups[0]). We risk security holes * when running non-BSD software if we do not do the same. */ newcred->cr_ngroups = 1; } else { bcopy(groups, newcred->cr_groups, ngrp * sizeof(gid_t)); newcred->cr_ngroups = ngrp; } setsugid(p); p->p_ucred = newcred; PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setreuid_args { uid_t ruid; uid_t euid; }; #endif /* ARGSUSED */ int setreuid(register struct thread *td, struct setreuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; AUDIT_ARG(euid, euid); AUDIT_ARG(ruid, ruid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = p->p_ucred; #ifdef MAC error = mac_check_proc_setreuid(p, oldcred, ruid, euid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid) || (euid != (uid_t)-1 && euid != oldcred->cr_uid && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID, SUSER_ALLOWJAIL)) != 0) goto fail; crcopy(newcred, oldcred); if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) && newcred->cr_svuid != newcred->cr_uid) { change_svuid(newcred, newcred->cr_uid); setsugid(p); } p->p_ucred = newcred; PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setregid_args { gid_t rgid; gid_t egid; }; #endif /* ARGSUSED */ int setregid(register struct thread *td, struct setregid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid; int error; egid = uap->egid; rgid = uap->rgid; AUDIT_ARG(egid, egid); AUDIT_ARG(rgid, rgid); newcred = crget(); PROC_LOCK(p); oldcred = p->p_ucred; #ifdef MAC error = mac_check_proc_setregid(p, oldcred, rgid, egid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid) || (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID, SUSER_ALLOWJAIL)) != 0) goto fail; crcopy(newcred, oldcred); if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) && newcred->cr_svgid != newcred->cr_groups[0]) { change_svgid(newcred, newcred->cr_groups[0]); setsugid(p); } p->p_ucred = newcred; PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } /* - * setresuid(ruid, euid, suid) is like setreuid except control over the - * saved uid is explicit. + * setresuid(ruid, euid, suid) is like setreuid except control over the saved + * uid is explicit. */ - #ifndef _SYS_SYSPROTO_H_ struct setresuid_args { uid_t ruid; uid_t euid; uid_t suid; }; #endif /* ARGSUSED */ int setresuid(register struct thread *td, struct setresuid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; uid_t euid, ruid, suid; struct uidinfo *euip, *ruip; int error; euid = uap->euid; ruid = uap->ruid; suid = uap->suid; AUDIT_ARG(euid, euid); AUDIT_ARG(ruid, ruid); AUDIT_ARG(suid, suid); newcred = crget(); euip = uifind(euid); ruip = uifind(ruid); PROC_LOCK(p); oldcred = p->p_ucred; #ifdef MAC error = mac_check_proc_setresuid(p, oldcred, ruid, euid, suid); if (error) goto fail; #endif if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid && ruid != oldcred->cr_svuid && ruid != oldcred->cr_uid) || (euid != (uid_t)-1 && euid != oldcred->cr_ruid && euid != oldcred->cr_svuid && euid != oldcred->cr_uid) || (suid != (uid_t)-1 && suid != oldcred->cr_ruid && suid != oldcred->cr_svuid && suid != oldcred->cr_uid)) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID, SUSER_ALLOWJAIL)) != 0) goto fail; crcopy(newcred, oldcred); if (euid != (uid_t)-1 && oldcred->cr_uid != euid) { change_euid(newcred, euip); setsugid(p); } if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) { change_ruid(newcred, ruip); setsugid(p); } if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) { change_svuid(newcred, suid); setsugid(p); } p->p_ucred = newcred; PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); uifree(ruip); uifree(euip); crfree(newcred); return (error); } /* - * setresgid(rgid, egid, sgid) is like setregid except control over the - * saved gid is explicit. + * setresgid(rgid, egid, sgid) is like setregid except control over the saved + * gid is explicit. */ - #ifndef _SYS_SYSPROTO_H_ struct setresgid_args { gid_t rgid; gid_t egid; gid_t sgid; }; #endif /* ARGSUSED */ int setresgid(register struct thread *td, struct setresgid_args *uap) { struct proc *p = td->td_proc; struct ucred *newcred, *oldcred; gid_t egid, rgid, sgid; int error; egid = uap->egid; rgid = uap->rgid; sgid = uap->sgid; AUDIT_ARG(egid, egid); AUDIT_ARG(rgid, rgid); AUDIT_ARG(sgid, sgid); newcred = crget(); PROC_LOCK(p); oldcred = p->p_ucred; #ifdef MAC error = mac_check_proc_setresgid(p, oldcred, rgid, egid, sgid); if (error) goto fail; #endif if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid && rgid != oldcred->cr_svgid && rgid != oldcred->cr_groups[0]) || (egid != (gid_t)-1 && egid != oldcred->cr_rgid && egid != oldcred->cr_svgid && egid != oldcred->cr_groups[0]) || (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid && sgid != oldcred->cr_svgid && sgid != oldcred->cr_groups[0])) && (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID, SUSER_ALLOWJAIL)) != 0) goto fail; crcopy(newcred, oldcred); if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) { change_egid(newcred, egid); setsugid(p); } if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) { change_rgid(newcred, rgid); setsugid(p); } if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) { change_svgid(newcred, sgid); setsugid(p); } p->p_ucred = newcred; PROC_UNLOCK(p); crfree(oldcred); return (0); fail: PROC_UNLOCK(p); crfree(newcred); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getresuid_args { uid_t *ruid; uid_t *euid; uid_t *suid; }; #endif /* ARGSUSED */ int getresuid(register struct thread *td, struct getresuid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->ruid) error1 = copyout(&cred->cr_ruid, uap->ruid, sizeof(cred->cr_ruid)); if (uap->euid) error2 = copyout(&cred->cr_uid, uap->euid, sizeof(cred->cr_uid)); if (uap->suid) error3 = copyout(&cred->cr_svuid, uap->suid, sizeof(cred->cr_svuid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct getresgid_args { gid_t *rgid; gid_t *egid; gid_t *sgid; }; #endif /* ARGSUSED */ int getresgid(register struct thread *td, struct getresgid_args *uap) { struct ucred *cred; int error1 = 0, error2 = 0, error3 = 0; cred = td->td_ucred; if (uap->rgid) error1 = copyout(&cred->cr_rgid, uap->rgid, sizeof(cred->cr_rgid)); if (uap->egid) error2 = copyout(&cred->cr_groups[0], uap->egid, sizeof(cred->cr_groups[0])); if (uap->sgid) error3 = copyout(&cred->cr_svgid, uap->sgid, sizeof(cred->cr_svgid)); return (error1 ? error1 : error2 ? error2 : error3); } #ifndef _SYS_SYSPROTO_H_ struct issetugid_args { int dummy; }; #endif /* ARGSUSED */ int issetugid(register struct thread *td, struct issetugid_args *uap) { struct proc *p = td->td_proc; /* * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, * we use P_SUGID because we consider changing the owners as * "tainting" as well. * This is significant for procs that start as root and "become" * a user without an exec - programs cannot know *everything* * that libc *might* have put in their data segment. */ PROC_LOCK(p); td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0; PROC_UNLOCK(p); return (0); } int __setugid(struct thread *td, struct __setugid_args *uap) { #ifdef REGRESSION struct proc *p; p = td->td_proc; switch (uap->flag) { case 0: PROC_LOCK(p); p->p_flag &= ~P_SUGID; PROC_UNLOCK(p); return (0); case 1: PROC_LOCK(p); p->p_flag |= P_SUGID; PROC_UNLOCK(p); return (0); default: return (EINVAL); } #else /* !REGRESSION */ return (ENOSYS); #endif /* REGRESSION */ } /* * Check if gid is a member of the group set. */ int groupmember(gid_t gid, struct ucred *cred) { register gid_t *gp; gid_t *egp; egp = &(cred->cr_groups[cred->cr_ngroups]); for (gp = cred->cr_groups; gp < egp; gp++) if (*gp == gid) return (1); return (0); } /* * Test the active securelevel against a given level. securelevel_gt() * implements (securelevel > level). securelevel_ge() implements * (securelevel >= level). Note that the logic is inverted -- these * functions return EPERM on "success" and 0 on "failure". * * XXXRW: Possibly since this has to do with privilege, it should move to * kern_priv.c. */ int securelevel_gt(struct ucred *cr, int level) { int active_securelevel; active_securelevel = securelevel; KASSERT(cr != NULL, ("securelevel_gt: null cr")); if (cr->cr_prison != NULL) active_securelevel = imax(cr->cr_prison->pr_securelevel, active_securelevel); return (active_securelevel > level ? EPERM : 0); } int securelevel_ge(struct ucred *cr, int level) { int active_securelevel; active_securelevel = securelevel; KASSERT(cr != NULL, ("securelevel_ge: null cr")); if (cr->cr_prison != NULL) active_securelevel = imax(cr->cr_prison->pr_securelevel, active_securelevel); return (active_securelevel >= level ? EPERM : 0); } /* * 'see_other_uids' determines whether or not visibility of processes * and sockets with credentials holding different real uids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_uids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW, &see_other_uids, 0, "Unprivileged processes may see subjects/objects with different real uid"); /*- * Determine if u1 "can see" the subject specified by u2, according to the * 'see_other_uids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_seeotheruids(struct ucred *u1, struct ucred *u2) { if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) { if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, SUSER_ALLOWJAIL) != 0) return (ESRCH); } return (0); } /* * 'see_other_gids' determines whether or not visibility of processes * and sockets with credentials holding different real gids is possible * using a variety of system MIBs. * XXX: data declarations should be together near the beginning of the file. */ static int see_other_gids = 1; SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW, &see_other_gids, 0, "Unprivileged processes may see subjects/objects with different real gid"); /* * Determine if u1 can "see" the subject specified by u2, according to the * 'see_other_gids' policy. * Returns: 0 for permitted, ESRCH otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ static int cr_seeothergids(struct ucred *u1, struct ucred *u2) { int i, match; if (!see_other_gids) { match = 0; for (i = 0; i < u1->cr_ngroups; i++) { if (groupmember(u1->cr_groups[i], u2)) match = 1; if (match) break; } if (!match) { if (priv_check_cred(u1, PRIV_SEEOTHERGIDS, SUSER_ALLOWJAIL) != 0) return (ESRCH); } } return (0); } /*- * Determine if u1 "can see" the subject specified by u2. * Returns: 0 for permitted, an errno value otherwise * Locks: none * References: *u1 and *u2 must not change during the call * u1 may equal u2, in which case only one reference is required */ int cr_cansee(struct ucred *u1, struct ucred *u2) { int error; if ((error = prison_check(u1, u2))) return (error); #ifdef MAC if ((error = mac_check_cred_visible(u1, u2))) return (error); #endif if ((error = cr_seeotheruids(u1, u2))) return (error); if ((error = cr_seeothergids(u1, u2))) return (error); return (0); } /*- * Determine if td "can see" the subject specified by p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect p->p_ucred must be held. td really * should be curthread. * References: td and p must be valid for the lifetime of the call */ int p_cansee(struct thread *td, struct proc *p) { /* Wrap cr_cansee() for all functionality. */ KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); return (cr_cansee(td->td_ucred, p->p_ucred)); } /* * 'conservative_signals' prevents the delivery of a broad class of * signals by unprivileged processes to processes that have changed their * credentials since the last invocation of execve(). This can prevent * the leakage of cached information or retained privileges as a result * of a common class of signal-related vulnerabilities. However, this * may interfere with some applications that expect to be able to * deliver these signals to peer processes after having given up * privilege. */ static int conservative_signals = 1; SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW, &conservative_signals, 0, "Unprivileged processes prevented from " "sending certain signals to processes whose credentials have changed"); /*- * Determine whether cred may deliver the specified signal to proc. * Returns: 0 for permitted, an errno value otherwise. * Locks: A lock must be held for proc. * References: cred and proc must be valid for the lifetime of the call. */ int cr_cansignal(struct ucred *cred, struct proc *proc, int signum) { int error; PROC_LOCK_ASSERT(proc, MA_OWNED); /* * Jail semantics limit the scope of signalling to proc in the * same jail as cred, if cred is in jail. */ error = prison_check(cred, proc->p_ucred); if (error) return (error); #ifdef MAC if ((error = mac_check_proc_signal(cred, proc, signum))) return (error); #endif if ((error = cr_seeotheruids(cred, proc->p_ucred))) return (error); if ((error = cr_seeothergids(cred, proc->p_ucred))) return (error); /* * UNIX signal semantics depend on the status of the P_SUGID * bit on the target process. If the bit is set, then additional * restrictions are placed on the set of available signals. */ if (conservative_signals && (proc->p_flag & P_SUGID)) { switch (signum) { case 0: case SIGKILL: case SIGINT: case SIGTERM: case SIGALRM: case SIGSTOP: case SIGTTIN: case SIGTTOU: case SIGTSTP: case SIGHUP: case SIGUSR1: case SIGUSR2: /* * Generally, permit job and terminal control * signals. */ break; default: /* Not permitted without privilege. */ error = priv_check_cred(cred, PRIV_SIGNAL_SUGID, SUSER_ALLOWJAIL); if (error) return (error); } } /* * Generally, the target credential's ruid or svuid must match the * subject credential's ruid or euid. */ if (cred->cr_ruid != proc->p_ucred->cr_ruid && cred->cr_ruid != proc->p_ucred->cr_svuid && cred->cr_uid != proc->p_ucred->cr_ruid && cred->cr_uid != proc->p_ucred->cr_svuid) { /* Not permitted without privilege. */ error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED, SUSER_ALLOWJAIL); if (error) return (error); } return (0); } /*- * Determine whether td may deliver the specified signal to p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must be * held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansignal(struct thread *td, struct proc *p, int signum) { KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); /* * UNIX signalling semantics require that processes in the same * session always be able to deliver SIGCONT to one another, * overriding the remaining protections. */ /* XXX: This will require an additional lock of some sort. */ if (signum == SIGCONT && td->td_proc->p_session == p->p_session) return (0); /* * Some compat layers use SIGTHR and higher signals for * communication between different kernel threads of the same * process, so that they expect that it's always possible to * deliver them, even for suid applications where cr_cansignal() can * deny such ability for security consideration. It should be * pretty safe to do since the only way to create two processes * with the same p_leader is via rfork(2). */ if (td->td_proc->p_leader != NULL && signum >= SIGTHR && signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader) return (0); return (cr_cansignal(td->td_ucred, p, signum)); } /*- * Determine whether td may reschedule p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_cansched(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_check_proc_sched(td->td_ucred, p))) return (error); #endif if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) return (error); if ((error = cr_seeothergids(td->td_ucred, p->p_ucred))) return (error); if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid && td->td_ucred->cr_uid != p->p_ucred->cr_ruid) { error = priv_check_cred(td->td_ucred, PRIV_SCHED_DIFFCRED, SUSER_ALLOWJAIL); if (error) return (error); } return (0); } /* * The 'unprivileged_proc_debug' flag may be used to disable a variety of * unprivileged inter-process debugging services, including some procfs * functionality, ptrace(), and ktrace(). In the past, inter-process * debugging has been involved in a variety of security problems, and sites * not requiring the service might choose to disable it when hardening * systems. * * XXX: Should modifying and reading this variable require locking? * XXX: data declarations should be together near the beginning of the file. */ static int unprivileged_proc_debug = 1; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW, &unprivileged_proc_debug, 0, "Unprivileged processes may use process debugging facilities"); /*- * Determine whether td may debug p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_candebug(struct thread *td, struct proc *p) { int credentialchanged, error, grpsubset, i, uidsubset; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if (!unprivileged_proc_debug) { error = priv_check_cred(td->td_ucred, PRIV_DEBUG_UNPRIV, SUSER_ALLOWJAIL); if (error) return (error); } if (td->td_proc == p) return (0); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_check_proc_debug(td->td_ucred, p))) return (error); #endif if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) return (error); if ((error = cr_seeothergids(td->td_ucred, p->p_ucred))) return (error); /* * Is p's group set a subset of td's effective group set? This * includes p's egid, group access list, rgid, and svgid. */ grpsubset = 1; for (i = 0; i < p->p_ucred->cr_ngroups; i++) { if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) { grpsubset = 0; break; } } grpsubset = grpsubset && groupmember(p->p_ucred->cr_rgid, td->td_ucred) && groupmember(p->p_ucred->cr_svgid, td->td_ucred); /* * Are the uids present in p's credential equal to td's * effective uid? This includes p's euid, svuid, and ruid. */ uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid && td->td_ucred->cr_uid == p->p_ucred->cr_svuid && td->td_ucred->cr_uid == p->p_ucred->cr_ruid); /* * Has the credential of the process changed since the last exec()? */ credentialchanged = (p->p_flag & P_SUGID); /* * If p's gids aren't a subset, or the uids aren't a subset, * or the credential has changed, require appropriate privilege * for td to debug p. */ if (!grpsubset || !uidsubset) { error = priv_check_cred(td->td_ucred, PRIV_DEBUG_DIFFCRED, SUSER_ALLOWJAIL); if (error) return (error); } if (credentialchanged) { error = priv_check_cred(td->td_ucred, PRIV_DEBUG_SUGID, SUSER_ALLOWJAIL); if (error) return (error); } /* Can't trace init when securelevel > 0. */ if (p == initproc) { error = securelevel_gt(td->td_ucred, 0); if (error) return (error); } /* * Can't trace a process that's currently exec'ing. * * XXX: Note, this is not a security policy decision, it's a * basic correctness/functionality decision. Therefore, this check * should be moved to the caller's of p_candebug(). */ if ((p->p_flag & P_INEXEC) != 0) return (EAGAIN); return (0); } /*- * Determine whether the subject represented by cred can "see" a socket. * Returns: 0 for permitted, ENOENT otherwise. */ int cr_canseesocket(struct ucred *cred, struct socket *so) { int error; error = prison_check(cred, so->so_cred); if (error) return (ENOENT); #ifdef MAC SOCK_LOCK(so); error = mac_check_socket_visible(cred, so); SOCK_UNLOCK(so); if (error) return (error); #endif if (cr_seeotheruids(cred, so->so_cred)) return (ENOENT); if (cr_seeothergids(cred, so->so_cred)) return (ENOENT); return (0); } /*- * Determine whether td can wait for the exit of p. * Returns: 0 for permitted, an errno value otherwise * Locks: Sufficient locks to protect various components of td and p * must be held. td must be curthread, and a lock must * be held for p. * References: td and p must be valid for the lifetime of the call */ int p_canwait(struct thread *td, struct proc *p) { int error; KASSERT(td == curthread, ("%s: td not curthread", __func__)); PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = prison_check(td->td_ucred, p->p_ucred))) return (error); #ifdef MAC if ((error = mac_check_proc_wait(td->td_ucred, p))) return (error); #endif #if 0 /* XXXMAC: This could have odd effects on some shells. */ if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred))) return (error); #endif return (0); } /* * Allocate a zeroed cred structure. */ struct ucred * crget(void) { register struct ucred *cr; MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK | M_ZERO); refcount_init(&cr->cr_ref, 1); #ifdef MAC mac_init_cred(cr); #endif return (cr); } /* * Claim another reference to a ucred structure. */ struct ucred * crhold(struct ucred *cr) { refcount_acquire(&cr->cr_ref); return (cr); } /* * Free a cred structure. Throws away space when ref count gets to 0. */ void crfree(struct ucred *cr) { KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref)); KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred")); if (refcount_release(&cr->cr_ref)) { /* * Some callers of crget(), such as nfs_statfs(), * allocate a temporary credential, but don't * allocate a uidinfo structure. */ if (cr->cr_uidinfo != NULL) uifree(cr->cr_uidinfo); if (cr->cr_ruidinfo != NULL) uifree(cr->cr_ruidinfo); /* * Free a prison, if any. */ if (jailed(cr)) prison_free(cr->cr_prison); #ifdef MAC mac_destroy_cred(cr); #endif FREE(cr, M_CRED); } } /* * Check to see if this ucred is shared. */ int crshared(struct ucred *cr) { return (cr->cr_ref > 1); } /* * Copy a ucred's contents from a template. Does not block. */ void crcopy(struct ucred *dest, struct ucred *src) { KASSERT(crshared(dest) == 0, ("crcopy of shared ucred")); bcopy(&src->cr_startcopy, &dest->cr_startcopy, (unsigned)((caddr_t)&src->cr_endcopy - (caddr_t)&src->cr_startcopy)); uihold(dest->cr_uidinfo); uihold(dest->cr_ruidinfo); if (jailed(dest)) prison_hold(dest->cr_prison); #ifdef MAC mac_copy_cred(src, dest); #endif } /* * Dup cred struct to a new held one. */ struct ucred * crdup(struct ucred *cr) { struct ucred *newcr; newcr = crget(); crcopy(newcr, cr); return (newcr); } /* * Fill in a struct xucred based on a struct ucred. */ void cru2x(struct ucred *cr, struct xucred *xcr) { bzero(xcr, sizeof(*xcr)); xcr->cr_version = XUCRED_VERSION; xcr->cr_uid = cr->cr_uid; xcr->cr_ngroups = cr->cr_ngroups; bcopy(cr->cr_groups, xcr->cr_groups, sizeof(cr->cr_groups)); } /* * small routine to swap a thread's current ucred for the correct one taken * from the process. */ void cred_update_thread(struct thread *td) { struct proc *p; struct ucred *cred; p = td->td_proc; cred = td->td_ucred; PROC_LOCK(p); td->td_ucred = crhold(p->p_ucred); PROC_UNLOCK(p); if (cred != NULL) crfree(cred); } /* * Get login name, if available. */ #ifndef _SYS_SYSPROTO_H_ struct getlogin_args { char *namebuf; u_int namelen; }; #endif /* ARGSUSED */ int getlogin(struct thread *td, struct getlogin_args *uap) { int error; char login[MAXLOGNAME]; struct proc *p = td->td_proc; if (uap->namelen > MAXLOGNAME) uap->namelen = MAXLOGNAME; PROC_LOCK(p); SESS_LOCK(p->p_session); bcopy(p->p_session->s_login, login, uap->namelen); SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); error = copyout(login, uap->namebuf, uap->namelen); return(error); } /* * Set login name. */ #ifndef _SYS_SYSPROTO_H_ struct setlogin_args { char *namebuf; }; #endif /* ARGSUSED */ int setlogin(struct thread *td, struct setlogin_args *uap) { struct proc *p = td->td_proc; int error; char logintmp[MAXLOGNAME]; error = priv_check_cred(td->td_ucred, PRIV_PROC_SETLOGIN, SUSER_ALLOWJAIL); if (error) return (error); error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL); if (error == ENAMETOOLONG) error = EINVAL; else if (!error) { PROC_LOCK(p); SESS_LOCK(p->p_session); (void) memcpy(p->p_session->s_login, logintmp, sizeof(logintmp)); SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); } return (error); } void setsugid(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_SUGID; if (!(p->p_pfsflags & PF_ISUGID)) p->p_stops = 0; } /*- * Change a process's effective uid. * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_euid(struct ucred *newcred, struct uidinfo *euip) { newcred->cr_uid = euip->ui_uid; uihold(euip); uifree(newcred->cr_uidinfo); newcred->cr_uidinfo = euip; } /*- * Change a process's effective gid. * Side effects: newcred->cr_gid will be modified. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_egid(struct ucred *newcred, gid_t egid) { newcred->cr_groups[0] = egid; } /*- * Change a process's real uid. * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo * will be updated, and the old and new cr_ruidinfo proc * counts will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_ruid(struct ucred *newcred, struct uidinfo *ruip) { (void)chgproccnt(newcred->cr_ruidinfo, -1, 0); newcred->cr_ruid = ruip->ui_uid; uihold(ruip); uifree(newcred->cr_ruidinfo); newcred->cr_ruidinfo = ruip; (void)chgproccnt(newcred->cr_ruidinfo, 1, 0); } /*- * Change a process's real gid. * Side effects: newcred->cr_rgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_rgid(struct ucred *newcred, gid_t rgid) { newcred->cr_rgid = rgid; } /*- * Change a process's saved uid. * Side effects: newcred->cr_svuid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svuid(struct ucred *newcred, uid_t svuid) { newcred->cr_svuid = svuid; } /*- * Change a process's saved gid. * Side effects: newcred->cr_svgid will be updated. * References: newcred must be an exclusive credential reference for the * duration of the call. */ void change_svgid(struct ucred *newcred, gid_t svgid) { newcred->cr_svgid = svgid; } Index: head/sys/kern/kern_resource.c =================================================================== --- head/sys/kern/kern_resource.c (revision 167231) +++ head/sys/kern/kern_resource.c (revision 167232) @@ -1,1260 +1,1257 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures"); static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures"); #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) static struct mtx uihashtbl_mtx; static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; static u_long uihash; /* size of hash table - 1 */ static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp); static int donice(struct thread *td, struct proc *chgp, int n); static struct uidinfo *uilookup(uid_t uid); /* * Resource controls and accounting. */ - #ifndef _SYS_SYSPROTO_H_ struct getpriority_args { int which; int who; }; #endif int getpriority(td, uap) struct thread *td; register struct getpriority_args *uap; { struct proc *p; struct pgrp *pg; int error, low; error = 0; low = PRIO_MAX + 1; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) low = td->td_proc->p_nice; else { p = pfind(uap->who); if (p == NULL) break; if (p_cansee(td, p) == 0) low = p->p_nice; PROC_UNLOCK(p); } break; case PRIO_PGRP: sx_slock(&proctree_lock); if (uap->who == 0) { pg = td->td_proc->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (!p_cansee(td, p)) { if (p->p_nice < low) low = p->p_nice; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { /* Do not bother to check PRS_NEW processes */ if (p->p_state == PRS_NEW) continue; PROC_LOCK(p); if (!p_cansee(td, p) && p->p_ucred->cr_uid == uap->who) { if (p->p_nice < low) low = p->p_nice; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (low == PRIO_MAX + 1 && error == 0) error = ESRCH; td->td_retval[0] = low; return (error); } #ifndef _SYS_SYSPROTO_H_ struct setpriority_args { int which; int who; int prio; }; #endif int setpriority(td, uap) struct thread *td; struct setpriority_args *uap; { struct proc *curp, *p; struct pgrp *pg; int found = 0, error = 0; curp = td->td_proc; switch (uap->which) { case PRIO_PROCESS: if (uap->who == 0) { PROC_LOCK(curp); error = donice(td, curp, uap->prio); PROC_UNLOCK(curp); } else { p = pfind(uap->who); if (p == 0) break; if (p_cansee(td, p) == 0) error = donice(td, p, uap->prio); PROC_UNLOCK(p); } found++; break; case PRIO_PGRP: sx_slock(&proctree_lock); if (uap->who == 0) { pg = curp->p_pgrp; PGRP_LOCK(pg); } else { pg = pgfind(uap->who); if (pg == NULL) { sx_sunlock(&proctree_lock); break; } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pg->pg_members, p_pglist) { PROC_LOCK(p); if (!p_cansee(td, p)) { error = donice(td, p, uap->prio); found++; } PROC_UNLOCK(p); } PGRP_UNLOCK(pg); break; case PRIO_USER: if (uap->who == 0) uap->who = td->td_ucred->cr_uid; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_ucred->cr_uid == uap->who && !p_cansee(td, p)) { error = donice(td, p, uap->prio); found++; } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); break; default: error = EINVAL; break; } if (found == 0 && error == 0) error = ESRCH; return (error); } /* * Set "nice" for a (whole) process. */ static int donice(struct thread *td, struct proc *p, int n) { int error; PROC_LOCK_ASSERT(p, MA_OWNED); if ((error = p_cansched(td, p))) return (error); if (n > PRIO_MAX) n = PRIO_MAX; if (n < PRIO_MIN) n = PRIO_MIN; if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0) return (EACCES); mtx_lock_spin(&sched_lock); sched_nice(p, n); mtx_unlock_spin(&sched_lock); return (0); } /* * Set realtime priority for LWP. */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_thread_args { int function; lwpid_t lwpid; struct rtprio *rtp; }; #endif - int rtprio_thread(struct thread *td, struct rtprio_thread_args *uap) { struct proc *curp; struct proc *p; struct rtprio rtp; struct thread *td1; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; curp = td->td_proc; /* * Though lwpid is unique, only current process is supported * since there is no efficient way to look up a LWP yet. */ p = curp; PROC_LOCK(p); switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; mtx_lock_spin(&sched_lock); if (uap->lwpid == 0 || uap->lwpid == td->td_tid) td1 = td; else td1 = thread_find(p, uap->lwpid); if (td1 != NULL) pri_to_rtp(td1, &rtp); else error = ESRCH; mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* Disallow setting rtprio in most cases if not superuser. */ if (priv_check(td, PRIV_SCHED_RTPRIO) != 0) { /* can't set realtime priority */ /* * Realtime priority has to be restricted for reasons which should be * obvious. However, for idle priority, there is a potential for * system deadlock if an idleprio process gains a lock on a resource * that other processes need (and the idleprio process can't run * due to a CPU-bound normal process). Fix me! XXX */ #if 0 if (RTP_PRIO_IS_REALTIME(rtp.type)) { #else if (rtp.type != RTP_PRIO_NORMAL) { #endif error = EPERM; break; } } mtx_lock_spin(&sched_lock); if (uap->lwpid == 0 || uap->lwpid == td->td_tid) td1 = td; else td1 = thread_find(p, uap->lwpid); if (td1 != NULL) error = rtp_to_pri(&rtp, td1); else error = ESRCH; mtx_unlock_spin(&sched_lock); break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } /* * Set realtime priority. */ #ifndef _SYS_SYSPROTO_H_ struct rtprio_args { int function; pid_t pid; struct rtprio *rtp; }; #endif - int rtprio(td, uap) struct thread *td; /* curthread */ register struct rtprio_args *uap; { struct proc *curp; struct proc *p; struct thread *tdp; struct rtprio rtp; int cierror, error; /* Perform copyin before acquiring locks if needed. */ if (uap->function == RTP_SET) cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); else cierror = 0; curp = td->td_proc; if (uap->pid == 0) { p = curp; PROC_LOCK(p); } else { p = pfind(uap->pid); if (p == NULL) return (ESRCH); } switch (uap->function) { case RTP_LOOKUP: if ((error = p_cansee(td, p))) break; mtx_lock_spin(&sched_lock); /* * Return OUR priority if no pid specified, * or if one is, report the highest priority * in the process. There isn't much more you can do as * there is only room to return a single priority. * XXXKSE: maybe need a new interface to report * priorities of multiple system scope threads. * Note: specifying our own pid is not the same * as leaving it zero. */ if (uap->pid == 0) { pri_to_rtp(td, &rtp); } else { struct rtprio rtp2; rtp.type = RTP_PRIO_IDLE; rtp.prio = RTP_PRIO_MAX; FOREACH_THREAD_IN_PROC(p, tdp) { pri_to_rtp(tdp, &rtp2); if (rtp2.type < rtp.type || (rtp2.type == rtp.type && rtp2.prio < rtp.prio)) { rtp.type = rtp2.type; rtp.prio = rtp2.prio; } } } mtx_unlock_spin(&sched_lock); PROC_UNLOCK(p); return (copyout(&rtp, uap->rtp, sizeof(struct rtprio))); case RTP_SET: if ((error = p_cansched(td, p)) || (error = cierror)) break; /* Disallow setting rtprio in most cases if not superuser. */ if (priv_check(td, PRIV_SCHED_RTPRIO) != 0) { /* can't set someone else's */ if (uap->pid) { error = EPERM; break; } /* can't set realtime priority */ /* * Realtime priority has to be restricted for reasons which should be * obvious. However, for idle priority, there is a potential for * system deadlock if an idleprio process gains a lock on a resource * that other processes need (and the idleprio process can't run * due to a CPU-bound normal process). Fix me! XXX */ #if 0 if (RTP_PRIO_IS_REALTIME(rtp.type)) { #else if (rtp.type != RTP_PRIO_NORMAL) { #endif error = EPERM; break; } } /* * If we are setting our own priority, set just our * thread but if we are doing another process, * do all the threads on that process. If we * specify our own pid we do the latter. */ mtx_lock_spin(&sched_lock); if (uap->pid == 0) { error = rtp_to_pri(&rtp, td); } else { FOREACH_THREAD_IN_PROC(p, td) { if ((error = rtp_to_pri(&rtp, td)) != 0) break; } } mtx_unlock_spin(&sched_lock); break; default: error = EINVAL; break; } PROC_UNLOCK(p); return (error); } int rtp_to_pri(struct rtprio *rtp, struct thread *td) { u_char newpri; mtx_assert(&sched_lock, MA_OWNED); if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); switch (RTP_PRIO_BASE(rtp->type)) { case RTP_PRIO_REALTIME: newpri = PRI_MIN_REALTIME + rtp->prio; break; case RTP_PRIO_NORMAL: newpri = PRI_MIN_TIMESHARE + rtp->prio; break; case RTP_PRIO_IDLE: newpri = PRI_MIN_IDLE + rtp->prio; break; default: return (EINVAL); } sched_class(td, rtp->type); /* XXX fix */ sched_user_prio(td, newpri); if (curthread == td) sched_prio(curthread, td->td_user_pri); /* XXX dubious */ return (0); } void pri_to_rtp(struct thread *td, struct rtprio *rtp) { mtx_assert(&sched_lock, MA_OWNED); switch (PRI_BASE(td->td_pri_class)) { case PRI_REALTIME: rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME; break; case PRI_TIMESHARE: rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE; break; case PRI_IDLE: rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE; break; default: break; } rtp->type = td->td_pri_class; } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osetrlimit_args { u_int which; struct orlimit *rlp; }; #endif int osetrlimit(td, uap) struct thread *td; register struct osetrlimit_args *uap; { struct orlimit olim; struct rlimit lim; int error; if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit)))) return (error); lim.rlim_cur = olim.rlim_cur; lim.rlim_max = olim.rlim_max; error = kern_setrlimit(td, uap->which, &lim); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ogetrlimit_args { u_int which; struct orlimit *rlp; }; #endif int ogetrlimit(td, uap) struct thread *td; register struct ogetrlimit_args *uap; { struct orlimit olim; struct rlimit rl; struct proc *p; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); p = td->td_proc; PROC_LOCK(p); lim_rlimit(p, uap->which, &rl); PROC_UNLOCK(p); /* * XXX would be more correct to convert only RLIM_INFINITY to the * old RLIM_INFINITY and fail with EOVERFLOW for other larger * values. Most 64->32 and 32->16 conversions, including not * unimportant ones of uids are even more broken than what we * do here (they blindly truncate). We don't do this correctly * here since we have little experience with EOVERFLOW yet. * Elsewhere, getuid() can't fail... */ olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur; olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max; error = copyout(&olim, uap->rlp, sizeof(olim)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct __setrlimit_args { u_int which; struct rlimit *rlp; }; #endif int setrlimit(td, uap) struct thread *td; register struct __setrlimit_args *uap; { struct rlimit alim; int error; if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit)))) return (error); error = kern_setrlimit(td, uap->which, &alim); return (error); } int kern_setrlimit(td, which, limp) struct thread *td; u_int which; struct rlimit *limp; { struct plimit *newlim, *oldlim; struct proc *p; register struct rlimit *alimp; rlim_t oldssiz; int error; if (which >= RLIM_NLIMITS) return (EINVAL); /* * Preserve historical bugs by treating negative limits as unsigned. */ if (limp->rlim_cur < 0) limp->rlim_cur = RLIM_INFINITY; if (limp->rlim_max < 0) limp->rlim_max = RLIM_INFINITY; oldssiz = 0; p = td->td_proc; newlim = lim_alloc(); PROC_LOCK(p); oldlim = p->p_limit; alimp = &oldlim->pl_rlimit[which]; if (limp->rlim_cur > alimp->rlim_max || limp->rlim_max > alimp->rlim_max) if ((error = priv_check_cred(td->td_ucred, PRIV_PROC_SETRLIMIT, SUSER_ALLOWJAIL))) { PROC_UNLOCK(p); lim_free(newlim); return (error); } if (limp->rlim_cur > limp->rlim_max) limp->rlim_cur = limp->rlim_max; lim_copy(newlim, oldlim); alimp = &newlim->pl_rlimit[which]; switch (which) { case RLIMIT_CPU: mtx_lock_spin(&sched_lock); p->p_cpulimit = limp->rlim_cur; mtx_unlock_spin(&sched_lock); break; case RLIMIT_DATA: if (limp->rlim_cur > maxdsiz) limp->rlim_cur = maxdsiz; if (limp->rlim_max > maxdsiz) limp->rlim_max = maxdsiz; break; case RLIMIT_STACK: if (limp->rlim_cur > maxssiz) limp->rlim_cur = maxssiz; if (limp->rlim_max > maxssiz) limp->rlim_max = maxssiz; oldssiz = alimp->rlim_cur; break; case RLIMIT_NOFILE: if (limp->rlim_cur > maxfilesperproc) limp->rlim_cur = maxfilesperproc; if (limp->rlim_max > maxfilesperproc) limp->rlim_max = maxfilesperproc; break; case RLIMIT_NPROC: if (limp->rlim_cur > maxprocperuid) limp->rlim_cur = maxprocperuid; if (limp->rlim_max > maxprocperuid) limp->rlim_max = maxprocperuid; if (limp->rlim_cur < 1) limp->rlim_cur = 1; if (limp->rlim_max < 1) limp->rlim_max = 1; break; } *alimp = *limp; p->p_limit = newlim; PROC_UNLOCK(p); lim_free(oldlim); if (which == RLIMIT_STACK) { /* * Stack is allocated to the max at exec time with only * "rlim_cur" bytes accessible. If stack limit is going * up make more accessible, if going down make inaccessible. */ if (limp->rlim_cur != oldssiz) { vm_offset_t addr; vm_size_t size; vm_prot_t prot; if (limp->rlim_cur > oldssiz) { prot = p->p_sysent->sv_stackprot; size = limp->rlim_cur - oldssiz; addr = p->p_sysent->sv_usrstack - limp->rlim_cur; } else { prot = VM_PROT_NONE; size = oldssiz - limp->rlim_cur; addr = p->p_sysent->sv_usrstack - oldssiz; } addr = trunc_page(addr); size = round_page(size); (void)vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, FALSE); } } /* * The data size limit may need to be changed to a value * that makes sense for the 32 bit binary. */ if (p->p_sysent->sv_fixlimits != NULL) p->p_sysent->sv_fixlimits(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct __getrlimit_args { u_int which; struct rlimit *rlp; }; #endif /* ARGSUSED */ int getrlimit(td, uap) struct thread *td; register struct __getrlimit_args *uap; { struct rlimit rlim; struct proc *p; int error; if (uap->which >= RLIM_NLIMITS) return (EINVAL); p = td->td_proc; PROC_LOCK(p); lim_rlimit(p, uap->which, &rlim); PROC_UNLOCK(p); error = copyout(&rlim, uap->rlp, sizeof(struct rlimit)); return (error); } /* * Transform the running time and tick information for children of proc p * into user and system time usage. */ void calccru(p, up, sp) struct proc *p; struct timeval *up; struct timeval *sp; { PROC_LOCK_ASSERT(p, MA_OWNED); calcru1(p, &p->p_crux, up, sp); } /* * Transform the running time and tick information in proc p into user * and system time usage. If appropriate, include the current time slice * on this CPU. */ void calcru(struct proc *p, struct timeval *up, struct timeval *sp) { struct rusage_ext rux; struct thread *td; uint64_t u; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_NOTOWNED); mtx_lock_spin(&sched_lock); /* * If we are getting stats for the current process, then add in the * stats that this thread has accumulated in its current time slice. * We reset the thread and CPU state as if we had performed a context * switch right here. */ if (curthread->td_proc == p) { td = curthread; u = cpu_ticks(); p->p_rux.rux_runtime += u - PCPU_GET(switchtime); PCPU_SET(switchtime, u); p->p_rux.rux_uticks += td->td_uticks; td->td_uticks = 0; p->p_rux.rux_iticks += td->td_iticks; td->td_iticks = 0; p->p_rux.rux_sticks += td->td_sticks; td->td_sticks = 0; } /* Work on a copy of p_rux so we can let go of sched_lock */ rux = p->p_rux; mtx_unlock_spin(&sched_lock); calcru1(p, &rux, up, sp); /* Update the result from the p_rux copy */ p->p_rux.rux_uu = rux.rux_uu; p->p_rux.rux_su = rux.rux_su; p->p_rux.rux_tu = rux.rux_tu; } static void calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up, struct timeval *sp) { /* {user, system, interrupt, total} {ticks, usec}: */ u_int64_t ut, uu, st, su, it, tt, tu; ut = ruxp->rux_uticks; st = ruxp->rux_sticks; it = ruxp->rux_iticks; tt = ut + st + it; if (tt == 0) { /* Avoid divide by zero */ st = 1; tt = 1; } tu = cputick2usec(ruxp->rux_runtime); if ((int64_t)tu < 0) { /* XXX: this should be an assert /phk */ printf("calcru: negative runtime of %jd usec for pid %d (%s)\n", (intmax_t)tu, p->p_pid, p->p_comm); tu = ruxp->rux_tu; } if (tu >= ruxp->rux_tu) { /* * The normal case, time increased. * Enforce monotonicity of bucketed numbers. */ uu = (tu * ut) / tt; if (uu < ruxp->rux_uu) uu = ruxp->rux_uu; su = (tu * st) / tt; if (su < ruxp->rux_su) su = ruxp->rux_su; } else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) { /* * When we calibrate the cputicker, it is not uncommon to * see the presumably fixed frequency increase slightly over * time as a result of thermal stabilization and NTP * discipline (of the reference clock). We therefore ignore * a bit of backwards slop because we expect to catch up * shortly. We use a 3 microsecond limit to catch low * counts and a 1% limit for high counts. */ uu = ruxp->rux_uu; su = ruxp->rux_su; tu = ruxp->rux_tu; } else { /* tu < ruxp->rux_tu */ /* * What happene here was likely that a laptop, which ran at * a reduced clock frequency at boot, kicked into high gear. * The wisdom of spamming this message in that case is * dubious, but it might also be indicative of something * serious, so lets keep it and hope laptops can be made * more truthful about their CPU speed via ACPI. */ printf("calcru: runtime went backwards from %ju usec " "to %ju usec for pid %d (%s)\n", (uintmax_t)ruxp->rux_tu, (uintmax_t)tu, p->p_pid, p->p_comm); uu = (tu * ut) / tt; su = (tu * st) / tt; } ruxp->rux_uu = uu; ruxp->rux_su = su; ruxp->rux_tu = tu; up->tv_sec = uu / 1000000; up->tv_usec = uu % 1000000; sp->tv_sec = su / 1000000; sp->tv_usec = su % 1000000; } #ifndef _SYS_SYSPROTO_H_ struct getrusage_args { int who; struct rusage *rusage; }; #endif int getrusage(td, uap) register struct thread *td; register struct getrusage_args *uap; { struct rusage ru; int error; error = kern_getrusage(td, uap->who, &ru); if (error == 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int kern_getrusage(td, who, rup) struct thread *td; int who; struct rusage *rup; { struct proc *p; p = td->td_proc; PROC_LOCK(p); switch (who) { case RUSAGE_SELF: *rup = p->p_stats->p_ru; calcru(p, &rup->ru_utime, &rup->ru_stime); break; case RUSAGE_CHILDREN: *rup = p->p_stats->p_cru; calccru(p, &rup->ru_utime, &rup->ru_stime); break; default: PROC_UNLOCK(p); return (EINVAL); } PROC_UNLOCK(p); return (0); } void ruadd(ru, rux, ru2, rux2) struct rusage *ru; struct rusage_ext *rux; struct rusage *ru2; struct rusage_ext *rux2; { register long *ip, *ip2; register int i; rux->rux_runtime += rux2->rux_runtime; rux->rux_uticks += rux2->rux_uticks; rux->rux_sticks += rux2->rux_sticks; rux->rux_iticks += rux2->rux_iticks; rux->rux_uu += rux2->rux_uu; rux->rux_su += rux2->rux_su; rux->rux_tu += rux2->rux_tu; if (ru->ru_maxrss < ru2->ru_maxrss) ru->ru_maxrss = ru2->ru_maxrss; ip = &ru->ru_first; ip2 = &ru2->ru_first; for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) *ip++ += *ip2++; } /* * Allocate a new resource limits structure and initialize its * reference count and mutex pointer. */ struct plimit * lim_alloc() { struct plimit *limp; limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK); refcount_init(&limp->pl_refcnt, 1); return (limp); } struct plimit * lim_hold(limp) struct plimit *limp; { refcount_acquire(&limp->pl_refcnt); return (limp); } void lim_free(limp) struct plimit *limp; { KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow")); if (refcount_release(&limp->pl_refcnt)) free((void *)limp, M_PLIMIT); } /* * Make a copy of the plimit structure. * We share these structures copy-on-write after fork. */ void lim_copy(dst, src) struct plimit *dst, *src; { KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit")); bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit)); } /* * Return the hard limit for a particular system resource. The * which parameter specifies the index into the rlimit array. */ rlim_t lim_max(struct proc *p, int which) { struct rlimit rl; lim_rlimit(p, which, &rl); return (rl.rlim_max); } /* * Return the current (soft) limit for a particular system resource. * The which parameter which specifies the index into the rlimit array */ rlim_t lim_cur(struct proc *p, int which) { struct rlimit rl; lim_rlimit(p, which, &rl); return (rl.rlim_cur); } /* * Return a copy of the entire rlimit structure for the system limit * specified by 'which' in the rlimit structure pointed to by 'rlp'. */ void lim_rlimit(struct proc *p, int which, struct rlimit *rlp) { PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(which >= 0 && which < RLIM_NLIMITS, ("request for invalid resource limit")); *rlp = p->p_limit->pl_rlimit[which]; } /* * Find the uidinfo structure for a uid. This structure is used to * track the total resource consumption (process count, socket buffer * size, etc.) for the uid and impose limits. */ void uihashinit() { uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash); mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF); } /* * Look up a uidinfo struct for the parameter uid. * uihashtbl_mtx must be locked. */ static struct uidinfo * uilookup(uid) uid_t uid; { struct uihashhead *uipp; struct uidinfo *uip; mtx_assert(&uihashtbl_mtx, MA_OWNED); uipp = UIHASH(uid); LIST_FOREACH(uip, uipp, ui_hash) if (uip->ui_uid == uid) break; return (uip); } /* * Find or allocate a struct uidinfo for a particular uid. * Increase refcount on uidinfo struct returned. * uifree() should be called on a struct uidinfo when released. */ struct uidinfo * uifind(uid) uid_t uid; { struct uidinfo *old_uip, *uip; mtx_lock(&uihashtbl_mtx); uip = uilookup(uid); if (uip == NULL) { mtx_unlock(&uihashtbl_mtx); uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO); mtx_lock(&uihashtbl_mtx); /* * There's a chance someone created our uidinfo while we * were in malloc and not holding the lock, so we have to * make sure we don't insert a duplicate uidinfo. */ if ((old_uip = uilookup(uid)) != NULL) { /* Someone else beat us to it. */ free(uip, M_UIDINFO); uip = old_uip; } else { uip->ui_mtxp = mtx_pool_alloc(mtxpool_sleep); uip->ui_uid = uid; LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash); } } uihold(uip); mtx_unlock(&uihashtbl_mtx); return (uip); } /* * Place another refcount on a uidinfo struct. */ void uihold(uip) struct uidinfo *uip; { UIDINFO_LOCK(uip); uip->ui_ref++; UIDINFO_UNLOCK(uip); } /*- * Since uidinfo structs have a long lifetime, we use an * opportunistic refcounting scheme to avoid locking the lookup hash * for each release. * * If the refcount hits 0, we need to free the structure, * which means we need to lock the hash. * Optimal case: * After locking the struct and lowering the refcount, if we find * that we don't need to free, simply unlock and return. * Suboptimal case: * If refcount lowering results in need to free, bump the count * back up, lose the lock and aquire the locks in the proper * order to try again. */ void uifree(uip) struct uidinfo *uip; { /* Prepare for optimal case. */ UIDINFO_LOCK(uip); if (--uip->ui_ref != 0) { UIDINFO_UNLOCK(uip); return; } /* Prepare for suboptimal case. */ uip->ui_ref++; UIDINFO_UNLOCK(uip); mtx_lock(&uihashtbl_mtx); UIDINFO_LOCK(uip); /* * We must subtract one from the count again because we backed out * our initial subtraction before dropping the lock. * Since another thread may have added a reference after we dropped the * initial lock we have to test for zero again. */ if (--uip->ui_ref == 0) { LIST_REMOVE(uip, ui_hash); mtx_unlock(&uihashtbl_mtx); if (uip->ui_sbsize != 0) printf("freeing uidinfo: uid = %d, sbsize = %jd\n", uip->ui_uid, (intmax_t)uip->ui_sbsize); if (uip->ui_proccnt != 0) printf("freeing uidinfo: uid = %d, proccnt = %ld\n", uip->ui_uid, uip->ui_proccnt); UIDINFO_UNLOCK(uip); FREE(uip, M_UIDINFO); return; } mtx_unlock(&uihashtbl_mtx); UIDINFO_UNLOCK(uip); } /* * Change the count associated with number of processes * a given user is using. When 'max' is 0, don't enforce a limit */ int chgproccnt(uip, diff, max) struct uidinfo *uip; int diff; int max; { UIDINFO_LOCK(uip); /* Don't allow them to exceed max, but allow subtraction. */ if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) { UIDINFO_UNLOCK(uip); return (0); } uip->ui_proccnt += diff; if (uip->ui_proccnt < 0) printf("negative proccnt for uid = %d\n", uip->ui_uid); UIDINFO_UNLOCK(uip); return (1); } /* * Change the total socket buffer size a user has used. */ int chgsbsize(uip, hiwat, to, max) struct uidinfo *uip; u_int *hiwat; u_int to; rlim_t max; { rlim_t new; UIDINFO_LOCK(uip); new = uip->ui_sbsize + to - *hiwat; /* Don't allow them to exceed max, but allow subtraction. */ if (to > *hiwat && new > max) { UIDINFO_UNLOCK(uip); return (0); } uip->ui_sbsize = new; UIDINFO_UNLOCK(uip); *hiwat = to; if (new < 0) printf("negative sbsize for uid = %d\n", uip->ui_uid); return (1); } Index: head/sys/kern/kern_sig.c =================================================================== --- head/sys/kern/kern_sig.c (revision 167231) +++ head/sys/kern/kern_sig.c (revision 167232) @@ -1,3299 +1,3289 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ static int coredump(struct thread *); static char *expand_name(const char *, uid_t, pid_t); static int killpg1(struct thread *td, int sig, int pgid, int all); static int issignal(struct thread *p); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static void sig_suspend_threads(struct thread *, struct proc *, int); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, int prop); #ifdef KSE static int do_tdsignal(struct proc *, struct thread *, int, ksiginfo_t *); #endif static void sigqueue_start(void); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { 0, filt_sigattach, filt_sigdetach, filt_signal }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo); SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, &sugid_coredump, 0, "Enable coredumping set user/group ID processes"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SA_KILL 0x01 /* terminates process by default */ #define SA_CORE 0x02 /* ditto and coredumps */ #define SA_STOP 0x04 /* suspend process */ #define SA_TTYSTOP 0x08 /* ditto, from tty */ #define SA_IGNORE 0x10 /* ignore by default */ #define SA_CONT 0x20 /* continue if suspended */ #define SA_CANTMASK 0x40 /* non-maskable, catchable */ #define SA_PROC 0x80 /* deliverable to any thread */ static int sigproptbl[NSIG] = { SA_KILL|SA_PROC, /* SIGHUP */ SA_KILL|SA_PROC, /* SIGINT */ SA_KILL|SA_CORE|SA_PROC, /* SIGQUIT */ SA_KILL|SA_CORE, /* SIGILL */ SA_KILL|SA_CORE, /* SIGTRAP */ SA_KILL|SA_CORE, /* SIGABRT */ SA_KILL|SA_CORE|SA_PROC, /* SIGEMT */ SA_KILL|SA_CORE, /* SIGFPE */ SA_KILL|SA_PROC, /* SIGKILL */ SA_KILL|SA_CORE, /* SIGBUS */ SA_KILL|SA_CORE, /* SIGSEGV */ SA_KILL|SA_CORE, /* SIGSYS */ SA_KILL|SA_PROC, /* SIGPIPE */ SA_KILL|SA_PROC, /* SIGALRM */ SA_KILL|SA_PROC, /* SIGTERM */ SA_IGNORE|SA_PROC, /* SIGURG */ SA_STOP|SA_PROC, /* SIGSTOP */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTSTP */ SA_IGNORE|SA_CONT|SA_PROC, /* SIGCONT */ SA_IGNORE|SA_PROC, /* SIGCHLD */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTTIN */ SA_STOP|SA_TTYSTOP|SA_PROC, /* SIGTTOU */ SA_IGNORE|SA_PROC, /* SIGIO */ SA_KILL, /* SIGXCPU */ SA_KILL, /* SIGXFSZ */ SA_KILL|SA_PROC, /* SIGVTALRM */ SA_KILL|SA_PROC, /* SIGPROF */ SA_IGNORE|SA_PROC, /* SIGWINCH */ SA_IGNORE|SA_PROC, /* SIGINFO */ SA_KILL|SA_PROC, /* SIGUSR1 */ SA_KILL|SA_PROC, /* SIGUSR2 */ }; static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); } ksiginfo_t * ksiginfo_alloc(int wait) { int flags; flags = M_ZERO; if (! wait) flags |= M_NOWAIT; if (ksiginfo_zone != NULL) return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags)); return (NULL); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline int ksiginfo_tryfree(ksiginfo_t *ksi) { if (!(ksi->ksi_flags & KSI_EXT)) { uma_zfree(ksiginfo_zone, ksi); return (1); } return (0); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_kill, signo)) { count++; SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(0)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if ((si->ksi_flags & KSI_TRAP) != 0) { if (ret != 0) SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } if (ret != 0) return (ret); out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); } void sigqueue_collect_set(sigqueue_t *sq, sigset_t *set) { ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); TAILQ_FOREACH(ksi, &sq->sq_list, ksi_link) SIGADDSET(*set, ksi->ksi_signo); SIGSETOR(*set, sq->sq_kill); } void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, sigset_t *setp) { sigset_t tmp, set; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); /* * make a copy, this allows setp to point to src or dst * sq_signals without trouble. */ set = *setp; p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_signals; SIGSETAND(tmp, set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); /* Finally, rescan src queue and set pending bits for it */ sigqueue_collect_set(src, &src->sq_signals); } void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } void sigqueue_delete_set(sigqueue_t *sq, sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_signals, *set); /* Finally, rescan queue and set pending bits for it */ sigqueue_collect_set(sq, &sq->sq_signals); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ void sigqueue_delete_set_proc(struct proc *p, sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); mtx_lock_spin(&sched_lock); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); mtx_unlock_spin(&sched_lock); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to process p, the current * process, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). - * - * MP SAFE. */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); mtx_assert(&sched_lock, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { struct proc *p; #ifdef KSE sigset_t set, saved; #else sigset_t set; #endif p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If our mask changed we may have to move signal that were * previously masked by all threads to our sigqueue. */ set = p->p_sigqueue.sq_signals; #ifdef KSE if (p->p_flag & P_SA) saved = p->p_sigqueue.sq_signals; #endif SIGSETNAND(set, td->td_sigmask); if (! SIGISEMPTY(set)) sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set); if (SIGPENDING(td)) { mtx_lock_spin(&sched_lock); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; mtx_unlock_spin(&sched_lock); } #ifdef KSE if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) { if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) { /* pending set changed */ p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } } #endif } int sigonstack(size_t sp) { struct thread *td = curthread; return ((td->td_pflags & TDP_ALTSTACK) ? #if defined(COMPAT_43) ((td->td_sigstk.ss_size == 0) ? (td->td_sigstk.ss_flags & SS_ONSTACK) : ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)) #else ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size) #endif : 0); } static __inline int sigprop(int sig) { if (sig > 0 && sig < NSIG) return (sigproptbl[_SIG_IDX(sig)]); return (0); } int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(td, sig, act, oact, flags) struct thread *td; register int sig; struct sigaction *act, *oact; int flags; { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; oact->sa_flags = 0; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) oact->sa_flags |= SA_SIGINFO; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (act->sa_flags & SA_SIGINFO) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!(act->sa_flags & SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (act->sa_flags & SA_ONSTACK) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (act->sa_flags & SA_RESETHAND) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (act->sa_flags & SA_NODEFER) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SA_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { #ifdef KSE if ((p->p_flag & P_SA) && SIGISMEMBER(p->p_sigqueue.sq_signals, sig)) { p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } #endif /* never to be seen again */ sigqueue_delete_proc(p, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sigaction(td, uap) struct thread *td; register struct sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(td, uap) struct thread *td; register struct freebsd4_sigaction_args *uap; { struct sigaction act, oact; register struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(td, uap) struct thread *td; register struct osigaction_args *uap; { struct osigaction sa; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(td, uap) struct thread *td; struct osigreturn_args *uap; { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(p) struct proc *p; { register int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) if (sigprop(i) & SA_IGNORE && i != SIGCONT) SIGADDSET(ps->ps_sigignore, i); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { struct sigacts *ps; int sig; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); SIGDELSET(ps->ps_sigcatch, sig); if (sigprop(sig) & SA_IGNORE) { if (sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); sigqueue_delete_proc(p, sig); } ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(td, how, set, oset, old) struct thread *td; int how; sigset_t *set, *oset; int old; { int error; PROC_LOCK(td->td_proc); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); SIGSETOR(td->td_sigmask, *set); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); break; case SIG_SETMASK: SIG_CANTMASK(*set); if (old) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; signotify(td); break; default: error = EINVAL; break; } } PROC_UNLOCK(td->td_proc); return (error); } -/* - * sigprocmask() - MP SAFE - */ - #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sigprocmask(td, uap) register struct thread *td; struct sigprocmask_args *uap; { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ -/* - * osigprocmask() - MP SAFE - */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(td, uap) register struct thread *td; struct osigprocmask_args *uap; { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { if (error == ERESTART) return (error); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t savedmask; struct proc *p; int error, sig, hz, i, timevalid = 0; struct timespec rts, ets, ts; struct timeval tv; p = td->td_proc; error = 0; sig = 0; SIG_CANTMASK(waitset); PROC_LOCK(p); ps = p->p_sigacts; savedmask = td->td_sigmask; if (timeout) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; getnanouptime(&rts); ets = rts; timespecadd(&ets, timeout); } } restart: for (i = 1; i <= _SIG_MAXSIG; ++i) { if (!SIGISMEMBER(waitset, i)) continue; if (!SIGISMEMBER(td->td_sigqueue.sq_signals, i)) { if (SIGISMEMBER(p->p_sigqueue.sq_signals, i)) { #ifdef KSE if (p->p_flag & P_SA) { p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } #endif sigqueue_move(&p->p_sigqueue, &td->td_sigqueue, i); } else continue; } SIGFILLSET(td->td_sigmask); SIG_CANTMASK(td->td_sigmask); SIGDELSET(td->td_sigmask, i); mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); if (sig) goto out; else { /* * Because cursig() may have stopped current thread, * after it is resumed, things may have already been * changed, it should rescan any pending signals. */ goto restart; } } if (error) goto out; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout) { if (!timevalid) { error = EINVAL; goto out; } getnanouptime(&rts); if (timespeccmp(&rts, &ets, >=)) { error = EAGAIN; goto out; } ts = ets; timespecsub(&ts, &rts); TIMESPEC_TO_TIMEVAL(&tv, &ts); hz = tvtohz(&tv); } else hz = 0; td->td_sigmask = savedmask; SIGSETNAND(td->td_sigmask, waitset); signotify(td); error = msleep(&ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", hz); if (timeout) { if (error == ERESTART) { /* timeout can not be restarted. */ error = EINTR; } else if (error == EAGAIN) { /* will calculate timeout by ourself. */ error = 0; } } goto restart; out: td->td_sigmask = savedmask; signotify(td); if (sig) { ksiginfo_init(ksi); sigqueue_get(&td->td_sigqueue, sig, ksi); ksi->ksi_signo = sig; if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); error = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, 0); } #endif if (sig == SIGKILL) sigexit(td, sig); } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sigpending(td, uap) struct thread *td; struct sigpending_args *uap; { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(td, uap) struct thread *td; struct osigpending_args *uap; { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(td, uap) struct thread *td; register struct osigvec_args *uap; { struct sigvec vec; struct sigaction nsa, osa; register struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(td, uap) register struct thread *td; struct osigblock_args *uap; { struct proc *p = td->td_proc; sigset_t set; OSIG2SIG(uap->mask, set); SIG_CANTMASK(set); PROC_LOCK(p); SIG2OSIG(td->td_sigmask, td->td_retval[0]); SIGSETOR(td->td_sigmask, set); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(td, uap) struct thread *td; struct osigsetmask_args *uap; { struct proc *p = td->td_proc; sigset_t set; OSIG2SIG(uap->mask, set); SIG_CANTMASK(set); PROC_LOCK(p); SIG2OSIG(td->td_sigmask, td->td_retval[0]); SIGSETLO(td->td_sigmask, set); signotify(td); PROC_UNLOCK(p); return (0); } #endif /* COMPAT_43 */ /* - * Suspend calling thread until signal, providing mask to be set - * in the meantime. + * Suspend calling thread until signal, providing mask to be set in the + * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sigsuspend(td, uap) struct thread *td; struct sigsuspend_args *uap; { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); td->td_oldsigmask = td->td_sigmask; td->td_pflags |= TDP_OLDMASK; SIG_CANTMASK(mask); td->td_sigmask = mask; signotify(td); while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; PROC_UNLOCK(p); /* always return EINTR rather than ERESTART... */ return (EINTR); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(td, uap) struct thread *td; struct osigsuspend_args *uap; { struct proc *p = td->td_proc; sigset_t mask; PROC_LOCK(p); td->td_oldsigmask = td->td_sigmask; td->td_pflags |= TDP_OLDMASK; OSIG2SIG(uap->mask, mask); SIG_CANTMASK(mask); SIGSETLO(td->td_sigmask, mask); signotify(td); while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0) /* void */; PROC_UNLOCK(p); /* always return EINTR rather than ERESTART... */ return (EINTR); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(td, uap) struct thread *td; register struct osigstack_args *uap; { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sigaltstack(td, uap) struct thread *td; register struct sigaltstack_args *uap; { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(td, sig, pgid, all) register struct thread *td; int sig, pgid, all; { register struct proc *p; struct pgrp *pgrp; int nfound = 0; if (all) { /* * broadcast */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc || p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansignal(td, p, sig) == 0) { nfound++; if (sig) psignal(p, sig); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p->p_state == PRS_NEW ) { PROC_UNLOCK(p); continue; } if (p_cansignal(td, p, sig) == 0) { nfound++; if (sig) psignal(p, sig); } PROC_UNLOCK(p); } PGRP_UNLOCK(pgrp); } return (nfound ? 0 : ESRCH); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int kill(td, uap) register struct thread *td; register struct kill_args *uap; { register struct proc *p; int error; AUDIT_ARG(signum, uap->signum); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); if (uap->pid > 0) { /* kill single process */ if ((p = pfind(uap->pid)) == NULL) { if ((p = zpfind(uap->pid)) == NULL) return (ESRCH); } AUDIT_ARG(process, p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } AUDIT_ARG(pid, uap->pid); switch (uap->pid) { case -1: /* broadcast signal */ return (killpg1(td, uap->signum, 0, 1)); case 0: /* signal own process group */ return (killpg1(td, uap->signum, 0, 0)); default: /* negative explicit process group */ return (killpg1(td, uap->signum, -uap->pid, 0)); } /* NOTREACHED */ } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(td, uap) struct thread *td; register struct okillpg_args *uap; { AUDIT_ARG(signum, uap->signum); AUDIT_ARG(pid, uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); return (killpg1(td, uap->signum, uap->pgid, 0)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif - int sigqueue(struct thread *td, struct sigqueue_args *uap) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (uap->pid <= 0) return (EINVAL); if ((p = pfind(uap->pid)) == NULL) { if ((p = zpfind(uap->pid)) == NULL) return (ESRCH); } error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum != 0) { ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value.sival_ptr = uap->value; error = tdsignal(p, NULL, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. */ void gsignal(pgid, sig) int pgid, sig; { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(pgrp, sig, checkctty) struct pgrp *pgrp; int sig, checkctty; { register struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (checkctty == 0 || p->p_flag & P_CONTROLT) psignal(p, sig); PROC_UNLOCK(p); } } } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; #ifdef KSE int error; #endif int sig; int code; p = td->td_proc; sig = ksi->ksi_signo; code = ksi->ksi_code; KASSERT(_SIG_VALID(sig), ("invalid signal")); #ifdef KSE if (td->td_pflags & TDP_SA) { if (td->td_mailbox == NULL) thread_user_enter(td); PROC_LOCK(p); SIGDELSET(td->td_sigmask, sig); mtx_lock_spin(&sched_lock); /* * Force scheduling an upcall, so UTS has chance to * process the signal before thread runs again in * userland. */ if (td->td_upcall) td->td_upcall->ku_flags |= KUF_DOUPCALL; mtx_unlock_spin(&sched_lock); } else { PROC_LOCK(p); } #else PROC_LOCK(p); #endif ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig)) { p->p_stats->p_ru.ru_nsignals++; #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif #ifdef KSE if (!(td->td_pflags & TDP_SA)) (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); #else (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); #endif #ifdef KSE else if (td->td_mailbox == NULL) { mtx_unlock(&ps->ps_mtx); /* UTS caused a sync signal */ p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ sigexit(td, sig); } else { mtx_unlock(&ps->ps_mtx); SIGADDSET(td->td_sigmask, sig); PROC_UNLOCK(p); error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig, sizeof(siginfo_t)); PROC_LOCK(p); /* UTS memory corrupted */ if (error) sigexit(td, SIGSEGV); mtx_lock(&ps->ps_mtx); } #endif SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(td->td_sigmask, sig); if (SIGISMEMBER(ps->ps_sigreset, sig)) { /* * See kern_sigaction() for origin of this code. */ SIGDELSET(ps->ps_sigcatch, sig); if (sig != SIGCONT && sigprop(sig) & SA_IGNORE) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(td->td_sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ tdsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, int prop) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if current thread can handle the signal without * switching conetxt to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig)) return (curthread); signal_td = NULL; mtx_lock_spin(&sched_lock); FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig)) { signal_td = td; break; } } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); mtx_unlock_spin(&sched_lock); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. */ void psignal(struct proc *p, int sig) { (void) tdsignal(p, NULL, sig, NULL); } int psignal_event(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) { struct thread *td = NULL; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!KSI_ONQ(ksi), ("psignal_event: ksi on queue")); /* * ksi_code and other fields should be set before * calling this function. */ ksi->ksi_signo = sigev->sigev_signo; ksi->ksi_value = sigev->sigev_value; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = thread_find(p, sigev->sigev_notify_thread_id); if (td == NULL) return (ESRCH); } return (tdsignal(p, td, ksi->ksi_signo, ksi)); } int tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { #ifdef KSE sigset_t saved; int ret; if (p->p_flag & P_SA) saved = p->p_sigqueue.sq_signals; ret = do_tdsignal(p, td, sig, ksi); if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) { if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) { /* pending set changed */ p->p_flag |= P_SIGEVENT; wakeup(&p->p_siglist); } } return (ret); } static int do_tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { #endif sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) #ifdef KSE panic("do_tdsignal(): invalid signal %d", sig); #else panic("tdsignal(): invalid signal %d", sig); #endif #ifdef KSE KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("do_tdsignal: ksi on queue")); #else KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("tdsignal: ksi on queue")); #endif /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); /* * If the signal is blocked and not destined for this thread, then * assign it to the process so that we can find it later in the first * thread that unblocks it. Otherwise, assign it to this thread now. */ if (td == NULL) { td = sigtd(p, sig, prop); if (SIGISMEMBER(td->td_sigmask, sig)) sigqueue = &p->p_sigqueue; else sigqueue = &td->td_sigqueue; } else { KASSERT(td->td_proc == p, ("invalid thread")); sigqueue = &td->td_sigqueue; } /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { mtx_unlock(&ps->ps_mtx); if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; mtx_unlock(&ps->ps_mtx); if (prop & SA_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SA_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SA_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } sigqueue_delete_proc(p, SIGCONT); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); /* * SIGKILL: Remove procfs STOPEVENTs. */ if (sig == SIGKILL) { /* from procfs_ioctl.c: PIOCBIC */ p->p_stops = 0; /* from procfs_ioctl.c: PIOCCONT */ p->p_step = 0; wakeup(&p->p_step); } /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediatly, such as * waking up threads so that they can cross the user boundary. * We try do the per-process part here. */ if (P_SHOULDSTOP(p)) { /* * The process is in stopped mode. All the threads should be * either winding down or already on the suspended queue. */ if (p->p_flag & P_TRACED) { /* * The traced process is already stopped, * so no further action is necessary. * No signal can restart us. */ goto out; } if (sig == SIGKILL) { /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SA_CONT) { /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; if (p->p_numthreads == p->p_suspcount) { p->p_flag |= P_CONTINUED; p->p_xstat = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); } if (action == SIG_DFL) { sigqueue_delete(sigqueue, sig); } else if (action == SIG_CATCH) { #ifdef KSE /* * The process wants to catch it so it needs * to run at least one thread, but which one? * It would seem that the answer would be to * run an upcall in the next KSE to run, and * deliver the signal that way. In a NON KSE * process, we need to make sure that the * single thread is runnable asap. * XXXKSE for now however, make them all run. */ #else /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ #endif goto runfast; } /* * The signal is not ignored or caught. */ mtx_lock_spin(&sched_lock); thread_unsuspend(p); mtx_unlock_spin(&sched_lock); goto out; } if (prop & SA_STOP) { /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ mtx_lock_spin(&sched_lock); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) sleepq_abort(td, intrval); mtx_unlock_spin(&sched_lock); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { mtx_lock_spin(&sched_lock); tdsigwakeup(td, sig, action, intrval); mtx_unlock_spin(&sched_lock); goto out; } MPASS(action == SIG_DFL); if (prop & SA_STOP) { if (p->p_flag & P_PPWAIT) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; mtx_lock_spin(&sched_lock); sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); mtx_unlock_spin(&sched_lock); sigqueue_delete_proc(p, p->p_xstat); } else mtx_unlock_spin(&sched_lock); goto out; } else goto runfast; /* NOTREACHED */ } else { /* Not in "NORMAL" state. discard the signal. */ sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: mtx_lock_spin(&sched_lock); tdsigwakeup(td, sig, action, intrval); thread_unsuspend(p); mtx_unlock_spin(&sched_lock); out: /* If we jump here, sched_lock should not be owned. */ mtx_assert(&sched_lock, MA_NOTOWNED); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; register int prop; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); prop = sigprop(sig); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. */ if (action == SIG_DFL && (prop & SA_KILL)) { if (p->p_nice > 0) sched_nice(td->td_proc, 0); if (td->td_priority > PUSER) sched_prio(td, PUSER); } if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) return; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SA_CONT) && action == SIG_DFL) { mtx_unlock_spin(&sched_lock); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); mtx_lock_spin(&sched_lock); return; } /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER) sched_prio(td, PUSER); sleepq_abort(td, intrval); } else { /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif } } static void sig_suspend_threads(struct thread *td, struct proc *p, int sending) { struct thread *td2; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td2) { if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR) && !TD_IS_SUSPENDED(td2)) { thread_suspend_one(td2); } else { if (sending || td != td2) td2->td_flags |= TDF_ASTPENDING; #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } } } int ptracestop(struct thread *td, int sig) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.mtx_object, "Stopping for traced signal"); mtx_lock_spin(&sched_lock); td->td_flags |= TDF_XSIG; mtx_unlock_spin(&sched_lock); td->td_xsig = sig; while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) { if (p->p_flag & P_SINGLE_EXIT) { mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_XSIG; mtx_unlock_spin(&sched_lock); return (sig); } /* * Just make wait() to work, the last stopped thread * will win. */ p->p_xstat = sig; p->p_xthread = td; p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE); mtx_lock_spin(&sched_lock); sig_suspend_threads(td, p, 0); stopme: thread_stopped(p); thread_suspend_one(td); PROC_UNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL, NULL); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); PROC_LOCK(p); if (!(p->p_flag & P_TRACED)) break; if (td->td_flags & TDF_DBSUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; mtx_lock_spin(&sched_lock); goto stopme; } } return (td->td_xsig); } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(td) struct thread *td; { struct proc *p; struct sigacts *ps; sigset_t sigpending; int sig, prop, newsig; p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); sigpending = td->td_sigqueue.sq_signals; SIGSETNAND(sigpending, td->td_sigmask); if (p->p_flag & P_PPWAIT) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); sig = sig_ffs(&sigpending); if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) { sigqueue_delete(&td->td_sigqueue, sig); #ifdef KSE if (td->td_pflags & TDP_SA) SIGADDSET(td->td_sigmask, sig); #endif continue; } if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) { /* * If traced, always stop. */ mtx_unlock(&ps->ps_mtx); newsig = ptracestop(td, sig); mtx_lock(&ps->ps_mtx); #ifdef KSE if (td->td_pflags & TDP_SA) SIGADDSET(td->td_sigmask, sig); #endif if (sig != newsig) { ksiginfo_t ksi; /* * clear old signal. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ sigqueue_get(&td->td_sigqueue, sig, &ksi); /* * If parent wants us to take the signal, * then it will leave it in p->p_xstat; * otherwise we just look for signals again. */ if (newsig == 0) continue; sig = newsig; /* * Put the new signal into td_sigqueue. If the * signal is being masked, look for other signals. */ SIGADDSET(td->td_sigqueue.sq_signals, sig); #ifdef KSE if (td->td_pflags & TDP_SA) SIGDELSET(td->td_sigmask, sig); #endif if (SIGISMEMBER(td->td_sigmask, sig)) continue; signotify(td); } /* * If the traced bit got turned off, go back up * to the top to rescan signals. This ensures * that p_sig* and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) continue; } prop = sigprop(sig); /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process * with default action, stop here, * then clear the signal. However, * if process is member of an orphaned * process group, ignore tty stop signals. */ if (prop & SA_STOP) { if (p->p_flag & P_TRACED || (p->p_pgrp->pg_jobc == 0 && prop & SA_TTYSTOP)) break; /* == ignore */ mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.mtx_object, "Catching SIGSTOP"); p->p_flag |= P_STOPPED_SIG; p->p_xstat = sig; mtx_lock_spin(&sched_lock); sig_suspend_threads(td, p, 0); thread_stopped(p); thread_suspend_one(td); PROC_UNLOCK(p); DROP_GIANT(); mi_switch(SW_INVOL, NULL); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); PROC_LOCK(p); mtx_lock(&ps->ps_mtx); break; } else if (prop & SA_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (intptr_t)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SA_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */ } /* NOTREACHED */ } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&sched_lock, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { mtx_unlock_spin(&sched_lock); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); mtx_lock_spin(&sched_lock); } } /* * Take the action for the specified signal * from the current set of pending signals. */ void postsig(sig) register int sig; { struct thread *td = curthread; register struct proc *p = td->td_proc; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; int code; KASSERT(sig != 0, ("postsig")); PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); sigqueue_get(&td->td_sigqueue, sig, &ksi); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, 0); #endif if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } #ifdef KSE if (!(td->td_pflags & TDP_SA) && action == SIG_DFL) { #else if (action == SIG_DFL) { #endif /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); sigexit(td, sig); /* NOTREACHED */ } else { #ifdef KSE if (td->td_pflags & TDP_SA) { if (sig == SIGKILL) { mtx_unlock(&ps->ps_mtx); sigexit(td, sig); } } #endif /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig), ("postsig action")); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]); if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(td->td_sigmask, sig); if (SIGISMEMBER(ps->ps_sigreset, sig)) { /* * See kern_sigaction() for origin of this code. */ SIGDELSET(ps->ps_sigcatch, sig); if (sig != SIGCONT && sigprop(sig) & SA_IGNORE) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } p->p_stats->p_ru.ru_nsignals++; if (p->p_sig != sig) { code = 0; } else { code = p->p_code; p->p_code = 0; p->p_sig = 0; } #ifdef KSE if (td->td_pflags & TDP_SA) thread_signal_add(curthread, &ksi); else (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); #else (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); #endif } } /* * Kill the current process for stated reason. */ void killproc(p, why) struct proc *p; char *why; { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why); psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(td, sig) struct thread *td; int sig; { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n", p->p_pid, p->p_comm, td->td_ucred ? td->td_ucred->cr_uid : -1, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else PROC_UNLOCK(p); exit1(td, W_EXITCODE(0, sig)); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } tdsignal(p->p_pptr, NULL, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int status) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, status); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xstat); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason; int status = p->p_xstat; /* convert to int */ reason = CLD_EXITED; if (WCOREDUMP(status)) reason = CLD_DUMPED; else if (WIFSIGNALED(status)) reason = CLD_KILLED; /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } static char corefilename[MAXPATHLEN] = {"%N.core"}; SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename, sizeof(corefilename), "process corefile name format string"); /* * expand_name(name, uid, pid) * Expand the name described in corefilename, using name, uid, and pid. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static char * expand_name(name, uid, pid) const char *name; uid_t uid; pid_t pid; { const char *format, *appendstr; char *temp; char buf[11]; /* Buffer for pid/uid -- max 4B */ size_t i, l, n; format = corefilename; temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO); if (temp == NULL) return (NULL); for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': appendstr = "%"; break; case 'N': /* process name */ appendstr = name; break; case 'P': /* process id */ sprintf(buf, "%u", pid); appendstr = buf; break; case 'U': /* user id */ sprintf(buf, "%u", uid); appendstr = buf; break; default: appendstr = ""; log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format); } l = strlen(appendstr); if ((n + l) >= MAXPATHLEN) goto toolong; memcpy(temp + n, appendstr, l); n += l; break; default: temp[n++] = format[i]; } } if (format[i] != '\0') goto toolong; return (temp); toolong: log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n", (long)pid, name, (u_long)uid); free(temp, M_TEMP); return (NULL); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; register struct vnode *vp; register struct ucred *cred = td->td_ucred; struct flock lf; struct nameidata nd; struct vattr vattr; int error, error1, flags, locked; struct mount *mp; char *name; /* name of corefile */ off_t limit; int vfslocked; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); _STOPEVENT(p, S_CORE, 0); if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(p, RLIMIT_CORE); PROC_UNLOCK(p); if (limit == 0) return (EFBIG); restart: name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid); if (name == NULL) return (EINVAL); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td); flags = O_CREAT | FWRITE | O_NOFOLLOW; error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, -1); free(name, M_TEMP); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* Don't dump to non-regular files or files with links. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) { VOP_UNLOCK(vp, 0, td); error = EFAULT; goto close; } VOP_UNLOCK(vp, 0, td); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { lf.l_type = F_UNLCK; if (locked) VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); if ((error = vn_close(vp, FWRITE, cred, td)) != 0) goto out; if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; VFS_UNLOCK_GIANT(vfslocked); goto restart; } VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VOP_LEASE(vp, td, cred, LEASE_WRITE); VOP_SETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); error = p->p_sysent->sv_coredump ? p->p_sysent->sv_coredump(td, vp, limit) : ENOSYS; if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } close: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; out: VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(td, args) struct thread *td; struct nosys_args *args; { struct proc *p = td->td_proc; PROC_LOCK(p); psignal(p, SIGSYS); PROC_UNLOCK(p); return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(sigiop, sig, checkctty) struct sigio **sigiop; int sig, checkctty; { struct sigio *sigio; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(&p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(&p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); ps->ps_refcnt = 1; mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { mtx_lock(&ps->ps_mtx); ps->ps_refcnt--; if (ps->ps_refcnt == 0) { mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } else mtx_unlock(&ps->ps_mtx); } struct sigacts * sigacts_hold(struct sigacts *ps) { mtx_lock(&ps->ps_mtx); ps->ps_refcnt++; mtx_unlock(&ps->ps_mtx); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { int shared; mtx_lock(&ps->ps_mtx); shared = ps->ps_refcnt > 1; mtx_unlock(&ps->ps_mtx); return (shared); } Index: head/sys/kern/kern_synch.c =================================================================== --- head/sys/kern/kern_synch.c (revision 167231) +++ head/sys/kern/kern_synch.c (revision 167232) @@ -1,577 +1,577 @@ /*- * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL) int hogticks; int lbolt; static int pause_wchan; static struct callout loadav_callout; static struct callout lbolt_callout; struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. */ static fixpt_t cexp[3] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ static int fscale __unused = FSCALE; SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); static void loadav(void *arg); static void lboltcb(void *arg); void sleepinit(void) { hogticks = (hz / 10) * 2; /* Default only. */ init_sleepqueues(); } /* * General sleep call. Suspends the current thread until a wakeup is * performed on the specified identifier. The thread will then be made * runnable with the specified priority. Sleeps at most timo/hz seconds * (0 means no timeout). If pri includes PCATCH flag, signals are checked * before and after sleeping, else signals are not checked. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal needs to be delivered, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). * * The mutex argument is unlocked before the caller is suspended, and * re-locked before msleep returns. If priority includes the PDROP * flag the mutex is not re-locked before returning. */ int msleep(ident, mtx, priority, wmesg, timo) void *ident; struct mtx *mtx; int priority, timo; const char *wmesg; { struct thread *td; struct proc *p; int catch, rval, flags, pri; WITNESS_SAVE_DECL(mtx); td = curthread; p = td->td_proc; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0); #endif WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, mtx == NULL ? NULL : &mtx->mtx_object, "Sleeping on \"%s\"", wmesg); KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL || ident == &lbolt, ("sleeping without a mutex")); KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); if (cold) { /* * During autoconfiguration, just return; * don't run any other threads or panic below, * in case this is the idle thread and already asleep. * XXX: this used to do "s = splhigh(); splx(safepri); * splx(s);" to give interrupts a chance, but there is * no way to give interrupts a chance now. */ if (mtx != NULL && priority & PDROP) mtx_unlock(mtx); return (0); } catch = priority & PCATCH; rval = 0; /* * If we are already on a sleep queue, then remove us from that * sleep queue first. We have to do this to handle recursive * sleeps. */ if (TD_ON_SLEEPQ(td)) sleepq_remove(td, td->td_wchan); if (ident == &pause_wchan) flags = SLEEPQ_PAUSE; else flags = SLEEPQ_MSLEEP; if (catch) flags |= SLEEPQ_INTERRUPTIBLE; sleepq_lock(ident); CTR5(KTR_PROC, "msleep: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, p->p_pid, p->p_comm, wmesg, ident); DROP_GIANT(); if (mtx != NULL) { mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(&mtx->mtx_object, mtx); mtx_unlock(mtx); } /* * We put ourselves on the sleep queue and start our timeout * before calling thread_suspend_check, as we could stop there, * and a wakeup or a SIGCONT (or both) could occur while we were * stopped without resuming us. Thus, we must be ready for sleep * when cursig() is called. If the wakeup happens while we're * stopped, then td will no longer be on a sleep queue upon * return from cursig(). */ sleepq_add(ident, ident == &lbolt ? NULL : &mtx->mtx_object, wmesg, flags, 0); if (timo) sleepq_set_timeout(ident, timo); /* * Adjust this thread's priority, if necessary. */ pri = priority & PRIMASK; if (pri != 0 && pri != td->td_priority) { mtx_lock_spin(&sched_lock); sched_prio(td, pri); mtx_unlock_spin(&sched_lock); } if (timo && catch) rval = sleepq_timedwait_sig(ident); else if (timo) rval = sleepq_timedwait(ident); else if (catch) rval = sleepq_wait_sig(ident); else { sleepq_wait(ident); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0); #endif PICKUP_GIANT(); if (mtx != NULL && !(priority & PDROP)) { mtx_lock(mtx); WITNESS_RESTORE(&mtx->mtx_object, mtx); } return (rval); } int msleep_spin(ident, mtx, wmesg, timo) void *ident; struct mtx *mtx; const char *wmesg; int timo; { struct thread *td; struct proc *p; int rval; WITNESS_SAVE_DECL(mtx); td = curthread; p = td->td_proc; KASSERT(mtx != NULL, ("sleeping without a mutex")); KASSERT(p != NULL, ("msleep1")); KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep")); if (cold) { /* * During autoconfiguration, just return; * don't run any other threads or panic below, * in case this is the idle thread and already asleep. * XXX: this used to do "s = splhigh(); splx(safepri); * splx(s);" to give interrupts a chance, but there is * no way to give interrupts a chance now. */ return (0); } sleepq_lock(ident); CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, p->p_pid, p->p_comm, wmesg, ident); DROP_GIANT(); mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(&mtx->mtx_object, mtx); mtx_unlock_spin(mtx); /* * We put ourselves on the sleep queue and start our timeout. */ sleepq_add(ident, &mtx->mtx_object, wmesg, SLEEPQ_MSLEEP, 0); if (timo) sleepq_set_timeout(ident, timo); /* * Can't call ktrace with any spin locks held so it can lock the * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold * any spin lock. Thus, we have to drop the sleepq spin lock while * we handle those requests. This is safe since we have placed our * thread on the sleep queue already. */ #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { sleepq_release(ident); ktrcsw(1, 0); sleepq_lock(ident); } #endif #ifdef WITNESS sleepq_release(ident); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"", wmesg); sleepq_lock(ident); #endif if (timo) rval = sleepq_timedwait(ident); else { sleepq_wait(ident); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0); #endif PICKUP_GIANT(); mtx_lock_spin(mtx); WITNESS_RESTORE(&mtx->mtx_object, mtx); return (rval); } /* * pause() is like tsleep() except that the intention is to not be * explicitly woken up by another thread. Instead, the current thread * simply wishes to sleep until the timeout expires. It is * implemented using a dummy wait channel. */ int pause(wmesg, timo) const char *wmesg; int timo; { KASSERT(timo != 0, ("pause: timeout required")); return (tsleep(&pause_wchan, 0, wmesg, timo)); } /* * Make all threads sleeping on the specified identifier runnable. */ void wakeup(ident) register void *ident; { sleepq_lock(ident); sleepq_broadcast(ident, SLEEPQ_MSLEEP, -1, 0); } /* * Make a thread sleeping on the specified identifier runnable. * May wake more than one thread if a target thread is currently * swapped out. */ void wakeup_one(ident) register void *ident; { sleepq_lock(ident); sleepq_signal(ident, SLEEPQ_MSLEEP, -1, 0); } /* * The machine independent parts of context switching. */ void mi_switch(int flags, struct thread *newtd) { uint64_t new_switchtime; struct thread *td; struct proc *p; mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED); td = curthread; /* XXX */ p = td->td_proc; /* XXX */ KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td)) mtx_assert(&Giant, MA_NOTOWNED); #endif KASSERT(td->td_critnest == 1 || (td->td_critnest == 2 && (td->td_owepreempt) && (flags & SW_INVOL) != 0 && newtd == NULL) || panicstr, ("mi_switch: switch in a critical section")); KASSERT((flags & (SW_INVOL | SW_VOL)) != 0, ("mi_switch: switch must be voluntary or involuntary")); KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself")); /* * Don't perform context switches from the debugger. */ if (kdb_active) { mtx_unlock_spin(&sched_lock); kdb_backtrace(); kdb_reenter(); panic("%s: did not reenter debugger", __func__); } if (flags & SW_VOL) p->p_stats->p_ru.ru_nvcsw++; else p->p_stats->p_ru.ru_nivcsw++; /* * Compute the amount of time during which the current * process was running, and add that to its total so far. */ new_switchtime = cpu_ticks(); p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime)); p->p_rux.rux_uticks += td->td_uticks; td->td_uticks = 0; p->p_rux.rux_iticks += td->td_iticks; td->td_iticks = 0; p->p_rux.rux_sticks += td->td_sticks; td->td_sticks = 0; td->td_generation++; /* bump preempt-detect counter */ /* * Check if the process exceeds its cpu resource allocation. If * it reaches the max, arrange to kill the process in ast(). */ if (p->p_cpulimit != RLIM_INFINITY && p->p_rux.rux_runtime >= p->p_cpulimit * cpu_tickrate()) { p->p_sflag |= PS_XCPU; td->td_flags |= TDF_ASTPENDING; } /* * Finish up stats for outgoing thread. */ cnt.v_swtch++; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); CTR4(KTR_PROC, "mi_switch: old thread %ld (kse %p, pid %ld, %s)", td->td_tid, td->td_sched, p->p_pid, p->p_comm); #if (KTR_COMPILE & KTR_SCHED) != 0 if (td == PCPU_GET(idlethread)) CTR3(KTR_SCHED, "mi_switch: %p(%s) prio %d idle", td, td->td_proc->p_comm, td->td_priority); else if (newtd != NULL) CTR5(KTR_SCHED, "mi_switch: %p(%s) prio %d preempted by %p(%s)", td, td->td_proc->p_comm, td->td_priority, newtd, newtd->td_proc->p_comm); else CTR6(KTR_SCHED, "mi_switch: %p(%s) prio %d inhibit %d wmesg %s lock %s", td, td->td_proc->p_comm, td->td_priority, td->td_inhibitors, td->td_wmesg, td->td_lockname); #endif /* * We call thread_switchout after the KTR_SCHED prints above so kse * selecting a new thread to run does not show up as a preemption. */ #ifdef KSE if ((flags & SW_VOL) && (td->td_proc->p_flag & P_SA)) newtd = thread_switchout(td, flags, newtd); #endif sched_switch(td, newtd, flags); CTR3(KTR_SCHED, "mi_switch: running %p(%s) prio %d", td, td->td_proc->p_comm, td->td_priority); CTR4(KTR_PROC, "mi_switch: new thread %ld (kse %p, pid %ld, %s)", td->td_tid, td->td_sched, p->p_pid, p->p_comm); /* * If the last thread was exiting, finish cleaning it up. */ if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } } /* * Change process state to be runnable, * placing it on the run queue if it is in memory, * and awakening the swapper if it isn't in memory. */ void setrunnable(struct thread *td) { struct proc *p; p = td->td_proc; mtx_assert(&sched_lock, MA_OWNED); switch (p->p_state) { case PRS_ZOMBIE: panic("setrunnable(1)"); default: break; } switch (td->td_state) { case TDS_RUNNING: case TDS_RUNQ: return; case TDS_INHIBITED: /* * If we are only inhibited because we are swapped out * then arange to swap in this process. Otherwise just return. */ if (td->td_inhibitors != TDI_SWAPPED) return; /* XXX: intentional fall-through ? */ case TDS_CAN_RUN: break; default: printf("state is 0x%x", td->td_state); panic("setrunnable(2)"); } if ((p->p_sflag & PS_INMEM) == 0) { if ((p->p_sflag & PS_SWAPPINGIN) == 0) { p->p_sflag |= PS_SWAPINREQ; /* * due to a LOR between sched_lock and * the sleepqueue chain locks, use * lower level scheduling functions. */ kick_proc0(); } } else sched_wakeup(td); } /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. * XXXKSE Needs complete rewrite when correct info is available. * Completely Bogus.. only works with 1:1 (but compiles ok now :-) */ static void loadav(void *arg) { int i, nrun; struct loadavg *avg; nrun = sched_load(); avg = &averunnable; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; /* * Schedule the next update to occur after 5 seconds, but add a * random variation to avoid synchronisation with processes that * run at regular intervals. */ callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)), loadav, NULL); } static void lboltcb(void *arg) { wakeup(&lbolt); callout_reset(&lbolt_callout, hz, lboltcb, NULL); } /* ARGSUSED */ static void synch_setup(dummy) void *dummy; { callout_init(&loadav_callout, CALLOUT_MPSAFE); callout_init(&lbolt_callout, CALLOUT_MPSAFE); /* Kick off timeout driven events by calling first time. */ loadav(NULL); lboltcb(NULL); } /* - * General purpose yield system call + * General purpose yield system call. */ int yield(struct thread *td, struct yield_args *uap) { mtx_assert(&Giant, MA_NOTOWNED); (void)uap; sched_relinquish(td); return (0); } Index: head/sys/kern/kern_sysctl.c =================================================================== --- head/sys/kern/kern_sysctl.c (revision 167231) +++ head/sys/kern/kern_sysctl.c (revision 167232) @@ -1,1610 +1,1609 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD * project, to make these variables more userfriendly. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic"); static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids"); static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer"); /* * Locking - this locks the sysctl tree in memory. */ static struct sx sysctllock; #define SYSCTL_LOCK() sx_xlock(&sysctllock) #define SYSCTL_UNLOCK() sx_xunlock(&sysctllock) #define SYSCTL_INIT() sx_init(&sysctllock, "sysctl lock") static int sysctl_root(SYSCTL_HANDLER_ARGS); struct sysctl_oid_list sysctl__children; /* root list */ static struct sysctl_oid * sysctl_find_oidname(const char *name, struct sysctl_oid_list *list) { struct sysctl_oid *oidp; SLIST_FOREACH(oidp, list, oid_link) { if (strcmp(oidp->oid_name, name) == 0) { return (oidp); } } return (NULL); } /* * Initialization of the MIB tree. * * Order by number in each list. */ void sysctl_register_oid(struct sysctl_oid *oidp) { struct sysctl_oid_list *parent = oidp->oid_parent; struct sysctl_oid *p; struct sysctl_oid *q; /* * First check if another oid with the same name already * exists in the parent's list. */ p = sysctl_find_oidname(oidp->oid_name, parent); if (p != NULL) { if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) { p->oid_refcnt++; return; } else { printf("can't re-use a leaf (%s)!\n", p->oid_name); return; } } /* * If this oid has a number OID_AUTO, give it a number which * is greater than any current oid. * NOTE: DO NOT change the starting value here, change it in * , and make sure it is at least 256 to * accomodate e.g. net.inet.raw as a static sysctl node. */ if (oidp->oid_number == OID_AUTO) { static int newoid = CTL_AUTO_START; oidp->oid_number = newoid++; if (newoid == 0x7fffffff) panic("out of oids"); } #if 0 else if (oidp->oid_number >= CTL_AUTO_START) { /* do not panic; this happens when unregistering sysctl sets */ printf("static sysctl oid too high: %d", oidp->oid_number); } #endif /* * Insert the oid into the parent's list in order. */ q = NULL; SLIST_FOREACH(p, parent, oid_link) { if (oidp->oid_number < p->oid_number) break; q = p; } if (q) SLIST_INSERT_AFTER(q, oidp, oid_link); else SLIST_INSERT_HEAD(parent, oidp, oid_link); } void sysctl_unregister_oid(struct sysctl_oid *oidp) { struct sysctl_oid *p; int error; error = ENOENT; if (oidp->oid_number == OID_AUTO) { error = EINVAL; } else { SLIST_FOREACH(p, oidp->oid_parent, oid_link) { if (p == oidp) { SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); error = 0; break; } } } /* * This can happen when a module fails to register and is * being unloaded afterwards. It should not be a panic() * for normal use. */ if (error) printf("%s: failed to unregister sysctl\n", __func__); } /* Initialize a new context to keep track of dynamically added sysctls. */ int sysctl_ctx_init(struct sysctl_ctx_list *c) { if (c == NULL) { return (EINVAL); } TAILQ_INIT(c); return (0); } /* Free the context, and destroy all dynamic oids registered in this context */ int sysctl_ctx_free(struct sysctl_ctx_list *clist) { struct sysctl_ctx_entry *e, *e1; int error; error = 0; /* * First perform a "dry run" to check if it's ok to remove oids. * XXX FIXME * XXX This algorithm is a hack. But I don't know any * XXX better solution for now... */ TAILQ_FOREACH(e, clist, link) { error = sysctl_remove_oid(e->entry, 0, 0); if (error) break; } /* * Restore deregistered entries, either from the end, * or from the place where error occured. * e contains the entry that was not unregistered */ if (error) e1 = TAILQ_PREV(e, sysctl_ctx_list, link); else e1 = TAILQ_LAST(clist, sysctl_ctx_list); while (e1 != NULL) { sysctl_register_oid(e1->entry); e1 = TAILQ_PREV(e1, sysctl_ctx_list, link); } if (error) return(EBUSY); /* Now really delete the entries */ e = TAILQ_FIRST(clist); while (e != NULL) { e1 = TAILQ_NEXT(e, link); error = sysctl_remove_oid(e->entry, 1, 0); if (error) panic("sysctl_remove_oid: corrupt tree, entry: %s", e->entry->oid_name); free(e, M_SYSCTLOID); e = e1; } return (error); } /* Add an entry to the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; if (clist == NULL || oidp == NULL) return(NULL); e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK); e->entry = oidp; TAILQ_INSERT_HEAD(clist, e, link); return (e); } /* Find an entry in the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; if (clist == NULL || oidp == NULL) return(NULL); TAILQ_FOREACH(e, clist, link) { if(e->entry == oidp) return(e); } return (e); } /* * Delete an entry from the context. * NOTE: this function doesn't free oidp! You have to remove it * with sysctl_remove_oid(). */ int sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; if (clist == NULL || oidp == NULL) return (EINVAL); e = sysctl_ctx_entry_find(clist, oidp); if (e != NULL) { TAILQ_REMOVE(clist, e, link); free(e, M_SYSCTLOID); return (0); } else return (ENOENT); } /* * Remove dynamically created sysctl trees. * oidp - top of the tree to be removed * del - if 0 - just deregister, otherwise free up entries as well * recurse - if != 0 traverse the subtree to be deleted */ int sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse) { struct sysctl_oid *p; int error; if (oidp == NULL) return(EINVAL); if ((oidp->oid_kind & CTLFLAG_DYN) == 0) { printf("can't remove non-dynamic nodes!\n"); return (EINVAL); } /* * WARNING: normal method to do this should be through * sysctl_ctx_free(). Use recursing as the last resort * method to purge your sysctl tree of leftovers... * However, if some other code still references these nodes, * it will panic. */ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oidp->oid_refcnt == 1) { SLIST_FOREACH(p, SYSCTL_CHILDREN(oidp), oid_link) { if (!recurse) return (ENOTEMPTY); error = sysctl_remove_oid(p, del, recurse); if (error) return (error); } if (del) free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID); } } if (oidp->oid_refcnt > 1 ) { oidp->oid_refcnt--; } else { if (oidp->oid_refcnt == 0) { printf("Warning: bad oid_refcnt=%u (%s)!\n", oidp->oid_refcnt, oidp->oid_name); return (EINVAL); } sysctl_unregister_oid(oidp); if (del) { if (oidp->oid_descr) free((void *)(uintptr_t)(const void *)oidp->oid_descr, M_SYSCTLOID); free((void *)(uintptr_t)(const void *)oidp->oid_name, M_SYSCTLOID); free(oidp, M_SYSCTLOID); } } return (0); } /* * Create new sysctls at run time. * clist may point to a valid context initialized with sysctl_ctx_init(). */ struct sysctl_oid * sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int number, const char *name, int kind, void *arg1, int arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr) { struct sysctl_oid *oidp; ssize_t len; char *newname; /* You have to hook up somewhere.. */ if (parent == NULL) return(NULL); /* Check if the node already exists, otherwise create it */ oidp = sysctl_find_oidname(name, parent); if (oidp != NULL) { if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { oidp->oid_refcnt++; /* Update the context */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); return (oidp); } else { printf("can't re-use a leaf (%s)!\n", name); return (NULL); } } oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO); oidp->oid_parent = parent; SLIST_NEXT(oidp, oid_link) = NULL; oidp->oid_number = number; oidp->oid_refcnt = 1; len = strlen(name); newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK); bcopy(name, newname, len + 1); newname[len] = '\0'; oidp->oid_name = newname; oidp->oid_handler = handler; oidp->oid_kind = CTLFLAG_DYN | kind; if ((kind & CTLTYPE) == CTLTYPE_NODE) { /* Allocate space for children */ SYSCTL_CHILDREN_SET(oidp, malloc(sizeof(struct sysctl_oid_list), M_SYSCTLOID, M_WAITOK)); SLIST_INIT(SYSCTL_CHILDREN(oidp)); } else { oidp->oid_arg1 = arg1; oidp->oid_arg2 = arg2; } oidp->oid_fmt = fmt; if (descr) { int len = strlen(descr) + 1; oidp->oid_descr = malloc(len, M_SYSCTLOID, M_WAITOK); if (oidp->oid_descr) strcpy((char *)(uintptr_t)(const void *)oidp->oid_descr, descr); } /* Update the context, if used */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); /* Register this oid */ sysctl_register_oid(oidp); return (oidp); } /* * Reparent an existing oid. */ int sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent) { struct sysctl_oid *oidp; if (oid->oid_parent == parent) return (0); oidp = sysctl_find_oidname(oid->oid_name, parent); if (oidp != NULL) return (EEXIST); sysctl_unregister_oid(oid); oid->oid_parent = parent; oid->oid_number = OID_AUTO; sysctl_register_oid(oid); return (0); } /* * Register the kernel's oids on startup. */ SET_DECLARE(sysctl_set, struct sysctl_oid); static void sysctl_register_all(void *arg) { struct sysctl_oid **oidp; SYSCTL_INIT(); SET_FOREACH(oidp, sysctl_set) sysctl_register_oid(*oidp); } SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0); /* * "Staff-functions" * * These functions implement a presently undocumented interface * used by the sysctl program to walk the tree, and get the type * so it can print the value. * This interface is under work and consideration, and should probably * be killed with a big axe by the first person who can find the time. * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. * * {0,0} printf the entire MIB-tree. * {0,1,...} return the name of the "..." OID. * {0,2,...} return the next OID. * {0,3} return the OID of the name in "new" * {0,4,...} return the kind & format info for the "..." OID. * {0,5,...} return the description the "..." OID. */ #ifdef SYSCTL_DEBUG static void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) { int k; struct sysctl_oid *oidp; SLIST_FOREACH(oidp, l, oid_link) { for (k=0; koid_number, oidp->oid_name); printf("%c%c", oidp->oid_kind & CTLFLAG_RD ? 'R':' ', oidp->oid_kind & CTLFLAG_WR ? 'W':' '); if (oidp->oid_handler) printf(" *Handler"); switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_NODE: printf(" Node\n"); if (!oidp->oid_handler) { sysctl_sysctl_debug_dump_node( oidp->oid_arg1, i+2); } break; case CTLTYPE_INT: printf(" Int\n"); break; case CTLTYPE_STRING: printf(" String\n"); break; case CTLTYPE_QUAD: printf(" Quad\n"); break; case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; default: printf("\n"); } } } static int sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) { int error; error = priv_check(req->td, PRIV_SYSCTL_DEBUG); if (error) return (error); sysctl_sysctl_debug_dump_node(&sysctl__children, 0); return (ENOENT); } SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD, 0, 0, sysctl_sysctl_debug, "-", ""); #endif static int sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int error = 0; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; char buf[10]; while (namelen) { if (!lsp) { snprintf(buf,sizeof(buf),"%d",*name); if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, buf, strlen(buf)); if (error) return (error); namelen--; name++; continue; } lsp2 = 0; SLIST_FOREACH(oid, lsp, oid_link) { if (oid->oid_number != *name) continue; if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, oid->oid_name, strlen(oid->oid_name)); if (error) return (error); namelen--; name++; if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oid->oid_handler) break; lsp2 = (struct sysctl_oid_list *)oid->oid_arg1; break; } lsp = lsp2; } return (SYSCTL_OUT(req, "", 1)); } static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, ""); static int sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, int *next, int *len, int level, struct sysctl_oid **oidpp) { struct sysctl_oid *oidp; *len = level; SLIST_FOREACH(oidp, lsp, oid_link) { *next = oidp->oid_number; *oidpp = oidp; if (oidp->oid_kind & CTLFLAG_SKIP) continue; if (!namelen) { if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (0); if (oidp->oid_handler) /* We really should call the handler here...*/ return (0); lsp = (struct sysctl_oid_list *)oidp->oid_arg1; if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, len, level+1, oidpp)) return (0); goto emptynode; } if (oidp->oid_number < *name) continue; if (oidp->oid_number > *name) { if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (0); if (oidp->oid_handler) return (0); lsp = (struct sysctl_oid_list *)oidp->oid_arg1; if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, len, level+1, oidpp)) return (0); goto next; } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) continue; if (oidp->oid_handler) continue; lsp = (struct sysctl_oid_list *)oidp->oid_arg1; if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, len, level+1, oidpp)) return (0); next: namelen = 1; emptynode: *len = level; } return (1); } static int sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int i, j, error; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children; int newoid[CTL_MAXNAME]; i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid); if (i) return (ENOENT); error = SYSCTL_OUT(req, newoid, j * sizeof (int)); return (error); } static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, ""); static int name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp) { int i; struct sysctl_oid *oidp; struct sysctl_oid_list *lsp = &sysctl__children; char *p; if (!*name) return (ENOENT); p = name + strlen(name) - 1 ; if (*p == '.') *p = '\0'; *len = 0; for (p = name; *p && *p != '.'; p++) ; i = *p; if (i == '.') *p = '\0'; oidp = SLIST_FIRST(lsp); while (oidp && *len < CTL_MAXNAME) { if (strcmp(name, oidp->oid_name)) { oidp = SLIST_NEXT(oidp, oid_link); continue; } *oid++ = oidp->oid_number; (*len)++; if (!i) { if (oidpp) *oidpp = oidp; return (0); } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oidp->oid_handler) break; lsp = (struct sysctl_oid_list *)oidp->oid_arg1; oidp = SLIST_FIRST(lsp); name = p+1; for (p = name; *p && *p != '.'; p++) ; i = *p; if (i == '.') *p = '\0'; } return (ENOENT); } static int sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) { char *p; int error, oid[CTL_MAXNAME], len; struct sysctl_oid *op = 0; if (!req->newlen) return (ENOENT); if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ return (ENAMETOOLONG); p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK); error = SYSCTL_IN(req, p, req->newlen); if (error) { free(p, M_SYSCTL); return (error); } p [req->newlen] = '\0'; error = name2oid(p, oid, &len, &op); free(p, M_SYSCTL); if (error) return (error); error = SYSCTL_OUT(req, oid, len * sizeof *oid); return (error); } SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; int error; error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) return (error); if (!oid->oid_fmt) return (ENOENT); error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind)); if (error) return (error); error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1); return (error); } static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, ""); static int sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; int error; error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) return (error); if (!oid->oid_descr) return (ENOENT); error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1); return (error); } static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD, sysctl_sysctl_oiddescr, ""); /* * Default "handler" functions. */ /* * Handle an int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_int(SYSCTL_HANDLER_ARGS) { int tmpout, error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(int)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(int)); return (error); } /* * Based on on sysctl_handle_int() convert milliseconds into ticks. */ int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) { int error, s, tt; tt = *(int *)oidp->oid_arg1; s = (int)((int64_t)tt * 1000 / hz); error = sysctl_handle_int(oidp, &s, 0, req); if (error || !req->newptr) return (error); tt = (int)((int64_t)s * hz / 1000); if (tt < 1) return (EINVAL); *(int *)oidp->oid_arg1 = tt; return (0); } /* * Handle a long, signed or unsigned. arg1 points to it. */ int sysctl_handle_long(SYSCTL_HANDLER_ARGS) { int error = 0; long tmplong; #ifdef SCTL_MASK32 int tmpint; #endif /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (!arg1) return (EINVAL); tmplong = *(long *)arg1; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { tmpint = tmplong; error = SYSCTL_OUT(req, &tmpint, sizeof(int)); } else #endif error = SYSCTL_OUT(req, &tmplong, sizeof(long)); if (error || !req->newptr) return (error); #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { error = SYSCTL_IN(req, &tmpint, sizeof(int)); *(long *)arg1 = (long)tmpint; } else #endif error = SYSCTL_IN(req, arg1, sizeof(long)); return (error); } /* * Handle our generic '\0' terminated 'C' string. * Two cases: * a variable string: point arg1 at it, arg2 is max length. * a constant string: point arg1 at it, arg2 is zero. */ int sysctl_handle_string(SYSCTL_HANDLER_ARGS) { int error=0; char *tmparg; size_t outlen; /* * Attempt to get a coherent snapshot by copying to a * temporary kernel buffer. */ retry: outlen = strlen((char *)arg1)+1; tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK); if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) { free(tmparg, M_SYSCTLTMP); goto retry; } error = SYSCTL_OUT(req, tmparg, outlen); free(tmparg, M_SYSCTLTMP); if (error || !req->newptr) return (error); if ((req->newlen - req->newidx) >= arg2) { error = EINVAL; } else { arg2 = (req->newlen - req->newidx); error = SYSCTL_IN(req, arg1, arg2); ((char *)arg1)[arg2] = '\0'; } return (error); } /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. */ int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS) { int error, tries; u_int generation; struct sysctl_req req2; /* * Attempt to get a coherent snapshot, by using the thread * pre-emption counter updated from within mi_switch() to * determine if we were pre-empted during a bcopy() or * copyout(). Make 3 attempts at doing this before giving up. * If we encounter an error, stop immediately. */ tries = 0; req2 = *req; retry: generation = curthread->td_generation; error = SYSCTL_OUT(req, arg1, arg2); if (error) return (error); tries++; if (generation != curthread->td_generation && tries < 3) { *req = req2; goto retry; } error = SYSCTL_IN(req, arg1, arg2); return (error); } /* * Transfer functions to/from kernel space. * XXX: rather untested at this point */ static int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) { size_t i = 0; if (req->oldptr) { i = l; if (req->oldlen <= req->oldidx) i = 0; else if (i > req->oldlen - req->oldidx) i = req->oldlen - req->oldidx; if (i > 0) bcopy(p, (char *)req->oldptr + req->oldidx, i); } req->oldidx += l; if (req->oldptr && i != l) return (ENOMEM); return (0); } static int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); bcopy((char *)req->newptr + req->newidx, p, l); req->newidx += l; return (0); } int kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int error = 0; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr= old; } if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_LOCKED; SYSCTL_LOCK(); error = sysctl_root(0, name, namelen, &req); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); SYSCTL_UNLOCK(); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } int kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int oid[CTL_MAXNAME]; size_t oidlen, plen; int error; oid[0] = 0; /* sysctl internal magic */ oid[1] = 3; /* name2oid */ oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, strlen(name), &plen, flags); if (error) return (error); error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp, new, newlen, retval, flags); return (error); } /* * Transfer function to/from user space. */ static int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) { int error = 0; size_t i, len, origidx; origidx = req->oldidx; req->oldidx += l; if (req->oldptr == NULL) return (0); /* * If we have not wired the user supplied buffer and we are currently * holding locks, drop a witness warning, as it's possible that * write operations to the user page can sleep. */ if (req->lock != REQ_WIRED) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_old_user()"); i = l; len = req->validlen; if (len <= origidx) i = 0; else { if (i > len - origidx) i = len - origidx; error = copyout(p, (char *)req->oldptr + origidx, i); } if (error) return (error); if (i < l) return (ENOMEM); return (0); } static int sysctl_new_user(struct sysctl_req *req, void *p, size_t l) { int error; if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_new_user()"); error = copyin((char *)req->newptr + req->newidx, p, l); req->newidx += l; return (error); } /* * Wire the user space destination buffer. If set to a value greater than * zero, the len parameter limits the maximum amount of wired memory. */ int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len) { int ret; size_t i, wiredlen; char *cp, dummy; wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen; ret = 0; if (req->lock == REQ_LOCKED && req->oldptr && req->oldfunc == sysctl_old_user) { if (wiredlen != 0) { ret = vslock(req->oldptr, wiredlen); if (ret != 0) { if (ret != ENOMEM) return (ret); wiredlen = 0; } /* * Touch all the wired pages to avoid PTE modified * bit emulation traps on Alpha while holding locks * in the sysctl handler. */ for (i = (wiredlen + PAGE_SIZE - 1) / PAGE_SIZE, cp = req->oldptr; i > 0; i--, cp += PAGE_SIZE) { copyin(cp, &dummy, 1); copyout(&dummy, cp, 1); } } req->lock = REQ_WIRED; req->validlen = wiredlen; } return (0); } int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, int *nindx, struct sysctl_req *req) { struct sysctl_oid *oid; int indx; oid = SLIST_FIRST(&sysctl__children); indx = 0; while (oid && indx < CTL_MAXNAME) { if (oid->oid_number == name[indx]) { indx++; if (oid->oid_kind & CTLFLAG_NOLOCK) req->lock = REQ_UNLOCKED; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oid->oid_handler != NULL || indx == namelen) { *noid = oid; if (nindx != NULL) *nindx = indx; return (0); } oid = SLIST_FIRST( (struct sysctl_oid_list *)oid->oid_arg1); } else if (indx == namelen) { *noid = oid; if (nindx != NULL) *nindx = indx; return (0); } else { return (ENOTDIR); } } else { oid = SLIST_NEXT(oid, oid_link); } } return (ENOENT); } /* * Traverse our tree, and find the right node, execute whatever it points * to, and return the resulting error code. */ static int sysctl_root(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; int error, indx, lvl; error = sysctl_find_oid(arg1, arg2, &oid, &indx, req); if (error) return (error); if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { /* * You can't call a sysctl when it's a node, but has * no handler. Inform the user that it's a node. * The indx may or may not be the same as namelen. */ if (oid->oid_handler == NULL) return (EISDIR); } /* Is this sysctl writable? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) return (EPERM); KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL")); /* Is this sysctl sensitive to securelevels? */ if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) { lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE; error = securelevel_gt(req->td->td_ucred, lvl); if (error) return (error); } /* Is this sysctl writable by only privileged users? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) { if (oid->oid_kind & CTLFLAG_PRISON) error = priv_check_cred(req->td->td_ucred, PRIV_SYSCTL_WRITEJAIL, SUSER_ALLOWJAIL); else error = priv_check(req->td, PRIV_SYSCTL_WRITE); if (error) return (error); } if (!oid->oid_handler) return (EINVAL); if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { arg1 = (int *)arg1 + indx; arg2 -= indx; } else { arg1 = oid->oid_arg1; arg2 = oid->oid_arg2; } #ifdef MAC error = mac_check_system_sysctl(req->td->td_ucred, oid, arg1, arg2, req); if (error != 0) return (error); #endif error = oid->oid_handler(oid, arg1, arg2, req); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sysctl_args { int *name; u_int namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif - int __sysctl(struct thread *td, struct sysctl_args *uap) { int error, name[CTL_MAXNAME]; size_t j; if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) return (EINVAL); error = copyin(uap->name, &name, uap->namelen * sizeof(int)); if (error) return (error); mtx_lock(&Giant); error = userland_sysctl(td, name, uap->namelen, uap->old, uap->oldlenp, 0, uap->new, uap->newlen, &j, 0); if (error && error != ENOMEM) goto done2; if (uap->oldlenp) { int i = copyout(&j, uap->oldlenp, sizeof(j)); if (i) error = i; } done2: mtx_unlock(&Giant); return (error); } /* * This is used from various compatibility syscalls too. That's why name * must be in kernel space. */ int userland_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval, int flags) { int error = 0; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { if (inkernel) { req.oldlen = *oldlenp; } else { error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); if (error) return (error); } } req.validlen = req.oldlen; if (old) { if (!useracc(old, req.oldlen, VM_PROT_WRITE)) return (EFAULT); req.oldptr= old; } if (new != NULL) { if (!useracc(new, req.newlen, VM_PROT_READ)) return (EFAULT); req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_user; req.newfunc = sysctl_new_user; req.lock = REQ_LOCKED; SYSCTL_LOCK(); do { req.oldidx = 0; req.newidx = 0; error = sysctl_root(0, name, namelen, &req); } while (error == EAGAIN); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); SYSCTL_UNLOCK(); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } #ifdef COMPAT_43 #include #include #define KINFO_PROC (0<<8) #define KINFO_RT (1<<8) #define KINFO_VNODE (2<<8) #define KINFO_FILE (3<<8) #define KINFO_METER (4<<8) #define KINFO_LOADAVG (5<<8) #define KINFO_CLOCKRATE (6<<8) /* Non-standard BSDI extension - only present on their 4.3 net-2 releases */ #define KINFO_BSDI_SYSINFO (101<<8) /* * XXX this is bloat, but I hope it's better here than on the potentially * limited kernel stack... -Peter */ static struct { int bsdi_machine; /* "i386" on BSD/386 */ /* ^^^ this is an offset to the string, relative to the struct start */ char *pad0; long pad1; long pad2; long pad3; u_long pad4; u_long pad5; u_long pad6; int bsdi_ostype; /* "BSD/386" on BSD/386 */ int bsdi_osrelease; /* "1.1" on BSD/386 */ long pad7; long pad8; char *pad9; long pad10; long pad11; int pad12; long pad13; quad_t pad14; long pad15; struct timeval pad16; /* we dont set this, because BSDI's uname used gethostname() instead */ int bsdi_hostname; /* hostname on BSD/386 */ /* the actual string data is appended here */ } bsdi_si; + /* * this data is appended to the end of the bsdi_si structure during copyout. * The "char *" offsets are relative to the base of the bsdi_si struct. * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings * should not exceed the length of the buffer here... (or else!! :-) */ static char bsdi_strings[80]; /* It had better be less than this! */ #ifndef _SYS_SYSPROTO_H_ struct getkerninfo_args { int op; char *where; size_t *size; int arg; }; #endif - int ogetkerninfo(struct thread *td, struct getkerninfo_args *uap) { int error, name[6]; size_t size; u_int needed = 0; mtx_lock(&Giant); switch (uap->op & 0xff00) { case KINFO_RT: name[0] = CTL_NET; name[1] = PF_ROUTE; name[2] = 0; name[3] = (uap->op & 0xff0000) >> 16; name[4] = uap->op & 0xff; name[5] = uap->arg; error = userland_sysctl(td, name, 6, uap->where, uap->size, 0, 0, 0, &size, 0); break; case KINFO_VNODE: name[0] = CTL_KERN; name[1] = KERN_VNODE; error = userland_sysctl(td, name, 2, uap->where, uap->size, 0, 0, 0, &size, 0); break; case KINFO_PROC: name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = uap->op & 0xff; name[3] = uap->arg; error = userland_sysctl(td, name, 4, uap->where, uap->size, 0, 0, 0, &size, 0); break; case KINFO_FILE: name[0] = CTL_KERN; name[1] = KERN_FILE; error = userland_sysctl(td, name, 2, uap->where, uap->size, 0, 0, 0, &size, 0); break; case KINFO_METER: name[0] = CTL_VM; name[1] = VM_TOTAL; error = userland_sysctl(td, name, 2, uap->where, uap->size, 0, 0, 0, &size, 0); break; case KINFO_LOADAVG: name[0] = CTL_VM; name[1] = VM_LOADAVG; error = userland_sysctl(td, name, 2, uap->where, uap->size, 0, 0, 0, &size, 0); break; case KINFO_CLOCKRATE: name[0] = CTL_KERN; name[1] = KERN_CLOCKRATE; error = userland_sysctl(td, name, 2, uap->where, uap->size, 0, 0, 0, &size, 0); break; case KINFO_BSDI_SYSINFO: { /* * this is pretty crude, but it's just enough for uname() * from BSDI's 1.x libc to work. * * *size gives the size of the buffer before the call, and * the amount of data copied after a successful call. * If successful, the return value is the amount of data * available, which can be larger than *size. * * BSDI's 2.x product apparently fails with ENOMEM if *size * is too small. */ u_int left; char *s; bzero((char *)&bsdi_si, sizeof(bsdi_si)); bzero(bsdi_strings, sizeof(bsdi_strings)); s = bsdi_strings; bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si); strcpy(s, ostype); s += strlen(s) + 1; bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si); strcpy(s, osrelease); s += strlen(s) + 1; bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si); strcpy(s, machine); s += strlen(s) + 1; needed = sizeof(bsdi_si) + (s - bsdi_strings); if ((uap->where == NULL) || (uap->size == NULL)) { /* process is asking how much buffer to supply.. */ size = needed; error = 0; break; } if ((error = copyin(uap->size, &size, sizeof(size))) != 0) break; /* if too much buffer supplied, trim it down */ if (size > needed) size = needed; /* how much of the buffer is remaining */ left = size; if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0) break; /* is there any point in continuing? */ if (left > sizeof(bsdi_si)) { left -= sizeof(bsdi_si); error = copyout(&bsdi_strings, uap->where + sizeof(bsdi_si), left); } break; } default: error = EOPNOTSUPP; break; } if (error == 0) { td->td_retval[0] = needed ? needed : size; if (uap->size) { error = copyout(&size, uap->size, sizeof(size)); } } mtx_unlock(&Giant); return (error); } #endif /* COMPAT_43 */ Index: head/sys/kern/kern_time.c =================================================================== --- head/sys/kern/kern_time.c (revision 167231) +++ head/sys/kern/kern_time.c (revision 167232) @@ -1,1510 +1,1500 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_time.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MAX_CLOCKS (CLOCK_MONOTONIC+1) static struct kclock posix_clocks[MAX_CLOCKS]; static uma_zone_t itimer_zone = NULL; /* * Time of day and interval timer support. * * These routines provide the kernel entry points to get and set * the time-of-day and per-process interval timers. Subroutines * here provide support for adding and subtracting timeval structures * and decrementing interval timers, optionally reloading the interval * timers when they expire. */ static int settime(struct thread *, struct timeval *); static void timevalfix(struct timeval *); static void no_lease_updatetime(int); static void itimer_start(void); static int itimer_init(void *, int, int); static void itimer_fini(void *, int); static void itimer_enter(struct itimer *); static void itimer_leave(struct itimer *); static struct itimer *itimer_find(struct proc *, int); static void itimers_alloc(struct proc *); static void itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp); static void itimers_event_hook_exit(void *arg, struct proc *p); static int realtimer_create(struct itimer *); static int realtimer_gettime(struct itimer *, struct itimerspec *); static int realtimer_settime(struct itimer *, int, struct itimerspec *, struct itimerspec *); static int realtimer_delete(struct itimer *); static void realtimer_clocktime(clockid_t, struct timespec *); static void realtimer_expire(void *); static int kern_timer_create(struct thread *, clockid_t, struct sigevent *, int *, int); static int kern_timer_delete(struct thread *, int); int register_posix_clock(int, struct kclock *); void itimer_fire(struct itimer *it); int itimespecfix(struct timespec *ts); #define CLOCK_CALL(clock, call, arglist) \ ((*posix_clocks[clock].call) arglist) SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL); static void no_lease_updatetime(deltat) int deltat; { } void (*lease_updatetime)(int) = no_lease_updatetime; static int settime(struct thread *td, struct timeval *tv) { struct timeval delta, tv1, tv2; static struct timeval maxtime, laststep; struct timespec ts; int s; s = splclock(); microtime(&tv1); delta = *tv; timevalsub(&delta, &tv1); /* * If the system is secure, we do not allow the time to be * set to a value earlier than 1 second less than the highest * time we have yet seen. The worst a miscreant can do in * this circumstance is "freeze" time. He couldn't go * back to the past. * * We similarly do not allow the clock to be stepped more * than one second, nor more than once per second. This allows * a miscreant to make the clock march double-time, but no worse. */ if (securelevel_gt(td->td_ucred, 1) != 0) { if (delta.tv_sec < 0 || delta.tv_usec < 0) { /* * Update maxtime to latest time we've seen. */ if (tv1.tv_sec > maxtime.tv_sec) maxtime = tv1; tv2 = *tv; timevalsub(&tv2, &maxtime); if (tv2.tv_sec < -1) { tv->tv_sec = maxtime.tv_sec - 1; printf("Time adjustment clamped to -1 second\n"); } } else { if (tv1.tv_sec == laststep.tv_sec) { splx(s); return (EPERM); } if (delta.tv_sec > 1) { tv->tv_sec = tv1.tv_sec + 1; printf("Time adjustment clamped to +1 second\n"); } laststep = *tv; } } ts.tv_sec = tv->tv_sec; ts.tv_nsec = tv->tv_usec * 1000; mtx_lock(&Giant); tc_setclock(&ts); (void) splsoftclock(); lease_updatetime(delta.tv_sec); splx(s); resettodr(); mtx_unlock(&Giant); return (0); } #ifndef _SYS_SYSPROTO_H_ struct clock_gettime_args { clockid_t clock_id; struct timespec *tp; }; #endif - /* ARGSUSED */ int clock_gettime(struct thread *td, struct clock_gettime_args *uap) { struct timespec ats; int error; error = kern_clock_gettime(td, uap->clock_id, &ats); if (error == 0) error = copyout(&ats, uap->tp, sizeof(ats)); return (error); } int kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats) { struct timeval sys, user; struct proc *p; p = td->td_proc; switch (clock_id) { case CLOCK_REALTIME: /* Default to precise. */ case CLOCK_REALTIME_PRECISE: nanotime(ats); break; case CLOCK_REALTIME_FAST: getnanotime(ats); break; case CLOCK_VIRTUAL: PROC_LOCK(p); calcru(p, &user, &sys); PROC_UNLOCK(p); TIMEVAL_TO_TIMESPEC(&user, ats); break; case CLOCK_PROF: PROC_LOCK(p); calcru(p, &user, &sys); PROC_UNLOCK(p); timevaladd(&user, &sys); TIMEVAL_TO_TIMESPEC(&user, ats); break; case CLOCK_MONOTONIC: /* Default to precise. */ case CLOCK_MONOTONIC_PRECISE: case CLOCK_UPTIME: case CLOCK_UPTIME_PRECISE: nanouptime(ats); break; case CLOCK_UPTIME_FAST: case CLOCK_MONOTONIC_FAST: getnanouptime(ats); break; case CLOCK_SECOND: ats->tv_sec = time_second; ats->tv_nsec = 0; break; default: return (EINVAL); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct clock_settime_args { clockid_t clock_id; const struct timespec *tp; }; #endif - /* ARGSUSED */ int clock_settime(struct thread *td, struct clock_settime_args *uap) { struct timespec ats; int error; if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0) return (error); return (kern_clock_settime(td, uap->clock_id, &ats)); } int kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats) { struct timeval atv; int error; #ifdef MAC error = mac_check_system_settime(td->td_ucred); if (error) return (error); #endif if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0) return (error); if (clock_id != CLOCK_REALTIME) return (EINVAL); if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000) return (EINVAL); /* XXX Don't convert nsec->usec and back */ TIMESPEC_TO_TIMEVAL(&atv, ats); error = settime(td, &atv); return (error); } #ifndef _SYS_SYSPROTO_H_ struct clock_getres_args { clockid_t clock_id; struct timespec *tp; }; #endif - int clock_getres(struct thread *td, struct clock_getres_args *uap) { struct timespec ts; int error; if (uap->tp == NULL) return (0); error = kern_clock_getres(td, uap->clock_id, &ts); if (error == 0) error = copyout(&ts, uap->tp, sizeof(ts)); return (error); } int kern_clock_getres(struct thread *td, clockid_t clock_id, struct timespec *ts) { ts->tv_sec = 0; switch (clock_id) { case CLOCK_REALTIME: case CLOCK_REALTIME_FAST: case CLOCK_REALTIME_PRECISE: case CLOCK_MONOTONIC: case CLOCK_MONOTONIC_FAST: case CLOCK_MONOTONIC_PRECISE: case CLOCK_UPTIME: case CLOCK_UPTIME_FAST: case CLOCK_UPTIME_PRECISE: /* * Round up the result of the division cheaply by adding 1. * Rounding up is especially important if rounding down * would give 0. Perfect rounding is unimportant. */ ts->tv_nsec = 1000000000 / tc_getfrequency() + 1; break; case CLOCK_VIRTUAL: case CLOCK_PROF: /* Accurately round up here because we can do so cheaply. */ ts->tv_nsec = (1000000000 + hz - 1) / hz; break; case CLOCK_SECOND: ts->tv_sec = 1; ts->tv_nsec = 0; break; default: return (EINVAL); } return (0); } static int nanowait; int kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt) { struct timespec ts, ts2, ts3; struct timeval tv; int error; if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000) return (EINVAL); if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0)) return (0); getnanouptime(&ts); timespecadd(&ts, rqt); TIMESPEC_TO_TIMEVAL(&tv, rqt); for (;;) { error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp", tvtohz(&tv)); getnanouptime(&ts2); if (error != EWOULDBLOCK) { if (error == ERESTART) error = EINTR; if (rmt != NULL) { timespecsub(&ts, &ts2); if (ts.tv_sec < 0) timespecclear(&ts); *rmt = ts; } return (error); } if (timespeccmp(&ts2, &ts, >=)) return (0); ts3 = ts; timespecsub(&ts3, &ts2); TIMESPEC_TO_TIMEVAL(&tv, &ts3); } } #ifndef _SYS_SYSPROTO_H_ struct nanosleep_args { struct timespec *rqtp; struct timespec *rmtp; }; #endif - /* ARGSUSED */ int nanosleep(struct thread *td, struct nanosleep_args *uap) { struct timespec rmt, rqt; int error; error = copyin(uap->rqtp, &rqt, sizeof(rqt)); if (error) return (error); if (uap->rmtp && !useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE)) return (EFAULT); error = kern_nanosleep(td, &rqt, &rmt); if (error && uap->rmtp) { int error2; error2 = copyout(&rmt, uap->rmtp, sizeof(rmt)); if (error2) error = error2; } return (error); } #ifndef _SYS_SYSPROTO_H_ struct gettimeofday_args { struct timeval *tp; struct timezone *tzp; }; #endif /* ARGSUSED */ int gettimeofday(struct thread *td, struct gettimeofday_args *uap) { struct timeval atv; struct timezone rtz; int error = 0; if (uap->tp) { microtime(&atv); error = copyout(&atv, uap->tp, sizeof (atv)); } if (error == 0 && uap->tzp != NULL) { rtz.tz_minuteswest = tz_minuteswest; rtz.tz_dsttime = tz_dsttime; error = copyout(&rtz, uap->tzp, sizeof (rtz)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct settimeofday_args { struct timeval *tv; struct timezone *tzp; }; #endif /* ARGSUSED */ int settimeofday(struct thread *td, struct settimeofday_args *uap) { struct timeval atv, *tvp; struct timezone atz, *tzp; int error; if (uap->tv) { error = copyin(uap->tv, &atv, sizeof(atv)); if (error) return (error); tvp = &atv; } else tvp = NULL; if (uap->tzp) { error = copyin(uap->tzp, &atz, sizeof(atz)); if (error) return (error); tzp = &atz; } else tzp = NULL; return (kern_settimeofday(td, tvp, tzp)); } int kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp) { int error; #ifdef MAC error = mac_check_system_settime(td->td_ucred); if (error) return (error); #endif error = priv_check(td, PRIV_SETTIMEOFDAY); if (error) return (error); /* Verify all parameters before changing time. */ if (tv) { if (tv->tv_usec < 0 || tv->tv_usec >= 1000000) return (EINVAL); error = settime(td, tv); } if (tzp && error == 0) { tz_minuteswest = tzp->tz_minuteswest; tz_dsttime = tzp->tz_dsttime; } return (error); } /* - * Get value of an interval timer. The process virtual and - * profiling virtual time timers are kept in the p_stats area, since - * they can be swapped out. These are kept internally in the - * way they are specified externally: in time until they expire. + * Get value of an interval timer. The process virtual and profiling virtual + * time timers are kept in the p_stats area, since they can be swapped out. + * These are kept internally in the way they are specified externally: in + * time until they expire. * - * The real time interval timer is kept in the process table slot - * for the process, and its value (it_value) is kept as an - * absolute time rather than as a delta, so that it is easy to keep - * periodic real-time signals from drifting. + * The real time interval timer is kept in the process table slot for the + * process, and its value (it_value) is kept as an absolute time rather than + * as a delta, so that it is easy to keep periodic real-time signals from + * drifting. * * Virtual time timers are processed in the hardclock() routine of - * kern_clock.c. The real time timer is processed by a timeout - * routine, called from the softclock() routine. Since a callout - * may be delayed in real time due to interrupt processing in the system, - * it is possible for the real time timeout routine (realitexpire, given below), - * to be delayed in real time past when it is supposed to occur. It - * does not suffice, therefore, to reload the real timer .it_value from the - * real time timers .it_interval. Rather, we compute the next time in - * absolute time the timer should go off. + * kern_clock.c. The real time timer is processed by a timeout routine, + * called from the softclock() routine. Since a callout may be delayed in + * real time due to interrupt processing in the system, it is possible for + * the real time timeout routine (realitexpire, given below), to be delayed + * in real time past when it is supposed to occur. It does not suffice, + * therefore, to reload the real timer .it_value from the real time timers + * .it_interval. Rather, we compute the next time in absolute time the timer + * should go off. */ #ifndef _SYS_SYSPROTO_H_ struct getitimer_args { u_int which; struct itimerval *itv; }; #endif int getitimer(struct thread *td, struct getitimer_args *uap) { struct itimerval aitv; int error; error = kern_getitimer(td, uap->which, &aitv); if (error != 0) return (error); return (copyout(&aitv, uap->itv, sizeof (struct itimerval))); } int kern_getitimer(struct thread *td, u_int which, struct itimerval *aitv) { struct proc *p = td->td_proc; struct timeval ctv; if (which > ITIMER_PROF) return (EINVAL); if (which == ITIMER_REAL) { /* * Convert from absolute to relative time in .it_value * part of real time timer. If time for real time timer * has passed return 0, else return difference between * current time and time for the timer to go off. */ PROC_LOCK(p); *aitv = p->p_realtimer; PROC_UNLOCK(p); if (timevalisset(&aitv->it_value)) { getmicrouptime(&ctv); if (timevalcmp(&aitv->it_value, &ctv, <)) timevalclear(&aitv->it_value); else timevalsub(&aitv->it_value, &ctv); } } else { mtx_lock_spin(&sched_lock); *aitv = p->p_stats->p_timer[which]; mtx_unlock_spin(&sched_lock); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct setitimer_args { u_int which; struct itimerval *itv, *oitv; }; #endif - int setitimer(struct thread *td, struct setitimer_args *uap) { struct itimerval aitv, oitv; int error; if (uap->itv == NULL) { uap->itv = uap->oitv; return (getitimer(td, (struct getitimer_args *)uap)); } if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval)))) return (error); error = kern_setitimer(td, uap->which, &aitv, &oitv); if (error != 0 || uap->oitv == NULL) return (error); return (copyout(&oitv, uap->oitv, sizeof(struct itimerval))); } int kern_setitimer(struct thread *td, u_int which, struct itimerval *aitv, struct itimerval *oitv) { struct proc *p = td->td_proc; struct timeval ctv; if (aitv == NULL) return (kern_getitimer(td, which, oitv)); if (which > ITIMER_PROF) return (EINVAL); if (itimerfix(&aitv->it_value)) return (EINVAL); if (!timevalisset(&aitv->it_value)) timevalclear(&aitv->it_interval); else if (itimerfix(&aitv->it_interval)) return (EINVAL); if (which == ITIMER_REAL) { PROC_LOCK(p); if (timevalisset(&p->p_realtimer.it_value)) callout_stop(&p->p_itcallout); getmicrouptime(&ctv); if (timevalisset(&aitv->it_value)) { callout_reset(&p->p_itcallout, tvtohz(&aitv->it_value), realitexpire, p); timevaladd(&aitv->it_value, &ctv); } *oitv = p->p_realtimer; p->p_realtimer = *aitv; PROC_UNLOCK(p); if (timevalisset(&oitv->it_value)) { if (timevalcmp(&oitv->it_value, &ctv, <)) timevalclear(&oitv->it_value); else timevalsub(&oitv->it_value, &ctv); } } else { mtx_lock_spin(&sched_lock); *oitv = p->p_stats->p_timer[which]; p->p_stats->p_timer[which] = *aitv; mtx_unlock_spin(&sched_lock); } return (0); } /* * Real interval timer expired: * send process whose timer expired an alarm signal. * If time is not set up to reload, then just return. * Else compute next time timer should go off which is > current time. * This is where delay in processing this timeout causes multiple * SIGALRM calls to be compressed into one. * tvtohz() always adds 1 to allow for the time until the next clock * interrupt being strictly less than 1 clock tick, but we don't want * that here since we want to appear to be in sync with the clock * interrupt even when we're delayed. */ void realitexpire(void *arg) { struct proc *p; struct timeval ctv, ntv; p = (struct proc *)arg; PROC_LOCK(p); psignal(p, SIGALRM); if (!timevalisset(&p->p_realtimer.it_interval)) { timevalclear(&p->p_realtimer.it_value); if (p->p_flag & P_WEXIT) wakeup(&p->p_itcallout); PROC_UNLOCK(p); return; } for (;;) { timevaladd(&p->p_realtimer.it_value, &p->p_realtimer.it_interval); getmicrouptime(&ctv); if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) { ntv = p->p_realtimer.it_value; timevalsub(&ntv, &ctv); callout_reset(&p->p_itcallout, tvtohz(&ntv) - 1, realitexpire, p); PROC_UNLOCK(p); return; } } /*NOTREACHED*/ } /* * Check that a proposed value to load into the .it_value or * .it_interval part of an interval timer is acceptable, and * fix it to have at least minimal value (i.e. if it is less * than the resolution of the clock, round it up.) */ int itimerfix(struct timeval *tv) { if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000) return (EINVAL); if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick) tv->tv_usec = tick; return (0); } /* * Decrement an interval timer by a specified number * of microseconds, which must be less than a second, * i.e. < 1000000. If the timer expires, then reload * it. In this case, carry over (usec - old value) to * reduce the value reloaded into the timer so that * the timer does not drift. This routine assumes * that it is called in a context where the timers * on which it is operating cannot change in value. */ int itimerdecr(struct itimerval *itp, int usec) { if (itp->it_value.tv_usec < usec) { if (itp->it_value.tv_sec == 0) { /* expired, and already in next interval */ usec -= itp->it_value.tv_usec; goto expire; } itp->it_value.tv_usec += 1000000; itp->it_value.tv_sec--; } itp->it_value.tv_usec -= usec; usec = 0; if (timevalisset(&itp->it_value)) return (1); /* expired, exactly at end of interval */ expire: if (timevalisset(&itp->it_interval)) { itp->it_value = itp->it_interval; itp->it_value.tv_usec -= usec; if (itp->it_value.tv_usec < 0) { itp->it_value.tv_usec += 1000000; itp->it_value.tv_sec--; } } else itp->it_value.tv_usec = 0; /* sec is already 0 */ return (0); } /* * Add and subtract routines for timevals. * N.B.: subtract routine doesn't deal with * results which are before the beginning, * it just gets very confused in this case. * Caveat emptor. */ void timevaladd(struct timeval *t1, const struct timeval *t2) { t1->tv_sec += t2->tv_sec; t1->tv_usec += t2->tv_usec; timevalfix(t1); } void timevalsub(struct timeval *t1, const struct timeval *t2) { t1->tv_sec -= t2->tv_sec; t1->tv_usec -= t2->tv_usec; timevalfix(t1); } static void timevalfix(struct timeval *t1) { if (t1->tv_usec < 0) { t1->tv_sec--; t1->tv_usec += 1000000; } if (t1->tv_usec >= 1000000) { t1->tv_sec++; t1->tv_usec -= 1000000; } } /* * ratecheck(): simple time-based rate-limit checking. */ int ratecheck(struct timeval *lasttime, const struct timeval *mininterval) { struct timeval tv, delta; int rv = 0; getmicrouptime(&tv); /* NB: 10ms precision */ delta = tv; timevalsub(&delta, lasttime); /* * check for 0,0 is so that the message will be seen at least once, * even if interval is huge. */ if (timevalcmp(&delta, mininterval, >=) || (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) { *lasttime = tv; rv = 1; } return (rv); } /* * ppsratecheck(): packets (or events) per second limitation. * * Return 0 if the limit is to be enforced (e.g. the caller * should drop a packet because of the rate limitation). * * maxpps of 0 always causes zero to be returned. maxpps of -1 * always causes 1 to be returned; this effectively defeats rate * limiting. * * Note that we maintain the struct timeval for compatibility * with other bsd systems. We reuse the storage and just monitor * clock ticks for minimal overhead. */ int ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps) { int now; /* * Reset the last time and counter if this is the first call * or more than a second has passed since the last update of * lasttime. */ now = ticks; if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) { lasttime->tv_sec = now; *curpps = 1; return (maxpps != 0); } else { (*curpps)++; /* NB: ignore potential overflow */ return (maxpps < 0 || *curpps < maxpps); } } static void itimer_start(void) { struct kclock rt_clock = { .timer_create = realtimer_create, .timer_delete = realtimer_delete, .timer_settime = realtimer_settime, .timer_gettime = realtimer_gettime, .event_hook = NULL }; itimer_zone = uma_zcreate("itimer", sizeof(struct itimer), NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0); register_posix_clock(CLOCK_REALTIME, &rt_clock); register_posix_clock(CLOCK_MONOTONIC, &rt_clock); p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L); p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX); p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX); EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit, (void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY); EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec, (void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY); } int register_posix_clock(int clockid, struct kclock *clk) { if ((unsigned)clockid >= MAX_CLOCKS) { printf("%s: invalid clockid\n", __func__); return (0); } posix_clocks[clockid] = *clk; return (1); } static int itimer_init(void *mem, int size, int flags) { struct itimer *it; it = (struct itimer *)mem; mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF); return (0); } static void itimer_fini(void *mem, int size) { struct itimer *it; it = (struct itimer *)mem; mtx_destroy(&it->it_mtx); } static void itimer_enter(struct itimer *it) { mtx_assert(&it->it_mtx, MA_OWNED); it->it_usecount++; } static void itimer_leave(struct itimer *it) { mtx_assert(&it->it_mtx, MA_OWNED); KASSERT(it->it_usecount > 0, ("invalid it_usecount")); if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0) wakeup(it); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_create_args { clockid_t clock_id; struct sigevent * evp; int * timerid; }; #endif - int ktimer_create(struct thread *td, struct ktimer_create_args *uap) { struct sigevent *evp1, ev; int id; int error; if (uap->evp != NULL) { error = copyin(uap->evp, &ev, sizeof(ev)); if (error != 0) return (error); evp1 = &ev; } else evp1 = NULL; error = kern_timer_create(td, uap->clock_id, evp1, &id, -1); if (error == 0) { error = copyout(&id, uap->timerid, sizeof(int)); if (error != 0) kern_timer_delete(td, id); } return (error); } static int kern_timer_create(struct thread *td, clockid_t clock_id, struct sigevent *evp, int *timerid, int preset_id) { struct proc *p = td->td_proc; struct itimer *it; int id; int error; if (clock_id < 0 || clock_id >= MAX_CLOCKS) return (EINVAL); if (posix_clocks[clock_id].timer_create == NULL) return (EINVAL); if (evp != NULL) { if (evp->sigev_notify != SIGEV_NONE && evp->sigev_notify != SIGEV_SIGNAL && evp->sigev_notify != SIGEV_THREAD_ID) return (EINVAL); if ((evp->sigev_notify == SIGEV_SIGNAL || evp->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(evp->sigev_signo)) return (EINVAL); } if (p->p_itimers == NULL) itimers_alloc(p); it = uma_zalloc(itimer_zone, M_WAITOK); it->it_flags = 0; it->it_usecount = 0; it->it_active = 0; timespecclear(&it->it_time.it_value); timespecclear(&it->it_time.it_interval); it->it_overrun = 0; it->it_overrun_last = 0; it->it_clockid = clock_id; it->it_timerid = -1; it->it_proc = p; ksiginfo_init(&it->it_ksi); it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT; error = CLOCK_CALL(clock_id, timer_create, (it)); if (error != 0) goto out; PROC_LOCK(p); if (preset_id != -1) { KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id")); id = preset_id; if (p->p_itimers->its_timers[id] != NULL) { PROC_UNLOCK(p); error = 0; goto out; } } else { /* * Find a free timer slot, skipping those reserved * for setitimer(). */ for (id = 3; id < TIMER_MAX; id++) if (p->p_itimers->its_timers[id] == NULL) break; if (id == TIMER_MAX) { PROC_UNLOCK(p); error = EAGAIN; goto out; } } it->it_timerid = id; p->p_itimers->its_timers[id] = it; if (evp != NULL) it->it_sigev = *evp; else { it->it_sigev.sigev_notify = SIGEV_SIGNAL; switch (clock_id) { default: case CLOCK_REALTIME: it->it_sigev.sigev_signo = SIGALRM; break; case CLOCK_VIRTUAL: it->it_sigev.sigev_signo = SIGVTALRM; break; case CLOCK_PROF: it->it_sigev.sigev_signo = SIGPROF; break; } it->it_sigev.sigev_value.sival_int = id; } if (it->it_sigev.sigev_notify == SIGEV_SIGNAL || it->it_sigev.sigev_notify == SIGEV_THREAD_ID) { it->it_ksi.ksi_signo = it->it_sigev.sigev_signo; it->it_ksi.ksi_code = SI_TIMER; it->it_ksi.ksi_value = it->it_sigev.sigev_value; it->it_ksi.ksi_timerid = id; } PROC_UNLOCK(p); *timerid = id; return (0); out: ITIMER_LOCK(it); CLOCK_CALL(it->it_clockid, timer_delete, (it)); ITIMER_UNLOCK(it); uma_zfree(itimer_zone, it); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_delete_args { int timerid; }; #endif - int ktimer_delete(struct thread *td, struct ktimer_delete_args *uap) { return (kern_timer_delete(td, uap->timerid)); } static struct itimer * itimer_find(struct proc *p, int timerid) { struct itimer *it; PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_itimers == NULL) || (timerid >= TIMER_MAX) || (it = p->p_itimers->its_timers[timerid]) == NULL) { return (NULL); } ITIMER_LOCK(it); if ((it->it_flags & ITF_DELETING) != 0) { ITIMER_UNLOCK(it); it = NULL; } return (it); } static int kern_timer_delete(struct thread *td, int timerid) { struct proc *p = td->td_proc; struct itimer *it; PROC_LOCK(p); it = itimer_find(p, timerid); if (it == NULL) { PROC_UNLOCK(p); return (EINVAL); } PROC_UNLOCK(p); it->it_flags |= ITF_DELETING; while (it->it_usecount > 0) { it->it_flags |= ITF_WANTED; msleep(it, &it->it_mtx, PPAUSE, "itimer", 0); } it->it_flags &= ~ITF_WANTED; CLOCK_CALL(it->it_clockid, timer_delete, (it)); ITIMER_UNLOCK(it); PROC_LOCK(p); if (KSI_ONQ(&it->it_ksi)) sigqueue_take(&it->it_ksi); p->p_itimers->its_timers[timerid] = NULL; PROC_UNLOCK(p); uma_zfree(itimer_zone, it); return (0); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_settime_args { int timerid; int flags; const struct itimerspec * value; struct itimerspec * ovalue; }; #endif - int ktimer_settime(struct thread *td, struct ktimer_settime_args *uap) { struct proc *p = td->td_proc; struct itimer *it; struct itimerspec val, oval, *ovalp; int error; error = copyin(uap->value, &val, sizeof(val)); if (error != 0) return (error); if (uap->ovalue != NULL) ovalp = &oval; else ovalp = NULL; PROC_LOCK(p); if (uap->timerid < 3 || (it = itimer_find(p, uap->timerid)) == NULL) { PROC_UNLOCK(p); error = EINVAL; } else { PROC_UNLOCK(p); itimer_enter(it); error = CLOCK_CALL(it->it_clockid, timer_settime, (it, uap->flags, &val, ovalp)); itimer_leave(it); ITIMER_UNLOCK(it); } if (error == 0 && uap->ovalue != NULL) error = copyout(ovalp, uap->ovalue, sizeof(*ovalp)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_gettime_args { int timerid; struct itimerspec * value; }; #endif - int ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap) { struct proc *p = td->td_proc; struct itimer *it; struct itimerspec val; int error; PROC_LOCK(p); if (uap->timerid < 3 || (it = itimer_find(p, uap->timerid)) == NULL) { PROC_UNLOCK(p); error = EINVAL; } else { PROC_UNLOCK(p); itimer_enter(it); error = CLOCK_CALL(it->it_clockid, timer_gettime, (it, &val)); itimer_leave(it); ITIMER_UNLOCK(it); } if (error == 0) error = copyout(&val, uap->value, sizeof(val)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct timer_getoverrun_args { int timerid; }; #endif - int ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap) { struct proc *p = td->td_proc; struct itimer *it; int error ; PROC_LOCK(p); if (uap->timerid < 3 || (it = itimer_find(p, uap->timerid)) == NULL) { PROC_UNLOCK(p); error = EINVAL; } else { td->td_retval[0] = it->it_overrun_last; ITIMER_UNLOCK(it); PROC_UNLOCK(p); error = 0; } return (error); } static int realtimer_create(struct itimer *it) { callout_init_mtx(&it->it_callout, &it->it_mtx, 0); return (0); } static int realtimer_delete(struct itimer *it) { mtx_assert(&it->it_mtx, MA_OWNED); ITIMER_UNLOCK(it); callout_drain(&it->it_callout); ITIMER_LOCK(it); return (0); } static int realtimer_gettime(struct itimer *it, struct itimerspec *ovalue) { struct timespec cts; mtx_assert(&it->it_mtx, MA_OWNED); realtimer_clocktime(it->it_clockid, &cts); *ovalue = it->it_time; if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) { timespecsub(&ovalue->it_value, &cts); if (ovalue->it_value.tv_sec < 0 || (ovalue->it_value.tv_sec == 0 && ovalue->it_value.tv_nsec == 0)) { ovalue->it_value.tv_sec = 0; ovalue->it_value.tv_nsec = 1; } } return (0); } static int realtimer_settime(struct itimer *it, int flags, struct itimerspec *value, struct itimerspec *ovalue) { struct timespec cts, ts; struct timeval tv; struct itimerspec val; mtx_assert(&it->it_mtx, MA_OWNED); val = *value; if (itimespecfix(&val.it_value)) return (EINVAL); if (timespecisset(&val.it_value)) { if (itimespecfix(&val.it_interval)) return (EINVAL); } else { timespecclear(&val.it_interval); } if (ovalue != NULL) realtimer_gettime(it, ovalue); it->it_time = val; if (timespecisset(&val.it_value)) { realtimer_clocktime(it->it_clockid, &cts); ts = val.it_value; if ((flags & TIMER_ABSTIME) == 0) { /* Convert to absolute time. */ timespecadd(&it->it_time.it_value, &cts); } else { timespecsub(&ts, &cts); /* * We don't care if ts is negative, tztohz will * fix it. */ } TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire, it); } else { callout_stop(&it->it_callout); } return (0); } static void realtimer_clocktime(clockid_t id, struct timespec *ts) { if (id == CLOCK_REALTIME) getnanotime(ts); else /* CLOCK_MONOTONIC */ getnanouptime(ts); } int itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi) { struct itimer *it; PROC_LOCK_ASSERT(p, MA_OWNED); it = itimer_find(p, timerid); if (it != NULL) { ksi->ksi_overrun = it->it_overrun; it->it_overrun_last = it->it_overrun; it->it_overrun = 0; ITIMER_UNLOCK(it); return (0); } return (EINVAL); } int itimespecfix(struct timespec *ts) { if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return (EINVAL); if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000) ts->tv_nsec = tick * 1000; return (0); } /* Timeout callback for realtime timer */ static void realtimer_expire(void *arg) { struct timespec cts, ts; struct timeval tv; struct itimer *it; struct proc *p; it = (struct itimer *)arg; p = it->it_proc; realtimer_clocktime(it->it_clockid, &cts); /* Only fire if time is reached. */ if (timespeccmp(&cts, &it->it_time.it_value, >=)) { if (timespecisset(&it->it_time.it_interval)) { timespecadd(&it->it_time.it_value, &it->it_time.it_interval); while (timespeccmp(&cts, &it->it_time.it_value, >=)) { if (it->it_overrun < INT_MAX) it->it_overrun++; else it->it_ksi.ksi_errno = ERANGE; timespecadd(&it->it_time.it_value, &it->it_time.it_interval); } } else { /* single shot timer ? */ timespecclear(&it->it_time.it_value); } if (timespecisset(&it->it_time.it_value)) { ts = it->it_time.it_value; timespecsub(&ts, &cts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire, it); } ITIMER_UNLOCK(it); itimer_fire(it); ITIMER_LOCK(it); } else if (timespecisset(&it->it_time.it_value)) { ts = it->it_time.it_value; timespecsub(&ts, &cts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire, it); } } void itimer_fire(struct itimer *it) { struct proc *p = it->it_proc; int ret; if (it->it_sigev.sigev_notify == SIGEV_SIGNAL || it->it_sigev.sigev_notify == SIGEV_THREAD_ID) { PROC_LOCK(p); if (!KSI_ONQ(&it->it_ksi)) { it->it_ksi.ksi_errno = 0; ret = psignal_event(p, &it->it_sigev, &it->it_ksi); if (__predict_false(ret != 0)) { it->it_overrun++; /* * Broken userland code, thread went * away, disarm the timer. */ if (ret == ESRCH) { ITIMER_LOCK(it); timespecclear(&it->it_time.it_value); timespecclear(&it->it_time.it_interval); callout_stop(&it->it_callout); ITIMER_UNLOCK(it); } } } else { if (it->it_overrun < INT_MAX) it->it_overrun++; else it->it_ksi.ksi_errno = ERANGE; } PROC_UNLOCK(p); } } static void itimers_alloc(struct proc *p) { struct itimers *its; int i; its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO); LIST_INIT(&its->its_virtual); LIST_INIT(&its->its_prof); TAILQ_INIT(&its->its_worklist); for (i = 0; i < TIMER_MAX; i++) its->its_timers[i] = NULL; PROC_LOCK(p); if (p->p_itimers == NULL) { p->p_itimers = its; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); free(its, M_SUBPROC); } } static void itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { itimers_event_hook_exit(arg, p); } /* Clean up timers when some process events are being triggered. */ static void itimers_event_hook_exit(void *arg, struct proc *p) { struct itimers *its; struct itimer *it; int event = (int)(intptr_t)arg; int i; if (p->p_itimers != NULL) { its = p->p_itimers; for (i = 0; i < MAX_CLOCKS; ++i) { if (posix_clocks[i].event_hook != NULL) CLOCK_CALL(i, event_hook, (p, i, event)); } /* * According to susv3, XSI interval timers should be inherited * by new image. */ if (event == ITIMER_EV_EXEC) i = 3; else if (event == ITIMER_EV_EXIT) i = 0; else panic("unhandled event"); for (; i < TIMER_MAX; ++i) { if ((it = its->its_timers[i]) != NULL) kern_timer_delete(curthread, i); } if (its->its_timers[0] == NULL && its->its_timers[1] == NULL && its->its_timers[2] == NULL) { free(its, M_SUBPROC); p->p_itimers = NULL; } } } Index: head/sys/kern/kern_umtx.c =================================================================== --- head/sys/kern/kern_umtx.c (revision 167231) +++ head/sys/kern/kern_umtx.c (revision 167232) @@ -1,2760 +1,2759 @@ /*- * Copyright (c) 2004, David Xu * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_IA32 #include #endif #define TYPE_SIMPLE_LOCK 0 #define TYPE_SIMPLE_WAIT 1 #define TYPE_NORMAL_UMUTEX 2 #define TYPE_PI_UMUTEX 3 #define TYPE_PP_UMUTEX 4 #define TYPE_CV 5 /* Key to represent a unique userland synchronous object */ struct umtx_key { int hash; int type; int shared; union { struct { vm_object_t object; uintptr_t offset; } shared; struct { struct vmspace *vs; uintptr_t addr; } private; struct { void *a; uintptr_t b; } both; } info; }; /* Priority inheritance mutex info. */ struct umtx_pi { /* Owner thread */ struct thread *pi_owner; /* Reference count */ int pi_refcount; /* List entry to link umtx holding by thread */ TAILQ_ENTRY(umtx_pi) pi_link; /* List entry in hash */ TAILQ_ENTRY(umtx_pi) pi_hashlink; /* List for waiters */ TAILQ_HEAD(,umtx_q) pi_blocked; /* Identify a userland lock object */ struct umtx_key pi_key; }; /* A userland synchronous object user. */ struct umtx_q { /* Linked list for the hash. */ TAILQ_ENTRY(umtx_q) uq_link; /* Umtx key. */ struct umtx_key uq_key; /* Umtx flags. */ int uq_flags; #define UQF_UMTXQ 0x0001 /* The thread waits on. */ struct thread *uq_thread; /* * Blocked on PI mutex. read can use chain lock * or sched_lock, write must have both chain lock and * sched_lock being hold. */ struct umtx_pi *uq_pi_blocked; /* On blocked list */ TAILQ_ENTRY(umtx_q) uq_lockq; /* Thread contending with us */ TAILQ_HEAD(,umtx_pi) uq_pi_contested; /* Inherited priority from PP mutex */ u_char uq_inherited_pri; }; TAILQ_HEAD(umtxq_head, umtx_q); /* Userland lock object's wait-queue chain */ struct umtxq_chain { /* Lock for this chain. */ struct mtx uc_lock; /* List of sleep queues. */ struct umtxq_head uc_queue; /* Busy flag */ char uc_busy; /* Chain lock waiters */ int uc_waiters; /* All PI in the list */ TAILQ_HEAD(,umtx_pi) uc_pi_list; }; #define UMTXQ_LOCKED_ASSERT(uc) mtx_assert(&(uc)->uc_lock, MA_OWNED) /* * Don't propagate time-sharing priority, there is a security reason, * a user can simply introduce PI-mutex, let thread A lock the mutex, * and let another thread B block on the mutex, because B is * sleeping, its priority will be boosted, this causes A's priority to * be boosted via priority propagating too and will never be lowered even * if it is using 100%CPU, this is unfair to other processes. */ #define UPRI(td) (((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\ (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\ PRI_MAX_TIMESHARE : (td)->td_user_pri) #define GOLDEN_RATIO_PRIME 2654404609U #define UMTX_CHAINS 128 #define UMTX_SHIFTS (__WORD_BIT - 7) #define THREAD_SHARE 0 #define PROCESS_SHARE 1 #define AUTO_SHARE 2 #define GET_SHARE(flags) \ (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE) static uma_zone_t umtx_pi_zone; static struct umtxq_chain umtxq_chains[UMTX_CHAINS]; static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory"); static int umtx_pi_allocated; SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug"); SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD, &umtx_pi_allocated, 0, "Allocated umtx_pi"); SYSCTL_DECL(_kern_threads); static int umtx_dflt_spins = 0; SYSCTL_INT(_kern_threads, OID_AUTO, umtx_dflt_spins, CTLFLAG_RW, &umtx_dflt_spins, 0, "default umtx spin count"); static int umtx_max_spins = 3000; SYSCTL_INT(_kern_threads, OID_AUTO, umtx_max_spins, CTLFLAG_RW, &umtx_max_spins, 0, "max umtx spin count"); static void umtxq_sysinit(void *); static void umtxq_hash(struct umtx_key *key); static struct umtxq_chain *umtxq_getchain(struct umtx_key *key); static void umtxq_lock(struct umtx_key *key); static void umtxq_unlock(struct umtx_key *key); static void umtxq_busy(struct umtx_key *key); static void umtxq_unbusy(struct umtx_key *key); static void umtxq_insert(struct umtx_q *uq); static void umtxq_remove(struct umtx_q *uq); static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo); static int umtxq_count(struct umtx_key *key); static int umtxq_signal(struct umtx_key *key, int nr_wakeup); static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2); static int umtx_key_get(void *addr, int type, int share, struct umtx_key *key); static void umtx_key_release(struct umtx_key *key); static struct umtx_pi *umtx_pi_alloc(int); static void umtx_pi_free(struct umtx_pi *pi); static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags); static void umtx_thread_cleanup(struct thread *td); static void umtx_exec_hook(void *arg __unused, struct proc *p __unused, struct image_params *imgp __unused); SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL); static void umtxq_sysinit(void *arg __unused) { int i; umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); for (i = 0; i < UMTX_CHAINS; ++i) { mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL, MTX_DEF | MTX_DUPOK); TAILQ_INIT(&umtxq_chains[i].uc_queue); TAILQ_INIT(&umtxq_chains[i].uc_pi_list); umtxq_chains[i].uc_busy = 0; umtxq_chains[i].uc_waiters = 0; } EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL, EVENTHANDLER_PRI_ANY); } struct umtx_q * umtxq_alloc(void) { struct umtx_q *uq; uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO); TAILQ_INIT(&uq->uq_pi_contested); uq->uq_inherited_pri = PRI_MAX; return (uq); } void umtxq_free(struct umtx_q *uq) { free(uq, M_UMTX); } static inline void umtxq_hash(struct umtx_key *key) { unsigned n = (uintptr_t)key->info.both.a + key->info.both.b; key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS; } static inline int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2) { return (k1->type == k2->type && k1->info.both.a == k2->info.both.a && k1->info.both.b == k2->info.both.b); } static inline struct umtxq_chain * umtxq_getchain(struct umtx_key *key) { return (&umtxq_chains[key->hash]); } /* * Set chain to busy state when following operation * may be blocked (kernel mutex can not be used). */ static inline void umtxq_busy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); while (uc->uc_busy != 0) { uc->uc_waiters++; msleep(uc, &uc->uc_lock, 0, "umtxqb", 0); uc->uc_waiters--; } uc->uc_busy = 1; } /* * Unbusy a chain. */ static inline void umtxq_unbusy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); KASSERT(uc->uc_busy != 0, ("not busy")); uc->uc_busy = 0; if (uc->uc_waiters) wakeup_one(uc); } /* * Lock a chain. */ static inline void umtxq_lock(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_lock(&uc->uc_lock); } /* * Unlock a chain. */ static inline void umtxq_unlock(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_unlock(&uc->uc_lock); } /* * Insert a thread onto the umtx queue. */ static inline void umtxq_insert(struct umtx_q *uq) { struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link); uq->uq_flags |= UQF_UMTXQ; } /* * Remove thread from the umtx queue. */ static inline void umtxq_remove(struct umtx_q *uq) { struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); if (uq->uq_flags & UQF_UMTXQ) { TAILQ_REMOVE(&uc->uc_queue, uq, uq_link); uq->uq_flags &= ~UQF_UMTXQ; } } /* * Check if there are multiple waiters */ static int umtxq_count(struct umtx_key *key) { struct umtxq_chain *uc; struct umtx_q *uq; int count = 0; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) { if (umtx_key_match(&uq->uq_key, key)) { if (++count > 1) break; } } return (count); } /* * Check if there are multiple PI waiters and returns first * waiter. */ static int umtxq_count_pi(struct umtx_key *key, struct umtx_q **first) { struct umtxq_chain *uc; struct umtx_q *uq; int count = 0; *first = NULL; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) { if (umtx_key_match(&uq->uq_key, key)) { if (++count > 1) break; *first = uq; } } return (count); } /* * Wake up threads waiting on an userland object. */ static int umtxq_signal(struct umtx_key *key, int n_wake) { struct umtxq_chain *uc; struct umtx_q *uq, *next; int ret; ret = 0; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) { if (umtx_key_match(&uq->uq_key, key)) { umtxq_remove(uq); wakeup(uq); if (++ret >= n_wake) break; } } return (ret); } /* * Wake up specified thread. */ static inline void umtxq_signal_thread(struct umtx_q *uq) { struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); umtxq_remove(uq); wakeup(uq); } /* * Put thread into sleep state, before sleeping, check if * thread was removed from umtx queue. */ static inline int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo) { struct umtxq_chain *uc; int error; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); if (!(uq->uq_flags & UQF_UMTXQ)) return (0); error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo); if (error == EWOULDBLOCK) error = ETIMEDOUT; return (error); } /* * Convert userspace address into unique logical address. */ static int umtx_key_get(void *addr, int type, int share, struct umtx_key *key) { struct thread *td = curthread; vm_map_t map; vm_map_entry_t entry; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; key->type = type; if (share == THREAD_SHARE) { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } else { MPASS(share == PROCESS_SHARE || share == AUTO_SHARE); map = &td->td_proc->p_vmspace->vm_map; if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE, &entry, &key->info.shared.object, &pindex, &prot, &wired) != KERN_SUCCESS) { return EFAULT; } if ((share == PROCESS_SHARE) || (share == AUTO_SHARE && VM_INHERIT_SHARE == entry->inheritance)) { key->shared = 1; key->info.shared.offset = entry->offset + entry->start - (vm_offset_t)addr; vm_object_reference(key->info.shared.object); } else { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } vm_map_lookup_done(map, entry); } umtxq_hash(key); return (0); } /* * Release key. */ static inline void umtx_key_release(struct umtx_key *key) { if (key->shared) vm_object_deallocate(key->info.shared.object); } /* * Lock a umtx object. */ static int _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo) { struct umtx_q *uq; u_long owner; u_long old; int error = 0; uq = td->td_umtxq; /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { /* * Try the uncontested case. This should be done in userland. */ owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id); /* The acquire succeeded. */ if (owner == UMTX_UNOWNED) return (0); /* The address was invalid. */ if (owner == -1) return (EFAULT); /* If no one owns it but it is contested try to acquire it. */ if (owner == UMTX_CONTESTED) { owner = casuword(&umtx->u_owner, UMTX_CONTESTED, id | UMTX_CONTESTED); if (owner == UMTX_CONTESTED) return (0); /* The address was invalid. */ if (owner == -1) return (EFAULT); /* If this failed the lock has changed, restart. */ continue; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) return (error); if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE, &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED); /* The address was invalid. */ if (old == -1) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); if (old == owner) error = umtxq_sleep(uq, "umtx", timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); } return (0); } /* * Lock a umtx object. */ static int do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, struct timespec *timeout) { struct timespec ts, ts2, ts3; struct timeval tv; int error; if (timeout == NULL) { error = _do_lock_umtx(td, umtx, id, 0); /* Mutex locking is restarted if it is interrupted. */ if (error == EINTR) error = ERESTART; } else { getnanouptime(&ts); timespecadd(&ts, timeout); TIMESPEC_TO_TIMEVAL(&tv, timeout); for (;;) { error = _do_lock_umtx(td, umtx, id, tvtohz(&tv)); if (error != ETIMEDOUT) break; getnanouptime(&ts2); if (timespeccmp(&ts2, &ts, >=)) { error = ETIMEDOUT; break; } ts3 = ts; timespecsub(&ts3, &ts2); TIMESPEC_TO_TIMEVAL(&tv, &ts3); } /* Timed-locking is not restarted. */ if (error == ERESTART) error = EINTR; } return (error); } /* * Unlock a umtx object. */ static int do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id) { struct umtx_key key; u_long owner; u_long old; int error; int count; /* * Make sure we own this mtx. */ owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner)); if (owner == -1) return (EFAULT); if ((owner & ~UMTX_CONTESTED) != id) return (EPERM); /* This should be done in userland */ if ((owner & UMTX_CONTESTED) == 0) { old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED); if (old == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE, &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ old = casuword(&umtx->u_owner, owner, count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED); umtxq_lock(&key); umtxq_signal(&key,1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (old == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } #ifdef COMPAT_IA32 /* * Lock a umtx object. */ static int _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo) { struct umtx_q *uq; uint32_t owner; uint32_t old; int error = 0; uq = td->td_umtxq; /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { /* * Try the uncontested case. This should be done in userland. */ owner = casuword32(m, UMUTEX_UNOWNED, id); /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) return (0); /* The address was invalid. */ if (owner == -1) return (EFAULT); /* If no one owns it but it is contested try to acquire it. */ if (owner == UMUTEX_CONTESTED) { owner = casuword32(m, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED); if (owner == UMUTEX_CONTESTED) return (0); /* The address was invalid. */ if (owner == -1) return (EFAULT); /* If this failed the lock has changed, restart. */ continue; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) return (error); if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE, &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ old = casuword32(m, owner, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (old == -1) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); if (old == owner) error = umtxq_sleep(uq, "umtx", timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); } return (0); } /* * Lock a umtx object. */ static int do_lock_umtx32(struct thread *td, void *m, uint32_t id, struct timespec *timeout) { struct timespec ts, ts2, ts3; struct timeval tv; int error; if (timeout == NULL) { error = _do_lock_umtx32(td, m, id, 0); /* Mutex locking is restarted if it is interrupted. */ if (error == EINTR) error = ERESTART; } else { getnanouptime(&ts); timespecadd(&ts, timeout); TIMESPEC_TO_TIMEVAL(&tv, timeout); for (;;) { error = _do_lock_umtx32(td, m, id, tvtohz(&tv)); if (error != ETIMEDOUT) break; getnanouptime(&ts2); if (timespeccmp(&ts2, &ts, >=)) { error = ETIMEDOUT; break; } ts3 = ts; timespecsub(&ts3, &ts2); TIMESPEC_TO_TIMEVAL(&tv, &ts3); } /* Timed-locking is not restarted. */ if (error == ERESTART) error = EINTR; } return (error); } /* * Unlock a umtx object. */ static int do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id) { struct umtx_key key; uint32_t owner; uint32_t old; int error; int count; /* * Make sure we own this mtx. */ owner = fuword32(m); if (owner == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); /* This should be done in userland */ if ((owner & UMUTEX_CONTESTED) == 0) { old = casuword32(m, owner, UMUTEX_UNOWNED); if (old == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE, &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ old = casuword32(m, owner, count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED); umtxq_lock(&key); umtxq_signal(&key,1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (old == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } #endif /* * Fetch and compare value, sleep on the address if value is not changed. */ static int do_wait(struct thread *td, void *addr, u_long id, struct timespec *timeout, int compat32) { struct umtx_q *uq; struct timespec ts, ts2, ts3; struct timeval tv; u_long tmp; int error = 0; uq = td->td_umtxq; if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE, &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); if (compat32 == 0) tmp = fuword(addr); else tmp = fuword32(addr); if (tmp != id) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); } else if (timeout == NULL) { umtxq_lock(&uq->uq_key); error = umtxq_sleep(uq, "uwait", 0); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); } else { getnanouptime(&ts); timespecadd(&ts, timeout); TIMESPEC_TO_TIMEVAL(&tv, timeout); umtxq_lock(&uq->uq_key); for (;;) { error = umtxq_sleep(uq, "uwait", tvtohz(&tv)); if (!(uq->uq_flags & UQF_UMTXQ)) break; if (error != ETIMEDOUT) break; umtxq_unlock(&uq->uq_key); getnanouptime(&ts2); if (timespeccmp(&ts2, &ts, >=)) { error = ETIMEDOUT; umtxq_lock(&uq->uq_key); break; } ts3 = ts; timespecsub(&ts3, &ts2); TIMESPEC_TO_TIMEVAL(&tv, &ts3); umtxq_lock(&uq->uq_key); } umtxq_remove(uq); umtxq_unlock(&uq->uq_key); } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } /* * Wake up threads sleeping on the specified address. */ int kern_umtx_wake(struct thread *td, void *uaddr, int n_wake) { struct umtx_key key; int ret; if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE, &key)) != 0) return (ret); umtxq_lock(&key); ret = umtxq_signal(&key, n_wake); umtxq_unlock(&key); umtx_key_release(&key); return (0); } /* * Lock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo, int try) { struct umtx_q *uq; uint32_t owner, old, id; #ifdef SMP int spincount; #endif int error = 0; id = td->td_tid; uq = td->td_umtxq; #ifdef SMP if (smp_cpus > 1) { spincount = fuword32(&m->m_spincount); if (spincount == 0) spincount = umtx_dflt_spins; if (spincount > umtx_max_spins) spincount = umtx_max_spins; } else spincount = 0; #endif /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { #ifdef SMP try_unowned: #endif /* * Try the uncontested case. This should be done in userland. */ owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id); /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) return (0); /* The address was invalid. */ if (owner == -1) return (EFAULT); /* If no one owns it but it is contested try to acquire it. */ if (owner == UMUTEX_CONTESTED) { #ifdef SMP try_contested: #endif owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED); if (owner == UMUTEX_CONTESTED) return (0); /* The address was invalid. */ if (owner == -1) return (EFAULT); /* If this failed the lock has changed, restart. */ continue; } if ((flags & UMUTEX_ERROR_CHECK) != 0 && (owner & ~UMUTEX_CONTESTED) == id) return (EDEADLK); if (try != 0) return (EBUSY); #ifdef SMP if (spincount > 0 && (owner & ~UMUTEX_CONTESTED) != id) { int i, found = 0; struct pcpu *pcpu = NULL; /* Look for a cpu the owner is running on */ for (i = 0; i < MAXCPU; i++) { if (CPU_ABSENT(i)) continue; pcpu = pcpu_find(i); if ((owner & ~UMUTEX_CONTESTED) == pcpu->pc_curtid) { found = 1; break; } } if (__predict_false(!found)) goto end_spin; while ((owner & ~UMUTEX_CONTESTED) == pcpu->pc_curtid && (owner & ~UMUTEX_CONTESTED) != id) { if (--spincount <= 0) break; if ((td->td_flags & (TDF_NEEDRESCHED|TDF_ASTPENDING|TDF_NEEDSIGCHK)) || P_SHOULDSTOP(td->td_proc)) break; owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner)); if (owner == UMUTEX_UNOWNED) goto try_unowned; if (owner == UMUTEX_CONTESTED) goto try_contested; cpu_spinwait(); } } end_spin: spincount = 0; #endif /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) return (error); if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (old == -1) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); if (old == owner) error = umtxq_sleep(uq, "umtxn", timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); } return (0); } /* * Lock PTHREAD_PRIO_NONE protocol POSIX mutex. */ /* * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; uint32_t owner, old, id; int error; int count; id = td->td_tid; /* * Make sure we own this mtx. */ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner)); if (owner == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); /* This should be done in userland */ if ((owner & UMUTEX_CONTESTED) == 0) { old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED); if (old == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ old = casuword32(&m->m_owner, owner, count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED); umtxq_lock(&key); umtxq_signal(&key,1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (old == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } static inline struct umtx_pi * umtx_pi_alloc(int flags) { struct umtx_pi *pi; pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags); TAILQ_INIT(&pi->pi_blocked); atomic_add_int(&umtx_pi_allocated, 1); return (pi); } static inline void umtx_pi_free(struct umtx_pi *pi) { uma_zfree(umtx_pi_zone, pi); atomic_add_int(&umtx_pi_allocated, -1); } /* * Adjust the thread's position on a pi_state after its priority has been * changed. */ static int umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td) { struct umtx_q *uq, *uq1, *uq2; struct thread *td1; mtx_assert(&sched_lock, MA_OWNED); if (pi == NULL) return (0); uq = td->td_umtxq; /* * Check if the thread needs to be moved on the blocked chain. * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq); uq2 = TAILQ_NEXT(uq, uq_lockq); if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) || (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) { /* * Remove thread from blocked chain and determine where * it should be moved to. */ TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { td1 = uq1->uq_thread; MPASS(td1->td_proc->p_magic == P_MAGIC); if (UPRI(td1) > UPRI(td)) break; } if (uq1 == NULL) TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); else TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); } return (1); } /* * Propagate priority when a thread is blocked on POSIX * PI mutex. */ static void umtx_propagate_priority(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; int pri; mtx_assert(&sched_lock, MA_OWNED); pri = UPRI(td); uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) return; for (;;) { td = pi->pi_owner; if (td == NULL) return; MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); if (UPRI(td) <= pri) return; sched_lend_user_prio(td, pri); /* * Pick up the lock that td is blocked on. */ uq = td->td_umtxq; pi = uq->uq_pi_blocked; /* Resort td on the list if needed. */ if (!umtx_pi_adjust_thread(pi, td)) break; } } /* * Unpropagate priority for a PI mutex when a thread blocked on * it is interrupted by signal or resumed by others. */ static void umtx_unpropagate_priority(struct umtx_pi *pi) { struct umtx_q *uq, *uq_owner; struct umtx_pi *pi2; int pri; mtx_assert(&sched_lock, MA_OWNED); while (pi != NULL && pi->pi_owner != NULL) { pri = PRI_MAX; uq_owner = pi->pi_owner->td_umtxq; TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) { uq = TAILQ_FIRST(&pi2->pi_blocked); if (uq != NULL) { if (pri > UPRI(uq->uq_thread)) pri = UPRI(uq->uq_thread); } } if (pri > uq_owner->uq_inherited_pri) pri = uq_owner->uq_inherited_pri; sched_unlend_user_prio(pi->pi_owner, pri); pi = uq_owner->uq_pi_blocked; } } /* * Insert a PI mutex into owned list. */ static void umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq_owner; uq_owner = owner->td_umtxq; mtx_assert(&sched_lock, MA_OWNED); if (pi->pi_owner != NULL) panic("pi_ower != NULL"); pi->pi_owner = owner; TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link); } /* * Claim ownership of a PI mutex. */ static int umtx_pi_claim(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq, *uq_owner; uq_owner = owner->td_umtxq; mtx_lock_spin(&sched_lock); if (pi->pi_owner == owner) { mtx_unlock_spin(&sched_lock); return (0); } if (pi->pi_owner != NULL) { /* * userland may have already messed the mutex, sigh. */ mtx_unlock_spin(&sched_lock); return (EPERM); } umtx_pi_setowner(pi, owner); uq = TAILQ_FIRST(&pi->pi_blocked); if (uq != NULL) { int pri; pri = UPRI(uq->uq_thread); if (pri < UPRI(owner)) sched_lend_user_prio(owner, pri); } mtx_unlock_spin(&sched_lock); return (0); } /* * Adjust a thread's order position in its blocked PI mutex, * this may result new priority propagating process. */ void umtx_pi_adjust(struct thread *td, u_char oldpri) { struct umtx_q *uq; struct umtx_pi *pi; uq = td->td_umtxq; mtx_assert(&sched_lock, MA_OWNED); MPASS(TD_ON_UPILOCK(td)); /* * Pick up the lock that td is blocked on. */ pi = uq->uq_pi_blocked; MPASS(pi != NULL); /* Resort the turnstile on the list. */ if (!umtx_pi_adjust_thread(pi, td)) return; /* * If our priority was lowered and we are at the head of the * turnstile, then propagate our new priority up the chain. */ if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri) umtx_propagate_priority(td); } /* * Sleep on a PI mutex. */ static int umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner, const char *wmesg, int timo) { struct umtxq_chain *uc; struct thread *td, *td1; struct umtx_q *uq1; int pri; int error = 0; td = uq->uq_thread; KASSERT(td == curthread, ("inconsistent uq_thread")); uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); umtxq_insert(uq); if (pi->pi_owner == NULL) { /* XXX * Current, We only support process private PI-mutex, * non-contended PI-mutexes are locked in userland. * Process shared PI-mutex should always be initialized * by kernel and be registered in kernel, locking should * always be done by kernel to avoid security problems. * For process private PI-mutex, we can find owner * thread and boost its priority safely. */ PROC_LOCK(curproc); td1 = thread_find(curproc, owner); mtx_lock_spin(&sched_lock); if (td1 != NULL && pi->pi_owner == NULL) { uq1 = td1->td_umtxq; umtx_pi_setowner(pi, td1); } PROC_UNLOCK(curproc); } else { mtx_lock_spin(&sched_lock); } TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { pri = UPRI(uq1->uq_thread); if (pri > UPRI(td)) break; } if (uq1 != NULL) TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); else TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); uq->uq_pi_blocked = pi; td->td_flags |= TDF_UPIBLOCKED; mtx_unlock_spin(&sched_lock); umtxq_unlock(&uq->uq_key); mtx_lock_spin(&sched_lock); umtx_propagate_priority(td); mtx_unlock_spin(&sched_lock); umtxq_lock(&uq->uq_key); if (uq->uq_flags & UQF_UMTXQ) { error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo); if (error == EWOULDBLOCK) error = ETIMEDOUT; if (uq->uq_flags & UQF_UMTXQ) { umtxq_busy(&uq->uq_key); umtxq_remove(uq); umtxq_unbusy(&uq->uq_key); } } umtxq_unlock(&uq->uq_key); mtx_lock_spin(&sched_lock); uq->uq_pi_blocked = NULL; td->td_flags &= ~TDF_UPIBLOCKED; TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); umtx_unpropagate_priority(pi); mtx_unlock_spin(&sched_lock); umtxq_lock(&uq->uq_key); return (error); } /* * Add reference count for a PI mutex. */ static void umtx_pi_ref(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); pi->pi_refcount++; } /* * Decrease reference count for a PI mutex, if the counter * is decreased to zero, its memory space is freed. */ static void umtx_pi_unref(struct umtx_pi *pi) { struct umtxq_chain *uc; int free = 0; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT(pi->pi_refcount > 0, ("invalid reference count")); if (--pi->pi_refcount == 0) { mtx_lock_spin(&sched_lock); if (pi->pi_owner != NULL) { TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link); pi->pi_owner = NULL; } KASSERT(TAILQ_EMPTY(&pi->pi_blocked), ("blocked queue not empty")); mtx_unlock_spin(&sched_lock); TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink); free = 1; } if (free) umtx_pi_free(pi); } /* * Find a PI mutex in hash table. */ static struct umtx_pi * umtx_pi_lookup(struct umtx_key *key) { struct umtxq_chain *uc; struct umtx_pi *pi; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) { if (umtx_key_match(&pi->pi_key, key)) { return (pi); } } return (NULL); } /* * Insert a PI mutex into hash table. */ static inline void umtx_pi_insert(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink); } /* * Lock a PI mutex. */ static int _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo, int try) { struct umtx_q *uq; struct umtx_pi *pi, *new_pi; uint32_t id, owner, old; int error; id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi == NULL) { new_pi = umtx_pi_alloc(M_NOWAIT); if (new_pi == NULL) { umtxq_unlock(&uq->uq_key); new_pi = umtx_pi_alloc(M_WAITOK); new_pi->pi_key = uq->uq_key; umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi != NULL) { umtx_pi_free(new_pi); new_pi = NULL; } } if (new_pi != NULL) { new_pi->pi_key = uq->uq_key; umtx_pi_insert(new_pi); pi = new_pi; } } umtx_pi_ref(pi); umtxq_unlock(&uq->uq_key); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { /* * Try the uncontested case. This should be done in userland. */ owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id); /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) { error = 0; break; } /* The address was invalid. */ if (owner == -1) { error = EFAULT; break; } /* If no one owns it but it is contested try to acquire it. */ if (owner == UMUTEX_CONTESTED) { owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED); if (owner == UMUTEX_CONTESTED) { umtxq_lock(&uq->uq_key); error = umtx_pi_claim(pi, td); umtxq_unlock(&uq->uq_key); break; } /* The address was invalid. */ if (owner == -1) { error = EFAULT; break; } /* If this failed the lock has changed, restart. */ continue; } if ((flags & UMUTEX_ERROR_CHECK) != 0 && (owner & ~UMUTEX_CONTESTED) == id) { error = EDEADLK; break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (old == -1) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); error = EFAULT; break; } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ if (old == owner) error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED, "umtxpi", timo); umtxq_unlock(&uq->uq_key); } umtxq_lock(&uq->uq_key); umtx_pi_unref(pi); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PI mutex. */ static int do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; struct umtx_q *uq_first, *uq_first2, *uq_me; struct umtx_pi *pi, *pi2; uint32_t owner, old, id; int error; int count; int pri; id = td->td_tid; /* * Make sure we own this mtx. */ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner)); if (owner == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); /* This should be done in userland */ if ((owner & UMUTEX_CONTESTED) == 0) { old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED); if (old == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count_pi(&key, &uq_first); if (uq_first != NULL) { pi = uq_first->uq_pi_blocked; if (pi->pi_owner != curthread) { umtxq_unbusy(&key); umtxq_unlock(&key); /* userland messed the mutex */ return (EPERM); } uq_me = curthread->td_umtxq; mtx_lock_spin(&sched_lock); pi->pi_owner = NULL; TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link); uq_first = TAILQ_FIRST(&pi->pi_blocked); pri = PRI_MAX; TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) { uq_first2 = TAILQ_FIRST(&pi2->pi_blocked); if (uq_first2 != NULL) { if (pri > UPRI(uq_first2->uq_thread)) pri = UPRI(uq_first2->uq_thread); } } sched_unlend_user_prio(curthread, pri); mtx_unlock_spin(&sched_lock); } umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ old = casuword32(&m->m_owner, owner, count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED); umtxq_lock(&key); if (uq_first != NULL) umtxq_signal_thread(uq_first); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (old == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } /* * Lock a PP mutex. */ static int _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo, int try) { struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t ceiling; uint32_t owner, id; int error, pri, old_inherited_pri, su; id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); for (;;) { old_inherited_pri = uq->uq_inherited_pri; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]); if (ceiling > RTP_PRIO_MAX) { error = EINVAL; goto out; } mtx_lock_spin(&sched_lock); if (UPRI(td) < PRI_MIN_REALTIME + ceiling) { mtx_unlock_spin(&sched_lock); error = EINVAL; goto out; } if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) { uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling; if (uq->uq_inherited_pri < UPRI(td)) sched_lend_user_prio(td, uq->uq_inherited_pri); } mtx_unlock_spin(&sched_lock); owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED); if (owner == UMUTEX_CONTESTED) { error = 0; break; } /* The address was invalid. */ if (owner == -1) { error = EFAULT; break; } if ((flags & UMUTEX_ERROR_CHECK) != 0 && (owner & ~UMUTEX_CONTESTED) == id) { error = EDEADLK; break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); mtx_lock_spin(&sched_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; sched_unlend_user_prio(td, pri); mtx_unlock_spin(&sched_lock); } if (error != 0) { mtx_lock_spin(&sched_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; sched_unlend_user_prio(td, pri); mtx_unlock_spin(&sched_lock); } out: umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PP mutex. */ static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t owner, id; uint32_t rceiling; int error, pri, new_inherited_pri, su; id = td->td_tid; uq = td->td_umtxq; su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); /* * Make sure we own this mtx. */ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner)); if (owner == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t)); if (error != 0) return (error); if (rceiling == -1) new_inherited_pri = PRI_MAX; else { rceiling = RTP_PRIO_MAX - rceiling; if (rceiling > RTP_PRIO_MAX) return (EINVAL); new_inherited_pri = PRI_MIN_REALTIME + rceiling; } if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_unlock(&key); /* * For priority protected mutex, always set unlocked state * to UMUTEX_CONTESTED, so that userland always enters kernel * to lock the mutex, it is necessary because thread priority * has to be adjusted for such mutex. */ error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner), UMUTEX_CONTESTED); umtxq_lock(&key); if (error == 0) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); if (error == -1) error = EFAULT; else { mtx_lock_spin(&sched_lock); if (su != 0) uq->uq_inherited_pri = new_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; sched_unlend_user_prio(td, pri); mtx_unlock_spin(&sched_lock); } umtx_key_release(&key); return (error); } static int do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling, uint32_t *old_ceiling) { struct umtx_q *uq; uint32_t save_ceiling; uint32_t owner, id; uint32_t flags; int error; flags = fuword32(&m->m_flags); if ((flags & UMUTEX_PRIO_PROTECT) == 0) return (EINVAL); if (ceiling > RTP_PRIO_MAX) return (EINVAL); id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); for (;;) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); save_ceiling = fuword32(&m->m_ceilings[0]); owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED); if (owner == UMUTEX_CONTESTED) { suword32(&m->m_ceilings[0], ceiling); suword32(__DEVOLATILE(uint32_t *, &m->m_owner), UMUTEX_CONTESTED); error = 0; break; } /* The address was invalid. */ if (owner == -1) { error = EFAULT; break; } if ((owner & ~UMUTEX_CONTESTED) == id) { suword32(&m->m_ceilings[0], ceiling); error = 0; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", 0); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); } umtxq_lock(&uq->uq_key); if (error == 0) umtxq_signal(&uq->uq_key, INT_MAX); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0 && old_ceiling != NULL) suword32(old_ceiling, save_ceiling); return (error); } static int _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo, int try) { switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: return (_do_lock_normal(td, m, flags, timo, try)); case UMUTEX_PRIO_INHERIT: return (_do_lock_pi(td, m, flags, timo, try)); case UMUTEX_PRIO_PROTECT: return (_do_lock_pp(td, m, flags, timo, try)); } return (EINVAL); } /* * Lock a userland POSIX mutex. */ static int do_lock_umutex(struct thread *td, struct umutex *m, struct timespec *timeout, int try) { struct timespec ts, ts2, ts3; struct timeval tv; uint32_t flags; int error; flags = fuword32(&m->m_flags); if (flags == -1) return (EFAULT); if (timeout == NULL) { error = _do_lock_umutex(td, m, flags, 0, try); /* Mutex locking is restarted if it is interrupted. */ if (error == EINTR) error = ERESTART; } else { getnanouptime(&ts); timespecadd(&ts, timeout); TIMESPEC_TO_TIMEVAL(&tv, timeout); for (;;) { error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try); if (error != ETIMEDOUT) break; getnanouptime(&ts2); if (timespeccmp(&ts2, &ts, >=)) { error = ETIMEDOUT; break; } ts3 = ts; timespecsub(&ts3, &ts2); TIMESPEC_TO_TIMEVAL(&tv, &ts3); } /* Timed-locking is not restarted. */ if (error == ERESTART) error = EINTR; } return (error); } /* * Unlock a userland POSIX mutex. */ static int do_unlock_umutex(struct thread *td, struct umutex *m) { uint32_t flags; flags = fuword32(&m->m_flags); if (flags == -1) return (EFAULT); switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: return (do_unlock_normal(td, m, flags)); case UMUTEX_PRIO_INHERIT: return (do_unlock_pi(td, m, flags)); case UMUTEX_PRIO_PROTECT: return (do_unlock_pp(td, m, flags)); } return (EINVAL); } static int do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m, struct timespec *timeout, u_long wflags) { struct umtx_q *uq; struct timeval tv; struct timespec cts, ets, tts; uint32_t flags; int error; uq = td->td_umtxq; flags = fuword32(&cv->c_flags); error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * The magic thing is we should set c_has_waiters to 1 before * releasing user mutex. */ suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1); umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); error = do_unlock_umutex(td, m); umtxq_lock(&uq->uq_key); if (error == 0) { if ((wflags & UMTX_CHECK_UNPARKING) && (td->td_pflags & TDP_WAKEUP)) { td->td_pflags &= ~TDP_WAKEUP; error = EINTR; } else if (timeout == NULL) { error = umtxq_sleep(uq, "ucond", 0); } else { getnanouptime(&ets); timespecadd(&ets, timeout); TIMESPEC_TO_TIMEVAL(&tv, timeout); for (;;) { error = umtxq_sleep(uq, "ucond", tvtohz(&tv)); if (error != ETIMEDOUT) break; getnanouptime(&cts); if (timespeccmp(&cts, &ets, >=)) { error = ETIMEDOUT; break; } tts = ets; timespecsub(&tts, &cts); TIMESPEC_TO_TIMEVAL(&tv, &tts); } } } if (error != 0) { if ((uq->uq_flags & UQF_UMTXQ) == 0) { /* * If we concurrently got do_cv_signal()d * and we got an error or UNIX signals or a timeout, * then, perform another umtxq_signal to avoid * consuming the wakeup. This may cause supurious * wakeup for another thread which was just queued, * but SUSV3 explicitly allows supurious wakeup to * occur, and indeed a kernel based implementation * can not avoid it. */ if (!umtxq_signal(&uq->uq_key, 1)) error = 0; } if (error == ERESTART) error = EINTR; } umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland condition variable. */ static int do_cv_signal(struct thread *td, struct ucond *cv) { struct umtx_key key; int error, cnt, nwake; uint32_t flags; flags = fuword32(&cv->c_flags); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); nwake = umtxq_signal(&key, 1); if (cnt <= nwake) { umtxq_unlock(&key); error = suword32( __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0); umtxq_lock(&key); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } static int do_cv_broadcast(struct thread *td, struct ucond *cv) { struct umtx_key key; int error; uint32_t flags; flags = fuword32(&cv->c_flags); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_signal(&key, INT_MAX); umtxq_unlock(&key); error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0); umtxq_lock(&key); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } int _umtx_lock(struct thread *td, struct _umtx_lock_args *uap) /* struct umtx *umtx */ { return _do_lock_umtx(td, uap->umtx, td->td_tid, 0); } int _umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap) /* struct umtx *umtx */ { return do_unlock_umtx(td, uap->umtx, td->td_tid); } static int __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = copyin(uap->uaddr2, &timeout, sizeof(timeout)); if (error != 0) return (error); if (timeout.tv_nsec >= 1000000000 || timeout.tv_nsec < 0) { return (EINVAL); } ts = &timeout; } return (do_lock_umtx(td, uap->obj, uap->val, ts)); } static int __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap) { return (do_unlock_umtx(td, uap->obj, uap->val)); } static int __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; if (uap->uaddr2 == NULL) ts = NULL; else { error = copyin(uap->uaddr2, &timeout, sizeof(timeout)); if (error != 0) return (error); if (timeout.tv_nsec >= 1000000000 || timeout.tv_nsec < 0) return (EINVAL); ts = &timeout; } return do_wait(td, uap->obj, uap->val, ts, 0); } static int __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap) { return (kern_umtx_wake(td, uap->obj, uap->val)); } static int __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = copyin(uap->uaddr2, &timeout, sizeof(timeout)); if (error != 0) return (error); if (timeout.tv_nsec >= 1000000000 || timeout.tv_nsec < 0) { return (EINVAL); } ts = &timeout; } return do_lock_umutex(td, uap->obj, ts, 0); } static int __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap) { return do_lock_umutex(td, uap->obj, NULL, 1); } static int __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap) { return do_unlock_umutex(td, uap->obj); } static int __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap) { return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1); } static int __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = copyin(uap->uaddr2, &timeout, sizeof(timeout)); if (error != 0) return (error); if (timeout.tv_nsec >= 1000000000 || timeout.tv_nsec < 0) { return (EINVAL); } ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static int __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap) { return do_cv_signal(td, uap->obj); } static int __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap) { return do_cv_broadcast(td, uap->obj); } typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap); static _umtx_op_func op_table[] = { __umtx_op_lock_umtx, /* UMTX_OP_LOCK */ __umtx_op_unlock_umtx, /* UMTX_OP_UNLOCK */ __umtx_op_wait, /* UMTX_OP_WAIT */ __umtx_op_wake, /* UMTX_OP_WAKE */ __umtx_op_trylock_umutex, /* UMTX_OP_MUTEX_TRYLOCK */ __umtx_op_lock_umutex, /* UMTX_OP_MUTEX_LOCK */ __umtx_op_unlock_umutex, /* UMTX_OP_MUTEX_UNLOCK */ __umtx_op_set_ceiling, /* UMTX_OP_SET_CEILING */ __umtx_op_cv_wait, /* UMTX_OP_CV_WAIT*/ __umtx_op_cv_signal, /* UMTX_OP_CV_SIGNAL */ __umtx_op_cv_broadcast /* UMTX_OP_CV_BROADCAST */ }; int _umtx_op(struct thread *td, struct _umtx_op_args *uap) { if ((unsigned)uap->op < UMTX_OP_MAX) return (*op_table[uap->op])(td, uap); return (EINVAL); } #ifdef COMPAT_IA32 - int freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap) /* struct umtx *umtx */ { return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL)); } int freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap) /* struct umtx *umtx */ { return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid)); } struct timespec32 { u_int32_t tv_sec; u_int32_t tv_nsec; }; static inline int copyin_timeout32(void *addr, struct timespec *tsp) { struct timespec32 ts32; int error; error = copyin(addr, &ts32, sizeof(struct timespec32)); if (error == 0) { tsp->tv_sec = ts32.tv_sec; tsp->tv_nsec = ts32.tv_nsec; } return (error); } static int __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = copyin_timeout32(uap->uaddr2, &timeout); if (error != 0) return (error); if (timeout.tv_nsec >= 1000000000 || timeout.tv_nsec < 0) { return (EINVAL); } ts = &timeout; } return (do_lock_umtx32(td, uap->obj, uap->val, ts)); } static int __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap) { return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val)); } static int __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; if (uap->uaddr2 == NULL) ts = NULL; else { error = copyin_timeout32(uap->uaddr2, &timeout); if (error != 0) return (error); if (timeout.tv_nsec >= 1000000000 || timeout.tv_nsec < 0) return (EINVAL); ts = &timeout; } return do_wait(td, uap->obj, uap->val, ts, 1); } static int __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = copyin_timeout32(uap->uaddr2, &timeout); if (error != 0) return (error); if (timeout.tv_nsec >= 1000000000 || timeout.tv_nsec < 0) return (EINVAL); ts = &timeout; } return do_lock_umutex(td, uap->obj, ts, 0); } static int __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = copyin_timeout32(uap->uaddr2, &timeout); if (error != 0) return (error); if (timeout.tv_nsec >= 1000000000 || timeout.tv_nsec < 0) return (EINVAL); ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static _umtx_op_func op_table_compat32[] = { __umtx_op_lock_umtx_compat32, /* UMTX_OP_LOCK */ __umtx_op_unlock_umtx_compat32, /* UMTX_OP_UNLOCK */ __umtx_op_wait_compat32, /* UMTX_OP_WAIT */ __umtx_op_wake, /* UMTX_OP_WAKE */ __umtx_op_trylock_umutex, /* UMTX_OP_MUTEX_LOCK */ __umtx_op_lock_umutex_compat32, /* UMTX_OP_MUTEX_TRYLOCK */ __umtx_op_unlock_umutex, /* UMTX_OP_MUTEX_UNLOCK */ __umtx_op_set_ceiling, /* UMTX_OP_SET_CEILING */ __umtx_op_cv_wait_compat32, /* UMTX_OP_CV_WAIT*/ __umtx_op_cv_signal, /* UMTX_OP_CV_SIGNAL */ __umtx_op_cv_broadcast /* UMTX_OP_CV_BROADCAST */ }; int freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap) { if ((unsigned)uap->op < UMTX_OP_MAX) return (*op_table_compat32[uap->op])(td, (struct _umtx_op_args *)uap); return (EINVAL); } #endif void umtx_thread_init(struct thread *td) { td->td_umtxq = umtxq_alloc(); td->td_umtxq->uq_thread = td; } void umtx_thread_fini(struct thread *td) { umtxq_free(td->td_umtxq); } /* * It will be called when new thread is created, e.g fork(). */ void umtx_thread_alloc(struct thread *td) { struct umtx_q *uq; uq = td->td_umtxq; uq->uq_inherited_pri = PRI_MAX; KASSERT(uq->uq_flags == 0, ("uq_flags != 0")); KASSERT(uq->uq_thread == td, ("uq_thread != td")); KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL")); KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty")); } /* * exec() hook. */ static void umtx_exec_hook(void *arg __unused, struct proc *p __unused, struct image_params *imgp __unused) { umtx_thread_cleanup(curthread); } /* * thread_exit() hook. */ void umtx_thread_exit(struct thread *td) { umtx_thread_cleanup(td); } /* * clean up umtx data. */ static void umtx_thread_cleanup(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; if ((uq = td->td_umtxq) == NULL) return; mtx_lock_spin(&sched_lock); uq->uq_inherited_pri = PRI_MAX; while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) { pi->pi_owner = NULL; TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link); } td->td_flags &= ~TDF_UBORROWING; mtx_unlock_spin(&sched_lock); } Index: head/sys/kern/kern_uuid.c =================================================================== --- head/sys/kern/kern_uuid.c (revision 167231) +++ head/sys/kern/kern_uuid.c (revision 167232) @@ -1,362 +1,361 @@ /*- * Copyright (c) 2002 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include /* * See also: * http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt * http://www.opengroup.org/onlinepubs/009629399/apdxa.htm * * Note that the generator state is itself an UUID, but the time and clock * sequence fields are written in the native byte order. */ CTASSERT(sizeof(struct uuid) == 16); /* We use an alternative, more convenient representation in the generator. */ struct uuid_private { union { uint64_t ll; /* internal. */ struct { uint32_t low; uint16_t mid; uint16_t hi; } x; } time; uint16_t seq; /* Big-endian. */ uint16_t node[UUID_NODE_LEN>>1]; }; CTASSERT(sizeof(struct uuid_private) == 16); static struct uuid_private uuid_last; static struct mtx uuid_mutex; MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF); /* * Return the first MAC address we encounter or, if none was found, * construct a sufficiently random multicast address. We don't try * to return the same MAC address as previously returned. We always * generate a new multicast address if no MAC address exists in the * system. * It would be nice to know if 'ifnet' or any of its sub-structures * has been changed in any way. If not, we could simply skip the * scan and safely return the MAC address we returned before. */ static void uuid_node(uint16_t *node) { struct ifnet *ifp; struct ifaddr *ifa; struct sockaddr_dl *sdl; int i; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &ifnet, if_link) { /* Walk the address list */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sdl = (struct sockaddr_dl*)ifa->ifa_addr; if (sdl != NULL && sdl->sdl_family == AF_LINK && sdl->sdl_type == IFT_ETHER) { /* Got a MAC address. */ bcopy(LLADDR(sdl), node, UUID_NODE_LEN); IFNET_RUNLOCK(); return; } } } IFNET_RUNLOCK(); for (i = 0; i < (UUID_NODE_LEN>>1); i++) node[i] = (uint16_t)arc4random(); *((uint8_t*)node) |= 0x01; } /* * Get the current time as a 60 bit count of 100-nanosecond intervals * since 00:00:00.00, October 15,1582. We apply a magic offset to convert * the Unix time since 00:00:00.00, Januari 1, 1970 to the date of the * Gregorian reform to the Christian calendar. */ static uint64_t uuid_time(void) { struct bintime bt; uint64_t time = 0x01B21DD213814000LL; bintime(&bt); time += (uint64_t)bt.sec * 10000000LL; time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32; return (time & ((1LL << 60) - 1LL)); } struct uuid * kern_uuidgen(struct uuid *store, size_t count) { struct uuid_private uuid; uint64_t time; size_t n; mtx_lock(&uuid_mutex); uuid_node(uuid.node); time = uuid_time(); if (uuid_last.time.ll == 0LL || uuid_last.node[0] != uuid.node[0] || uuid_last.node[1] != uuid.node[1] || uuid_last.node[2] != uuid.node[2]) uuid.seq = (uint16_t)arc4random() & 0x3fff; else if (uuid_last.time.ll >= time) uuid.seq = (uuid_last.seq + 1) & 0x3fff; else uuid.seq = uuid_last.seq; uuid_last = uuid; uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL); mtx_unlock(&uuid_mutex); /* Set sequence and variant and deal with byte order. */ uuid.seq = htobe16(uuid.seq | 0x8000); for (n = 0; n < count; n++) { /* Set time and version (=1). */ uuid.time.x.low = (uint32_t)time; uuid.time.x.mid = (uint16_t)(time >> 32); uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12); store[n] = *(struct uuid *)&uuid; time++; } return (store); } #ifndef _SYS_SYSPROTO_H_ struct uuidgen_args { struct uuid *store; int count; }; #endif - int uuidgen(struct thread *td, struct uuidgen_args *uap) { struct uuid *store; size_t count; int error; /* * Limit the number of UUIDs that can be created at the same time * to some arbitrary number. This isn't really necessary, but I * like to have some sort of upper-bound that's less than 2G :-) * XXX probably needs to be tunable. */ if (uap->count < 1 || uap->count > 2048) return (EINVAL); count = uap->count; store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK); kern_uuidgen(store, count); error = copyout(store, uap->store, count * sizeof(struct uuid)); free(store, M_TEMP); return (error); } int snprintf_uuid(char *buf, size_t sz, struct uuid *uuid) { struct uuid_private *id; int cnt; id = (struct uuid_private *)uuid; cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x", id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq), be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2])); return (cnt); } int printf_uuid(struct uuid *uuid) { char buf[38]; snprintf_uuid(buf, sizeof(buf), uuid); return (printf("%s", buf)); } int sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid) { char buf[38]; snprintf_uuid(buf, sizeof(buf), uuid); return (sbuf_printf(sb, "%s", buf)); } /* * Encode/Decode UUID into byte-stream. * http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | time_low | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | time_mid | time_hi_and_version | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |clk_seq_hi_res | clk_seq_low | node (0-1) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | node (2-5) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ void le_uuid_enc(void *buf, struct uuid const *uuid) { u_char *p; int i; p = buf; le32enc(p, uuid->time_low); le16enc(p + 4, uuid->time_mid); le16enc(p + 6, uuid->time_hi_and_version); p[8] = uuid->clock_seq_hi_and_reserved; p[9] = uuid->clock_seq_low; for (i = 0; i < _UUID_NODE_LEN; i++) p[10 + i] = uuid->node[i]; } void le_uuid_dec(void const *buf, struct uuid *uuid) { u_char const *p; int i; p = buf; uuid->time_low = le32dec(p); uuid->time_mid = le16dec(p + 4); uuid->time_hi_and_version = le16dec(p + 6); uuid->clock_seq_hi_and_reserved = p[8]; uuid->clock_seq_low = p[9]; for (i = 0; i < _UUID_NODE_LEN; i++) uuid->node[i] = p[10 + i]; } void be_uuid_enc(void *buf, struct uuid const *uuid) { u_char *p; int i; p = buf; be32enc(p, uuid->time_low); be16enc(p + 4, uuid->time_mid); be16enc(p + 6, uuid->time_hi_and_version); p[8] = uuid->clock_seq_hi_and_reserved; p[9] = uuid->clock_seq_low; for (i = 0; i < _UUID_NODE_LEN; i++) p[10 + i] = uuid->node[i]; } void be_uuid_dec(void const *buf, struct uuid *uuid) { u_char const *p; int i; p = buf; uuid->time_low = be32dec(p); uuid->time_mid = le16dec(p + 4); uuid->time_hi_and_version = be16dec(p + 6); uuid->clock_seq_hi_and_reserved = p[8]; uuid->clock_seq_low = p[9]; for (i = 0; i < _UUID_NODE_LEN; i++) uuid->node[i] = p[10 + i]; } int parse_uuid(const char *str, struct uuid *uuid) { u_int c[11]; int n; /* An empty string represents a nil UUID. */ if (*str == '\0') { bzero(uuid, sizeof(*uuid)); return (0); } /* The UUID string representation has a fixed length. */ if (strlen(str) != 36) return (EINVAL); /* * We only work with "new" UUIDs. New UUIDs have the form: * 01234567-89ab-cdef-0123-456789abcdef * The so called "old" UUIDs, which we don't support, have the form: * 0123456789ab.cd.ef.01.23.45.67.89.ab */ if (str[8] != '-') return (EINVAL); n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1, c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10); /* Make sure we have all conversions. */ if (n != 11) return (EINVAL); /* Successful scan. Build the UUID. */ uuid->time_low = c[0]; uuid->time_mid = c[1]; uuid->time_hi_and_version = c[2]; uuid->clock_seq_hi_and_reserved = c[3]; uuid->clock_seq_low = c[4]; for (n = 0; n < 6; n++) uuid->node[n] = c[n + 5]; /* Check semantics... */ return (((c[3] & 0x80) != 0x00 && /* variant 0? */ (c[3] & 0xc0) != 0x80 && /* variant 1? */ (c[3] & 0xe0) != 0xc0) ? EINVAL : 0); /* variant 2? */ } Index: head/sys/kern/kern_xxx.c =================================================================== --- head/sys/kern/kern_xxx.c (revision 167231) +++ head/sys/kern/kern_xxx.c (revision 167232) @@ -1,290 +1,288 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct gethostname_args { char *hostname; u_int len; }; #endif /* ARGSUSED */ int ogethostname(td, uap) struct thread *td; struct gethostname_args *uap; { int name[2]; int error; size_t len = uap->len; name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; mtx_lock(&Giant); error = userland_sysctl(td, name, 2, uap->hostname, &len, 1, 0, 0, 0, 0); mtx_unlock(&Giant); return(error); } #ifndef _SYS_SYSPROTO_H_ struct sethostname_args { char *hostname; u_int len; }; #endif /* ARGSUSED */ int osethostname(td, uap) struct thread *td; register struct sethostname_args *uap; { int name[2]; int error; name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; mtx_lock(&Giant); error = userland_sysctl(td, name, 2, 0, 0, 0, uap->hostname, uap->len, 0, 0); mtx_unlock(&Giant); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ogethostid_args { int dummy; }; #endif /* ARGSUSED */ int ogethostid(td, uap) struct thread *td; struct ogethostid_args *uap; { *(long *)(td->td_retval) = hostid; return (0); } #endif /* COMPAT_43 */ #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct osethostid_args { long hostid; }; #endif /* ARGSUSED */ int osethostid(td, uap) struct thread *td; struct osethostid_args *uap; { int error; error = priv_check(td, PRIV_SETHOSTID); if (error) return (error); mtx_lock(&Giant); hostid = uap->hostid; mtx_unlock(&Giant); return (0); } int oquota(td, uap) struct thread *td; struct oquota_args *uap; { + return (ENOSYS); } #endif /* COMPAT_43 */ /* - * This is the FreeBSD-1.1 compatable uname(2) interface. These - * days it is done in libc as a wrapper around a bunch of sysctl's. - * This must maintain the old 1.1 binary ABI. + * This is the FreeBSD-1.1 compatable uname(2) interface. These days it is + * done in libc as a wrapper around a bunch of sysctl's. This must maintain + * the old 1.1 binary ABI. */ #if SYS_NMLN != 32 #error "FreeBSD-1.1 uname syscall has been broken" #endif #ifndef _SYS_SYSPROTO_H_ struct uname_args { struct utsname *name; }; #endif - /* ARGSUSED */ int uname(td, uap) struct thread *td; struct uname_args *uap; { int name[2], error; size_t len; char *s, *us; name[0] = CTL_KERN; name[1] = KERN_OSTYPE; len = sizeof (uap->name->sysname); mtx_lock(&Giant); error = userland_sysctl(td, name, 2, uap->name->sysname, &len, 1, 0, 0, 0, 0); if (error) goto done2; subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0); name[1] = KERN_HOSTNAME; len = sizeof uap->name->nodename; error = userland_sysctl(td, name, 2, uap->name->nodename, &len, 1, 0, 0, 0, 0); if (error) goto done2; subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0); name[1] = KERN_OSRELEASE; len = sizeof uap->name->release; error = userland_sysctl(td, name, 2, uap->name->release, &len, 1, 0, 0, 0, 0); if (error) goto done2; subyte( uap->name->release + sizeof(uap->name->release) - 1, 0); /* name = KERN_VERSION; len = sizeof uap->name->version; error = userland_sysctl(td, name, 2, uap->name->version, &len, 1, 0, 0, 0, 0); if (error) goto done2; subyte( uap->name->version + sizeof(uap->name->version) - 1, 0); */ /* * this stupid hackery to make the version field look like FreeBSD 1.1 */ for(s = version; *s && *s != '#'; s++); for(us = uap->name->version; *s && *s != ':'; s++) { error = subyte( us++, *s); if (error) goto done2; } error = subyte( us++, 0); if (error) goto done2; name[0] = CTL_HW; name[1] = HW_MACHINE; len = sizeof uap->name->machine; error = userland_sysctl(td, name, 2, uap->name->machine, &len, 1, 0, 0, 0, 0); if (error) goto done2; subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0); done2: mtx_unlock(&Giant); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getdomainname_args { char *domainname; int len; }; #endif - /* ARGSUSED */ int getdomainname(td, uap) struct thread *td; struct getdomainname_args *uap; { int domainnamelen; int error; mtx_lock(&Giant); domainnamelen = strlen(domainname) + 1; if ((u_int)uap->len > domainnamelen) uap->len = domainnamelen; error = copyout(domainname, uap->domainname, uap->len); mtx_unlock(&Giant); return (error); } #ifndef _SYS_SYSPROTO_H_ struct setdomainname_args { char *domainname; int len; }; #endif - /* ARGSUSED */ int setdomainname(td, uap) struct thread *td; struct setdomainname_args *uap; { int error, domainnamelen; error = priv_check(td, PRIV_SETDOMAINNAME); if (error) return (error); mtx_lock(&Giant); if ((u_int)uap->len > sizeof (domainname) - 1) { error = EINVAL; goto done2; } domainnamelen = uap->len; error = copyin(uap->domainname, domainname, uap->len); domainname[domainnamelen] = 0; done2: mtx_unlock(&Giant); return (error); } Index: head/sys/kern/p1003_1b.c =================================================================== --- head/sys/kern/p1003_1b.c (revision 167231) +++ head/sys/kern/p1003_1b.c (revision 167232) @@ -1,320 +1,320 @@ /*- * Copyright (c) 1996, 1997, 1998 * HD Associates, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by HD Associates, Inc * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* p1003_1b: Real Time common code. */ #include __FBSDID("$FreeBSD$"); #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B"); -/* The system calls return ENOSYS if an entry is called that is - * not run-time supported. I am also logging since some programs - * start to use this when they shouldn't. That will be removed if annoying. +/* The system calls return ENOSYS if an entry is called that is not run-time + * supported. I am also logging since some programs start to use this when + * they shouldn't. That will be removed if annoying. */ int syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap) { log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n", td->td_proc->p_comm, td->td_proc->p_pid, s); /* a " return nosys(p, uap); " here causes a core dump. */ return ENOSYS; } #if !defined(_KPOSIX_PRIORITY_SCHEDULING) /* Not configured but loadable via a module: */ static int sched_attach(void) { return 0; } SYSCALL_NOT_PRESENT_GEN(sched_setparam) SYSCALL_NOT_PRESENT_GEN(sched_getparam) SYSCALL_NOT_PRESENT_GEN(sched_setscheduler) SYSCALL_NOT_PRESENT_GEN(sched_getscheduler) SYSCALL_NOT_PRESENT_GEN(sched_yield) SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max) SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min) SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval) #else /* Configured in kernel version: */ static struct ksched *ksched; static int sched_attach(void) { int ret = ksched_attach(&ksched); if (ret == 0) p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1); return ret; } int sched_setparam(struct thread *td, struct sched_setparam_args *uap) { struct thread *targettd; struct proc *targetp; int e; struct sched_param sched_param; e = copyin(uap->param, &sched_param, sizeof(sched_param)); if (e) return (e); if (uap->pid == 0) { targetp = td->td_proc; targettd = td; PROC_LOCK(targetp); } else { targetp = pfind(uap->pid); if (targetp == NULL) return (ESRCH); targettd = FIRST_THREAD_IN_PROC(targetp); } e = p_cansched(td, targetp); if (e == 0) { e = ksched_setparam(ksched, targettd, (const struct sched_param *)&sched_param); } PROC_UNLOCK(targetp); return (e); } int sched_getparam(struct thread *td, struct sched_getparam_args *uap) { int e; struct sched_param sched_param; struct thread *targettd; struct proc *targetp; if (uap->pid == 0) { targetp = td->td_proc; targettd = td; PROC_LOCK(targetp); } else { targetp = pfind(uap->pid); if (targetp == NULL) { return (ESRCH); } targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */ } e = p_cansee(td, targetp); if (e == 0) { e = ksched_getparam(ksched, targettd, &sched_param); } PROC_UNLOCK(targetp); if (e == 0) e = copyout(&sched_param, uap->param, sizeof(sched_param)); return (e); } int sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap) { int e; struct sched_param sched_param; struct thread *targettd; struct proc *targetp; /* Don't allow non root user to set a scheduler policy. */ e = priv_check(td, PRIV_SCHED_SET); if (e) return (e); e = copyin(uap->param, &sched_param, sizeof(sched_param)); if (e) return (e); if (uap->pid == 0) { targetp = td->td_proc; targettd = td; PROC_LOCK(targetp); } else { targetp = pfind(uap->pid); if (targetp == NULL) return (ESRCH); targettd = FIRST_THREAD_IN_PROC(targetp); } e = p_cansched(td, targetp); if (e == 0) { e = ksched_setscheduler(ksched, targettd, uap->policy, (const struct sched_param *)&sched_param); } PROC_UNLOCK(targetp); return (e); } int sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap) { int e, policy; struct thread *targettd; struct proc *targetp; if (uap->pid == 0) { targetp = td->td_proc; targettd = td; PROC_LOCK(targetp); } else { targetp = pfind(uap->pid); if (targetp == NULL) { e = ESRCH; goto done2; } targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */ } e = p_cansee(td, targetp); if (e == 0) { e = ksched_getscheduler(ksched, targettd, &policy); td->td_retval[0] = policy; } PROC_UNLOCK(targetp); done2: return (e); } int sched_yield(struct thread *td, struct sched_yield_args *uap) { return (ksched_yield(ksched)); } int sched_get_priority_max(struct thread *td, struct sched_get_priority_max_args *uap) { int error, prio; error = ksched_get_priority_max(ksched, uap->policy, &prio); td->td_retval[0] = prio; return (error); } int sched_get_priority_min(struct thread *td, struct sched_get_priority_min_args *uap) { int error, prio; error = ksched_get_priority_min(ksched, uap->policy, &prio); td->td_retval[0] = prio; return (error); } int sched_rr_get_interval(struct thread *td, struct sched_rr_get_interval_args *uap) { struct timespec timespec; int error; error = kern_sched_rr_get_interval(td, uap->pid, ×pec); if (error == 0) error = copyout(×pec, uap->interval, sizeof(timespec)); return (error); } int kern_sched_rr_get_interval(struct thread *td, pid_t pid, struct timespec *ts) { int e; struct thread *targettd; struct proc *targetp; if (pid == 0) { targettd = td; targetp = td->td_proc; PROC_LOCK(targetp); } else { targetp = td->td_proc; PROC_LOCK(targetp); targettd = thread_find(targetp, pid); if (targettd == NULL) { PROC_UNLOCK(targetp); return (ESRCH); } } e = p_cansee(td, targetp); if (e == 0) e = ksched_rr_get_interval(ksched, targettd, ts); PROC_UNLOCK(targetp); return (e); } #endif static void p31binit(void *notused) { (void) sched_attach(); p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE); } SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL); Index: head/sys/kern/sys_generic.c =================================================================== --- head/sys/kern/sys_generic.c (revision 167231) +++ head/sys/kern/sys_generic.c (revision 167232) @@ -1,1147 +1,1132 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); static int pollscan(struct thread *, struct pollfd *, u_int); static int selscan(struct thread *, fd_mask **, fd_mask **, int); static int dofileread(struct thread *, int, struct file *, struct uio *, off_t, int); static int dofilewrite(struct thread *, int, struct file *, struct uio *, off_t, int); static void doselwakeup(struct selinfo *, int); -/* - * Read system call. - */ #ifndef _SYS_SYSPROTO_H_ struct read_args { int fd; void *buf; size_t nbyte; }; #endif int read(td, uap) struct thread *td; struct read_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_readv(td, uap->fd, &auio); return(error); } /* * Positioned read system call */ #ifndef _SYS_SYSPROTO_H_ struct pread_args { int fd; void *buf; size_t nbyte; int pad; off_t offset; }; #endif int pread(td, uap) struct thread *td; struct pread_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_preadv(td, uap->fd, &auio, uap->offset); return(error); } /* * Scatter read system call. */ #ifndef _SYS_SYSPROTO_H_ struct readv_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int readv(struct thread *td, struct readv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_readv(td, uap->fd, auio); free(auio, M_IOV); return (error); } int kern_readv(struct thread *td, int fd, struct uio *auio) { struct file *fp; int error; error = fget_read(td, fd, &fp); if (error) return (error); error = dofileread(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Scatter positioned read system call. */ #ifndef _SYS_SYSPROTO_H_ struct preadv_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int preadv(struct thread *td, struct preadv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_preadv(td, uap->fd, auio, uap->offset); free(auio, M_IOV); return (error); } int kern_preadv(td, fd, auio, offset) struct thread *td; int fd; struct uio *auio; off_t offset; { struct file *fp; int error; error = fget_read(td, fd, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && fp->f_vnode->v_type != VCHR) error = EINVAL; else error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for readv and preadv that reads data in * from a file using the passed in uio, offset, and flags. */ static int dofileread(td, fd, fp, auio, offset, flags) struct thread *td; int fd; struct file *fp; struct uio *auio; off_t offset; int flags; { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif /* Finish zero length reads right here */ if (auio->uio_resid == 0) { td->td_retval[0] = 0; return(0); } auio->uio_rw = UIO_READ; auio->uio_offset = offset; auio->uio_td = td; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_READ, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } -/* - * Write system call - */ #ifndef _SYS_SYSPROTO_H_ struct write_args { int fd; const void *buf; size_t nbyte; }; #endif int write(td, uap) struct thread *td; struct write_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_writev(td, uap->fd, &auio); return(error); } /* * Positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwrite_args { int fd; const void *buf; size_t nbyte; int pad; off_t offset; }; #endif int pwrite(td, uap) struct thread *td; struct pwrite_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_pwritev(td, uap->fd, &auio, uap->offset); return(error); } /* * Gather write system call. */ #ifndef _SYS_SYSPROTO_H_ struct writev_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int writev(struct thread *td, struct writev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_writev(td, uap->fd, auio); free(auio, M_IOV); return (error); } int kern_writev(struct thread *td, int fd, struct uio *auio) { struct file *fp; int error; error = fget_write(td, fd, &fp); if (error) return (error); error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Gather positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwritev_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int pwritev(struct thread *td, struct pwritev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_pwritev(td, uap->fd, auio, uap->offset); free(auio, M_IOV); return (error); } int kern_pwritev(td, fd, auio, offset) struct thread *td; struct uio *auio; int fd; off_t offset; { struct file *fp; int error; error = fget_write(td, fd, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && fp->f_vnode->v_type != VCHR) error = EINVAL; else error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for writev and pwritev that writes data to * a file using the passed in uio, offset, and flags. */ static int dofilewrite(td, fd, fp, auio, offset, flags) struct thread *td; int fd; struct file *fp; struct uio *auio; off_t offset; int flags; { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif auio->uio_rw = UIO_WRITE; auio->uio_td = td; auio->uio_offset = offset; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if (fp->f_type == DTYPE_VNODE) bwillwrite(); if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Socket layer is responsible for issuing SIGPIPE. */ if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_WRITE, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } -/* - * Ioctl system call. - */ #ifndef _SYS_SYSPROTO_H_ struct ioctl_args { int fd; u_long com; caddr_t data; }; #endif /* ARGSUSED */ int ioctl(struct thread *td, struct ioctl_args *uap) { u_long com; int arg, error; u_int size; caddr_t data; if (uap->com > 0xffffffff) { printf( "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", td->td_proc->p_pid, td->td_proc->p_comm, uap->com); uap->com &= 0xffffffff; } com = uap->com; /* * Interpret high order word to find amount of data to be * copied to/from the user's address space. */ size = IOCPARM_LEN(com); if ((size > IOCPARM_MAX) || ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) ((com & IOC_OUT) && size == 0) || #else ((com & (IOC_IN | IOC_OUT)) && size == 0) || #endif ((com & IOC_VOID) && size > 0 && size != sizeof(int))) return (ENOTTY); if (size > 0) { if (!(com & IOC_VOID)) data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); else { /* Integer argument. */ arg = (intptr_t)uap->data; data = (void *)&arg; size = 0; } } else data = (void *)&uap->data; if (com & IOC_IN) { error = copyin(uap->data, data, (u_int)size); if (error) { if (size > 0) free(data, M_IOCTLOPS); return (error); } } else if (com & IOC_OUT) { /* * Zero the buffer so the user always * gets back something deterministic. */ bzero(data, size); } error = kern_ioctl(td, uap->fd, com, data); if (error == 0 && (com & IOC_OUT)) error = copyout(data, uap->data, (u_int)size); if (size > 0) free(data, M_IOCTLOPS); return (error); } int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) { struct file *fp; struct filedesc *fdp; int error; int tmp; if ((error = fget(td, fd, &fp)) != 0) return (error); if ((fp->f_flag & (FREAD | FWRITE)) == 0) { fdrop(fp, td); return (EBADF); } fdp = td->td_proc->p_fd; switch (com) { case FIONCLEX: FILEDESC_LOCK_FAST(fdp); fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; FILEDESC_UNLOCK_FAST(fdp); goto out; case FIOCLEX: FILEDESC_LOCK_FAST(fdp); fdp->fd_ofileflags[fd] |= UF_EXCLOSE; FILEDESC_UNLOCK_FAST(fdp); goto out; case FIONBIO: FILE_LOCK(fp); if ((tmp = *(int *)data)) fp->f_flag |= FNONBLOCK; else fp->f_flag &= ~FNONBLOCK; FILE_UNLOCK(fp); data = (void *)&tmp; break; case FIOASYNC: FILE_LOCK(fp); if ((tmp = *(int *)data)) fp->f_flag |= FASYNC; else fp->f_flag &= ~FASYNC; FILE_UNLOCK(fp); data = (void *)&tmp; break; } error = fo_ioctl(fp, com, data, td->td_ucred, td); out: fdrop(fp, td); return (error); } /* * sellock and selwait are initialized in selectinit() via SYSINIT. */ struct mtx sellock; struct cv selwait; u_int nselcoll; /* Select collisions since boot */ SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); -/* - * Select system call. - */ #ifndef _SYS_SYSPROTO_H_ struct select_args { int nd; fd_set *in, *ou, *ex; struct timeval *tv; }; #endif int select(td, uap) register struct thread *td; register struct select_args *uap; { struct timeval tv, *tvp; int error; if (uap->tv != NULL) { error = copyin(uap->tv, &tv, sizeof(tv)); if (error) return (error); tvp = &tv; } else tvp = NULL; return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); } int kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, fd_set *fd_ex, struct timeval *tvp) { struct filedesc *fdp; /* * The magic 2048 here is chosen to be just enough for FD_SETSIZE * infds with the new FD_SETSIZE of 1024, and more than enough for * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE * of 256. */ fd_mask s_selbits[howmany(2048, NFDBITS)]; fd_mask *ibits[3], *obits[3], *selbits, *sbp; struct timeval atv, rtv, ttv; int error, timo; u_int ncoll, nbufbytes, ncpbytes, nfdbits; if (nd < 0) return (EINVAL); fdp = td->td_proc->p_fd; FILEDESC_LOCK_FAST(fdp); if (nd > td->td_proc->p_fd->fd_nfiles) nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ FILEDESC_UNLOCK_FAST(fdp); /* * Allocate just enough bits for the non-null fd_sets. Use the * preallocated auto buffer if possible. */ nfdbits = roundup(nd, NFDBITS); ncpbytes = nfdbits / NBBY; nbufbytes = 0; if (fd_in != NULL) nbufbytes += 2 * ncpbytes; if (fd_ou != NULL) nbufbytes += 2 * ncpbytes; if (fd_ex != NULL) nbufbytes += 2 * ncpbytes; if (nbufbytes <= sizeof s_selbits) selbits = &s_selbits[0]; else selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); /* * Assign pointers into the bit buffers and fetch the input bits. * Put the output buffers together so that they can be bzeroed * together. */ sbp = selbits; #define getbits(name, x) \ do { \ if (name == NULL) \ ibits[x] = NULL; \ else { \ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ obits[x] = sbp; \ sbp += ncpbytes / sizeof *sbp; \ error = copyin(name, ibits[x], ncpbytes); \ if (error != 0) \ goto done_nosellock; \ } \ } while (0) getbits(fd_in, 0); getbits(fd_ou, 1); getbits(fd_ex, 2); #undef getbits if (nbufbytes != 0) bzero(selbits, nbufbytes / 2); if (tvp != NULL) { atv = *tvp; if (itimerfix(&atv)) { error = EINVAL; goto done_nosellock; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } timo = 0; TAILQ_INIT(&td->td_selq); mtx_lock(&sellock); retry: ncoll = nselcoll; mtx_lock_spin(&sched_lock); td->td_flags |= TDF_SELECT; mtx_unlock_spin(&sched_lock); mtx_unlock(&sellock); error = selscan(td, ibits, obits, nd); mtx_lock(&sellock); if (error || td->td_retval[0]) goto done; if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) goto done; ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } /* * An event of interest may occur while we do not hold * sellock, so check TDF_SELECT and the number of * collisions and rescan the file descriptors if * necessary. */ mtx_lock_spin(&sched_lock); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { mtx_unlock_spin(&sched_lock); goto retry; } mtx_unlock_spin(&sched_lock); if (timo > 0) error = cv_timedwait_sig(&selwait, &sellock, timo); else error = cv_wait_sig(&selwait, &sellock); if (error == 0) goto retry; done: clear_selinfo_list(td); mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_SELECT; mtx_unlock_spin(&sched_lock); mtx_unlock(&sellock); done_nosellock: /* select is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; #define putbits(name, x) \ if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ error = error2; if (error == 0) { int error2; putbits(fd_in, 0); putbits(fd_ou, 1); putbits(fd_ex, 2); #undef putbits } if (selbits != &s_selbits[0]) free(selbits, M_SELECT); return (error); } static int selscan(td, ibits, obits, nfd) struct thread *td; fd_mask **ibits, **obits; int nfd; { int msk, i, fd; fd_mask bits; struct file *fp; int n = 0; /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; struct filedesc *fdp = td->td_proc->p_fd; FILEDESC_LOCK(fdp); for (msk = 0; msk < 3; msk++) { if (ibits[msk] == NULL) continue; for (i = 0; i < nfd; i += NFDBITS) { bits = ibits[msk][i/NFDBITS]; /* ffs(int mask) not portable, fd_mask is long */ for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { if (!(bits & 1)) continue; if ((fp = fget_locked(fdp, fd)) == NULL) { FILEDESC_UNLOCK(fdp); return (EBADF); } if (fo_poll(fp, flag[msk], td->td_ucred, td)) { obits[msk][(fd)/NFDBITS] |= ((fd_mask)1 << ((fd) % NFDBITS)); n++; } } } } FILEDESC_UNLOCK(fdp); td->td_retval[0] = n; return (0); } -/* - * Poll system call. - */ #ifndef _SYS_SYSPROTO_H_ struct poll_args { struct pollfd *fds; u_int nfds; int timeout; }; #endif int poll(td, uap) struct thread *td; struct poll_args *uap; { struct pollfd *bits; struct pollfd smallbits[32]; struct timeval atv, rtv, ttv; int error = 0, timo; u_int ncoll, nfds; size_t ni; nfds = uap->nfds; /* * This is kinda bogus. We have fd limits, but that is not * really related to the size of the pollfd array. Make sure * we let the process use at least FD_SETSIZE entries and at * least enough for the current limits. We want to be reasonably * safe, but not overly restrictive. */ PROC_LOCK(td->td_proc); if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && (nfds > FD_SETSIZE)) { PROC_UNLOCK(td->td_proc); error = EINVAL; goto done2; } PROC_UNLOCK(td->td_proc); ni = nfds * sizeof(struct pollfd); if (ni > sizeof(smallbits)) bits = malloc(ni, M_TEMP, M_WAITOK); else bits = smallbits; error = copyin(uap->fds, bits, ni); if (error) goto done_nosellock; if (uap->timeout != INFTIM) { atv.tv_sec = uap->timeout / 1000; atv.tv_usec = (uap->timeout % 1000) * 1000; if (itimerfix(&atv)) { error = EINVAL; goto done_nosellock; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } timo = 0; TAILQ_INIT(&td->td_selq); mtx_lock(&sellock); retry: ncoll = nselcoll; mtx_lock_spin(&sched_lock); td->td_flags |= TDF_SELECT; mtx_unlock_spin(&sched_lock); mtx_unlock(&sellock); error = pollscan(td, bits, nfds); mtx_lock(&sellock); if (error || td->td_retval[0]) goto done; if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) goto done; ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } /* * An event of interest may occur while we do not hold * sellock, so check TDF_SELECT and the number of collisions * and rescan the file descriptors if necessary. */ mtx_lock_spin(&sched_lock); if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) { mtx_unlock_spin(&sched_lock); goto retry; } mtx_unlock_spin(&sched_lock); if (timo > 0) error = cv_timedwait_sig(&selwait, &sellock, timo); else error = cv_wait_sig(&selwait, &sellock); if (error == 0) goto retry; done: clear_selinfo_list(td); mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_SELECT; mtx_unlock_spin(&sched_lock); mtx_unlock(&sellock); done_nosellock: /* poll is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; if (error == 0) { error = copyout(bits, uap->fds, ni); if (error) goto out; } out: if (ni > sizeof(smallbits)) free(bits, M_TEMP); done2: return (error); } static int pollscan(td, fds, nfd) struct thread *td; struct pollfd *fds; u_int nfd; { register struct filedesc *fdp = td->td_proc->p_fd; int i; struct file *fp; int n = 0; FILEDESC_LOCK(fdp); for (i = 0; i < nfd; i++, fds++) { if (fds->fd >= fdp->fd_nfiles) { fds->revents = POLLNVAL; n++; } else if (fds->fd < 0) { fds->revents = 0; } else { fp = fdp->fd_ofiles[fds->fd]; if (fp == NULL) { fds->revents = POLLNVAL; n++; } else { /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ fds->revents = fo_poll(fp, fds->events, td->td_ucred, td); if (fds->revents != 0) n++; } } } FILEDESC_UNLOCK(fdp); td->td_retval[0] = n; return (0); } /* * OpenBSD poll system call. * * XXX this isn't quite a true representation.. OpenBSD uses select ops. */ #ifndef _SYS_SYSPROTO_H_ struct openbsd_poll_args { struct pollfd *fds; u_int nfds; int timeout; }; #endif int openbsd_poll(td, uap) register struct thread *td; register struct openbsd_poll_args *uap; { return (poll(td, (struct poll_args *)uap)); } /* * Remove the references to the thread from all of the objects we were * polling. * * This code assumes that the underlying owner of the selinfo structure will * hold sellock before it changes it, and that it will unlink itself from our * list if it goes away. */ void clear_selinfo_list(td) struct thread *td; { struct selinfo *si; mtx_assert(&sellock, MA_OWNED); TAILQ_FOREACH(si, &td->td_selq, si_thrlist) si->si_thread = NULL; TAILQ_INIT(&td->td_selq); } /* * Record a select request. */ void selrecord(selector, sip) struct thread *selector; struct selinfo *sip; { mtx_lock(&sellock); /* * If the selinfo's thread pointer is NULL then take ownership of it. * * If the thread pointer is not NULL and it points to another * thread, then we have a collision. * * If the thread pointer is not NULL and points back to us then leave * it alone as we've already added pointed it at us and added it to * our list. */ if (sip->si_thread == NULL) { sip->si_thread = selector; TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist); } else if (sip->si_thread != selector) { sip->si_flags |= SI_COLL; } mtx_unlock(&sellock); } /* Wake up a selecting thread. */ void selwakeup(sip) struct selinfo *sip; { doselwakeup(sip, -1); } /* Wake up a selecting thread, and set its priority. */ void selwakeuppri(sip, pri) struct selinfo *sip; int pri; { doselwakeup(sip, pri); } /* * Do a wakeup when a selectable event occurs. */ static void doselwakeup(sip, pri) struct selinfo *sip; int pri; { struct thread *td; mtx_lock(&sellock); td = sip->si_thread; if ((sip->si_flags & SI_COLL) != 0) { nselcoll++; sip->si_flags &= ~SI_COLL; cv_broadcastpri(&selwait, pri); } if (td == NULL) { mtx_unlock(&sellock); return; } TAILQ_REMOVE(&td->td_selq, sip, si_thrlist); sip->si_thread = NULL; mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_SELECT; mtx_unlock_spin(&sched_lock); sleepq_remove(td, &selwait); mtx_unlock(&sellock); } static void selectinit(void *); SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL) /* ARGSUSED*/ static void selectinit(dummy) void *dummy; { cv_init(&selwait, "select"); mtx_init(&sellock, "sellck", NULL, MTX_DEF); } Index: head/sys/kern/sys_pipe.c =================================================================== --- head/sys/kern/sys_pipe.c (revision 167231) +++ head/sys/kern/sys_pipe.c (revision 167232) @@ -1,1631 +1,1630 @@ /*- * Copyright (c) 1996 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and * the receiving process can copy it directly from the pages in the sending * process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. * * In order to limit the resource use of pipes, two sysctls exist: * * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable * address space available to us in pipe_map. This value is normally * autotuned, but may also be loader tuned. * * kern.ipc.pipekva - This read-only sysctl tracks the current amount of * memory in use by pipes. * * Based on how large pipekva is relative to maxpipekva, the following * will happen: * * 0% - 50%: * New pipes are given 16K of memory backing, pipes may dynamically * grow to as large as 64K where needed. * 50% - 75%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes may NOT grow. * 75% - 100%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes will be shrunk down to 4K whenever possible. * * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE * resize which MUST occur for reverse-direction pipes when they are * first used. * * Additional information about the current state of pipes may be obtained * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, * and kern.ipc.piperesizefail. * * Locking rules: There are two locks present here: A mutex, used via * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via * the flag, as mutexes can not persist over uiomove. The mutex * exists only to guard access to the flag, and is not in itself a * locking mechanism. Also note that there is only a single mutex for * both directions of a pipe. * * As pipelock() may have to sleep before it can acquire the flag, it * is important to reread all data after a call to pipelock(); everything * in the structure may have changed. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ /* * interfaces to the outside world */ static fo_rdwr_t pipe_read; static fo_rdwr_t pipe_write; static fo_ioctl_t pipe_ioctl; static fo_poll_t pipe_poll; static fo_kqfilter_t pipe_kqfilter; static fo_stat_t pipe_stat; static fo_close_t pipe_close; static struct fileops pipeops = { .fo_read = pipe_read, .fo_write = pipe_write, .fo_ioctl = pipe_ioctl, .fo_poll = pipe_poll, .fo_kqfilter = pipe_kqfilter, .fo_stat = pipe_stat, .fo_close = pipe_close, .fo_flags = DFLAG_PASSABLE }; static void filt_pipedetach(struct knote *kn); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); static struct filterops pipe_rfiltops = { 1, NULL, filt_pipedetach, filt_piperead }; static struct filterops pipe_wfiltops = { 1, NULL, filt_pipedetach, filt_pipewrite }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) static int amountpipes; static int amountpipekva; static int pipefragretry; static int pipeallocfail; static int piperesizefail; static int piperesizeallowed = 1; SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, &maxpipekva, 0, "Pipe KVA limit"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD, &amountpipes, 0, "Current # of pipes"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, &amountpipekva, 0, "Pipe KVA usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, &pipeallocfail, 0, "Pipe allocation failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, &piperesizefail, 0, "Pipe resize failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, &piperesizeallowed, 0, "Pipe resizing allowed"); static void pipeinit(void *dummy __unused); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static int pipe_create(struct pipe *pipe, int backing); static __inline int pipelock(struct pipe *cpipe, int catch); static __inline void pipeunlock(struct pipe *cpipe); static __inline void pipeselwakeup(struct pipe *cpipe); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); static void pipe_destroy_write_buffer(struct pipe *wpipe); static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); static void pipe_clone_write_buffer(struct pipe *wpipe); #endif static int pipespace(struct pipe *cpipe, int size); static int pipespace_new(struct pipe *cpipe, int size); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); static void pipe_zone_dtor(void *mem, int size, void *arg); static int pipe_zone_init(void *mem, int size, int flags); static void pipe_zone_fini(void *mem, int size); static uma_zone_t pipe_zone; SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); static void pipeinit(void *dummy __unused) { pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair), pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini, UMA_ALIGN_PTR, 0); KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); } static int pipe_zone_ctor(void *mem, int size, void *arg, int flags) { struct pipepair *pp; struct pipe *rpipe, *wpipe; KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); pp = (struct pipepair *)mem; /* * We zero both pipe endpoints to make sure all the kmem pointers * are NULL, flag fields are zero'd, etc. We timestamp both * endpoints with the same time. */ rpipe = &pp->pp_rpipe; bzero(rpipe, sizeof(*rpipe)); vfs_timestamp(&rpipe->pipe_ctime); rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; wpipe = &pp->pp_wpipe; bzero(wpipe, sizeof(*wpipe)); wpipe->pipe_ctime = rpipe->pipe_ctime; wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; rpipe->pipe_peer = wpipe; rpipe->pipe_pair = pp; wpipe->pipe_peer = rpipe; wpipe->pipe_pair = pp; /* * Mark both endpoints as present; they will later get free'd * one at a time. When both are free'd, then the whole pair * is released. */ rpipe->pipe_present = 1; wpipe->pipe_present = 1; /* * Eventually, the MAC Framework may initialize the label * in ctor or init, but for now we do it elswhere to avoid * blocking in ctor or init. */ pp->pp_label = NULL; atomic_add_int(&amountpipes, 2); return (0); } static void pipe_zone_dtor(void *mem, int size, void *arg) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size")); pp = (struct pipepair *)mem; atomic_subtract_int(&amountpipes, 2); } static int pipe_zone_init(void *mem, int size, int flags) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); pp = (struct pipepair *)mem; mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); return (0); } static void pipe_zone_fini(void *mem, int size) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); pp = (struct pipepair *)mem; mtx_destroy(&pp->pp_mtx); } /* - * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, - * let the zone pick up the pieces via pipeclose(). + * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let + * the zone pick up the pieces via pipeclose(). */ - /* ARGSUSED */ int pipe(td, uap) struct thread *td; struct pipe_args /* { int dummy; } */ *uap; { struct filedesc *fdp = td->td_proc->p_fd; struct file *rf, *wf; struct pipepair *pp; struct pipe *rpipe, *wpipe; int fd, error; pp = uma_zalloc(pipe_zone, M_WAITOK); #ifdef MAC /* * The MAC label is shared between the connected endpoints. As a * result mac_init_pipe() and mac_create_pipe() are called once * for the pair, and not on the endpoints. */ mac_init_pipe(pp); mac_create_pipe(td->td_ucred, pp); #endif rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; knlist_init(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe), NULL, NULL, NULL); knlist_init(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe), NULL, NULL, NULL); /* Only the forward direction pipe is backed by default */ if ((error = pipe_create(rpipe, 1)) != 0 || (error = pipe_create(wpipe, 0)) != 0) { pipeclose(rpipe); pipeclose(wpipe); return (error); } rpipe->pipe_state |= PIPE_DIRECTOK; wpipe->pipe_state |= PIPE_DIRECTOK; error = falloc(td, &rf, &fd); if (error) { pipeclose(rpipe); pipeclose(wpipe); return (error); } /* An extra reference on `rf' has been held for us by falloc(). */ td->td_retval[0] = fd; /* * Warning: once we've gotten past allocation of the fd for the * read-side, we can only drop the read side via fdrop() in order * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ FILE_LOCK(rf); rf->f_flag = FREAD | FWRITE; rf->f_type = DTYPE_PIPE; rf->f_data = rpipe; rf->f_ops = &pipeops; FILE_UNLOCK(rf); error = falloc(td, &wf, &fd); if (error) { fdclose(fdp, rf, td->td_retval[0], td); fdrop(rf, td); /* rpipe has been closed by fdrop(). */ pipeclose(wpipe); return (error); } /* An extra reference on `wf' has been held for us by falloc(). */ FILE_LOCK(wf); wf->f_flag = FREAD | FWRITE; wf->f_type = DTYPE_PIPE; wf->f_data = wpipe; wf->f_ops = &pipeops; FILE_UNLOCK(wf); fdrop(wf, td); td->td_retval[1] = fd; fdrop(rf, td); return (0); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int pipespace_new(cpipe, size) struct pipe *cpipe; int size; { caddr_t buffer; int error, cnt, firstseg; static int curfail = 0; static struct timeval lastfail; KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), ("pipespace: resize of direct writes not allowed")); retry: cnt = cpipe->pipe_buffer.cnt; if (cnt > size) size = cnt; size = round_page(size); buffer = (caddr_t) vm_map_min(pipe_map); error = vm_map_find(pipe_map, NULL, 0, (vm_offset_t *) &buffer, size, 1, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) { if ((cpipe->pipe_buffer.buffer == NULL) && (size > SMALL_PIPE_SIZE)) { size = SMALL_PIPE_SIZE; pipefragretry++; goto retry; } if (cpipe->pipe_buffer.buffer == NULL) { pipeallocfail++; if (ppsratecheck(&lastfail, &curfail, 1)) printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); } else { piperesizefail++; } return (ENOMEM); } /* copy data, then free old resources if we're resizing */ if (cnt > 0) { if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, firstseg); if ((cnt - firstseg) > 0) bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], cpipe->pipe_buffer.in); } else { bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, cnt); } } pipe_free_kmem(cpipe); cpipe->pipe_buffer.buffer = buffer; cpipe->pipe_buffer.size = size; cpipe->pipe_buffer.in = cnt; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = cnt; atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size); return (0); } /* * Wrapper for pipespace_new() that performs locking assertions. */ static int pipespace(cpipe, size) struct pipe *cpipe; int size; { KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipespace")); return (pipespace_new(cpipe, size)); } /* * lock a pipe for I/O, blocking other access */ static __inline int pipelock(cpipe, catch) struct pipe *cpipe; int catch; { int error; PIPE_LOCK_ASSERT(cpipe, MA_OWNED); while (cpipe->pipe_state & PIPE_LOCKFL) { cpipe->pipe_state |= PIPE_LWANT; error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO, "pipelk", 0); if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCKFL; return (0); } /* * unlock a pipe I/O lock */ static __inline void pipeunlock(cpipe) struct pipe *cpipe; { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipeunlock")); cpipe->pipe_state &= ~PIPE_LOCKFL; if (cpipe->pipe_state & PIPE_LWANT) { cpipe->pipe_state &= ~PIPE_LWANT; wakeup(cpipe); } } static __inline void pipeselwakeup(cpipe) struct pipe *cpipe; { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); if (cpipe->pipe_state & PIPE_SEL) { cpipe->pipe_state &= ~PIPE_SEL; selwakeuppri(&cpipe->pipe_sel, PSOCK); } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); } /* * Initialize and allocate VM and memory for pipe. The structure * will start out zero'd from the ctor, so we just manage the kmem. */ static int pipe_create(pipe, backing) struct pipe *pipe; int backing; { int error; if (backing) { if (amountpipekva > maxpipekva / 2) error = pipespace_new(pipe, SMALL_PIPE_SIZE); else error = pipespace_new(pipe, PIPE_SIZE); } else { /* If we're not backing this pipe, no need to do anything. */ error = 0; } return (error); } /* ARGSUSED */ static int pipe_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct pipe *rpipe = fp->f_data; int error; int nread = 0; u_int size; PIPE_LOCK(rpipe); ++rpipe->pipe_busy; error = pipelock(rpipe, 1); if (error) goto unlocked_error; #ifdef MAC error = mac_check_pipe_read(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (amountpipekva > (3 * maxpipekva) / 4) { if (!(rpipe->pipe_state & PIPE_DIRECTW) && (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && (piperesizeallowed == 1)) { PIPE_UNLOCK(rpipe); pipespace(rpipe, SMALL_PIPE_SIZE); PIPE_LOCK(rpipe); } } while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); PIPE_LOCK(rpipe); if (error) break; rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_map.cnt) && (rpipe->pipe_state & PIPE_DIRECTW)) { if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove_fromphys(rpipe->pipe_map.ms, rpipe->pipe_map.pos, size, uio); PIPE_LOCK(rpipe); if (error) break; nread += size; rpipe->pipe_map.pos += size; rpipe->pipe_map.cnt -= size; if (rpipe->pipe_map.cnt == 0) { rpipe->pipe_state &= ~PIPE_DIRECTW; wakeup(rpipe); } #endif } else { /* * detect EOF condition * read returns 0 on EOF, no need to set error */ if (rpipe->pipe_state & PIPE_EOF) break; /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } /* * Break if some data was read. */ if (nread > 0) break; /* * Unlock the pipe buffer for our remaining processing. * We will either break out with an error or we will * sleep and relock to loop. */ pipeunlock(rpipe); /* * Handle non-blocking mode operation or * wait for more data. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; if ((error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0)) == 0) error = pipelock(rpipe, 1); } if (error) goto unlocked_error; } } #ifdef MAC locked_error: #endif pipeunlock(rpipe); /* XXX: should probably do this before getting any locks. */ if (error == 0) vfs_timestamp(&rpipe->pipe_atime); unlocked_error: --rpipe->pipe_busy; /* * PIPE_WANT processing only makes sense if pipe_busy is 0. */ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * Handle write blocking hysteresis. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) pipeselwakeup(rpipe); PIPE_UNLOCK(rpipe); return (error); } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int pipe_build_write_buffer(wpipe, uio) struct pipe *wpipe; struct uio *uio; { pmap_t pmap; u_int size; int i, j; vm_offset_t addr, endaddr; PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); KASSERT(wpipe->pipe_state & PIPE_DIRECTW, ("Clone attempt on non-direct write pipe!")); size = (u_int) uio->uio_iov->iov_len; if (size > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; pmap = vmspace_pmap(curproc->p_vmspace); endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { /* * vm_fault_quick() can sleep. Consequently, * vm_page_lock_queue() and vm_page_unlock_queue() * should not be performed outside of this loop. */ race: if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { vm_page_lock_queues(); for (j = 0; j < i; j++) vm_page_unhold(wpipe->pipe_map.ms[j]); vm_page_unlock_queues(); return (EFAULT); } wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr, VM_PROT_READ); if (wpipe->pipe_map.ms[i] == NULL) goto race; } /* * set up the control block */ wpipe->pipe_map.npages = i; wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_map.cnt = size; /* * and update the uio data */ uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; if (uio->uio_iov->iov_len == 0) uio->uio_iov++; uio->uio_resid -= size; uio->uio_offset += size; return (0); } /* * unmap and unwire the process buffer */ static void pipe_destroy_write_buffer(wpipe) struct pipe *wpipe; { int i; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); vm_page_lock_queues(); for (i = 0; i < wpipe->pipe_map.npages; i++) { vm_page_unhold(wpipe->pipe_map.ms[i]); } vm_page_unlock_queues(); wpipe->pipe_map.npages = 0; } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void pipe_clone_write_buffer(wpipe) struct pipe *wpipe; { struct uio uio; struct iovec iov; int size; int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); size = wpipe->pipe_map.cnt; pos = wpipe->pipe_map.pos; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; wpipe->pipe_state &= ~PIPE_DIRECTW; PIPE_UNLOCK(wpipe); iov.iov_base = wpipe->pipe_buffer.buffer; iov.iov_len = size; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = size; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); PIPE_LOCK(wpipe); pipe_destroy_write_buffer(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int pipe_direct_write(wpipe, uio) struct pipe *wpipe; struct uio *uio; { int error; retry: PIPE_LOCK_ASSERT(wpipe, MA_OWNED); error = pipelock(wpipe, 1); if (wpipe->pipe_state & PIPE_EOF) error = EPIPE; if (error) { pipeunlock(wpipe); goto error1; } while (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdww", 0); if (error) goto error1; else goto retry; } wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwc", 0); if (error) goto error1; else goto retry; } wpipe->pipe_state |= PIPE_DIRECTW; PIPE_UNLOCK(wpipe); error = pipe_build_write_buffer(wpipe, uio); PIPE_LOCK(wpipe); if (error) { wpipe->pipe_state &= ~PIPE_DIRECTW; pipeunlock(wpipe); goto error1; } error = 0; while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { if (wpipe->pipe_state & PIPE_EOF) { pipe_destroy_write_buffer(wpipe); pipeselwakeup(wpipe); pipeunlock(wpipe); error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); pipelock(wpipe, 0); } if (wpipe->pipe_state & PIPE_EOF) error = EPIPE; if (wpipe->pipe_state & PIPE_DIRECTW) { /* * this bit of trickery substitutes a kernel buffer for * the process that might be going away. */ pipe_clone_write_buffer(wpipe); } else { pipe_destroy_write_buffer(wpipe); } pipeunlock(wpipe); return (error); error1: wakeup(wpipe); return (error); } #endif static int pipe_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { int error = 0; int desiredsize, orig_resid; struct pipe *wpipe, *rpipe; rpipe = fp->f_data; wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); error = pipelock(wpipe, 1); if (error) { PIPE_UNLOCK(rpipe); return (error); } /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (EPIPE); } #ifdef MAC error = mac_check_pipe_write(active_cred, wpipe->pipe_pair); if (error) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } #endif ++wpipe->pipe_busy; /* Choose a larger size if it's advantageous */ desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { if (piperesizeallowed != 1) break; if (amountpipekva > maxpipekva / 2) break; if (desiredsize == BIG_PIPE_SIZE) break; desiredsize = desiredsize * 2; } /* Choose a smaller size if we're in a OOM situation */ if ((amountpipekva > (3 * maxpipekva) / 4) && (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && (piperesizeallowed == 1)) desiredsize = SMALL_PIPE_SIZE; /* Resize if the above determined that a new size was necessary */ if ((desiredsize != wpipe->pipe_buffer.size) && ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { PIPE_UNLOCK(wpipe); pipespace(wpipe, desiredsize); PIPE_LOCK(wpipe); } if (wpipe->pipe_buffer.size == 0) { /* * This can only happen for reverse direction use of pipes * in a complete OOM situation. */ error = ENOMEM; --wpipe->pipe_busy; pipeunlock(wpipe); PIPE_UNLOCK(wpipe); return (error); } pipeunlock(wpipe); orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; pipelock(wpipe, 0); if (wpipe->pipe_state & PIPE_EOF) { pipeunlock(wpipe); error = EPIPE; break; } #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. * * The direct write mechanism will detect the reader going * away on us. */ if (uio->uio_segflg == UIO_USERSPACE && uio->uio_iov->iov_len >= PIPE_MINDIRECT && wpipe->pipe_buffer.size >= PIPE_MINDIRECT && (fp->f_flag & FNONBLOCK) == 0) { pipeunlock(wpipe); error = pipe_direct_write(wpipe, uio); if (error) break; continue; } #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. We break out if a signal occurs or the * reader goes away. */ if (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipbww", 0); if (error) break; else continue; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; if (segsize > size) segsize = size; /* Transfer first segment */ PIPE_UNLOCK(rpipe); error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], segsize, uio); PIPE_LOCK(rpipe); if (error == 0 && segsize < size) { KASSERT(wpipe->pipe_buffer.in + segsize == wpipe->pipe_buffer.size, ("Pipe buffer wraparound disappeared")); /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ PIPE_UNLOCK(rpipe); error = uiomove( &wpipe->pipe_buffer.buffer[0], size - segsize, uio); PIPE_LOCK(rpipe); } if (error == 0) { wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) { KASSERT(wpipe->pipe_buffer.in == size - segsize + wpipe->pipe_buffer.size, ("Expected wraparound bad")); wpipe->pipe_buffer.in = size - segsize; } wpipe->pipe_buffer.cnt += size; KASSERT(wpipe->pipe_buffer.cnt <= wpipe->pipe_buffer.size, ("Pipe buffer overflow")); } pipeunlock(wpipe); if (error != 0) break; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; pipeunlock(wpipe); break; } /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipewr", 0); if (error != 0) break; } } pipelock(wpipe, 0); --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if I/O was successful */ if ((wpipe->pipe_buffer.cnt == 0) && (uio->uio_resid == 0) && (error == EPIPE)) { error = 0; } if (error == 0) vfs_timestamp(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ static int pipe_ioctl(fp, cmd, data, active_cred, td) struct file *fp; u_long cmd; void *data; struct ucred *active_cred; struct thread *td; { struct pipe *mpipe = fp->f_data; int error; PIPE_LOCK(mpipe); #ifdef MAC error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data); if (error) { PIPE_UNLOCK(mpipe); return (error); } #endif error = 0; switch (cmd) { case FIONBIO: break; case FIOASYNC: if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } break; case FIONREAD: if (mpipe->pipe_state & PIPE_DIRECTW) *(int *)data = mpipe->pipe_map.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; break; case FIOSETOWN: PIPE_UNLOCK(mpipe); error = fsetown(*(int *)data, &mpipe->pipe_sigio); goto out_unlocked; case FIOGETOWN: *(int *)data = fgetown(&mpipe->pipe_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: PIPE_UNLOCK(mpipe); error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); goto out_unlocked; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&mpipe->pipe_sigio); break; default: error = ENOTTY; break; } PIPE_UNLOCK(mpipe); out_unlocked: return (error); } static int pipe_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { struct pipe *rpipe = fp->f_data; struct pipe *wpipe; int revents = 0; #ifdef MAC int error; #endif wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); #ifdef MAC error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_state & PIPE_DIRECTW) || (rpipe->pipe_buffer.cnt > 0) || (rpipe->pipe_state & PIPE_EOF)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) || (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) revents |= events & (POLLOUT | POLLWRNORM); if ((rpipe->pipe_state & PIPE_EOF) || (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) { selrecord(td, &rpipe->pipe_sel); rpipe->pipe_state |= PIPE_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &wpipe->pipe_sel); wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC locked_error: #endif PIPE_UNLOCK(rpipe); return (revents); } /* * We shouldn't need locks here as we're doing a read and this should * be a natural race. */ static int pipe_stat(fp, ub, active_cred, td) struct file *fp; struct stat *ub; struct ucred *active_cred; struct thread *td; { struct pipe *pipe = fp->f_data; #ifdef MAC int error; PIPE_LOCK(pipe); error = mac_check_pipe_stat(active_cred, pipe->pipe_pair); PIPE_UNLOCK(pipe); if (error) return (error); #endif bzero(ub, sizeof(*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = PAGE_SIZE; if (pipe->pipe_state & PIPE_DIRECTW) ub->st_size = pipe->pipe_map.cnt; else ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; ub->st_atimespec = pipe->pipe_atime; ub->st_mtimespec = pipe->pipe_mtime; ub->st_ctimespec = pipe->pipe_ctime; ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; /* * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. * XXX (st_dev, st_ino) should be unique. */ return (0); } /* ARGSUSED */ static int pipe_close(fp, td) struct file *fp; struct thread *td; { struct pipe *cpipe = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; funsetown(&cpipe->pipe_sigio); pipeclose(cpipe); return (0); } static void pipe_free_kmem(cpipe) struct pipe *cpipe; { KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipe_free_kmem: pipe mutex locked")); if (cpipe->pipe_buffer.buffer != NULL) { atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size); vm_map_remove(pipe_map, (vm_offset_t)cpipe->pipe_buffer.buffer, (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; } #ifndef PIPE_NODIRECT { cpipe->pipe_map.cnt = 0; cpipe->pipe_map.pos = 0; cpipe->pipe_map.npages = 0; } #endif } /* * shutdown the pipe */ static void pipeclose(cpipe) struct pipe *cpipe; { struct pipepair *pp; struct pipe *ppipe; KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); PIPE_LOCK(cpipe); pipelock(cpipe, 0); pp = cpipe->pipe_pair; pipeselwakeup(cpipe); /* * If the other side is blocked, wake it up saying that * we want to close it down. */ cpipe->pipe_state |= PIPE_EOF; while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT; pipeunlock(cpipe); msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); pipelock(cpipe, 0); } /* * Disconnect from peer, if any. */ ppipe = cpipe->pipe_peer; if (ppipe->pipe_present != 0) { pipeselwakeup(ppipe); ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); } /* * Mark this endpoint as free. Release kmem resources. We * don't mark this endpoint as unused until we've finished * doing that, or the pipe might disappear out from under * us. */ PIPE_UNLOCK(cpipe); pipe_free_kmem(cpipe); PIPE_LOCK(cpipe); cpipe->pipe_present = 0; pipeunlock(cpipe); knlist_clear(&cpipe->pipe_sel.si_note, 1); knlist_destroy(&cpipe->pipe_sel.si_note); /* * If both endpoints are now closed, release the memory for the * pipe pair. If not, unlock. */ if (ppipe->pipe_present == 0) { PIPE_UNLOCK(cpipe); #ifdef MAC mac_destroy_pipe(pp); #endif uma_zfree(pipe_zone, cpipe->pipe_pair); } else PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int pipe_kqfilter(struct file *fp, struct knote *kn) { struct pipe *cpipe; cpipe = kn->kn_fp->f_data; PIPE_LOCK(cpipe); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; if (!cpipe->pipe_peer->pipe_present) { /* other end of pipe has been closed */ PIPE_UNLOCK(cpipe); return (EPIPE); } cpipe = cpipe->pipe_peer; break; default: PIPE_UNLOCK(cpipe); return (EINVAL); } knlist_add(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); return (0); } static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; PIPE_LOCK(cpipe); if (kn->kn_filter == EVFILT_WRITE) { if (!cpipe->pipe_peer->pipe_present) { PIPE_UNLOCK(cpipe); return; } cpipe = cpipe->pipe_peer; } knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int filt_piperead(struct knote *kn, long hint) { struct pipe *rpipe = kn->kn_fp->f_data; struct pipe *wpipe = rpipe->pipe_peer; int ret; PIPE_LOCK(rpipe); kn->kn_data = rpipe->pipe_buffer.cnt; if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) kn->kn_data = rpipe->pipe_map.cnt; if ((rpipe->pipe_state & PIPE_EOF) || (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_flags |= EV_EOF; PIPE_UNLOCK(rpipe); return (1); } ret = kn->kn_data > 0; PIPE_UNLOCK(rpipe); return ret; } /*ARGSUSED*/ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *rpipe = kn->kn_fp->f_data; struct pipe *wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; kn->kn_flags |= EV_EOF; PIPE_UNLOCK(rpipe); return (1); } kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; if (wpipe->pipe_state & PIPE_DIRECTW) kn->kn_data = 0; PIPE_UNLOCK(rpipe); return (kn->kn_data >= PIPE_BUF); } Index: head/sys/kern/sysv_msg.c =================================================================== --- head/sys/kern/sysv_msg.c (revision 167231) +++ head/sys/kern/sysv_msg.c (revision 167232) @@ -1,1298 +1,1294 @@ /*- * Implementation of SVID messages * * Author: Daniel Boulet * * Copyright 1993 Daniel Boulet and RTMX Inc. * * This system call was implemented by Daniel Boulet under contract from RTMX. * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. */ /*- * Copyright (c) 2003-2005 McAfee, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_sysvipc.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues"); static void msginit(void); static int msgunload(void); static int sysvmsg_modload(struct module *, int, void *); #ifdef MSG_DEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) #endif static void msg_freehdr(struct msg *msghdr); /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *msgcalls[] = { (sy_call_t *)msgctl, (sy_call_t *)msgget, (sy_call_t *)msgsnd, (sy_call_t *)msgrcv }; #ifndef MSGSSZ #define MSGSSZ 8 /* Each segment must be 2^N long */ #endif #ifndef MSGSEG #define MSGSEG 2048 /* must be less than 32767 */ #endif #define MSGMAX (MSGSSZ*MSGSEG) #ifndef MSGMNB #define MSGMNB 2048 /* max # of bytes in a queue */ #endif #ifndef MSGMNI #define MSGMNI 40 #endif #ifndef MSGTQL #define MSGTQL 40 #endif /* * Based on the configuration parameters described in an SVR2 (yes, two) * config(1m) man page. * * Each message is broken up and stored in segments that are msgssz bytes * long. For efficiency reasons, this should be a power of two. Also, * it doesn't make sense if it is less than 8 or greater than about 256. * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of * two between 8 and 1024 inclusive (and panic's if it isn't). */ struct msginfo msginfo = { MSGMAX, /* max chars in a message */ MSGMNI, /* # of message queue identifiers */ MSGMNB, /* max chars in a queue */ MSGTQL, /* max messages in system */ MSGSSZ, /* size of a message segment */ /* (must be small power of 2 greater than 4) */ MSGSEG /* number of message segments */ }; /* * macros to convert between msqid_ds's and msqid's. * (specific to this implementation) */ #define MSQID(ix,ds) ((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000)) #define MSQID_IX(id) ((id) & 0xffff) #define MSQID_SEQ(id) (((id) >> 16) & 0xffff) /* * The rest of this file is specific to this particular implementation. */ struct msgmap { short next; /* next segment in buffer */ /* -1 -> available */ /* 0..(MSGSEG-1) -> index of next segment */ }; #define MSG_LOCKED 01000 /* Is this msqid_ds locked? */ static int nfree_msgmaps; /* # of free map entries */ static short free_msgmaps; /* head of linked list of free map entries */ static struct msg *free_msghdrs;/* list of free msg headers */ static char *msgpool; /* MSGMAX byte long msg buffer pool */ static struct msgmap *msgmaps; /* MSGSEG msgmap structures */ static struct msg *msghdrs; /* MSGTQL msg headers */ static struct msqid_kernel *msqids; /* MSGMNI msqid_kernel struct's */ static struct mtx msq_mtx; /* global mutex for message queues. */ static void msginit() { register int i; TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg); TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz); msginfo.msgmax = msginfo.msgseg * msginfo.msgssz; TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni); TUNABLE_INT_FETCH("kern.ipc.msgmnb", &msginfo.msgmnb); TUNABLE_INT_FETCH("kern.ipc.msgtql", &msginfo.msgtql); msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK); if (msgpool == NULL) panic("msgpool is NULL"); msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK); if (msgmaps == NULL) panic("msgmaps is NULL"); msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK); if (msghdrs == NULL) panic("msghdrs is NULL"); msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG, M_WAITOK); if (msqids == NULL) panic("msqids is NULL"); /* * msginfo.msgssz should be a power of two for efficiency reasons. * It is also pretty silly if msginfo.msgssz is less than 8 * or greater than about 256 so ... */ i = 8; while (i < 1024 && i != msginfo.msgssz) i <<= 1; if (i != msginfo.msgssz) { DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz, msginfo.msgssz)); panic("msginfo.msgssz not a small power of 2"); } if (msginfo.msgseg > 32767) { DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg)); panic("msginfo.msgseg > 32767"); } if (msgmaps == NULL) panic("msgmaps is NULL"); for (i = 0; i < msginfo.msgseg; i++) { if (i > 0) msgmaps[i-1].next = i; msgmaps[i].next = -1; /* implies entry is available */ } free_msgmaps = 0; nfree_msgmaps = msginfo.msgseg; if (msghdrs == NULL) panic("msghdrs is NULL"); for (i = 0; i < msginfo.msgtql; i++) { msghdrs[i].msg_type = 0; if (i > 0) msghdrs[i-1].msg_next = &msghdrs[i]; msghdrs[i].msg_next = NULL; #ifdef MAC mac_init_sysv_msgmsg(&msghdrs[i]); #endif } free_msghdrs = &msghdrs[0]; if (msqids == NULL) panic("msqids is NULL"); for (i = 0; i < msginfo.msgmni; i++) { msqids[i].u.msg_qbytes = 0; /* implies entry is available */ msqids[i].u.msg_perm.seq = 0; /* reset to a known value */ msqids[i].u.msg_perm.mode = 0; #ifdef MAC mac_init_sysv_msgqueue(&msqids[i]); #endif } mtx_init(&msq_mtx, "msq", NULL, MTX_DEF); } static int msgunload() { struct msqid_kernel *msqkptr; int msqid; #ifdef MAC int i; #endif for (msqid = 0; msqid < msginfo.msgmni; msqid++) { /* * Look for an unallocated and unlocked msqid_ds. * msqid_ds's can be locked by msgsnd or msgrcv while * they are copying the message in/out. We can't * re-use the entry until they release it. */ msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes != 0 || (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) break; } if (msqid != msginfo.msgmni) return (EBUSY); #ifdef MAC for (i = 0; i < msginfo.msgtql; i++) mac_destroy_sysv_msgmsg(&msghdrs[i]); for (msqid = 0; msqid < msginfo.msgmni; msqid++) mac_destroy_sysv_msgqueue(&msqids[msqid]); #endif free(msgpool, M_MSG); free(msgmaps, M_MSG); free(msghdrs, M_MSG); free(msqids, M_MSG); mtx_destroy(&msq_mtx); return (0); } static int sysvmsg_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: msginit(); break; case MOD_UNLOAD: error = msgunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvmsg_mod = { "sysvmsg", &sysvmsg_modload, NULL }; SYSCALL_MODULE_HELPER(msgsys); SYSCALL_MODULE_HELPER(msgctl); SYSCALL_MODULE_HELPER(msgget); SYSCALL_MODULE_HELPER(msgsnd); SYSCALL_MODULE_HELPER(msgrcv); DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST); MODULE_VERSION(sysvmsg, 1); /* * Entry point for all MSG calls. */ int msgsys(td, uap) struct thread *td; /* XXX actually varargs. */ struct msgsys_args /* { int which; int a2; int a3; int a4; int a5; int a6; } */ *uap; { int error; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); if (uap->which < 0 || uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0])) return (EINVAL); error = (*msgcalls[uap->which])(td, &uap->a2); return (error); } static void msg_freehdr(msghdr) struct msg *msghdr; { while (msghdr->msg_ts > 0) { short next; if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg) panic("msghdr->msg_spot out of range"); next = msgmaps[msghdr->msg_spot].next; msgmaps[msghdr->msg_spot].next = free_msgmaps; free_msgmaps = msghdr->msg_spot; nfree_msgmaps++; msghdr->msg_spot = next; if (msghdr->msg_ts >= msginfo.msgssz) msghdr->msg_ts -= msginfo.msgssz; else msghdr->msg_ts = 0; } if (msghdr->msg_spot != -1) panic("msghdr->msg_spot != -1"); msghdr->msg_next = free_msghdrs; free_msghdrs = msghdr; #ifdef MAC mac_cleanup_sysv_msgmsg(msghdr); #endif } #ifndef _SYS_SYSPROTO_H_ struct msgctl_args { int msqid; int cmd; struct msqid_ds *buf; }; #endif - int msgctl(td, uap) struct thread *td; register struct msgctl_args *uap; { int msqid = uap->msqid; int cmd = uap->cmd; struct msqid_ds msqbuf; int error; DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf)); if (cmd == IPC_SET && (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0) return (error); error = kern_msgctl(td, msqid, cmd, &msqbuf); if (cmd == IPC_STAT && error == 0) error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds)); return (error); } int kern_msgctl(td, msqid, cmd, msqbuf) struct thread *td; int msqid; int cmd; struct msqid_ds *msqbuf; { int rval, error, msqix; register struct msqid_kernel *msqkptr; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); return (EINVAL); } msqkptr = &msqids[msqix]; mtx_lock(&msq_mtx); if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such msqid\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } #ifdef MAC error = mac_check_sysv_msqctl(td->td_ucred, msqkptr, cmd); if (error != 0) goto done2; #endif error = 0; rval = 0; switch (cmd) { case IPC_RMID: { struct msg *msghdr; if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M))) goto done2; #ifdef MAC /* * Check that the thread has MAC access permissions to * individual msghdrs. Note: We need to do this in a * separate loop because the actual loop alters the * msq/msghdr info as it progresses, and there is no going * back if half the way through we discover that the * thread cannot free a certain msghdr. The msq will get * into an inconsistent state. */ for (msghdr = msqkptr->u.msg_first; msghdr != NULL; msghdr = msghdr->msg_next) { error = mac_check_sysv_msgrmid(td->td_ucred, msghdr); if (error != 0) goto done2; } #endif /* Free the message headers */ msghdr = msqkptr->u.msg_first; while (msghdr != NULL) { struct msg *msghdr_tmp; /* Free the segments of each message */ msqkptr->u.msg_cbytes -= msghdr->msg_ts; msqkptr->u.msg_qnum--; msghdr_tmp = msghdr; msghdr = msghdr->msg_next; msg_freehdr(msghdr_tmp); } if (msqkptr->u.msg_cbytes != 0) panic("msg_cbytes is screwed up"); if (msqkptr->u.msg_qnum != 0) panic("msg_qnum is screwed up"); msqkptr->u.msg_qbytes = 0; /* Mark it as free */ #ifdef MAC mac_cleanup_sysv_msgqueue(msqkptr); #endif wakeup(msqkptr); } break; case IPC_SET: if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M))) goto done2; if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) { error = priv_check_cred(td->td_ucred, PRIV_IPC_MSGSIZE, SUSER_ALLOWJAIL); if (error) goto done2; } if (msqbuf->msg_qbytes > msginfo.msgmnb) { DPRINTF(("can't increase msg_qbytes beyond %d" "(truncating)\n", msginfo.msgmnb)); msqbuf->msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */ } if (msqbuf->msg_qbytes == 0) { DPRINTF(("can't reduce msg_qbytes to 0\n")); error = EINVAL; /* non-standard errno! */ goto done2; } msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid; /* change the owner */ msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid; /* change the owner */ msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) | (msqbuf->msg_perm.mode & 0777); msqkptr->u.msg_qbytes = msqbuf->msg_qbytes; msqkptr->u.msg_ctime = time_second; break; case IPC_STAT: if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) { DPRINTF(("requester doesn't have read access\n")); goto done2; } *msqbuf = msqkptr->u; break; default: DPRINTF(("invalid command %d\n", cmd)); error = EINVAL; goto done2; } if (error == 0) td->td_retval[0] = rval; done2: mtx_unlock(&msq_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct msgget_args { key_t key; int msgflg; }; #endif - int msgget(td, uap) struct thread *td; register struct msgget_args *uap; { int msqid, error = 0; int key = uap->key; int msgflg = uap->msgflg; struct ucred *cred = td->td_ucred; register struct msqid_kernel *msqkptr = NULL; DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg)); if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); mtx_lock(&msq_mtx); if (key != IPC_PRIVATE) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes != 0 && msqkptr->u.msg_perm.key == key) break; } if (msqid < msginfo.msgmni) { DPRINTF(("found public key\n")); if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) { DPRINTF(("not exclusive\n")); error = EEXIST; goto done2; } if ((error = ipcperm(td, &msqkptr->u.msg_perm, msgflg & 0700))) { DPRINTF(("requester doesn't have 0%o access\n", msgflg & 0700)); goto done2; } #ifdef MAC error = mac_check_sysv_msqget(cred, msqkptr); if (error != 0) goto done2; #endif goto found; } } DPRINTF(("need to allocate the msqid_ds\n")); if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { /* * Look for an unallocated and unlocked msqid_ds. * msqid_ds's can be locked by msgsnd or msgrcv while * they are copying the message in/out. We can't * re-use the entry until they release it. */ msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes == 0 && (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0) break; } if (msqid == msginfo.msgmni) { DPRINTF(("no more msqid_ds's available\n")); error = ENOSPC; goto done2; } DPRINTF(("msqid %d is available\n", msqid)); msqkptr->u.msg_perm.key = key; msqkptr->u.msg_perm.cuid = cred->cr_uid; msqkptr->u.msg_perm.uid = cred->cr_uid; msqkptr->u.msg_perm.cgid = cred->cr_gid; msqkptr->u.msg_perm.gid = cred->cr_gid; msqkptr->u.msg_perm.mode = (msgflg & 0777); /* Make sure that the returned msqid is unique */ msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff; msqkptr->u.msg_first = NULL; msqkptr->u.msg_last = NULL; msqkptr->u.msg_cbytes = 0; msqkptr->u.msg_qnum = 0; msqkptr->u.msg_qbytes = msginfo.msgmnb; msqkptr->u.msg_lspid = 0; msqkptr->u.msg_lrpid = 0; msqkptr->u.msg_stime = 0; msqkptr->u.msg_rtime = 0; msqkptr->u.msg_ctime = time_second; #ifdef MAC mac_create_sysv_msgqueue(cred, msqkptr); #endif } else { DPRINTF(("didn't find it and wasn't asked to create it\n")); error = ENOENT; goto done2; } found: /* Construct the unique msqid */ td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm); done2: mtx_unlock(&msq_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct msgsnd_args { int msqid; const void *msgp; size_t msgsz; int msgflg; }; #endif - int kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) struct thread *td; int msqid; const void *msgp; /* XXX msgp is actually mtext. */ size_t msgsz; int msgflg; long mtype; { int msqix, segs_needed, error = 0; register struct msqid_kernel *msqkptr; register struct msg *msghdr; short next; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); mtx_lock(&msq_mtx); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); error = EINVAL; goto done2; } msqkptr = &msqids[msqix]; if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such message queue id\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) { DPRINTF(("requester doesn't have write access\n")); goto done2; } #ifdef MAC error = mac_check_sysv_msqsnd(td->td_ucred, msqkptr); if (error != 0) goto done2; #endif segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz, segs_needed)); for (;;) { int need_more_resources = 0; /* * check msgsz * (inside this loop in case msg_qbytes changes while we sleep) */ if (msgsz > msqkptr->u.msg_qbytes) { DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.mode & MSG_LOCKED) { DPRINTF(("msqid is locked\n")); need_more_resources = 1; } if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) { DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n")); need_more_resources = 1; } if (segs_needed > nfree_msgmaps) { DPRINTF(("segs_needed > nfree_msgmaps\n")); need_more_resources = 1; } if (free_msghdrs == NULL) { DPRINTF(("no more msghdrs\n")); need_more_resources = 1; } if (need_more_resources) { int we_own_it; if ((msgflg & IPC_NOWAIT) != 0) { DPRINTF(("need more resources but caller " "doesn't want to wait\n")); error = EAGAIN; goto done2; } if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) { DPRINTF(("we don't own the msqid_ds\n")); we_own_it = 0; } else { /* Force later arrivals to wait for our request */ DPRINTF(("we own the msqid_ds\n")); msqkptr->u.msg_perm.mode |= MSG_LOCKED; we_own_it = 1; } DPRINTF(("msgsnd: goodnight\n")); error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH, "msgsnd", hz); DPRINTF(("msgsnd: good morning, error=%d\n", error)); if (we_own_it) msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; if (error == EWOULDBLOCK) { DPRINTF(("msgsnd: timed out\n")); continue; } if (error != 0) { DPRINTF(("msgsnd: interrupted system call\n")); error = EINTR; goto done2; } /* * Make sure that the msq queue still exists */ if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("msqid deleted\n")); error = EIDRM; goto done2; } } else { DPRINTF(("got all the resources that we need\n")); break; } } /* * We have the resources that we need. * Make sure! */ if (msqkptr->u.msg_perm.mode & MSG_LOCKED) panic("msg_perm.mode & MSG_LOCKED"); if (segs_needed > nfree_msgmaps) panic("segs_needed > nfree_msgmaps"); if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) panic("msgsz + msg_cbytes > msg_qbytes"); if (free_msghdrs == NULL) panic("no more msghdrs"); /* * Re-lock the msqid_ds in case we page-fault when copying in the * message */ if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) panic("msqid_ds is already locked"); msqkptr->u.msg_perm.mode |= MSG_LOCKED; /* * Allocate a message header */ msghdr = free_msghdrs; free_msghdrs = msghdr->msg_next; msghdr->msg_spot = -1; msghdr->msg_ts = msgsz; msghdr->msg_type = mtype; #ifdef MAC /* * XXXMAC: Should the mac_check_sysv_msgmsq check follow here * immediately? Or, should it be checked just before the msg is * enqueued in the msgq (as it is done now)? */ mac_create_sysv_msgmsg(td->td_ucred, msqkptr, msghdr); #endif /* * Allocate space for the message */ while (segs_needed > 0) { if (nfree_msgmaps <= 0) panic("not enough msgmaps"); if (free_msgmaps == -1) panic("nil free_msgmaps"); next = free_msgmaps; if (next <= -1) panic("next too low #1"); if (next >= msginfo.msgseg) panic("next out of range #1"); DPRINTF(("allocating segment %d to message\n", next)); free_msgmaps = msgmaps[next].next; nfree_msgmaps--; msgmaps[next].next = msghdr->msg_spot; msghdr->msg_spot = next; segs_needed--; } /* * Validate the message type */ if (msghdr->msg_type < 1) { msg_freehdr(msghdr); msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; wakeup(msqkptr); DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type)); error = EINVAL; goto done2; } /* * Copy in the message body */ next = msghdr->msg_spot; while (msgsz > 0) { size_t tlen; if (msgsz > msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz; if (next <= -1) panic("next too low #2"); if (next >= msginfo.msgseg) panic("next out of range #2"); mtx_unlock(&msq_mtx); if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz], tlen)) != 0) { mtx_lock(&msq_mtx); DPRINTF(("error %d copying in message segment\n", error)); msg_freehdr(msghdr); msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; wakeup(msqkptr); goto done2; } mtx_lock(&msq_mtx); msgsz -= tlen; msgp = (const char *)msgp + tlen; next = msgmaps[next].next; } if (next != -1) panic("didn't use all the msg segments"); /* * We've got the message. Unlock the msqid_ds. */ msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; /* * Make sure that the msqid_ds is still allocated. */ if (msqkptr->u.msg_qbytes == 0) { msg_freehdr(msghdr); wakeup(msqkptr); error = EIDRM; goto done2; } #ifdef MAC /* * Note: Since the task/thread allocates the msghdr and usually * primes it with its own MAC label, for a majority of policies, it * won't be necessary to check whether the msghdr has access * permissions to the msgq. The mac_check_sysv_msqsnd check would * suffice in that case. However, this hook may be required where * individual policies derive a non-identical label for the msghdr * from the current thread label and may want to check the msghdr * enqueue permissions, along with read/write permissions to the * msgq. */ error = mac_check_sysv_msgmsq(td->td_ucred, msghdr, msqkptr); if (error != 0) { msg_freehdr(msghdr); wakeup(msqkptr); goto done2; } #endif /* * Put the message into the queue */ if (msqkptr->u.msg_first == NULL) { msqkptr->u.msg_first = msghdr; msqkptr->u.msg_last = msghdr; } else { msqkptr->u.msg_last->msg_next = msghdr; msqkptr->u.msg_last = msghdr; } msqkptr->u.msg_last->msg_next = NULL; msqkptr->u.msg_cbytes += msghdr->msg_ts; msqkptr->u.msg_qnum++; msqkptr->u.msg_lspid = td->td_proc->p_pid; msqkptr->u.msg_stime = time_second; wakeup(msqkptr); td->td_retval[0] = 0; done2: mtx_unlock(&msq_mtx); return (error); } int msgsnd(td, uap) struct thread *td; register struct msgsnd_args *uap; { int error; long mtype; DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp, uap->msgsz, uap->msgflg)); if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) { DPRINTF(("error %d copying the message type\n", error)); return (error); } return (kern_msgsnd(td, uap->msqid, (const char *)uap->msgp + sizeof(mtype), uap->msgsz, uap->msgflg, mtype)); } #ifndef _SYS_SYSPROTO_H_ struct msgrcv_args { int msqid; void *msgp; size_t msgsz; long msgtyp; int msgflg; }; #endif - int kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype) struct thread *td; int msqid; void *msgp; /* XXX msgp is actually mtext. */ size_t msgsz; long msgtyp; int msgflg; long *mtype; { size_t len; register struct msqid_kernel *msqkptr; register struct msg *msghdr; int msqix, error = 0; short next; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); return (EINVAL); } msqkptr = &msqids[msqix]; mtx_lock(&msq_mtx); if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such message queue id\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) { DPRINTF(("requester doesn't have read access\n")); goto done2; } #ifdef MAC error = mac_check_sysv_msqrcv(td->td_ucred, msqkptr); if (error != 0) goto done2; #endif msghdr = NULL; while (msghdr == NULL) { if (msgtyp == 0) { msghdr = msqkptr->u.msg_first; if (msghdr != NULL) { if (msgsz < msghdr->msg_ts && (msgflg & MSG_NOERROR) == 0) { DPRINTF(("first message on the queue " "is too big (want %zu, got %d)\n", msgsz, msghdr->msg_ts)); error = E2BIG; goto done2; } #ifdef MAC error = mac_check_sysv_msgrcv(td->td_ucred, msghdr); if (error != 0) goto done2; #endif if (msqkptr->u.msg_first == msqkptr->u.msg_last) { msqkptr->u.msg_first = NULL; msqkptr->u.msg_last = NULL; } else { msqkptr->u.msg_first = msghdr->msg_next; if (msqkptr->u.msg_first == NULL) panic("msg_first/last screwed up #1"); } } } else { struct msg *previous; struct msg **prev; previous = NULL; prev = &(msqkptr->u.msg_first); while ((msghdr = *prev) != NULL) { /* * Is this message's type an exact match or is * this message's type less than or equal to * the absolute value of a negative msgtyp? * Note that the second half of this test can * NEVER be true if msgtyp is positive since * msg_type is always positive! */ if (msgtyp == msghdr->msg_type || msghdr->msg_type <= -msgtyp) { DPRINTF(("found message type %ld, " "requested %ld\n", msghdr->msg_type, msgtyp)); if (msgsz < msghdr->msg_ts && (msgflg & MSG_NOERROR) == 0) { DPRINTF(("requested message " "on the queue is too big " "(want %zu, got %hu)\n", msgsz, msghdr->msg_ts)); error = E2BIG; goto done2; } #ifdef MAC error = mac_check_sysv_msgrcv( td->td_ucred, msghdr); if (error != 0) goto done2; #endif *prev = msghdr->msg_next; if (msghdr == msqkptr->u.msg_last) { if (previous == NULL) { if (prev != &msqkptr->u.msg_first) panic("msg_first/last screwed up #2"); msqkptr->u.msg_first = NULL; msqkptr->u.msg_last = NULL; } else { if (prev == &msqkptr->u.msg_first) panic("msg_first/last screwed up #3"); msqkptr->u.msg_last = previous; } } break; } previous = msghdr; prev = &(msghdr->msg_next); } } /* * We've either extracted the msghdr for the appropriate * message or there isn't one. * If there is one then bail out of this loop. */ if (msghdr != NULL) break; /* * Hmph! No message found. Does the user want to wait? */ if ((msgflg & IPC_NOWAIT) != 0) { DPRINTF(("no appropriate message found (msgtyp=%ld)\n", msgtyp)); /* The SVID says to return ENOMSG. */ error = ENOMSG; goto done2; } /* * Wait for something to happen */ DPRINTF(("msgrcv: goodnight\n")); error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH, "msgrcv", 0); DPRINTF(("msgrcv: good morning (error=%d)\n", error)); if (error != 0) { DPRINTF(("msgrcv: interrupted system call\n")); error = EINTR; goto done2; } /* * Make sure that the msq queue still exists */ if (msqkptr->u.msg_qbytes == 0 || msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("msqid deleted\n")); error = EIDRM; goto done2; } } /* * Return the message to the user. * * First, do the bookkeeping (before we risk being interrupted). */ msqkptr->u.msg_cbytes -= msghdr->msg_ts; msqkptr->u.msg_qnum--; msqkptr->u.msg_lrpid = td->td_proc->p_pid; msqkptr->u.msg_rtime = time_second; /* * Make msgsz the actual amount that we'll be returning. * Note that this effectively truncates the message if it is too long * (since msgsz is never increased). */ DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz, msghdr->msg_ts)); if (msgsz > msghdr->msg_ts) msgsz = msghdr->msg_ts; *mtype = msghdr->msg_type; /* * Return the segments to the user */ next = msghdr->msg_spot; for (len = 0; len < msgsz; len += msginfo.msgssz) { size_t tlen; if (msgsz - len > msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz - len; if (next <= -1) panic("next too low #3"); if (next >= msginfo.msgseg) panic("next out of range #3"); mtx_unlock(&msq_mtx); error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen); mtx_lock(&msq_mtx); if (error != 0) { DPRINTF(("error (%d) copying out message segment\n", error)); msg_freehdr(msghdr); wakeup(msqkptr); goto done2; } msgp = (char *)msgp + tlen; next = msgmaps[next].next; } /* * Done, return the actual number of bytes copied out. */ msg_freehdr(msghdr); wakeup(msqkptr); td->td_retval[0] = msgsz; done2: mtx_unlock(&msq_mtx); return (error); } int msgrcv(td, uap) struct thread *td; register struct msgrcv_args *uap; { int error; long mtype; DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid, uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg)); if ((error = kern_msgrcv(td, uap->msqid, (char *)uap->msgp + sizeof(mtype), uap->msgsz, uap->msgtyp, uap->msgflg, &mtype)) != 0) return (error); if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0) DPRINTF(("error %d copying the message type\n", error)); return (error); } static int sysctl_msqids(SYSCTL_HANDLER_ARGS) { return (SYSCTL_OUT(req, msqids, sizeof(struct msqid_kernel) * msginfo.msgmni)); } SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0, "Maximum message size"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0, "Number of message queue identifiers"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0, "Maximum number of bytes in a queue"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0, "Maximum number of messages in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0, "Size of a message segment"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0, "Number of message segments"); SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLFLAG_RD, NULL, 0, sysctl_msqids, "", "Message queue IDs"); Index: head/sys/kern/sysv_sem.c =================================================================== --- head/sys/kern/sysv_sem.c (revision 167231) +++ head/sys/kern/sysv_sem.c (revision 167232) @@ -1,1364 +1,1361 @@ /*- * Implementation of SVID semaphores * * Author: Daniel Boulet * * This software is provided ``AS IS'' without any warranties of any kind. */ /*- * Copyright (c) 2003-2005 McAfee, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_sysvipc.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores"); #ifdef SEM_DEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) #endif static void seminit(void); static int sysvsem_modload(struct module *, int, void *); static int semunload(void); static void semexit_myhook(void *arg, struct proc *p); static int sysctl_sema(SYSCTL_HANDLER_ARGS); static int semvalid(int semid, struct semid_kernel *semakptr); #ifndef _SYS_SYSPROTO_H_ struct __semctl_args; int __semctl(struct thread *td, struct __semctl_args *uap); struct semget_args; int semget(struct thread *td, struct semget_args *uap); struct semop_args; int semop(struct thread *td, struct semop_args *uap); #endif static struct sem_undo *semu_alloc(struct thread *td); static int semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid, int semnum, int adjval); static void semundo_clear(int semid, int semnum); /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *semcalls[] = { (sy_call_t *)__semctl, (sy_call_t *)semget, (sy_call_t *)semop }; static struct mtx sem_mtx; /* semaphore global lock */ static int semtot = 0; static struct semid_kernel *sema; /* semaphore id pool */ static struct mtx *sema_mtx; /* semaphore id pool mutexes*/ static struct sem *sem; /* semaphore pool */ SLIST_HEAD(, sem_undo) semu_list; /* list of active undo structures */ static int *semu; /* undo structure pool */ static eventhandler_tag semexit_tag; #define SEMUNDO_MTX sem_mtx #define SEMUNDO_LOCK() mtx_lock(&SEMUNDO_MTX); #define SEMUNDO_UNLOCK() mtx_unlock(&SEMUNDO_MTX); #define SEMUNDO_LOCKASSERT(how) mtx_assert(&SEMUNDO_MTX, (how)); struct sem { u_short semval; /* semaphore value */ pid_t sempid; /* pid of last operation */ u_short semncnt; /* # awaiting semval > cval */ u_short semzcnt; /* # awaiting semval = 0 */ }; /* * Undo structure (one per process) */ struct sem_undo { SLIST_ENTRY(sem_undo) un_next; /* ptr to next active undo structure */ struct proc *un_proc; /* owner of this structure */ short un_cnt; /* # of active entries */ struct undo { short un_adjval; /* adjust on exit values */ short un_num; /* semaphore # */ int un_id; /* semid */ } un_ent[1]; /* undo entries */ }; /* * Configuration parameters */ #ifndef SEMMNI #define SEMMNI 10 /* # of semaphore identifiers */ #endif #ifndef SEMMNS #define SEMMNS 60 /* # of semaphores in system */ #endif #ifndef SEMUME #define SEMUME 10 /* max # of undo entries per process */ #endif #ifndef SEMMNU #define SEMMNU 30 /* # of undo structures in system */ #endif /* shouldn't need tuning */ #ifndef SEMMAP #define SEMMAP 30 /* # of entries in semaphore map */ #endif #ifndef SEMMSL #define SEMMSL SEMMNS /* max # of semaphores per id */ #endif #ifndef SEMOPM #define SEMOPM 100 /* max # of operations per semop call */ #endif #define SEMVMX 32767 /* semaphore maximum value */ #define SEMAEM 16384 /* adjust on exit max value */ /* * Due to the way semaphore memory is allocated, we have to ensure that * SEMUSZ is properly aligned. */ #define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1)) /* actual size of an undo structure */ #define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME])) /* * Macro to find a particular sem_undo vector */ #define SEMU(ix) \ ((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz)) /* * semaphore info struct */ struct seminfo seminfo = { SEMMAP, /* # of entries in semaphore map */ SEMMNI, /* # of semaphore identifiers */ SEMMNS, /* # of semaphores in system */ SEMMNU, /* # of undo structures in system */ SEMMSL, /* max # of semaphores per id */ SEMOPM, /* max # of operations per semop call */ SEMUME, /* max # of undo entries per process */ SEMUSZ, /* size in bytes of undo structure */ SEMVMX, /* semaphore maximum value */ SEMAEM /* adjust on exit max value */ }; SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0, "Number of entries in the semaphore map"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0, "Number of semaphore identifiers"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0, "Maximum number of semaphores in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0, "Maximum number of undo structures in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0, "Max semaphores per id"); SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0, "Max operations per semop call"); SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0, "Max undo entries per process"); SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0, "Size in bytes of undo structure"); SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0, "Semaphore maximum value"); SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0, "Adjust on exit max value"); SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLFLAG_RD, NULL, 0, sysctl_sema, "", ""); static void seminit(void) { int i; TUNABLE_INT_FETCH("kern.ipc.semmap", &seminfo.semmap); TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni); TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns); TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu); TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl); TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm); TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume); TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz); TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx); TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem); sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK); sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM, M_WAITOK); sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM, M_WAITOK | M_ZERO); semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK); for (i = 0; i < seminfo.semmni; i++) { sema[i].u.sem_base = 0; sema[i].u.sem_perm.mode = 0; sema[i].u.sem_perm.seq = 0; #ifdef MAC mac_init_sysv_sem(&sema[i]); #endif } for (i = 0; i < seminfo.semmni; i++) mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF); for (i = 0; i < seminfo.semmnu; i++) { struct sem_undo *suptr = SEMU(i); suptr->un_proc = NULL; } SLIST_INIT(&semu_list); mtx_init(&sem_mtx, "sem", NULL, MTX_DEF); semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL, EVENTHANDLER_PRI_ANY); } static int semunload(void) { int i; if (semtot != 0) return (EBUSY); EVENTHANDLER_DEREGISTER(process_exit, semexit_tag); #ifdef MAC for (i = 0; i < seminfo.semmni; i++) mac_destroy_sysv_sem(&sema[i]); #endif free(sem, M_SEM); free(sema, M_SEM); free(semu, M_SEM); for (i = 0; i < seminfo.semmni; i++) mtx_destroy(&sema_mtx[i]); mtx_destroy(&sem_mtx); return (0); } static int sysvsem_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: seminit(); break; case MOD_UNLOAD: error = semunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvsem_mod = { "sysvsem", &sysvsem_modload, NULL }; SYSCALL_MODULE_HELPER(semsys); SYSCALL_MODULE_HELPER(__semctl); SYSCALL_MODULE_HELPER(semget); SYSCALL_MODULE_HELPER(semop); DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST); MODULE_VERSION(sysvsem, 1); /* * Entry point for all SEM calls. */ int semsys(td, uap) struct thread *td; /* XXX actually varargs. */ struct semsys_args /* { int which; int a2; int a3; int a4; int a5; } */ *uap; { int error; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); if (uap->which < 0 || uap->which >= sizeof(semcalls)/sizeof(semcalls[0])) return (EINVAL); error = (*semcalls[uap->which])(td, &uap->a2); return (error); } /* * Allocate a new sem_undo structure for a process * (returns ptr to structure or NULL if no more room) */ static struct sem_undo * semu_alloc(td) struct thread *td; { int i; struct sem_undo *suptr; struct sem_undo **supptr; int attempt; SEMUNDO_LOCKASSERT(MA_OWNED); /* * Try twice to allocate something. * (we'll purge an empty structure after the first pass so * two passes are always enough) */ for (attempt = 0; attempt < 2; attempt++) { /* * Look for a free structure. * Fill it in and return it if we find one. */ for (i = 0; i < seminfo.semmnu; i++) { suptr = SEMU(i); if (suptr->un_proc == NULL) { SLIST_INSERT_HEAD(&semu_list, suptr, un_next); suptr->un_cnt = 0; suptr->un_proc = td->td_proc; return(suptr); } } /* * We didn't find a free one, if this is the first attempt * then try to free a structure. */ if (attempt == 0) { /* All the structures are in use - try to free one */ int did_something = 0; SLIST_FOREACH_PREVPTR(suptr, supptr, &semu_list, un_next) { if (suptr->un_cnt == 0) { suptr->un_proc = NULL; did_something = 1; *supptr = SLIST_NEXT(suptr, un_next); break; } } /* If we didn't free anything then just give-up */ if (!did_something) return(NULL); } else { /* * The second pass failed even though we freed * something after the first pass! * This is IMPOSSIBLE! */ panic("semu_alloc - second attempt failed"); } } return (NULL); } /* * Adjust a particular entry for a particular proc */ static int semundo_adjust(td, supptr, semid, semnum, adjval) struct thread *td; struct sem_undo **supptr; int semid, semnum; int adjval; { struct proc *p = td->td_proc; struct sem_undo *suptr; struct undo *sunptr; int i; SEMUNDO_LOCKASSERT(MA_OWNED); /* Look for and remember the sem_undo if the caller doesn't provide it */ suptr = *supptr; if (suptr == NULL) { SLIST_FOREACH(suptr, &semu_list, un_next) { if (suptr->un_proc == p) { *supptr = suptr; break; } } if (suptr == NULL) { if (adjval == 0) return(0); suptr = semu_alloc(td); if (suptr == NULL) return(ENOSPC); *supptr = suptr; } } /* * Look for the requested entry and adjust it (delete if adjval becomes * 0). */ sunptr = &suptr->un_ent[0]; for (i = 0; i < suptr->un_cnt; i++, sunptr++) { if (sunptr->un_id != semid || sunptr->un_num != semnum) continue; if (adjval != 0) { adjval += sunptr->un_adjval; if (adjval > seminfo.semaem || adjval < -seminfo.semaem) return (ERANGE); } sunptr->un_adjval = adjval; if (sunptr->un_adjval == 0) { suptr->un_cnt--; if (i < suptr->un_cnt) suptr->un_ent[i] = suptr->un_ent[suptr->un_cnt]; } return(0); } /* Didn't find the right entry - create it */ if (adjval == 0) return(0); if (adjval > seminfo.semaem || adjval < -seminfo.semaem) return (ERANGE); if (suptr->un_cnt != seminfo.semume) { sunptr = &suptr->un_ent[suptr->un_cnt]; suptr->un_cnt++; sunptr->un_adjval = adjval; sunptr->un_id = semid; sunptr->un_num = semnum; } else return(EINVAL); return(0); } static void semundo_clear(semid, semnum) int semid, semnum; { struct sem_undo *suptr; SEMUNDO_LOCKASSERT(MA_OWNED); SLIST_FOREACH(suptr, &semu_list, un_next) { struct undo *sunptr = &suptr->un_ent[0]; int i = 0; while (i < suptr->un_cnt) { if (sunptr->un_id == semid) { if (semnum == -1 || sunptr->un_num == semnum) { suptr->un_cnt--; if (i < suptr->un_cnt) { suptr->un_ent[i] = suptr->un_ent[suptr->un_cnt]; continue; } } if (semnum != -1) break; } i++, sunptr++; } } } static int semvalid(semid, semakptr) int semid; struct semid_kernel *semakptr; { return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0); } /* * Note that the user-mode half of this passes a union, not a pointer. */ #ifndef _SYS_SYSPROTO_H_ struct __semctl_args { int semid; int semnum; int cmd; union semun *arg; }; #endif - int __semctl(td, uap) struct thread *td; struct __semctl_args *uap; { struct semid_ds dsbuf; union semun arg, semun; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(arg.buf, &dsbuf, sizeof(dsbuf)); if (error) return (error); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = arg.array; break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: error = copyout(&dsbuf, arg.buf, sizeof(dsbuf)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } int kern_semctl(struct thread *td, int semid, int semnum, int cmd, union semun *arg, register_t *rval) { u_short *array; struct ucred *cred = td->td_ucred; int i, error; struct semid_ds *sbuf; struct semid_kernel *semakptr; struct mtx *sema_mtxp; u_short usval, count; int semidx; DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n", semid, semnum, cmd, arg)); if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); array = NULL; switch(cmd) { case SEM_STAT: /* * For this command we assume semid is an array index * rather than an IPC id. */ if (semid < 0 || semid >= seminfo.semmni) return (EINVAL); semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) { error = EINVAL; goto done2; } if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; #ifdef MAC error = mac_check_sysv_semctl(cred, semakptr, cmd); if (error != 0) goto done2; #endif bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds)); *rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm); mtx_unlock(sema_mtxp); return (0); } semidx = IPCID_TO_IX(semid); if (semidx < 0 || semidx >= seminfo.semmni) return (EINVAL); semakptr = &sema[semidx]; sema_mtxp = &sema_mtx[semidx]; mtx_lock(sema_mtxp); #ifdef MAC error = mac_check_sysv_semctl(cred, semakptr, cmd); if (error != 0) goto done2; #endif error = 0; *rval = 0; switch (cmd) { case IPC_RMID: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M))) goto done2; semakptr->u.sem_perm.cuid = cred->cr_uid; semakptr->u.sem_perm.uid = cred->cr_uid; semtot -= semakptr->u.sem_nsems; for (i = semakptr->u.sem_base - sem; i < semtot; i++) sem[i] = sem[i + semakptr->u.sem_nsems]; for (i = 0; i < seminfo.semmni; i++) { if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && sema[i].u.sem_base > semakptr->u.sem_base) sema[i].u.sem_base -= semakptr->u.sem_nsems; } semakptr->u.sem_perm.mode = 0; #ifdef MAC mac_cleanup_sysv_sem(semakptr); #endif SEMUNDO_LOCK(); semundo_clear(semidx, -1); SEMUNDO_UNLOCK(); wakeup(semakptr); break; case IPC_SET: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M))) goto done2; sbuf = arg->buf; semakptr->u.sem_perm.uid = sbuf->sem_perm.uid; semakptr->u.sem_perm.gid = sbuf->sem_perm.gid; semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode & ~0777) | (sbuf->sem_perm.mode & 0777); semakptr->u.sem_ctime = time_second; break; case IPC_STAT: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds)); break; case GETNCNT: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semncnt; break; case GETPID: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].sempid; break; case GETVAL: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semval; break; case GETALL: /* * Unfortunately, callers of this function don't know * in advance how many semaphores are in this set. * While we could just allocate the maximum size array * and pass the actual size back to the caller, that * won't work for SETALL since we can't copyin() more * data than the user specified as we may return a * spurious EFAULT. * * Note that the number of semaphores in a set is * fixed for the life of that set. The only way that * the 'count' could change while are blocked in * malloc() is if this semaphore set were destroyed * and a new one created with the same index. * However, semvalid() will catch that due to the * sequence number unless exactly 0x8000 (or a * multiple thereof) semaphore sets for the same index * are created and destroyed while we are in malloc! * */ count = semakptr->u.sem_nsems; mtx_unlock(sema_mtxp); array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK); mtx_lock(sema_mtxp); if ((error = semvalid(semid, semakptr)) != 0) goto done2; KASSERT(count == semakptr->u.sem_nsems, ("nsems changed")); if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; for (i = 0; i < semakptr->u.sem_nsems; i++) array[i] = semakptr->u.sem_base[i].semval; mtx_unlock(sema_mtxp); error = copyout(array, arg->array, count * sizeof(*array)); mtx_lock(sema_mtxp); break; case GETZCNT: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semzcnt; break; case SETVAL: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } if (arg->val < 0 || arg->val > seminfo.semvmx) { error = ERANGE; goto done2; } semakptr->u.sem_base[semnum].semval = arg->val; SEMUNDO_LOCK(); semundo_clear(semidx, semnum); SEMUNDO_UNLOCK(); wakeup(semakptr); break; case SETALL: /* * See comment on GETALL for why 'count' shouldn't change * and why we require a userland buffer. */ count = semakptr->u.sem_nsems; mtx_unlock(sema_mtxp); array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK); error = copyin(arg->array, array, count * sizeof(*array)); if (error) break; mtx_lock(sema_mtxp); if ((error = semvalid(semid, semakptr)) != 0) goto done2; KASSERT(count == semakptr->u.sem_nsems, ("nsems changed")); if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W))) goto done2; for (i = 0; i < semakptr->u.sem_nsems; i++) { usval = array[i]; if (usval > seminfo.semvmx) { error = ERANGE; break; } semakptr->u.sem_base[i].semval = usval; } SEMUNDO_LOCK(); semundo_clear(semidx, -1); SEMUNDO_UNLOCK(); wakeup(semakptr); break; default: error = EINVAL; break; } done2: mtx_unlock(sema_mtxp); if (array != NULL) free(array, M_TEMP); return(error); } #ifndef _SYS_SYSPROTO_H_ struct semget_args { key_t key; int nsems; int semflg; }; #endif - int semget(td, uap) struct thread *td; struct semget_args *uap; { int semid, error = 0; int key = uap->key; int nsems = uap->nsems; int semflg = uap->semflg; struct ucred *cred = td->td_ucred; DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg)); if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); mtx_lock(&Giant); if (key != IPC_PRIVATE) { for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) && sema[semid].u.sem_perm.key == key) break; } if (semid < seminfo.semmni) { DPRINTF(("found public key\n")); if ((error = ipcperm(td, &sema[semid].u.sem_perm, semflg & 0700))) { goto done2; } if (nsems > 0 && sema[semid].u.sem_nsems < nsems) { DPRINTF(("too small\n")); error = EINVAL; goto done2; } if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) { DPRINTF(("not exclusive\n")); error = EEXIST; goto done2; } #ifdef MAC error = mac_check_sysv_semget(cred, &sema[semid]); if (error != 0) goto done2; #endif goto found; } } DPRINTF(("need to allocate the semid_kernel\n")); if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) { if (nsems <= 0 || nsems > seminfo.semmsl) { DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems, seminfo.semmsl)); error = EINVAL; goto done2; } if (nsems > seminfo.semmns - semtot) { DPRINTF(( "not enough semaphores left (need %d, got %d)\n", nsems, seminfo.semmns - semtot)); error = ENOSPC; goto done2; } for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0) break; } if (semid == seminfo.semmni) { DPRINTF(("no more semid_kernel's available\n")); error = ENOSPC; goto done2; } DPRINTF(("semid %d is available\n", semid)); sema[semid].u.sem_perm.key = key; sema[semid].u.sem_perm.cuid = cred->cr_uid; sema[semid].u.sem_perm.uid = cred->cr_uid; sema[semid].u.sem_perm.cgid = cred->cr_gid; sema[semid].u.sem_perm.gid = cred->cr_gid; sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC; sema[semid].u.sem_perm.seq = (sema[semid].u.sem_perm.seq + 1) & 0x7fff; sema[semid].u.sem_nsems = nsems; sema[semid].u.sem_otime = 0; sema[semid].u.sem_ctime = time_second; sema[semid].u.sem_base = &sem[semtot]; semtot += nsems; bzero(sema[semid].u.sem_base, sizeof(sema[semid].u.sem_base[0])*nsems); #ifdef MAC mac_create_sysv_sem(cred, &sema[semid]); #endif DPRINTF(("sembase = %p, next = %p\n", sema[semid].u.sem_base, &sem[semtot])); } else { DPRINTF(("didn't find it and wasn't asked to create it\n")); error = ENOENT; goto done2; } found: td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm); done2: mtx_unlock(&Giant); return (error); } #ifndef _SYS_SYSPROTO_H_ struct semop_args { int semid; struct sembuf *sops; size_t nsops; }; #endif - int semop(td, uap) struct thread *td; struct semop_args *uap; { #define SMALL_SOPS 8 struct sembuf small_sops[SMALL_SOPS]; int semid = uap->semid; size_t nsops = uap->nsops; struct sembuf *sops; struct semid_kernel *semakptr; struct sembuf *sopptr = 0; struct sem *semptr = 0; struct sem_undo *suptr; struct mtx *sema_mtxp; size_t i, j, k; int error; int do_wakeup, do_undos; #ifdef SEM_DEBUG sops = NULL; #endif DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops)); if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ if (semid < 0 || semid >= seminfo.semmni) return (EINVAL); /* Allocate memory for sem_ops */ if (nsops <= SMALL_SOPS) sops = small_sops; else if (nsops <= seminfo.semopm) sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK); else { DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm, nsops)); return (E2BIG); } if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) { DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error, uap->sops, sops, nsops * sizeof(sops[0]))); if (sops != small_sops) free(sops, M_SEM); return (error); } semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) { error = EINVAL; goto done2; } if (semakptr->u.sem_perm.seq != IPCID_TO_SEQ(uap->semid)) { error = EINVAL; goto done2; } /* * Initial pass thru sops to see what permissions are needed. * Also perform any checks that don't need repeating on each * attempt to satisfy the request vector. */ j = 0; /* permission needed */ do_undos = 0; for (i = 0; i < nsops; i++) { sopptr = &sops[i]; if (sopptr->sem_num >= semakptr->u.sem_nsems) { error = EFBIG; goto done2; } if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0) do_undos = 1; j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A; } if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) { DPRINTF(("error = %d from ipaccess\n", error)); goto done2; } #ifdef MAC error = mac_check_sysv_semop(td->td_ucred, semakptr, j); if (error != 0) goto done2; #endif /* * Loop trying to satisfy the vector of requests. * If we reach a point where we must wait, any requests already * performed are rolled back and we go to sleep until some other * process wakes us up. At this point, we start all over again. * * This ensures that from the perspective of other tasks, a set * of requests is atomic (never partially satisfied). */ for (;;) { do_wakeup = 0; error = 0; /* error return if necessary */ for (i = 0; i < nsops; i++) { sopptr = &sops[i]; semptr = &semakptr->u.sem_base[sopptr->sem_num]; DPRINTF(( "semop: semakptr=%p, sem_base=%p, " "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n", semakptr, semakptr->u.sem_base, semptr, sopptr->sem_num, semptr->semval, sopptr->sem_op, (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait")); if (sopptr->sem_op < 0) { if (semptr->semval + sopptr->sem_op < 0) { DPRINTF(("semop: can't do it now\n")); break; } else { semptr->semval += sopptr->sem_op; if (semptr->semval == 0 && semptr->semzcnt > 0) do_wakeup = 1; } } else if (sopptr->sem_op == 0) { if (semptr->semval != 0) { DPRINTF(("semop: not zero now\n")); break; } } else if (semptr->semval + sopptr->sem_op > seminfo.semvmx) { error = ERANGE; break; } else { if (semptr->semncnt > 0) do_wakeup = 1; semptr->semval += sopptr->sem_op; } } /* * Did we get through the entire vector? */ if (i >= nsops) goto done; /* * No ... rollback anything that we've already done */ DPRINTF(("semop: rollback 0 through %d\n", i-1)); for (j = 0; j < i; j++) semakptr->u.sem_base[sops[j].sem_num].semval -= sops[j].sem_op; /* If we detected an error, return it */ if (error != 0) goto done2; /* * If the request that we couldn't satisfy has the * NOWAIT flag set then return with EAGAIN. */ if (sopptr->sem_flg & IPC_NOWAIT) { error = EAGAIN; goto done2; } if (sopptr->sem_op == 0) semptr->semzcnt++; else semptr->semncnt++; DPRINTF(("semop: good night!\n")); error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH, "semwait", 0); DPRINTF(("semop: good morning (error=%d)!\n", error)); /* return code is checked below, after sem[nz]cnt-- */ /* * Make sure that the semaphore still exists */ if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || semakptr->u.sem_perm.seq != IPCID_TO_SEQ(uap->semid)) { error = EIDRM; goto done2; } /* * The semaphore is still alive. Readjust the count of * waiting processes. */ if (sopptr->sem_op == 0) semptr->semzcnt--; else semptr->semncnt--; /* * Is it really morning, or was our sleep interrupted? * (Delayed check of msleep() return code because we * need to decrement sem[nz]cnt either way.) */ if (error != 0) { error = EINTR; goto done2; } DPRINTF(("semop: good morning!\n")); } done: /* * Process any SEM_UNDO requests. */ if (do_undos) { SEMUNDO_LOCK(); suptr = NULL; for (i = 0; i < nsops; i++) { /* * We only need to deal with SEM_UNDO's for non-zero * op's. */ int adjval; if ((sops[i].sem_flg & SEM_UNDO) == 0) continue; adjval = sops[i].sem_op; if (adjval == 0) continue; error = semundo_adjust(td, &suptr, semid, sops[i].sem_num, -adjval); if (error == 0) continue; /* * Oh-Oh! We ran out of either sem_undo's or undo's. * Rollback the adjustments to this point and then * rollback the semaphore ups and down so we can return * with an error with all structures restored. We * rollback the undo's in the exact reverse order that * we applied them. This guarantees that we won't run * out of space as we roll things back out. */ for (j = 0; j < i; j++) { k = i - j - 1; if ((sops[k].sem_flg & SEM_UNDO) == 0) continue; adjval = sops[k].sem_op; if (adjval == 0) continue; if (semundo_adjust(td, &suptr, semid, sops[k].sem_num, adjval) != 0) panic("semop - can't undo undos"); } for (j = 0; j < nsops; j++) semakptr->u.sem_base[sops[j].sem_num].semval -= sops[j].sem_op; DPRINTF(("error = %d from semundo_adjust\n", error)); SEMUNDO_UNLOCK(); goto done2; } /* loop through the sops */ SEMUNDO_UNLOCK(); } /* if (do_undos) */ /* We're definitely done - set the sempid's and time */ for (i = 0; i < nsops; i++) { sopptr = &sops[i]; semptr = &semakptr->u.sem_base[sopptr->sem_num]; semptr->sempid = td->td_proc->p_pid; } semakptr->u.sem_otime = time_second; /* * Do a wakeup if any semaphore was up'd whilst something was * sleeping on it. */ if (do_wakeup) { DPRINTF(("semop: doing wakeup\n")); wakeup(semakptr); DPRINTF(("semop: back from wakeup\n")); } DPRINTF(("semop: done\n")); td->td_retval[0] = 0; done2: mtx_unlock(sema_mtxp); if (sops != small_sops) free(sops, M_SEM); return (error); } /* * Go through the undo structures for this process and apply the adjustments to * semaphores. */ static void semexit_myhook(arg, p) void *arg; struct proc *p; { struct sem_undo *suptr; struct sem_undo **supptr; /* * Go through the chain of undo vectors looking for one * associated with this process. */ SEMUNDO_LOCK(); SLIST_FOREACH_PREVPTR(suptr, supptr, &semu_list, un_next) { if (suptr->un_proc == p) break; } SEMUNDO_UNLOCK(); if (suptr == NULL) return; DPRINTF(("proc @%p has undo structure with %d entries\n", p, suptr->un_cnt)); /* * If there are any active undo elements then process them. */ if (suptr->un_cnt > 0) { int ix; for (ix = 0; ix < suptr->un_cnt; ix++) { int semid = suptr->un_ent[ix].un_id; int semnum = suptr->un_ent[ix].un_num; int adjval = suptr->un_ent[ix].un_adjval; struct semid_kernel *semakptr; struct mtx *sema_mtxp; semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); SEMUNDO_LOCK(); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) panic("semexit - semid not allocated"); if (semnum >= semakptr->u.sem_nsems) panic("semexit - semnum out of range"); DPRINTF(( "semexit: %p id=%d num=%d(adj=%d) ; sem=%d\n", suptr->un_proc, suptr->un_ent[ix].un_id, suptr->un_ent[ix].un_num, suptr->un_ent[ix].un_adjval, semakptr->u.sem_base[semnum].semval)); if (adjval < 0) { if (semakptr->u.sem_base[semnum].semval < -adjval) semakptr->u.sem_base[semnum].semval = 0; else semakptr->u.sem_base[semnum].semval += adjval; } else semakptr->u.sem_base[semnum].semval += adjval; wakeup(semakptr); DPRINTF(("semexit: back from wakeup\n")); mtx_unlock(sema_mtxp); SEMUNDO_UNLOCK(); } } /* * Deallocate the undo vector. */ DPRINTF(("removing vector\n")); suptr->un_proc = NULL; *supptr = SLIST_NEXT(suptr, un_next); } static int sysctl_sema(SYSCTL_HANDLER_ARGS) { return (SYSCTL_OUT(req, sema, sizeof(struct semid_kernel) * seminfo.semmni)); } Index: head/sys/kern/sysv_shm.c =================================================================== --- head/sys/kern/sysv_shm.c (revision 167231) +++ head/sys/kern/sysv_shm.c (revision 167232) @@ -1,1023 +1,1018 @@ /* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */ /*- * Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Adam Glass and Charles * Hannum. * 4. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2003-2005 McAfee, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_sysvipc.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments"); #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43)) struct oshmctl_args; static int oshmctl(struct thread *td, struct oshmctl_args *uap); #endif static int shmget_allocate_segment(struct thread *td, struct shmget_args *uap, int mode); static int shmget_existing(struct thread *td, struct shmget_args *uap, int mode, int segnum); #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43)) /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *shmcalls[] = { (sy_call_t *)shmat, (sy_call_t *)oshmctl, (sy_call_t *)shmdt, (sy_call_t *)shmget, (sy_call_t *)shmctl }; #endif #define SHMSEG_FREE 0x0200 #define SHMSEG_REMOVED 0x0400 #define SHMSEG_ALLOCATED 0x0800 #define SHMSEG_WANTED 0x1000 static int shm_last_free, shm_nused, shm_committed, shmalloced; static struct shmid_kernel *shmsegs; struct shmmap_state { vm_offset_t va; int shmid; }; static void shm_deallocate_segment(struct shmid_kernel *); static int shm_find_segment_by_key(key_t); static struct shmid_kernel *shm_find_segment_by_shmid(int); static struct shmid_kernel *shm_find_segment_by_shmidx(int); static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *); static void shmrealloc(void); static void shminit(void); static int sysvshm_modload(struct module *, int, void *); static int shmunload(void); static void shmexit_myhook(struct vmspace *vm); static void shmfork_myhook(struct proc *p1, struct proc *p2); static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS); /* * Tuneable values. */ #ifndef SHMMAXPGS #define SHMMAXPGS 8192 /* Note: sysv shared memory is swap backed. */ #endif #ifndef SHMMAX #define SHMMAX (SHMMAXPGS*PAGE_SIZE) #endif #ifndef SHMMIN #define SHMMIN 1 #endif #ifndef SHMMNI #define SHMMNI 192 #endif #ifndef SHMSEG #define SHMSEG 128 #endif #ifndef SHMALL #define SHMALL (SHMMAXPGS) #endif struct shminfo shminfo = { SHMMAX, SHMMIN, SHMMNI, SHMSEG, SHMALL }; static int shm_use_phys; static int shm_allow_removed; SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0, "Maximum shared memory segment size"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0, "Minimum shared memory segment size"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0, "Number of shared memory identifiers"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0, "Number of segments per process"); SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0, "Maximum number of pages available for shared memory"); SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW, &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core"); SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RW, &shm_allow_removed, 0, "Enable/Disable attachment to attached segments marked for removal"); SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLFLAG_RD, NULL, 0, sysctl_shmsegs, "", "Current number of shared memory segments allocated"); static int shm_find_segment_by_key(key) key_t key; { int i; for (i = 0; i < shmalloced; i++) if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) && shmsegs[i].u.shm_perm.key == key) return (i); return (-1); } static struct shmid_kernel * shm_find_segment_by_shmid(int shmid) { int segnum; struct shmid_kernel *shmseg; segnum = IPCID_TO_IX(shmid); if (segnum < 0 || segnum >= shmalloced) return (NULL); shmseg = &shmsegs[segnum]; if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 || (!shm_allow_removed && (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) || shmseg->u.shm_perm.seq != IPCID_TO_SEQ(shmid)) return (NULL); return (shmseg); } static struct shmid_kernel * shm_find_segment_by_shmidx(int segnum) { struct shmid_kernel *shmseg; if (segnum < 0 || segnum >= shmalloced) return (NULL); shmseg = &shmsegs[segnum]; if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 || (!shm_allow_removed && (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0)) return (NULL); return (shmseg); } static void shm_deallocate_segment(shmseg) struct shmid_kernel *shmseg; { size_t size; GIANT_REQUIRED; vm_object_deallocate(shmseg->u.shm_internal); shmseg->u.shm_internal = NULL; size = round_page(shmseg->u.shm_segsz); shm_committed -= btoc(size); shm_nused--; shmseg->u.shm_perm.mode = SHMSEG_FREE; #ifdef MAC mac_cleanup_sysv_shm(shmseg); #endif } static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s) { struct shmid_kernel *shmseg; int segnum, result; size_t size; GIANT_REQUIRED; segnum = IPCID_TO_IX(shmmap_s->shmid); shmseg = &shmsegs[segnum]; size = round_page(shmseg->u.shm_segsz); result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size); if (result != KERN_SUCCESS) return (EINVAL); shmmap_s->shmid = -1; shmseg->u.shm_dtime = time_second; if ((--shmseg->u.shm_nattch <= 0) && (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) { shm_deallocate_segment(shmseg); shm_last_free = segnum; } return (0); } #ifndef _SYS_SYSPROTO_H_ struct shmdt_args { const void *shmaddr; }; #endif - int shmdt(td, uap) struct thread *td; struct shmdt_args *uap; { struct proc *p = td->td_proc; struct shmmap_state *shmmap_s; #ifdef MAC struct shmid_kernel *shmsegptr; #endif int i; int error = 0; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); mtx_lock(&Giant); shmmap_s = p->p_vmspace->vm_shm; if (shmmap_s == NULL) { error = EINVAL; goto done2; } for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) { if (shmmap_s->shmid != -1 && shmmap_s->va == (vm_offset_t)uap->shmaddr) { break; } } if (i == shminfo.shmseg) { error = EINVAL; goto done2; } #ifdef MAC shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)]; error = mac_check_sysv_shmdt(td->td_ucred, shmsegptr); if (error != 0) goto done2; #endif error = shm_delete_mapping(p->p_vmspace, shmmap_s); done2: mtx_unlock(&Giant); return (error); } #ifndef _SYS_SYSPROTO_H_ struct shmat_args { int shmid; const void *shmaddr; int shmflg; }; #endif - int kern_shmat(td, shmid, shmaddr, shmflg) struct thread *td; int shmid; const void *shmaddr; int shmflg; { struct proc *p = td->td_proc; int i, flags; struct shmid_kernel *shmseg; struct shmmap_state *shmmap_s = NULL; vm_offset_t attach_va; vm_prot_t prot; vm_size_t size; int rv; int error = 0; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); mtx_lock(&Giant); shmmap_s = p->p_vmspace->vm_shm; if (shmmap_s == NULL) { size = shminfo.shmseg * sizeof(struct shmmap_state); shmmap_s = malloc(size, M_SHM, M_WAITOK); for (i = 0; i < shminfo.shmseg; i++) shmmap_s[i].shmid = -1; p->p_vmspace->vm_shm = shmmap_s; } shmseg = shm_find_segment_by_shmid(shmid); if (shmseg == NULL) { error = EINVAL; goto done2; } error = ipcperm(td, &shmseg->u.shm_perm, (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W); if (error) goto done2; #ifdef MAC error = mac_check_sysv_shmat(td->td_ucred, shmseg, shmflg); if (error != 0) goto done2; #endif for (i = 0; i < shminfo.shmseg; i++) { if (shmmap_s->shmid == -1) break; shmmap_s++; } if (i >= shminfo.shmseg) { error = EMFILE; goto done2; } size = round_page(shmseg->u.shm_segsz); #ifdef VM_PROT_READ_IS_EXEC prot = VM_PROT_READ | VM_PROT_EXECUTE; #else prot = VM_PROT_READ; #endif if ((shmflg & SHM_RDONLY) == 0) prot |= VM_PROT_WRITE; flags = MAP_ANON | MAP_SHARED; if (shmaddr) { flags |= MAP_FIXED; if (shmflg & SHM_RND) { attach_va = (vm_offset_t)shmaddr & ~(SHMLBA-1); } else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0) { attach_va = (vm_offset_t)shmaddr; } else { error = EINVAL; goto done2; } } else { /* * This is just a hint to vm_map_find() about where to * put it. */ PROC_LOCK(p); attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr + lim_max(p, RLIMIT_DATA)); PROC_UNLOCK(p); } vm_object_reference(shmseg->u.shm_internal); rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->u.shm_internal, 0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0); if (rv != KERN_SUCCESS) { vm_object_deallocate(shmseg->u.shm_internal); error = ENOMEM; goto done2; } vm_map_inherit(&p->p_vmspace->vm_map, attach_va, attach_va + size, VM_INHERIT_SHARE); shmmap_s->va = attach_va; shmmap_s->shmid = shmid; shmseg->u.shm_lpid = p->p_pid; shmseg->u.shm_atime = time_second; shmseg->u.shm_nattch++; td->td_retval[0] = attach_va; done2: mtx_unlock(&Giant); return (error); } int shmat(td, uap) struct thread *td; struct shmat_args *uap; { return kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg); } #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43)) struct oshmid_ds { struct ipc_perm shm_perm; /* operation perms */ int shm_segsz; /* size of segment (bytes) */ u_short shm_cpid; /* pid, creator */ u_short shm_lpid; /* pid, last operation */ short shm_nattch; /* no. of current attaches */ time_t shm_atime; /* last attach time */ time_t shm_dtime; /* last detach time */ time_t shm_ctime; /* last change time */ void *shm_handle; /* internal handle for shm segment */ }; struct oshmctl_args { int shmid; int cmd; struct oshmid_ds *ubuf; }; - static int oshmctl(td, uap) struct thread *td; struct oshmctl_args *uap; { #ifdef COMPAT_43 int error = 0; struct shmid_kernel *shmseg; struct oshmid_ds outbuf; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); mtx_lock(&Giant); shmseg = shm_find_segment_by_shmid(uap->shmid); if (shmseg == NULL) { error = EINVAL; goto done2; } switch (uap->cmd) { case IPC_STAT: error = ipcperm(td, &shmseg->u.shm_perm, IPC_R); if (error) goto done2; #ifdef MAC error = mac_check_sysv_shmctl(td->td_ucred, shmseg, uap->cmd); if (error != 0) goto done2; #endif outbuf.shm_perm = shmseg->u.shm_perm; outbuf.shm_segsz = shmseg->u.shm_segsz; outbuf.shm_cpid = shmseg->u.shm_cpid; outbuf.shm_lpid = shmseg->u.shm_lpid; outbuf.shm_nattch = shmseg->u.shm_nattch; outbuf.shm_atime = shmseg->u.shm_atime; outbuf.shm_dtime = shmseg->u.shm_dtime; outbuf.shm_ctime = shmseg->u.shm_ctime; outbuf.shm_handle = shmseg->u.shm_internal; error = copyout(&outbuf, uap->ubuf, sizeof(outbuf)); if (error) goto done2; break; default: error = shmctl(td, (struct shmctl_args *)uap); break; } done2: mtx_unlock(&Giant); return (error); #else return (EINVAL); #endif } #endif #ifndef _SYS_SYSPROTO_H_ struct shmctl_args { int shmid; int cmd; struct shmid_ds *buf; }; #endif - int kern_shmctl(td, shmid, cmd, buf, bufsz) struct thread *td; int shmid; int cmd; void *buf; size_t *bufsz; { int error = 0; struct shmid_kernel *shmseg; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); mtx_lock(&Giant); switch (cmd) { case IPC_INFO: memcpy(buf, &shminfo, sizeof(shminfo)); if (bufsz) *bufsz = sizeof(shminfo); td->td_retval[0] = shmalloced; goto done2; case SHM_INFO: { struct shm_info shm_info; shm_info.used_ids = shm_nused; shm_info.shm_rss = 0; /*XXX where to get from ? */ shm_info.shm_tot = 0; /*XXX where to get from ? */ shm_info.shm_swp = 0; /*XXX where to get from ? */ shm_info.swap_attempts = 0; /*XXX where to get from ? */ shm_info.swap_successes = 0; /*XXX where to get from ? */ memcpy(buf, &shm_info, sizeof(shm_info)); if (bufsz) *bufsz = sizeof(shm_info); td->td_retval[0] = shmalloced; goto done2; } } if (cmd == SHM_STAT) shmseg = shm_find_segment_by_shmidx(shmid); else shmseg = shm_find_segment_by_shmid(shmid); if (shmseg == NULL) { error = EINVAL; goto done2; } #ifdef MAC error = mac_check_sysv_shmctl(td->td_ucred, shmseg, cmd); if (error != 0) goto done2; #endif switch (cmd) { case SHM_STAT: case IPC_STAT: error = ipcperm(td, &shmseg->u.shm_perm, IPC_R); if (error) goto done2; memcpy(buf, &shmseg->u, sizeof(struct shmid_ds)); if (bufsz) *bufsz = sizeof(struct shmid_ds); if (cmd == SHM_STAT) td->td_retval[0] = IXSEQ_TO_IPCID(shmid, shmseg->u.shm_perm); break; case IPC_SET: { struct shmid_ds *shmid; shmid = (struct shmid_ds *)buf; error = ipcperm(td, &shmseg->u.shm_perm, IPC_M); if (error) goto done2; shmseg->u.shm_perm.uid = shmid->shm_perm.uid; shmseg->u.shm_perm.gid = shmid->shm_perm.gid; shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & ~ACCESSPERMS) | (shmid->shm_perm.mode & ACCESSPERMS); shmseg->u.shm_ctime = time_second; break; } case IPC_RMID: error = ipcperm(td, &shmseg->u.shm_perm, IPC_M); if (error) goto done2; shmseg->u.shm_perm.key = IPC_PRIVATE; shmseg->u.shm_perm.mode |= SHMSEG_REMOVED; if (shmseg->u.shm_nattch <= 0) { shm_deallocate_segment(shmseg); shm_last_free = IPCID_TO_IX(shmid); } break; #if 0 case SHM_LOCK: case SHM_UNLOCK: #endif default: error = EINVAL; break; } done2: mtx_unlock(&Giant); return (error); } int shmctl(td, uap) struct thread *td; struct shmctl_args *uap; { int error = 0; struct shmid_ds buf; size_t bufsz; /* IPC_SET needs to copyin the buffer before calling kern_shmctl */ if (uap->cmd == IPC_SET) { if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds)))) goto done; } error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz); if (error) goto done; /* Cases in which we need to copyout */ switch (uap->cmd) { case IPC_INFO: case SHM_INFO: case SHM_STAT: case IPC_STAT: error = copyout(&buf, uap->buf, bufsz); break; } done: if (error) { /* Invalidate the return value */ td->td_retval[0] = -1; } return (error); } #ifndef _SYS_SYSPROTO_H_ struct shmget_args { key_t key; size_t size; int shmflg; }; #endif - static int shmget_existing(td, uap, mode, segnum) struct thread *td; struct shmget_args *uap; int mode; int segnum; { struct shmid_kernel *shmseg; int error; shmseg = &shmsegs[segnum]; if (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) { /* * This segment is in the process of being allocated. Wait * until it's done, and look the key up again (in case the * allocation failed or it was freed). */ shmseg->u.shm_perm.mode |= SHMSEG_WANTED; error = tsleep(shmseg, PLOCK | PCATCH, "shmget", 0); if (error) return (error); return (EAGAIN); } if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL)) return (EEXIST); #ifdef MAC error = mac_check_sysv_shmget(td->td_ucred, shmseg, uap->shmflg); if (error != 0) return (error); #endif if (uap->size && uap->size > shmseg->u.shm_segsz) return (EINVAL); td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm); return (0); } static int shmget_allocate_segment(td, uap, mode) struct thread *td; struct shmget_args *uap; int mode; { int i, segnum, shmid, size; struct ucred *cred = td->td_ucred; struct shmid_kernel *shmseg; vm_object_t shm_object; GIANT_REQUIRED; if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax) return (EINVAL); if (shm_nused >= shminfo.shmmni) /* Any shmids left? */ return (ENOSPC); size = round_page(uap->size); if (shm_committed + btoc(size) > shminfo.shmall) return (ENOMEM); if (shm_last_free < 0) { shmrealloc(); /* Maybe expand the shmsegs[] array. */ for (i = 0; i < shmalloced; i++) if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE) break; if (i == shmalloced) return (ENOSPC); segnum = i; } else { segnum = shm_last_free; shm_last_free = -1; } shmseg = &shmsegs[segnum]; /* * In case we sleep in malloc(), mark the segment present but deleted * so that noone else tries to create the same key. */ shmseg->u.shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED; shmseg->u.shm_perm.key = uap->key; shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff; shmid = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm); /* * We make sure that we have allocated a pager before we need * to. */ if (shm_use_phys) { shm_object = vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0); } else { shm_object = vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0); } VM_OBJECT_LOCK(shm_object); vm_object_clear_flag(shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shm_object, OBJ_NOSPLIT); VM_OBJECT_UNLOCK(shm_object); shmseg->u.shm_internal = shm_object; shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid; shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid; shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) | (mode & ACCESSPERMS) | SHMSEG_ALLOCATED; shmseg->u.shm_segsz = uap->size; shmseg->u.shm_cpid = td->td_proc->p_pid; shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0; shmseg->u.shm_atime = shmseg->u.shm_dtime = 0; #ifdef MAC mac_create_sysv_shm(cred, shmseg); #endif shmseg->u.shm_ctime = time_second; shm_committed += btoc(size); shm_nused++; if (shmseg->u.shm_perm.mode & SHMSEG_WANTED) { /* * Somebody else wanted this key while we were asleep. Wake * them up now. */ shmseg->u.shm_perm.mode &= ~SHMSEG_WANTED; wakeup(shmseg); } td->td_retval[0] = shmid; return (0); } int shmget(td, uap) struct thread *td; struct shmget_args *uap; { int segnum, mode; int error; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); mtx_lock(&Giant); mode = uap->shmflg & ACCESSPERMS; if (uap->key != IPC_PRIVATE) { again: segnum = shm_find_segment_by_key(uap->key); if (segnum >= 0) { error = shmget_existing(td, uap, mode, segnum); if (error == EAGAIN) goto again; goto done2; } if ((uap->shmflg & IPC_CREAT) == 0) { error = ENOENT; goto done2; } } error = shmget_allocate_segment(td, uap, mode); done2: mtx_unlock(&Giant); return (error); } int shmsys(td, uap) struct thread *td; /* XXX actually varargs. */ struct shmsys_args /* { int which; int a2; int a3; int a4; } */ *uap; { #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43)) int error; if (!jail_sysvipc_allowed && jailed(td->td_ucred)) return (ENOSYS); if (uap->which < 0 || uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0])) return (EINVAL); mtx_lock(&Giant); error = (*shmcalls[uap->which])(td, &uap->a2); mtx_unlock(&Giant); return (error); #else return (nosys(td, NULL)); #endif } static void shmfork_myhook(p1, p2) struct proc *p1, *p2; { struct shmmap_state *shmmap_s; size_t size; int i; mtx_lock(&Giant); size = shminfo.shmseg * sizeof(struct shmmap_state); shmmap_s = malloc(size, M_SHM, M_WAITOK); bcopy(p1->p_vmspace->vm_shm, shmmap_s, size); p2->p_vmspace->vm_shm = shmmap_s; for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) if (shmmap_s->shmid != -1) shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++; mtx_unlock(&Giant); } static void shmexit_myhook(struct vmspace *vm) { struct shmmap_state *base, *shm; int i; if ((base = vm->vm_shm) != NULL) { vm->vm_shm = NULL; mtx_lock(&Giant); for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) { if (shm->shmid != -1) shm_delete_mapping(vm, shm); } mtx_unlock(&Giant); free(base, M_SHM); } } static void shmrealloc(void) { int i; struct shmid_kernel *newsegs; if (shmalloced >= shminfo.shmmni) return; newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK); if (newsegs == NULL) return; for (i = 0; i < shmalloced; i++) bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0])); for (; i < shminfo.shmmni; i++) { shmsegs[i].u.shm_perm.mode = SHMSEG_FREE; shmsegs[i].u.shm_perm.seq = 0; #ifdef MAC mac_init_sysv_shm(&shmsegs[i]); #endif } free(shmsegs, M_SHM); shmsegs = newsegs; shmalloced = shminfo.shmmni; } static void shminit() { int i; TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall); for (i = PAGE_SIZE; i > 0; i--) { shminfo.shmmax = shminfo.shmall * i; if (shminfo.shmmax >= shminfo.shmall) break; } TUNABLE_ULONG_FETCH("kern.ipc.shmmin", &shminfo.shmmin); TUNABLE_ULONG_FETCH("kern.ipc.shmmni", &shminfo.shmmni); TUNABLE_ULONG_FETCH("kern.ipc.shmseg", &shminfo.shmseg); TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys); shmalloced = shminfo.shmmni; shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK); if (shmsegs == NULL) panic("cannot allocate initial memory for sysvshm"); for (i = 0; i < shmalloced; i++) { shmsegs[i].u.shm_perm.mode = SHMSEG_FREE; shmsegs[i].u.shm_perm.seq = 0; #ifdef MAC mac_init_sysv_shm(&shmsegs[i]); #endif } shm_last_free = 0; shm_nused = 0; shm_committed = 0; shmexit_hook = &shmexit_myhook; shmfork_hook = &shmfork_myhook; } static int shmunload() { #ifdef MAC int i; #endif if (shm_nused > 0) return (EBUSY); #ifdef MAC for (i = 0; i < shmalloced; i++) mac_destroy_sysv_shm(&shmsegs[i]); #endif free(shmsegs, M_SHM); shmexit_hook = NULL; shmfork_hook = NULL; return (0); } static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS) { return (SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0]))); } static int sysvshm_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: shminit(); break; case MOD_UNLOAD: error = shmunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvshm_mod = { "sysvshm", &sysvshm_modload, NULL }; SYSCALL_MODULE_HELPER(shmsys); SYSCALL_MODULE_HELPER(shmat); SYSCALL_MODULE_HELPER(shmctl); SYSCALL_MODULE_HELPER(shmdt); SYSCALL_MODULE_HELPER(shmget); DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST); MODULE_VERSION(sysvshm, 1); Index: head/sys/kern/uipc_mqueue.c =================================================================== --- head/sys/kern/uipc_mqueue.c (revision 167231) +++ head/sys/kern/uipc_mqueue.c (revision 167232) @@ -1,2496 +1,2484 @@ /*- * Copyright (c) 2005 David Xu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; u_int32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static struct fileops mqueueops; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); struct filterops mq_rfiltops = { 1, NULL, filt_mqdetach, filt_mqread }; struct filterops mq_wfiltops = { 1, NULL, filt_mqdetach, filt_mqwrite }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_fetchadd_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { int old, exp; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) mqfs_destroy(node); } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_type = nodetype; node->mn_refcount = 1; getnanotime(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp, struct thread *td) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_MPSAFE; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags, struct thread *td) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, td); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); sx_xlock(&mqfs->mi_lock); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); sx_xunlock(&mqfs->mi_lock); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vrecycle(vp, curthread); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; int error; LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) break; } if (vd != NULL) { if (vget(vd->mv_vnode, 0, curthread) == 0) { *vpp = vd->mv_vnode; vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread); return (0); } /* XXX if this can happen, we're in trouble */ } error = getnewvnode("mqueue", mp, &mqfs_vnodeops, vpp); if (error) return (error); vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread); return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len) { struct mqfs_node *pn; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { if (strncmp(pn->mn_name, name, len) == 0) return (pn); } return (NULL); } /* - * Look up a file or directory + * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; int nameiop, flags, error, namelen; char *pname; struct thread *td; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp, 0, cnp->cn_thread); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); return (error); } /* named node */ pn = mqfs_search(pd, pname, namelen); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); int rc; sx_xlock(&mqfs->mi_lock); rc = mqfs_lookupx(ap); sx_xunlock(&mqfs->mi_lock); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); #if 0 /* named node */ pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen); if (pn != NULL) { mqueue_free(mq); sx_xunlock(&mqfs->mi_lock); return (EEXIST); } #else if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); #endif pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) error = ENOSPC; else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } sx_xunlock(&mqfs->mi_lock); if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); /* * XXXRW: Other instances of the message queue primitive are * allowed in jail? */ if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp, ap->a_td); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); uma_zfree(mvdata_zone, vd); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_mode, ap->a_cred, NULL); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; VATTR_NULL(vap); vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = 0; vap->va_bytes = 0; vap->va_filerev = 0; vap->va_vaflags = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check_cred(ap->a_td->td_ucred, PRIV_MQ_ADMIN, SUSER_ALLOWJAIL)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != pn->mn_uid) && (error = priv_check_cred(ap->a_td->td_ucred, PRIV_MQ_ADMIN, SUSER_ALLOWJAIL))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqfs_node *pn; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); pn = VTON(vp); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_name[i] = 0; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); #if 0 /* named node */ pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen); if (pn != NULL) { sx_xunlock(&mqfs->mi_lock); return (EEXIST); } #else if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); #endif pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn == NULL) error = ENOSPC; else error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue", NULL, MTX_DEF); knlist_init(&mq->mq_rsel.si_note, &mq->mq_mutex, NULL, NULL, NULL); knlist_init(&mq->mq_wsel.si_note, &mq->mq_mutex, NULL, NULL, NULL); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); FREE(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; MALLOC(msg, struct mqueue_msg *, len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { FREE(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { FREE(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ets, ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; error = copyin(abs_timeout, &ets, sizeof(ets)); if (error != 0) goto bad; if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { ts2 = ets; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct proc *p; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; PROC_LOCK(p); if (!KSI_ONQ(&nt->nt_ksi)) psignal_event(p, &nt->nt_sigev, &nt->nt_ksi); PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ets, ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); error = copyin(abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { ts2 = ets; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } /* - * Syscall to open a message queue + * Syscall to open a message queue. */ int kmq_open(struct thread *td, struct kmq_open_args *uap) { char path[MQFS_NAMELEN + 1]; struct mq_attr attr, *pattr; struct mqfs_node *pn; struct filedesc *fdp; struct file *fp; struct mqueue *mq; int fd, error, len, flags, cmode; if ((uap->flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); fdp = td->td_proc->p_fd; flags = FFLAGS(uap->flags); cmode = (((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT); mq = NULL; if ((flags & O_CREAT) && (uap->attr != NULL)) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); if (attr.mq_maxmsg <= 0 || attr.mq_maxmsg > maxmsg) return (EINVAL); if (attr.mq_msgsize <= 0 || attr.mq_msgsize > maxmsgsize) return (EINVAL); pattr = &attr; } else pattr = NULL; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL) return (EINVAL); error = falloc(td, &fp, &fd); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(pattr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { int acc_mode = 0; if (flags & FREAD) acc_mode |= VREAD; if (flags & FWRITE) acc_mode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, acc_mode, td->td_ucred, NULL); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(fdp, fp, fd, td); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); FILE_LOCK(fp); fp->f_flag = (flags & (FREAD | FWRITE | O_NONBLOCK)); fp->f_type = DTYPE_MQUEUE; fp->f_ops = &mqueueops; fp->f_data = pn; FILE_UNLOCK(fp); FILEDESC_LOCK_FAST(fdp); if (fdp->fd_ofiles[fd] == fp) fdp->fd_ofileflags[fd] |= UF_EXCLOSE; FILEDESC_UNLOCK_FAST(fdp); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* - * Syscall to unlink a message queue + * Syscall to unlink a message queue. */ int kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL) return (EINVAL); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget_write, fpp, ppn, pmq); } -/* - * Syscall - */ int kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mqueue *mq; struct file *fp; struct mq_attr attr, oattr; int error; if (uap->attr) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); if (attr.mq_flags & ~O_NONBLOCK) return (EINVAL); } error = getmq(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); oattr.mq_maxmsg = mq->mq_maxmsg; oattr.mq_msgsize = mq->mq_msgsize; oattr.mq_curmsgs = mq->mq_curmsgs; FILE_LOCK(fp); oattr.mq_flags = (O_NONBLOCK & fp->f_flag); if (uap->attr) { fp->f_flag &= ~O_NONBLOCK; fp->f_flag |= (attr.mq_flags & O_NONBLOCK); } FILE_UNLOCK(fp); fdrop(fp, td); if (uap->oattr) error = copyout(&oattr, uap->oattr, sizeof(oattr)); return (error); } -/* - * Syscall - */ int kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; int error; int waitok; error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, uap->abs_timeout); fdrop(fp, td); return (error); } -/* - * Syscall - */ int kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; int error, waitok; error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, uap->abs_timeout); fdrop(fp, td); return (error); } -/* - * Syscall - */ int kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev; struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp; struct mqueue_notifier *nt, *newnt = NULL; int error; p = td->td_proc; fdp = td->td_proc->p_fd; if (uap->sigev) { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error) return (error); if (ev.sigev_notify != SIGEV_SIGNAL && ev.sigev_notify != SIGEV_THREAD_ID && ev.sigev_notify != SIGEV_NONE) return (EINVAL); if ((ev.sigev_notify == SIGEV_SIGNAL || ev.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(ev.sigev_signo)) return (EINVAL); } error = getmq(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_LOCK_FAST(fdp); if (fget_locked(fdp, uap->mqd) != fp) { FILEDESC_UNLOCK_FAST(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_UNLOCK_FAST(fdp); if (uap->sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, uap->mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = uap->mqd; notifier_insert(p, nt); } nt->nt_sigev = ev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, uap->mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct filedesc *fdp; struct mqueue *mq; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_LOCK_FAST(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_locked(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_UNLOCK_FAST(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } static int mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } static int mqf_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; FILE_LOCK(fp); fp->f_ops = &badfileops; FILE_UNLOCK(fp); pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); st->st_atimespec = pn->mn_atime; st->st_mtimespec = pn->mn_mtime; st->st_ctimespec = pn->mn_ctime; st->st_birthtimespec = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; return (0); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static struct fileops mqueueops = { .fo_read = mqf_read, .fo_write = mqf_write, .fo_ioctl = mqf_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; SYSCALL_MODULE_HELPER(kmq_open); SYSCALL_MODULE_HELPER(kmq_setattr); SYSCALL_MODULE_HELPER(kmq_timedsend); SYSCALL_MODULE_HELPER(kmq_timedreceive); SYSCALL_MODULE_HELPER(kmq_notify); SYSCALL_MODULE_HELPER(kmq_unlink); VFS_SET(mqfs_vfsops, mqueuefs, VFCF_SYNTHETIC); MODULE_VERSION(mqueuefs, 1); Index: head/sys/kern/uipc_sem.c =================================================================== --- head/sys/kern/uipc_sem.c (revision 167231) +++ head/sys/kern/uipc_sem.c (revision 167232) @@ -1,1011 +1,1008 @@ /*- * Copyright (c) 2002 Alfred Perlstein * Copyright (c) 2003-2005 SPARTA, Inc. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int sem_count_proc(struct proc *p); static struct ksem *sem_lookup_byname(const char *name); static int sem_create(struct thread *td, const char *name, struct ksem **ksret, mode_t mode, unsigned int value); static void sem_free(struct ksem *ksnew); static int sem_perm(struct thread *td, struct ksem *ks); static void sem_enter(struct proc *p, struct ksem *ks); static int sem_leave(struct proc *p, struct ksem *ks); static void sem_exechook(void *arg, struct proc *p, struct image_params *imgp); static void sem_exithook(void *arg, struct proc *p); static void sem_forkhook(void *arg, struct proc *p1, struct proc *p2, int flags); static int sem_hasopen(struct thread *td, struct ksem *ks); static int kern_sem_close(struct thread *td, semid_t id); static int kern_sem_post(struct thread *td, semid_t id); static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime); static int kern_sem_init(struct thread *td, int dir, unsigned int value, semid_t *idp); static int kern_sem_open(struct thread *td, int dir, const char *name, int oflag, mode_t mode, unsigned int value, semid_t *idp); static int kern_sem_unlink(struct thread *td, const char *name); #ifndef SEM_MAX #define SEM_MAX 30 #endif #define SEM_MAX_NAMELEN 14 #define SEM_TO_ID(x) ((intptr_t)(x)) #define ID_TO_SEM(x) id_to_sem(x) /* * available semaphores go here, this includes sem_init and any semaphores * created via sem_open that have not yet been unlinked. */ LIST_HEAD(, ksem) ksem_head = LIST_HEAD_INITIALIZER(&ksem_head); /* * semaphores still in use but have been sem_unlink()'d go here. */ LIST_HEAD(, ksem) ksem_deadhead = LIST_HEAD_INITIALIZER(&ksem_deadhead); static struct mtx sem_lock; static MALLOC_DEFINE(M_SEM, "sems", "semaphore data"); static int nsems = 0; SYSCTL_DECL(_p1003_1b); SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0, ""); static eventhandler_tag sem_exit_tag, sem_exec_tag, sem_fork_tag; #ifdef SEM_DEBUG #define DP(x) printf x #else #define DP(x) #endif static __inline void sem_ref(struct ksem *ks) { mtx_assert(&sem_lock, MA_OWNED); ks->ks_ref++; DP(("sem_ref: ks = %p, ref = %d\n", ks, ks->ks_ref)); } static __inline void sem_rel(struct ksem *ks) { mtx_assert(&sem_lock, MA_OWNED); DP(("sem_rel: ks = %p, ref = %d\n", ks, ks->ks_ref - 1)); if (--ks->ks_ref == 0) sem_free(ks); } static __inline struct ksem *id_to_sem(semid_t id); static __inline struct ksem * id_to_sem(semid_t id) { struct ksem *ks; mtx_assert(&sem_lock, MA_OWNED); DP(("id_to_sem: id = %0x,%p\n", id, (struct ksem *)id)); LIST_FOREACH(ks, &ksem_head, ks_entry) { DP(("id_to_sem: ks = %p\n", ks)); if (ks == (struct ksem *)id) return (ks); } return (NULL); } static struct ksem * sem_lookup_byname(const char *name) { struct ksem *ks; mtx_assert(&sem_lock, MA_OWNED); LIST_FOREACH(ks, &ksem_head, ks_entry) if (ks->ks_name != NULL && strcmp(ks->ks_name, name) == 0) return (ks); return (NULL); } static int sem_create(struct thread *td, const char *name, struct ksem **ksret, mode_t mode, unsigned int value) { struct ksem *ret; struct proc *p; struct ucred *uc; size_t len; int error; DP(("sem_create\n")); p = td->td_proc; uc = td->td_ucred; if (value > SEM_VALUE_MAX) return (EINVAL); ret = malloc(sizeof(*ret), M_SEM, M_WAITOK | M_ZERO); if (name != NULL) { len = strlen(name); if (len > SEM_MAX_NAMELEN) { free(ret, M_SEM); return (ENAMETOOLONG); } /* name must start with a '/' but not contain one. */ if (*name != '/' || len < 2 || index(name + 1, '/') != NULL) { free(ret, M_SEM); return (EINVAL); } ret->ks_name = malloc(len + 1, M_SEM, M_WAITOK); strcpy(ret->ks_name, name); } else { ret->ks_name = NULL; } ret->ks_mode = mode; ret->ks_value = value; ret->ks_ref = 1; ret->ks_waiters = 0; ret->ks_uid = uc->cr_uid; ret->ks_gid = uc->cr_gid; ret->ks_onlist = 0; cv_init(&ret->ks_cv, "sem"); LIST_INIT(&ret->ks_users); #ifdef MAC mac_init_posix_sem(ret); mac_create_posix_sem(uc, ret); #endif if (name != NULL) sem_enter(td->td_proc, ret); *ksret = ret; mtx_lock(&sem_lock); if (nsems >= p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX)) { sem_leave(td->td_proc, ret); sem_free(ret); error = ENFILE; } else { nsems++; error = 0; } mtx_unlock(&sem_lock); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_init_args { unsigned int value; semid_t *idp; }; int ksem_init(struct thread *td, struct ksem_init_args *uap); #endif int ksem_init(struct thread *td, struct ksem_init_args *uap) { int error; error = kern_sem_init(td, UIO_USERSPACE, uap->value, uap->idp); return (error); } static int kern_sem_init(struct thread *td, int dir, unsigned int value, semid_t *idp) { struct ksem *ks; semid_t id; int error; error = sem_create(td, NULL, &ks, S_IRWXU | S_IRWXG, value); if (error) return (error); id = SEM_TO_ID(ks); if (dir == UIO_USERSPACE) { error = copyout(&id, idp, sizeof(id)); if (error) { mtx_lock(&sem_lock); sem_rel(ks); mtx_unlock(&sem_lock); return (error); } } else { *idp = id; } mtx_lock(&sem_lock); LIST_INSERT_HEAD(&ksem_head, ks, ks_entry); ks->ks_onlist = 1; mtx_unlock(&sem_lock); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_open_args { char *name; int oflag; mode_t mode; unsigned int value; semid_t *idp; }; int ksem_open(struct thread *td, struct ksem_open_args *uap); #endif int ksem_open(struct thread *td, struct ksem_open_args *uap) { char name[SEM_MAX_NAMELEN + 1]; size_t done; int error; error = copyinstr(uap->name, name, SEM_MAX_NAMELEN + 1, &done); if (error) return (error); DP((">>> sem_open start\n")); error = kern_sem_open(td, UIO_USERSPACE, name, uap->oflag, uap->mode, uap->value, uap->idp); DP(("<<< sem_open end\n")); return (error); } static int kern_sem_open(struct thread *td, int dir, const char *name, int oflag, mode_t mode, unsigned int value, semid_t *idp) { struct ksem *ksnew, *ks; int error; semid_t id; ksnew = NULL; mtx_lock(&sem_lock); ks = sem_lookup_byname(name); /* * If we found it but O_EXCL is set, error. */ if (ks != NULL && (oflag & O_EXCL) != 0) { mtx_unlock(&sem_lock); return (EEXIST); } /* * If we didn't find it... */ if (ks == NULL) { /* * didn't ask for creation? error. */ if ((oflag & O_CREAT) == 0) { mtx_unlock(&sem_lock); return (ENOENT); } /* * We may block during creation, so drop the lock. */ mtx_unlock(&sem_lock); error = sem_create(td, name, &ksnew, mode, value); if (error != 0) return (error); id = SEM_TO_ID(ksnew); if (dir == UIO_USERSPACE) { DP(("about to copyout! %d to %p\n", id, idp)); error = copyout(&id, idp, sizeof(id)); if (error) { mtx_lock(&sem_lock); sem_leave(td->td_proc, ksnew); sem_rel(ksnew); mtx_unlock(&sem_lock); return (error); } } else { DP(("about to set! %d to %p\n", id, idp)); *idp = id; } /* * We need to make sure we haven't lost a race while * allocating during creation. */ mtx_lock(&sem_lock); ks = sem_lookup_byname(name); if (ks != NULL) { /* we lost... */ sem_leave(td->td_proc, ksnew); sem_rel(ksnew); /* we lost and we can't loose... */ if ((oflag & O_EXCL) != 0) { mtx_unlock(&sem_lock); return (EEXIST); } } else { DP(("sem_create: about to add to list...\n")); LIST_INSERT_HEAD(&ksem_head, ksnew, ks_entry); DP(("sem_create: setting list bit...\n")); ksnew->ks_onlist = 1; DP(("sem_create: done, about to unlock...\n")); } } else { #ifdef MAC error = mac_check_posix_sem_open(td->td_ucred, ks); if (error) goto err_open; #endif /* * if we aren't the creator, then enforce permissions. */ error = sem_perm(td, ks); if (error) goto err_open; sem_ref(ks); mtx_unlock(&sem_lock); id = SEM_TO_ID(ks); if (dir == UIO_USERSPACE) { error = copyout(&id, idp, sizeof(id)); if (error) { mtx_lock(&sem_lock); sem_rel(ks); mtx_unlock(&sem_lock); return (error); } } else { *idp = id; } sem_enter(td->td_proc, ks); mtx_lock(&sem_lock); sem_rel(ks); } err_open: mtx_unlock(&sem_lock); return (error); } static int sem_perm(struct thread *td, struct ksem *ks) { struct ucred *uc; /* * XXXRW: This permission routine appears to be incorrect. If the * user matches, we shouldn't go on to the group if the user * permissions don't allow the action? Not changed for now. To fix, * change from a series of if (); if (); to if () else if () else... */ uc = td->td_ucred; DP(("sem_perm: uc(%d,%d) ks(%d,%d,%o)\n", uc->cr_uid, uc->cr_gid, ks->ks_uid, ks->ks_gid, ks->ks_mode)); if ((uc->cr_uid == ks->ks_uid) && (ks->ks_mode & S_IWUSR) != 0) return (0); if ((uc->cr_gid == ks->ks_gid) && (ks->ks_mode & S_IWGRP) != 0) return (0); if ((ks->ks_mode & S_IWOTH) != 0) return (0); return (priv_check(td, PRIV_SEM_WRITE)); } static void sem_free(struct ksem *ks) { nsems--; if (ks->ks_onlist) LIST_REMOVE(ks, ks_entry); if (ks->ks_name != NULL) free(ks->ks_name, M_SEM); cv_destroy(&ks->ks_cv); free(ks, M_SEM); } static __inline struct kuser *sem_getuser(struct proc *p, struct ksem *ks); static __inline struct kuser * sem_getuser(struct proc *p, struct ksem *ks) { struct kuser *k; LIST_FOREACH(k, &ks->ks_users, ku_next) if (k->ku_pid == p->p_pid) return (k); return (NULL); } static int sem_hasopen(struct thread *td, struct ksem *ks) { return ((ks->ks_name == NULL && sem_perm(td, ks) == 0) || sem_getuser(td->td_proc, ks) != NULL); } static int sem_leave(struct proc *p, struct ksem *ks) { struct kuser *k; DP(("sem_leave: ks = %p\n", ks)); k = sem_getuser(p, ks); DP(("sem_leave: ks = %p, k = %p\n", ks, k)); if (k != NULL) { LIST_REMOVE(k, ku_next); sem_rel(ks); DP(("sem_leave: about to free k\n")); free(k, M_SEM); DP(("sem_leave: returning\n")); return (0); } return (EINVAL); } static void sem_enter(p, ks) struct proc *p; struct ksem *ks; { struct kuser *ku, *k; ku = malloc(sizeof(*ku), M_SEM, M_WAITOK); ku->ku_pid = p->p_pid; mtx_lock(&sem_lock); k = sem_getuser(p, ks); if (k != NULL) { mtx_unlock(&sem_lock); free(ku, M_TEMP); return; } LIST_INSERT_HEAD(&ks->ks_users, ku, ku_next); sem_ref(ks); mtx_unlock(&sem_lock); } #ifndef _SYS_SYSPROTO_H_ struct ksem_unlink_args { char *name; }; int ksem_unlink(struct thread *td, struct ksem_unlink_args *uap); #endif - int ksem_unlink(struct thread *td, struct ksem_unlink_args *uap) { char name[SEM_MAX_NAMELEN + 1]; size_t done; int error; error = copyinstr(uap->name, name, SEM_MAX_NAMELEN + 1, &done); return (error ? error : kern_sem_unlink(td, name)); } static int kern_sem_unlink(struct thread *td, const char *name) { struct ksem *ks; int error; mtx_lock(&sem_lock); ks = sem_lookup_byname(name); if (ks != NULL) { #ifdef MAC error = mac_check_posix_sem_unlink(td->td_ucred, ks); if (error) { mtx_unlock(&sem_lock); return (error); } #endif error = sem_perm(td, ks); } else error = ENOENT; DP(("sem_unlink: '%s' ks = %p, error = %d\n", name, ks, error)); if (error == 0) { LIST_REMOVE(ks, ks_entry); LIST_INSERT_HEAD(&ksem_deadhead, ks, ks_entry); sem_rel(ks); } mtx_unlock(&sem_lock); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_close_args { semid_t id; }; int ksem_close(struct thread *td, struct ksem_close_args *uap); #endif - int ksem_close(struct thread *td, struct ksem_close_args *uap) { return (kern_sem_close(td, uap->id)); } static int kern_sem_close(struct thread *td, semid_t id) { struct ksem *ks; int error; error = EINVAL; mtx_lock(&sem_lock); ks = ID_TO_SEM(id); /* this is not a valid operation for unnamed sems */ if (ks != NULL && ks->ks_name != NULL) error = sem_leave(td->td_proc, ks); mtx_unlock(&sem_lock); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_post_args { semid_t id; }; int ksem_post(struct thread *td, struct ksem_post_args *uap); #endif int ksem_post(struct thread *td, struct ksem_post_args *uap) { return (kern_sem_post(td, uap->id)); } static int kern_sem_post(struct thread *td, semid_t id) { struct ksem *ks; int error; mtx_lock(&sem_lock); ks = ID_TO_SEM(id); if (ks == NULL || !sem_hasopen(td, ks)) { error = EINVAL; goto err; } #ifdef MAC error = mac_check_posix_sem_post(td->td_ucred, ks); if (error) goto err; #endif if (ks->ks_value == SEM_VALUE_MAX) { error = EOVERFLOW; goto err; } ++ks->ks_value; if (ks->ks_waiters > 0) cv_signal(&ks->ks_cv); error = 0; err: mtx_unlock(&sem_lock); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_wait_args { semid_t id; }; int ksem_wait(struct thread *td, struct ksem_wait_args *uap); #endif - int ksem_wait(struct thread *td, struct ksem_wait_args *uap) { return (kern_sem_wait(td, uap->id, 0, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_timedwait_args { semid_t id; const struct timespec *abstime; }; int ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap); #endif int ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap) { struct timespec abstime; struct timespec *ts; int error; /* We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime, sizeof(abstime)); if (error != 0) return (error); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_trywait_args { semid_t id; }; int ksem_trywait(struct thread *td, struct ksem_trywait_args *uap); #endif int ksem_trywait(struct thread *td, struct ksem_trywait_args *uap) { return (kern_sem_wait(td, uap->id, 1, NULL)); } static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime) { struct timespec ts1, ts2; struct timeval tv; struct ksem *ks; int error; DP((">>> kern_sem_wait entered!\n")); mtx_lock(&sem_lock); ks = ID_TO_SEM(id); if (ks == NULL) { DP(("kern_sem_wait ks == NULL\n")); error = EINVAL; goto err; } sem_ref(ks); if (!sem_hasopen(td, ks)) { DP(("kern_sem_wait hasopen failed\n")); error = EINVAL; goto err; } #ifdef MAC error = mac_check_posix_sem_wait(td->td_ucred, ks); if (error) { DP(("kern_sem_wait mac failed\n")); goto err; } #endif DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag)); if (ks->ks_value == 0) { ks->ks_waiters++; if (tryflag != 0) error = EAGAIN; else if (abstime == NULL) error = cv_wait_sig(&ks->ks_cv, &sem_lock); else { for (;;) { ts1 = *abstime; getnanotime(&ts2); timespecsub(&ts1, &ts2); TIMESPEC_TO_TIMEVAL(&tv, &ts1); if (tv.tv_sec < 0) { error = ETIMEDOUT; break; } error = cv_timedwait_sig(&ks->ks_cv, &sem_lock, tvtohz(&tv)); if (error != EWOULDBLOCK) break; } } ks->ks_waiters--; if (error) goto err; } ks->ks_value--; error = 0; err: if (ks != NULL) sem_rel(ks); mtx_unlock(&sem_lock); DP(("<<< kern_sem_wait leaving, error = %d\n", error)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_getvalue_args { semid_t id; int *val; }; int ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap); #endif int ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap) { struct ksem *ks; int error, val; mtx_lock(&sem_lock); ks = ID_TO_SEM(uap->id); if (ks == NULL || !sem_hasopen(td, ks)) { mtx_unlock(&sem_lock); return (EINVAL); } #ifdef MAC error = mac_check_posix_sem_getvalue(td->td_ucred, ks); if (error) { mtx_unlock(&sem_lock); return (error); } #endif val = ks->ks_value; mtx_unlock(&sem_lock); error = copyout(&val, uap->val, sizeof(val)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_destroy_args { semid_t id; }; int ksem_destroy(struct thread *td, struct ksem_destroy_args *uap); #endif int ksem_destroy(struct thread *td, struct ksem_destroy_args *uap) { struct ksem *ks; int error; mtx_lock(&sem_lock); ks = ID_TO_SEM(uap->id); if (ks == NULL || !sem_hasopen(td, ks) || ks->ks_name != NULL) { error = EINVAL; goto err; } #ifdef MAC error = mac_check_posix_sem_destroy(td->td_ucred, ks); if (error) goto err; #endif if (ks->ks_waiters != 0) { error = EBUSY; goto err; } sem_rel(ks); error = 0; err: mtx_unlock(&sem_lock); return (error); } /* * Count the number of kusers associated with a proc, so as to guess at how * many to allocate when forking. */ static int sem_count_proc(struct proc *p) { struct ksem *ks; struct kuser *ku; int count; mtx_assert(&sem_lock, MA_OWNED); count = 0; LIST_FOREACH(ks, &ksem_head, ks_entry) { LIST_FOREACH(ku, &ks->ks_users, ku_next) { if (ku->ku_pid == p->p_pid) count++; } } LIST_FOREACH(ks, &ksem_deadhead, ks_entry) { LIST_FOREACH(ku, &ks->ks_users, ku_next) { if (ku->ku_pid == p->p_pid) count++; } } return (count); } /* * When a process forks, the child process must gain a reference to each open * semaphore in the parent process, whether it is unlinked or not. This * requires allocating a kuser structure for each semaphore reference in the * new process. Because the set of semaphores in the parent can change while * the fork is in progress, we have to handle races -- first we attempt to * allocate enough storage to acquire references to each of the semaphores, * then we enter the semaphores and release the temporary references. */ static void sem_forkhook(void *arg, struct proc *p1, struct proc *p2, int flags) { struct ksem *ks, **sem_array; int count, i, new_count; struct kuser *ku; mtx_lock(&sem_lock); count = sem_count_proc(p1); if (count == 0) { mtx_unlock(&sem_lock); return; } race_lost: mtx_assert(&sem_lock, MA_OWNED); mtx_unlock(&sem_lock); sem_array = malloc(sizeof(struct ksem *) * count, M_TEMP, M_WAITOK); mtx_lock(&sem_lock); new_count = sem_count_proc(p1); if (count < new_count) { /* Lost race, repeat and allocate more storage. */ free(sem_array, M_TEMP); count = new_count; goto race_lost; } /* * Given an array capable of storing an adequate number of semaphore * references, now walk the list of semaphores and acquire a new * reference for any semaphore opened by p1. */ count = new_count; i = 0; LIST_FOREACH(ks, &ksem_head, ks_entry) { LIST_FOREACH(ku, &ks->ks_users, ku_next) { if (ku->ku_pid == p1->p_pid) { sem_ref(ks); sem_array[i] = ks; i++; break; } } } LIST_FOREACH(ks, &ksem_deadhead, ks_entry) { LIST_FOREACH(ku, &ks->ks_users, ku_next) { if (ku->ku_pid == p1->p_pid) { sem_ref(ks); sem_array[i] = ks; i++; break; } } } mtx_unlock(&sem_lock); KASSERT(i == count, ("sem_forkhook: i != count (%d, %d)", i, count)); /* * Now cause p2 to enter each of the referenced semaphores, then * release our temporary reference. This is pretty inefficient. * Finally, free our temporary array. */ for (i = 0; i < count; i++) { sem_enter(p2, sem_array[i]); mtx_lock(&sem_lock); sem_rel(sem_array[i]); mtx_unlock(&sem_lock); } free(sem_array, M_TEMP); } static void sem_exechook(void *arg, struct proc *p, struct image_params *imgp __unused) { sem_exithook(arg, p); } static void sem_exithook(void *arg, struct proc *p) { struct ksem *ks, *ksnext; mtx_lock(&sem_lock); ks = LIST_FIRST(&ksem_head); while (ks != NULL) { ksnext = LIST_NEXT(ks, ks_entry); sem_leave(p, ks); ks = ksnext; } ks = LIST_FIRST(&ksem_deadhead); while (ks != NULL) { ksnext = LIST_NEXT(ks, ks_entry); sem_leave(p, ks); ks = ksnext; } mtx_unlock(&sem_lock); } static int sem_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: mtx_init(&sem_lock, "sem", "semaphore", MTX_DEF); p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX); p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX); sem_exit_tag = EVENTHANDLER_REGISTER(process_exit, sem_exithook, NULL, EVENTHANDLER_PRI_ANY); sem_exec_tag = EVENTHANDLER_REGISTER(process_exec, sem_exechook, NULL, EVENTHANDLER_PRI_ANY); sem_fork_tag = EVENTHANDLER_REGISTER(process_fork, sem_forkhook, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: if (nsems != 0) { error = EOPNOTSUPP; break; } EVENTHANDLER_DEREGISTER(process_exit, sem_exit_tag); EVENTHANDLER_DEREGISTER(process_exec, sem_exec_tag); EVENTHANDLER_DEREGISTER(process_fork, sem_fork_tag); mtx_destroy(&sem_lock); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sem_mod = { "sem", &sem_modload, NULL }; SYSCALL_MODULE_HELPER(ksem_init); SYSCALL_MODULE_HELPER(ksem_open); SYSCALL_MODULE_HELPER(ksem_unlink); SYSCALL_MODULE_HELPER(ksem_close); SYSCALL_MODULE_HELPER(ksem_post); SYSCALL_MODULE_HELPER(ksem_wait); SYSCALL_MODULE_HELPER(ksem_timedwait); SYSCALL_MODULE_HELPER(ksem_trywait); SYSCALL_MODULE_HELPER(ksem_getvalue); SYSCALL_MODULE_HELPER(ksem_destroy); DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST); MODULE_VERSION(sem, 1); Index: head/sys/kern/uipc_syscalls.c =================================================================== --- head/sys/kern/uipc_syscalls.c (revision 167231) +++ head/sys/kern/uipc_syscalls.c (revision 167232) @@ -1,2689 +1,2687 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * sendfile(2) and related extensions: * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_sctp.h" #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif /* SCTP */ static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, struct accept_args *uap, int compat); static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); /* * NSFBUFS-related variables and associated sysctls */ int nsfbufs; int nsfbufspeak; int nsfbufsused; SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, "Maximum number of sendfile(2) sf_bufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, "Number of sendfile(2) sf_bufs at peak usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, "Number of sendfile(2) sf_bufs in use"); /* * Convert a user file descriptor to a kernel file entry. A reference on the * file entry is held upon returning. This is lighter weight than * fgetsock(), which bumps the socket reference drops the file reference * count instead, as this approach avoids several additional mutex operations * associated with the additional reference count. If requested, return the * open file flags. */ static int getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp) { struct file *fp; int error; fp = NULL; if (fdp == NULL) error = EBADF; else { FILEDESC_LOCK_FAST(fdp); fp = fget_locked(fdp, fd); if (fp == NULL) error = EBADF; else if (fp->f_type != DTYPE_SOCKET) { fp = NULL; error = ENOTSOCK; } else { fhold(fp); if (fflagp != NULL) *fflagp = fp->f_flag; error = 0; } FILEDESC_UNLOCK_FAST(fdp); } *fpp = fp; return (error); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int socket(td, uap) struct thread *td; register struct socket_args /* { int domain; int type; int protocol; } */ *uap; { struct filedesc *fdp; struct socket *so; struct file *fp; int fd, error; #ifdef MAC error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type, uap->protocol); if (error) return (error); #endif fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd); if (error) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ NET_LOCK_GIANT(); error = socreate(uap->domain, &so, uap->type, uap->protocol, td->td_ucred, td); NET_UNLOCK_GIANT(); if (error) { fdclose(fdp, fp, fd, td); } else { FILEDESC_LOCK_FAST(fdp); fp->f_data = so; /* already has ref count */ fp->f_flag = FREAD|FWRITE; fp->f_ops = &socketops; fp->f_type = DTYPE_SOCKET; FILEDESC_UNLOCK_FAST(fdp); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } /* ARGSUSED */ int bind(td, uap) struct thread *td; register struct bind_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0) return (error); error = kern_bind(td, uap->s, sa); free(sa, M_SONAME); return (error); } int kern_bind(td, fd, sa) struct thread *td; int fd; struct sockaddr *sa; { struct socket *so; struct file *fp; int error; NET_LOCK_GIANT(); error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) goto done2; so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_check_socket_bind(td->td_ucred, so, sa); SOCK_UNLOCK(so); if (error) goto done1; #endif error = sobind(so, sa, td); #ifdef MAC done1: #endif fdrop(fp, td); done2: NET_UNLOCK_GIANT(); return (error); } /* ARGSUSED */ int listen(td, uap) struct thread *td; register struct listen_args /* { int s; int backlog; } */ *uap; { struct socket *so; struct file *fp; int error; NET_LOCK_GIANT(); error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_check_socket_listen(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto done; #endif error = solisten(so, uap->backlog, td); #ifdef MAC done: #endif fdrop(fp, td); } NET_UNLOCK_GIANT(); return(error); } /* * accept1() */ static int accept1(td, uap, compat) struct thread *td; register struct accept_args /* { int s; struct sockaddr * __restrict name; socklen_t * __restrict anamelen; } */ *uap; int compat; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uap->name == NULL) return (kern_accept(td, uap->s, NULL, NULL, NULL)); error = copyin(uap->anamelen, &namelen, sizeof (namelen)); if (error) return (error); error = kern_accept(td, uap->s, &name, &namelen, &fp); /* * return a namelen of zero for older code which might * ignore the return value from accept. */ if (error) { (void) copyout(&namelen, uap->anamelen, sizeof(*uap->anamelen)); return (error); } if (error == 0 && name != NULL) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uap->name, namelen); } if (error == 0) error = copyout(&namelen, uap->anamelen, sizeof(namelen)); if (error) fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { struct filedesc *fdp; struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; int error; struct socket *head, *so; int fd; u_int fflag; pid_t pgid; int tmp; if (name) { *name = NULL; if (*namelen < 0) return (EINVAL); } fdp = td->td_proc->p_fd; NET_LOCK_GIANT(); error = getsock(fdp, s, &headfp, &fflag); if (error) goto done2; head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC SOCK_LOCK(head); error = mac_check_socket_accept(td->td_ucred, head); SOCK_UNLOCK(head); if (error != 0) goto done; #endif error = falloc(td, &nfp, &fd); if (error) goto done; ACCEPT_LOCK(); if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { ACCEPT_UNLOCK(); error = EWOULDBLOCK; goto noconnection; } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, "accept", 0); if (error) { ACCEPT_UNLOCK(); goto noconnection; } } if (head->so_error) { error = head->so_error; head->so_error = 0; ACCEPT_UNLOCK(); goto noconnection; } so = TAILQ_FIRST(&head->so_comp); KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); /* soref() and so_state update */ soref(so); /* file descriptor reference */ TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; so->so_state |= (head->so_state & SS_NBIO); so->so_qstate &= ~SQ_COMP; so->so_head = NULL; SOCK_UNLOCK(so); ACCEPT_UNLOCK(); /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); FILE_LOCK(nfp); nfp->f_data = so; /* nfp has ref count from falloc */ nfp->f_flag = fflag; nfp->f_ops = &socketops; nfp->f_type = DTYPE_SOCKET; FILE_UNLOCK(nfp); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = 0; error = soaccept(so, &sa); if (error) { /* * return a namelen of zero for older code which might * ignore the return value from accept. */ if (name) *namelen = 0; goto noconnection; } if (sa == NULL) { if (name) *namelen = 0; goto done; } if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; *name = sa; sa = NULL; } noconnection: if (sa) FREE(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error) fdclose(fdp, nfp, fd, td); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); done2: NET_UNLOCK_GIANT(); return (error); } int accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* ARGSUSED */ int connect(td, uap) struct thread *td; register struct connect_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error) return (error); error = kern_connect(td, uap->s, sa); free(sa, M_SONAME); return (error); } int kern_connect(td, fd, sa) struct thread *td; int fd; struct sockaddr *sa; { struct socket *so; struct file *fp; int error; int interrupted = 0; NET_LOCK_GIANT(); error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) goto done2; so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef MAC SOCK_LOCK(so); error = mac_check_socket_connect(td->td_ucred, so, sa); SOCK_UNLOCK(so); if (error) goto bad; #endif error = soconnect(so, sa, td); if (error) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, "connec", 0); if (error) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); done2: NET_UNLOCK_GIANT(); return (error); } int socketpair(td, uap) struct thread *td; register struct socketpair_args /* { int domain; int type; int protocol; int *rsv; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, sv[2]; #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type, uap->protocol); if (error) return (error); #endif NET_LOCK_GIANT(); error = socreate(uap->domain, &so1, uap->type, uap->protocol, td->td_ucred, td); if (error) goto done2; error = socreate(uap->domain, &so2, uap->type, uap->protocol, td->td_ucred, td); if (error) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd); if (error) goto free2; sv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd); if (error) goto free3; fp2->f_data = so2; /* so2 already has ref count */ sv[1] = fd; error = soconnect2(so1, so2); if (error) goto free4; if (uap->type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error) goto free4; } FILE_LOCK(fp1); fp1->f_flag = FREAD|FWRITE; fp1->f_ops = &socketops; fp1->f_type = DTYPE_SOCKET; FILE_UNLOCK(fp1); FILE_LOCK(fp2); fp2->f_flag = FREAD|FWRITE; fp2->f_ops = &socketops; fp2->f_type = DTYPE_SOCKET; FILE_UNLOCK(fp2); error = copyout(sv, uap->rsv, 2 * sizeof (int)); fdrop(fp1, td); fdrop(fp2, td); goto done2; free4: fdclose(fdp, fp2, sv[1], td); fdrop(fp2, td); free3: fdclose(fdp, fp1, sv[0], td); fdrop(fp1, td); free2: (void)soclose(so2); free1: (void)soclose(so1); done2: NET_UNLOCK_GIANT(); return (error); } static int sendit(td, s, mp, flags) register struct thread *td; int s; register struct msghdr *mp; int flags; { struct mbuf *control; struct sockaddr *to; int error; if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { register struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_TRYWAIT); if (control == 0) { error = ENOBUFS; goto bad; } else { cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: if (to) FREE(to, M_SONAME); return (error); } int kern_sendit(td, s, mp, flags, control, segflg) struct thread *td; int s; struct msghdr *mp; int flags; struct mbuf *control; enum uio_seg segflg; { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; int i; int len, error; #ifdef KTRACE struct uio *ktruio = NULL; #endif NET_LOCK_GIANT(); error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error) goto bad2; so = (struct socket *)fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_check_socket_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto bad; #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); bad2: NET_UNLOCK_GIANT(); return (error); } int sendto(td, uap) struct thread *td; register struct sendto_args /* { int s; caddr_t buf; size_t len; int flags; caddr_t to; int tolen; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; error = sendit(td, uap->s, &msg, uap->flags); return (error); } #ifdef COMPAT_OLDSOCK int osend(td, uap) struct thread *td; register struct osend_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; error = sendit(td, uap->s, &msg, uap->flags); return (error); } int osendmsg(td, uap) struct thread *td; struct osendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sendmsg(td, uap) struct thread *td; struct sendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(td, s, mp, fromseg, controlp) struct thread *td; int s; struct msghdr *mp; enum uio_seg fromseg; struct mbuf **controlp; { struct uio auio; struct iovec *iov; int i; socklen_t len; int error; struct mbuf *m, *control = 0; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = 0; #ifdef KTRACE struct uio *ktruio = NULL; #endif if(controlp != NULL) *controlp = 0; NET_LOCK_GIANT(); error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error) { NET_UNLOCK_GIANT(); return (error); } so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_check_socket_receive(td->td_ucred, so); SOCK_UNLOCK(so); if (error) { fdrop(fp, td); NET_UNLOCK_GIANT(); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); NET_UNLOCK_GIANT(); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, (struct mbuf **)0, (mp->msg_control || controlp) ? &control : (struct mbuf **)0, &mp->msg_flags); if (error) { if (auio.uio_resid != (int)len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = (int)len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error) goto out; td->td_retval[0] = (int)len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == 0) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout(mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fdrop(fp, td); NET_UNLOCK_GIANT(); if (fromsa) FREE(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); return (error); } static int recvit(td, s, mp, namelenp) struct thread *td; int s; struct msghdr *mp; void *namelenp; { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error) return (error); if (namelenp) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int recvfrom(td, uap) struct thread *td; register struct recvfrom_args /* { int s; caddr_t buf; size_t len; int flags; struct sockaddr * __restrict from; socklen_t * __restrict fromlenaddr; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return(error); } #ifdef COMPAT_OLDSOCK int orecvfrom(td, uap) struct thread *td; struct recvfrom_args *uap; { uap->flags |= MSG_COMPAT; return (recvfrom(td, uap)); } #endif - #ifdef COMPAT_OLDSOCK int orecv(td, uap) struct thread *td; register struct orecv_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, NULL); return (error); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(td, uap) struct thread *td; struct orecvmsg_args /* { int s; struct omsghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int recvmsg(td, uap) struct thread *td; struct recvmsg_args /* { int s; struct msghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } /* ARGSUSED */ int shutdown(td, uap) struct thread *td; register struct shutdown_args /* { int s; int how; } */ *uap; { struct socket *so; struct file *fp; int error; NET_LOCK_GIANT(); error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, uap->how); fdrop(fp, td); } NET_UNLOCK_GIANT(); return (error); } /* ARGSUSED */ int setsockopt(td, uap) struct thread *td; register struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ *uap; { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t valsize; { int error; struct socket *so; struct file *fp; struct sockopt sopt; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } NET_LOCK_GIANT(); error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } NET_UNLOCK_GIANT(); return(error); } /* ARGSUSED */ int getsockopt(td, uap) struct thread *td; register struct getsockopt_args /* { int s; int level; int name; void * __restrict val; socklen_t * __restrict avalsize; } */ *uap; { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t *valsize; { int error; struct socket *so; struct file *fp; struct sockopt sopt; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } NET_LOCK_GIANT(); error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } NET_UNLOCK_GIANT(); return (error); } /* * getsockname1() - Get socket name. */ /* ARGSUSED */ static int getsockname1(td, uap, compat) struct thread *td; register struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; if (*alen < 0) return (EINVAL); NET_LOCK_GIANT(); error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) goto done; so = fp->f_data; *sa = NULL; error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); if (error) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; bad: fdrop(fp, td); if (error && *sa) { free(*sa, M_SONAME); *sa = NULL; } done: NET_UNLOCK_GIANT(); return (error); } int getsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ /* ARGSUSED */ static int getpeername1(td, uap, compat) struct thread *td; register struct getpeername_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; if (*alen < 0) return (EINVAL); NET_LOCK_GIANT(); error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) goto done2; so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done1; } *sa = NULL; error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); if (error) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; bad: if (error && *sa) { free(*sa, M_SONAME); *sa = NULL; } done1: fdrop(fp, td); done2: NET_UNLOCK_GIANT(); return (error); } int getpeername(td, uap) struct thread *td; struct getpeername_args *uap; { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(td, uap) struct thread *td; struct ogetpeername_args *uap; { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ int sockargs(mp, buf, buflen, type) struct mbuf **mp; caddr_t buf; int buflen, type; { register struct sockaddr *sa; register struct mbuf *m; int error; if ((u_int)buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && (u_int)buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif if ((u_int)buflen > MCLBYTES) return (EINVAL); } m = m_get(M_TRYWAIT, type); if (m == NULL) return (ENOBUFS); if ((u_int)buflen > MLEN) { MCLGET(m, M_TRYWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return (ENOBUFS); } } m->m_len = buflen; error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); if (error) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(namp, uaddr, len) struct sockaddr **namp; caddr_t uaddr; size_t len; { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error) { FREE(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } /* * Detach mapped page and release resources back to the system. */ void sf_buf_mext(void *addr, void *args) { vm_page_t m; m = sf_buf_page(args); sf_buf_free(args); vm_page_lock_queues(); vm_page_unwire(m, 0); /* * Check for the object going away on us. This can * happen since we don't hold a reference to it. * If so, we're responsible for freeing the page. */ if (m->wire_count == 0 && m->object == NULL) vm_page_free(m); vm_page_unlock_queues(); } /* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == - * 0. Optionally add a header and/or trailer to the socket output. If + * 0. Optionally add a header and/or trailer to the socket output. If * specified, write the total number of bytes sent into *sbytes. - * */ int sendfile(struct thread *td, struct sendfile_args *uap) { return (do_sendfile(td, uap, 0)); } static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; struct uio *hdr_uio, *trl_uio; int error; hdr_uio = trl_uio = NULL; if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error) goto out; if (hdtr.headers != NULL) { error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); if (error) goto out; } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error) goto out; } } error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat); out: if (hdr_uio) free(hdr_uio, M_IOV); if (trl_uio) free(trl_uio, M_IOV); return (error); } #ifdef COMPAT_FREEBSD4 int freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) { struct sendfile_args args; args.fd = uap->fd; args.s = uap->s; args.offset = uap->offset; args.nbytes = uap->nbytes; args.hdtr = uap->hdtr; args.sbytes = uap->sbytes; args.flags = uap->flags; return (do_sendfile(td, &args, 1)); } #endif /* COMPAT_FREEBSD4 */ int kern_sendfile(struct thread *td, struct sendfile_args *uap, struct uio *hdr_uio, struct uio *trl_uio, int compat) { struct file *sock_fp; struct vnode *vp; struct vm_object *obj = NULL; struct socket *so = NULL; struct mbuf *m = NULL; struct sf_buf *sf; struct vm_page *pg; off_t off, xfsize, sbytes = 0, rem = 0; int error, mnw = 0; int vfslocked; NET_LOCK_GIANT(); /* * The file descriptor must be a regular file and have a * backing VM object. * File offset must be positive. If it goes beyond EOF * we send only the header/trailer and no payload data. */ if ((error = fgetvp_read(td, uap->fd, &vp)) != 0) goto out; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); obj = vp->v_object; if (obj != NULL) { /* * Temporarily increase the backing VM object's reference * count so that a forced reclamation of its vnode does not * immediately destroy it. */ VM_OBJECT_LOCK(obj); if ((obj->flags & OBJ_DEAD) == 0) { vm_object_reference_locked(obj); VM_OBJECT_UNLOCK(obj); } else { VM_OBJECT_UNLOCK(obj); obj = NULL; } } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); if (obj == NULL) { error = EINVAL; goto out; } if (uap->offset < 0) { error = EINVAL; goto out; } /* * The socket must be a stream socket and connected. * Remember if it a blocking or non-blocking socket. */ if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp, NULL)) != 0) goto out; so = sock_fp->f_data; if (so->so_type != SOCK_STREAM) { error = EINVAL; goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto out; } /* * Do not wait on memory allocations but return ENOMEM for * caller to retry later. * XXX: Experimental. */ if (uap->flags & SF_MNOWAIT) mnw = 1; #ifdef MAC SOCK_LOCK(so); error = mac_check_socket_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto out; #endif /* If headers are specified copy them into mbufs. */ if (hdr_uio != NULL) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; if (hdr_uio->uio_resid > 0) { /* * In FBSD < 5.0 the nbytes to send also included * the header. If compat is specified subtract the * header size from nbytes. */ if (compat) { if (uap->nbytes > hdr_uio->uio_resid) uap->nbytes -= hdr_uio->uio_resid; else uap->nbytes = 0; } m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 0, 0, 0); if (m == NULL) { error = mnw ? EAGAIN : ENOBUFS; goto out; } } } /* Protect against multiple writers to the socket. */ SOCKBUF_LOCK(&so->so_snd); (void) sblock(&so->so_snd, M_WAITOK); SOCKBUF_UNLOCK(&so->so_snd); /* * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. * This is done in two loops. The inner loop turns as many pages * as it can, up to available socket buffer space, without blocking * into mbufs to have it bulk delivered into the socket send buffer. * The outer loop checks the state and available space of the socket * and takes care of the overall progress. */ for (off = uap->offset; ; ) { int loopbytes = 0; int space = 0; int done = 0; /* * Check the socket state for ongoing connection, * no errors and space in socket buffer. * If space is low allow for the remainder of the * file to be processed if it fits the socket buffer. * Otherwise block in waiting for sufficient space * to proceed, or if the socket is nonblocking, return * to userland with EAGAIN while reporting how far * we've come. * We wait until the socket buffer has significant free * space to do bulk sends. This makes good use of file * system read ahead and allows packet segmentation * offloading hardware to take over lots of work. If * we were not careful here we would send off only one * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } else if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } space = sbspace(&so->so_snd); if (space < rem && (space <= 0 || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) { SOCKBUF_UNLOCK(&so->so_snd); error = EAGAIN; goto done; } /* * sbwait drops the lock while sleeping. * When we loop back to retry_space the * state may have changed and we retest * for it. */ error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error) { SOCKBUF_UNLOCK(&so->so_snd); goto done; } goto retry_space; } SOCKBUF_UNLOCK(&so->so_snd); /* * Loop and construct maximum sized mbuf chain to be bulk * dumped into socket buffer. */ while(space > loopbytes) { vm_pindex_t pindex; vm_offset_t pgoff; struct mbuf *m0; VM_OBJECT_LOCK(obj); /* * Calculate the amount to transfer. * Not to exceed a page, the EOF, * or the passed in nbytes. */ pgoff = (vm_offset_t)(off & PAGE_MASK); xfsize = omin(PAGE_SIZE - pgoff, obj->un_pager.vnp.vnp_size - off - sbytes - loopbytes); if (uap->nbytes) rem = (uap->nbytes - sbytes - loopbytes); else rem = obj->un_pager.vnp.vnp_size - off - sbytes - loopbytes; xfsize = omin(rem, xfsize); if (xfsize <= 0) { VM_OBJECT_UNLOCK(obj); done = 1; /* all data sent */ break; } /* * Don't overflow the send buffer. * Stop here and send out what we've * already got. */ if (space < loopbytes + xfsize) { VM_OBJECT_UNLOCK(obj); break; } retry_lookup: /* * Attempt to look up the page. * Allocate if not found or * wait and loop if busy. */ pindex = OFF_TO_IDX(off); pg = vm_page_lookup(obj, pindex); if (pg == NULL) { pg = vm_page_alloc(obj, pindex, VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); if (pg == NULL) { VM_OBJECT_UNLOCK(obj); VM_WAIT; VM_OBJECT_LOCK(obj); goto retry_lookup; } } else if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy")) goto retry_lookup; else { /* * Wire the page so it does not get * ripped out from under us. */ vm_page_lock_queues(); vm_page_wire(pg); vm_page_unlock_queues(); } /* * Check if page is valid for what we need, * otherwise initiate I/O. * If we already turned some pages into mbufs, * send them off before we come here again and * block. */ if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) VM_OBJECT_UNLOCK(obj); else if (m != NULL) error = EAGAIN; /* send what we already got */ else if (uap->flags & SF_NODISKIO) error = EBUSY; else { int bsize, resid; /* * Ensure that our page is still around * when the I/O completes. */ vm_page_io_start(pg); VM_OBJECT_UNLOCK(obj); /* * Get the page from backing store. */ bsize = vp->v_mount->mnt_stat.f_iosize; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_SHARED | LK_RETRY, td); /* * XXXMAC: Because we don't have fp->f_cred * here, we pass in NOCRED. This is probably * wrong, but is consistent with our original * implementation. */ error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE, trunc_page(off), UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); VM_OBJECT_LOCK(obj); vm_page_io_finish(pg); if (!error) VM_OBJECT_UNLOCK(obj); mbstat.sf_iocnt++; } if (error) { vm_page_lock_queues(); vm_page_unwire(pg, 0); /* * See if anyone else might know about * this page. If not and it is not valid, * then free it. */ if (pg->wire_count == 0 && pg->valid == 0 && pg->busy == 0 && !(pg->oflags & VPO_BUSY) && pg->hold_count == 0) { vm_page_free(pg); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); if (error == EAGAIN) error = 0; /* not a real error */ break; } /* * Get a sendfile buf. We usually wait as long * as necessary, but this wait can be interrupted. */ if ((sf = sf_buf_alloc(pg, (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) { mbstat.sf_allocfail++; vm_page_lock_queues(); vm_page_unwire(pg, 0); /* * XXX: Not same check as above!? */ if (pg->wire_count == 0 && pg->object == NULL) vm_page_free(pg); vm_page_unlock_queues(); error = (mnw ? EAGAIN : EINTR); break; } /* * Get an mbuf and set it up as having * external storage. */ m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); if (m0 == NULL) { error = (mnw ? EAGAIN : ENOBUFS); sf_buf_mext((void *)sf_buf_kva(sf), sf); break; } MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext, sf, M_RDONLY, EXT_SFBUF); m0->m_data = (char *)sf_buf_kva(sf) + pgoff; m0->m_len = xfsize; /* Append to mbuf chain. */ if (m != NULL) m_cat(m, m0); else m = m0; /* Keep track of bits processed. */ loopbytes += xfsize; off += xfsize; } /* Add the buffer chain to the socket buffer. */ if (m != NULL) { int mlen; mlen = m_length(m, NULL); SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } SOCKBUF_UNLOCK(&so->so_snd); error = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); if (!error) sbytes += mlen; m = NULL; /* pru_send always consumes */ } /* Quit outer loop on error or when we're done. */ if (error || done) goto done; } /* * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { error = kern_writev(td, uap->s, trl_uio); if (error) goto done; sbytes += td->td_retval[0]; } done: SOCKBUF_LOCK(&so->so_snd); sbunlock(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); out: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (uap->sbytes != NULL) { copyout(&sbytes, uap->sbytes, sizeof(off_t)); } if (obj != NULL) vm_object_deallocate(obj); if (vp != NULL) { vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } if (so) fdrop(sock_fp, td); if (m) m_freem(m); NET_UNLOCK_GIANT(); if (error == ERESTART) error = EINTR; return (error); } /* * SCTP syscalls. * Functionality only compiled in if SCTP is defined in the kernel Makefile, * otherwise all return EOPNOTSUPP. * XXX: We should make this loadable one day. */ int sctp_peeloff(td, uap) struct thread *td; struct sctp_peeloff_args /* { int sd; caddr_t name; } */ *uap; { #ifdef SCTP struct filedesc *fdp; struct file *nfp = NULL; int error; struct socket *head, *so; int fd; u_int fflag; fdp = td->td_proc->p_fd; error = fgetsock(td, uap->sd, &head, &fflag); if (error) goto done2; error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); if (error) goto done2; /* * At this point we know we do have a assoc to pull * we proceed to get the fd setup. This may block * but that is ok. */ error = falloc(td, &nfp, &fd); if (error) goto done; td->td_retval[0] = fd; so = sonewconn(head, SS_ISCONNECTED); if (so == NULL) goto noconnection; /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); soref(so); /* file descriptor reference */ SOCK_UNLOCK(so); ACCEPT_LOCK(); TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; so->so_state |= (head->so_state & SS_NBIO); so->so_state &= ~SS_NOFDREF; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; ACCEPT_UNLOCK(); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error) goto noconnection; if (head->so_sigio != NULL) fsetown(fgetown(&head->so_sigio), &so->so_sigio); FILE_LOCK(nfp); nfp->f_data = so; nfp->f_flag = fflag; nfp->f_ops = &socketops; nfp->f_type = DTYPE_SOCKET; FILE_UNLOCK(nfp); noconnection: /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error) fdclose(fdp, nfp, fd, td); /* * Release explicitly held references before returning. */ done: if (nfp != NULL) fdrop(nfp, td); fputsock(head); done2: return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sctp_generic_sendmsg (td, uap) struct thread *td; struct sctp_generic_sendmsg_args /* { int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #ifdef SCTP struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp; int use_rcvinfo = 1; int error = 0, len; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec iov[1]; if (uap->sinfo) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error) return (error); u_sinfo = &sinfo; } if (uap->tolen) { error = getsockaddr(&to, uap->to, uap->tolen); if (error) { to = NULL; goto sctp_bad2; } } error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); if (error) goto sctp_bad; iov[0].iov_base = uap->msg; iov[0].iov_len = uap->mlen; so = (struct socket *)fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_check_socket_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; len = auio.uio_resid = uap->mlen; error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, use_rcvinfo, u_sinfo, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket. */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: fdrop(fp, td); sctp_bad2: if (to) free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sctp_generic_sendmsg_iov(td, uap) struct thread *td; struct sctp_generic_sendmsg_iov_args /* { int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #ifdef SCTP struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp; int use_rcvinfo = 1; int error=0, len, i; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec *iov, *tiov; if (uap->sinfo) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error) return (error); u_sinfo = &sinfo; } if (uap->tolen) { error = getsockaddr(&to, uap->to, uap->tolen); if (error) { to = NULL; goto sctp_bad2; } } error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); if (error) goto sctp_bad1; error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error) goto sctp_bad1; so = (struct socket *)fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_check_socket_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto sctp_bad; } } len = auio.uio_resid; error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, use_rcvinfo, u_sinfo, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: free(iov, M_IOV); sctp_bad1: fdrop(fp, td); sctp_bad2: if (to) free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sctp_generic_recvmsg(td, uap) struct thread *td; struct sctp_generic_recvmsg_args /* { int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags } */ *uap; { #ifdef SCTP u_int8_t sockbufstore[256]; struct uio auio; struct iovec *iov, *tiov; struct sctp_sndrcvinfo sinfo; struct socket *so; struct file *fp; struct sockaddr *fromsa; int fromlen; int len, i, msg_flags; int error = 0; #ifdef KTRACE struct uio *ktruio = NULL; #endif error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); if (error) { return (error); } error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error) { goto out1; } so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_check_socket_receive(td->td_ucred, so); SOCK_UNLOCK(so); if (error) { goto out; return (error); } #endif /* MAC */ if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); if (error) { goto out; } } else { fromlen = 0; } if(uap->msg_flags) { error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); if (error) { goto out; } } else { msg_flags = 0; } auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto out; } } len = auio.uio_resid; fromsa = (struct sockaddr *)sockbufstore; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, fromsa, fromlen, &msg_flags, (struct sctp_sndrcvinfo *)&sinfo, 1); if (error) { if (auio.uio_resid != (int)len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } else { if (uap->sinfo) error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); } #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = (int)len - auio.uio_resid; ktrgenio(uap->sd, UIO_READ, ktruio, error); } #endif /* KTRACE */ if (error) goto out; td->td_retval[0] = (int)len - auio.uio_resid; if (fromlen && uap->from) { len = fromlen; if (len <= 0 || fromsa == 0) len = 0; else { len = MIN(len, fromsa->sa_len); error = copyout(fromsa, uap->from, (unsigned)len); if (error) goto out; } error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); if (error) { goto out; } } if (uap->msg_flags) { error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); if (error) { goto out; } } out: free(iov, M_IOV); out1: fdrop(fp, td); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } Index: head/sys/kern/vfs_aio.c =================================================================== --- head/sys/kern/vfs_aio.c (revision 167231) +++ head/sys/kern/vfs_aio.c (revision 167232) @@ -1,2322 +1,2322 @@ /*- * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_vfs_aio.h" /* * Counter for allocating reference ids to new jobs. Wrapped to 1 on * overflow. (XXX will be removed soon.) */ static u_long jobrefid; /* * Counter for aio_fsync. */ static uint64_t jobseqno; #define JOBST_NULL 0 #define JOBST_JOBQSOCK 1 #define JOBST_JOBQGLOBAL 2 #define JOBST_JOBRUNNING 3 #define JOBST_JOBFINISHED 4 #define JOBST_JOBQBUF 5 #define JOBST_JOBQSYNC 6 #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ #endif #ifndef MAX_AIO_PROCS #define MAX_AIO_PROCS 32 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ #endif #ifndef TARGET_AIO_PROCS #define TARGET_AIO_PROCS 4 #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif #ifndef AIOD_TIMEOUT_DEFAULT #define AIOD_TIMEOUT_DEFAULT (10 * hz) #endif #ifndef AIOD_LIFETIME_DEFAULT #define AIOD_LIFETIME_DEFAULT (30 * hz) #endif static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); static int max_aio_procs = MAX_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, "Maximum number of kernel threads to use for handling async IO "); static int num_aio_procs = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, "Number of presently active kernel threads for async IO"); /* * The code will adjust the actual number of AIO processes towards this * number when it gets a chance. */ static int target_aio_procs = TARGET_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel threads for async IO"); static int max_queue_count = MAX_AIO_QUEUE; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, "Number of queued aio requests"); static int num_buf_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, "Number of aio requests presently handled by the buf subsystem"); /* Number of async I/O thread in the process of being started */ /* XXX This should be local to aio_aqueue() */ static int num_aio_resv_start = 0; static int aiod_timeout; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, "Timeout value for synchronous aio operations"); static int aiod_lifetime; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, "Maximum lifetime for idle aiod"); static int unloadable = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, "Allow unload of aio (not recommended)"); static int max_aio_per_proc = MAX_AIO_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process (stored in the process)"); static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process (stored in the process)"); static int max_buf_aio = MAX_BUF_AIO; SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process (stored in the process)"); typedef struct oaiocb { int aio_fildes; /* File descriptor */ off_t aio_offset; /* File offset for I/O */ volatile void *aio_buf; /* I/O buffer in process space */ size_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private _aiocb_private; } oaiocb_t; /* * Below is a key of locks used to protect each member of struct aiocblist * aioliojob and kaioinfo and any backends. * * * - need not protected * a - locked by kaioinfo lock * b - locked by backend lock, the backend lock can be null in some cases, * for example, BIO belongs to this type, in this case, proc lock is * reused. * c - locked by aio_job_mtx, the lock for the generic file I/O backend. */ /* * Current, there is only two backends: BIO and generic file I/O. * socket I/O is served by generic file I/O, this is not a good idea, since * disk file I/O and any other types without O_NONBLOCK flag can block daemon * threads, if there is no thread to serve socket I/O, the socket I/O will be * delayed too long or starved, we should create some threads dedicated to * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O * systems we really need non-blocking interface, fiddling O_NONBLOCK in file * structure is not safe because there is race between userland and aio * daemons. */ struct aiocblist { TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */ TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */ TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */ int jobflags; /* (a) job flags */ int jobstate; /* (b) job state */ int inputcharge; /* (*) input blockes */ int outputcharge; /* (*) output blockes */ struct buf *bp; /* (*) private to BIO backend, * buffer pointer */ struct proc *userproc; /* (*) user process */ struct ucred *cred; /* (*) active credential when created */ struct file *fd_file; /* (*) pointer to file structure */ struct aioliojob *lio; /* (*) optional lio job */ struct aiocb *uuaiocb; /* (*) pointer in userspace of aiocb */ struct knlist klist; /* (a) list of knotes */ struct aiocb uaiocb; /* (*) kernel I/O control block */ ksiginfo_t ksi; /* (a) realtime signal info */ struct task biotask; /* (*) private to BIO backend */ uint64_t seqno; /* (*) job number */ int pending; /* (a) number of pending I/O, aio_fsync only */ }; /* jobflags */ #define AIOCBLIST_DONE 0x01 #define AIOCBLIST_BUFDONE 0x02 #define AIOCBLIST_RUNDOWN 0x04 #define AIOCBLIST_CHECKSYNC 0x08 /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ struct aiothreadlist { int aiothreadflags; /* (c) AIO proc flags */ TAILQ_ENTRY(aiothreadlist) list; /* (c) list of processes */ struct thread *aiothread; /* (*) the AIO thread */ }; /* * data-structure for lio signal management */ struct aioliojob { int lioj_flags; /* (a) listio flags */ int lioj_count; /* (a) listio flags */ int lioj_finished_count; /* (a) listio flags */ struct sigevent lioj_signal; /* (a) signal on all I/O done */ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ struct knlist klist; /* (a) list of knotes */ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ /* * per process aio data structure */ struct kaioinfo { struct mtx kaio_mtx; /* the lock to protect this struct */ int kaio_flags; /* (a) per process kaio flags */ int kaio_maxactive_count; /* (*) maximum number of AIOs */ int kaio_active_count; /* (c) number of currently used AIOs */ int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */ int kaio_count; /* (a) size of AIO queue */ int kaio_ballowed_count; /* (*) maximum number of buffers */ int kaio_buffer_count; /* (a) number of physio buffers */ TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */ TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */ TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */ TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* (a) queue for aios waiting on sockets, * NOT USED YET. */ TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */ struct task kaio_task; /* (*) task to kick aio threads */ }; #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) #define AIO_MTX(ki) (&(ki)->kaio_mtx) #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* (c) Idle daemons */ static struct sema aio_newproc_sem; static struct mtx aio_job_mtx; static struct mtx aio_sock_mtx; static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; void aio_init_aioinfo(struct proc *p); static void aio_onceonly(void); static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process(struct aiocblist *aiocbe); static int aio_newproc(int *); int aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lio, int type, int osigev); static void aio_physwakeup(struct buf *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); static int aio_qphysio(struct proc *p, struct aiocblist *iocb); static void biohelper(void *, int); static void aio_daemon(void *param); static void aio_swake_cb(struct socket *, struct sockbuf *); static int aio_unload(void); static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type); #define DONE_BUF 1 #define DONE_QUEUE 2 static int do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev); static int aio_kick(struct proc *userp); static void aio_kick_nowait(struct proc *userp); static void aio_kick_helper(void *context, int pending); static int filt_aioattach(struct knote *kn); static void filt_aiodetach(struct knote *kn); static int filt_aio(struct knote *kn, long hint); static int filt_lioattach(struct knote *kn); static void filt_liodetach(struct knote *kn); static int filt_lio(struct knote *kn, long hint); /* * Zones for: * kaio Per process async io info * aiop async io thread data * aiocb async io jobs * aiol list io job pointer - internal to aio_suspend XXX * aiolio list io jobs */ static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone; /* kqueue filters for aio */ static struct filterops aio_filtops = { 0, filt_aioattach, filt_aiodetach, filt_aio }; static struct filterops lio_filtops = { 0, filt_lioattach, filt_liodetach, filt_lio }; static eventhandler_tag exit_tag, exec_tag; TASKQUEUE_DEFINE_THREAD(aiod_bio); /* * Main operations function for use as a kernel module. */ static int aio_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: aio_onceonly(); break; case MOD_UNLOAD: error = aio_unload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t aio_mod = { "aio", &aio_modload, NULL }; SYSCALL_MODULE_HELPER(aio_cancel); SYSCALL_MODULE_HELPER(aio_error); SYSCALL_MODULE_HELPER(aio_fsync); SYSCALL_MODULE_HELPER(aio_read); SYSCALL_MODULE_HELPER(aio_return); SYSCALL_MODULE_HELPER(aio_suspend); SYSCALL_MODULE_HELPER(aio_waitcomplete); SYSCALL_MODULE_HELPER(aio_write); SYSCALL_MODULE_HELPER(lio_listio); SYSCALL_MODULE_HELPER(oaio_read); SYSCALL_MODULE_HELPER(oaio_write); SYSCALL_MODULE_HELPER(olio_listio); DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(aio, 1); /* * Startup initialization */ static void aio_onceonly(void) { /* XXX: should probably just use so->callback */ aio_swake = &aio_swake_cb; exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, EVENTHANDLER_PRI_ANY); kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); TAILQ_INIT(&aio_freeproc); sema_init(&aio_newproc_sem, 0, "aio_new_proc"); mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF); TAILQ_INIT(&aio_jobs); aiod_unr = new_unrhdr(1, INT_MAX, NULL); kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiod_timeout = AIOD_TIMEOUT_DEFAULT; aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; async_io_version = _POSIX_VERSION; p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX); p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); } /* * Callback for unload of AIO when used as a module. */ static int aio_unload(void) { int error; /* * XXX: no unloads by default, it's too dangerous. * perhaps we could do it if locked out callers and then * did an aio_proc_rundown() on each process. * * jhb: aio_proc_rundown() needs to run on curproc though, * so I don't think that would fly. */ if (!unloadable) return (EOPNOTSUPP); error = kqueue_del_filteropts(EVFILT_AIO); if (error) return error; error = kqueue_del_filteropts(EVFILT_LIO); if (error) return error; async_io_version = 0; aio_swake = NULL; taskqueue_free(taskqueue_aiod_bio); delete_unrhdr(aiod_unr); uma_zdestroy(kaio_zone); uma_zdestroy(aiop_zone); uma_zdestroy(aiocb_zone); uma_zdestroy(aiol_zone); uma_zdestroy(aiolio_zone); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); EVENTHANDLER_DEREGISTER(process_exec, exec_tag); mtx_destroy(&aio_job_mtx); mtx_destroy(&aio_sock_mtx); sema_destroy(&aio_newproc_sem); p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1); p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1); return (0); } /* * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; ki = uma_zalloc(kaio_zone, M_WAITOK); mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF); ki->kaio_flags = 0; ki->kaio_maxactive_count = max_aio_per_proc; ki->kaio_active_count = 0; ki->kaio_qallowed_count = max_aio_queue_per_proc; ki->kaio_count = 0; ki->kaio_ballowed_count = max_buf_aio; ki->kaio_buffer_count = 0; TAILQ_INIT(&ki->kaio_all); TAILQ_INIT(&ki->kaio_done); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_bufqueue); TAILQ_INIT(&ki->kaio_liojoblist); TAILQ_INIT(&ki->kaio_sockqueue); TAILQ_INIT(&ki->kaio_syncqueue); TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); PROC_LOCK(p); if (p->p_aioinfo == NULL) { p->p_aioinfo = ki; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); } while (num_aio_procs < target_aio_procs) aio_newproc(NULL); } static int aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) { int ret = 0; PROC_LOCK(p); if (!KSI_ONQ(ksi)) { ksi->ksi_code = SI_ASYNCIO; ksi->ksi_flags |= KSI_EXT | KSI_INS; ret = psignal_event(p, sigev, ksi); } PROC_UNLOCK(p); return (ret); } /* * Free a job entry. Wait for completion if it is currently active, but don't * delay forever. If we delay, we return a flag that says that we have to * restart the queue scan. */ static int aio_free_entry(struct aiocblist *aiocbe) { struct kaioinfo *ki; struct aioliojob *lj; struct proc *p; p = aiocbe->userproc; MPASS(curproc == p); ki = p->p_aioinfo; MPASS(ki != NULL); AIO_LOCK_ASSERT(ki, MA_OWNED); MPASS(aiocbe->jobstate == JOBST_JOBFINISHED); atomic_subtract_int(&num_queue_count, 1); ki->kaio_count--; MPASS(ki->kaio_count >= 0); TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist); TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist); lj = aiocbe->lio; if (lj) { lj->lioj_count--; lj->lioj_finished_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); /* lio is going away, we need to destroy any knotes */ knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } } /* aiocbe is going away, we need to destroy any knotes */ knlist_delete(&aiocbe->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&aiocbe->ksi); PROC_UNLOCK(p); MPASS(aiocbe->bp == NULL); aiocbe->jobstate = JOBST_NULL; AIO_UNLOCK(ki); /* * The thread argument here is used to find the owning process * and is also passed to fo_close() which may pass it to various * places such as devsw close() routines. Because of that, we * need a thread pointer from the process owning the job that is * persistent and won't disappear out from under us or move to * another process. * * Currently, all the callers of this function call it to remove * an aiocblist from the current process' job list either via a * syscall or due to the current process calling exit() or * execve(). Thus, we know that p == curproc. We also know that * curthread can't exit since we are curthread. * * Therefore, we use curthread as the thread to pass to * knlist_delete(). This does mean that it is possible for the * thread pointer at close time to differ from the thread pointer * at open time, but this is already true of file descriptors in * a multithreaded process. */ fdrop(aiocbe->fd_file, curthread); crfree(aiocbe->cred); uma_zfree(aiocb_zone, aiocbe); AIO_LOCK(ki); return (0); } static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { aio_proc_rundown(arg, p); } /* * Rundown the jobs for a given process. */ static void aio_proc_rundown(void *arg, struct proc *p) { struct kaioinfo *ki; struct aioliojob *lj; struct aiocblist *cbe, *cbn; struct file *fp; struct socket *so; int remove; KASSERT(curthread->td_proc == p, ("%s: called on non-curproc", __func__)); ki = p->p_aioinfo; if (ki == NULL) return; AIO_LOCK(ki); ki->kaio_flags |= KAIO_RUNDOWN; restart: /* * Try to cancel all pending requests. This code simulates * aio_cancel on all pending I/O requests. */ TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { remove = 0; mtx_lock(&aio_job_mtx); if (cbe->jobstate == JOBST_JOBQGLOBAL) { TAILQ_REMOVE(&aio_jobs, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSOCK) { fp = cbe->fd_file; MPASS(fp->f_type == DTYPE_SOCKET); so = fp->f_data; TAILQ_REMOVE(&so->so_aiojobq, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSYNC) { TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); remove = 1; } mtx_unlock(&aio_job_mtx); if (remove) { cbe->jobstate = JOBST_JOBFINISHED; cbe->uaiocb._aiocb_private.status = -1; cbe->uaiocb._aiocb_private.error = ECANCELED; TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); aio_bio_done_notify(p, cbe, DONE_QUEUE); } } /* Wait for all running I/O to be finished */ if (TAILQ_FIRST(&ki->kaio_bufqueue) || TAILQ_FIRST(&ki->kaio_jobqueue)) { ki->kaio_flags |= KAIO_WAKEUP; msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); goto restart; } /* Free all completed I/O requests. */ while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL) aio_free_entry(cbe); while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } else { panic("LIO job not cleaned up: C:%d, FC:%d\n", lj->lioj_count, lj->lioj_finished_count); } } AIO_UNLOCK(ki); taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task); uma_zfree(kaio_zone, ki); p->p_aioinfo = NULL; } /* * Select a job to run (called by an AIO daemon). */ static struct aiocblist * aio_selectjob(struct aiothreadlist *aiop) { struct aiocblist *aiocbe; struct kaioinfo *ki; struct proc *userp; mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_FOREACH(aiocbe, &aio_jobs, list) { userp = aiocbe->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < ki->kaio_maxactive_count) { TAILQ_REMOVE(&aio_jobs, aiocbe, list); /* Account for currently active jobs. */ ki->kaio_active_count++; aiocbe->jobstate = JOBST_JOBRUNNING; break; } } return (aiocbe); } /* * Move all data to a permanent storage device, this code * simulates fsync syscall. */ static int aio_fsync_vnode(struct thread *td, struct vnode *vp) { struct mount *mp; int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_object != NULL) { VM_OBJECT_LOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_UNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); drop: VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * The AIO processing activity. This is the code that does the I/O request for * the non-physio version of the operations. The normal vn operations are used, * and this code should work in all instances for every type of file, including * pipes, sockets, fifos, and regular files. * * XXX I don't think it works well for socket, pipe, and fifo. */ static void aio_process(struct aiocblist *aiocbe) { struct ucred *td_savedcred; struct thread *td; struct proc *mycp; struct aiocb *cb; struct file *fp; struct socket *so; struct uio auio; struct iovec aiov; int cnt; int error; int oublock_st, oublock_end; int inblock_st, inblock_end; td = curthread; td_savedcred = td->td_ucred; td->td_ucred = aiocbe->cred; mycp = td->td_proc; cb = &aiocbe->uaiocb; fp = aiocbe->fd_file; if (cb->aio_lio_opcode == LIO_SYNC) { error = 0; cnt = 0; if (fp->f_vnode != NULL) error = aio_fsync_vnode(td, fp->f_vnode); cb->_aiocb_private.error = error; cb->_aiocb_private.status = 0; td->td_ucred = td_savedcred; return; } aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; aiov.iov_len = cb->aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = cb->aio_offset; auio.uio_resid = cb->aio_nbytes; cnt = cb->aio_nbytes; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; inblock_st = mycp->p_stats->p_ru.ru_inblock; oublock_st = mycp->p_stats->p_ru.ru_oublock; /* * aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ if (cb->aio_lio_opcode == LIO_READ) { auio.uio_rw = UIO_READ; error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); } else { if (fp->f_type == DTYPE_VNODE) bwillwrite(); auio.uio_rw = UIO_WRITE; error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); } inblock_end = mycp->p_stats->p_ru.ru_inblock; oublock_end = mycp->p_stats->p_ru.ru_oublock; aiocbe->inputcharge = inblock_end - inblock_st; aiocbe->outputcharge = oublock_end - oublock_st; if ((error) && (auio.uio_resid != cnt)) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { int sigpipe = 1; if (fp->f_type == DTYPE_SOCKET) { so = fp->f_data; if (so->so_options & SO_NOSIGPIPE) sigpipe = 0; } if (sigpipe) { PROC_LOCK(aiocbe->userproc); psignal(aiocbe->userproc, SIGPIPE); PROC_UNLOCK(aiocbe->userproc); } } } cnt -= auio.uio_resid; cb->_aiocb_private.error = error; cb->_aiocb_private.status = cnt; td->td_ucred = td_savedcred; } static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type) { struct aioliojob *lj; struct kaioinfo *ki; struct aiocblist *scb, *scbn; int lj_done; ki = userp->p_aioinfo; AIO_LOCK_ASSERT(ki, MA_OWNED); lj = aiocbe->lio; lj_done = 0; if (lj) { lj->lioj_finished_count++; if (lj->lioj_count == lj->lioj_finished_count) lj_done = 1; } if (type == DONE_QUEUE) { aiocbe->jobflags |= AIOCBLIST_DONE; } else { aiocbe->jobflags |= AIOCBLIST_BUFDONE; } TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist); aiocbe->jobstate = JOBST_JOBFINISHED; if (ki->kaio_flags & KAIO_RUNDOWN) goto notification_done; if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi); KNOTE_LOCKED(&aiocbe->klist, 1); if (lj_done) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } notification_done: if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) { TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) { if (aiocbe->fd_file == scb->fd_file && aiocbe->seqno < scb->seqno) { if (--scb->pending == 0) { mtx_lock(&aio_job_mtx); scb->jobstate = JOBST_JOBQGLOBAL; TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list); TAILQ_INSERT_TAIL(&aio_jobs, scb, list); aio_kick_nowait(userp); mtx_unlock(&aio_job_mtx); } } } } if (ki->kaio_flags & KAIO_WAKEUP) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(&userp->p_aioinfo); } } /* * The AIO daemon, most of the actual work is done in aio_process, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *_id) { struct aiocblist *aiocbe; struct aiothreadlist *aiop; struct kaioinfo *ki; struct proc *curcp, *mycp, *userp; struct vmspace *myvm, *tmpvm; struct thread *td = curthread; int id = (intptr_t)_id; /* * Local copies of curproc (cp) and vmspace (myvm) */ mycp = td->td_proc; myvm = mycp->p_vmspace; KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp")); /* * Allocate and ready the aio control info. There is one aiop structure * per daemon. */ aiop = uma_zalloc(aiop_zone, M_WAITOK); aiop->aiothread = td; aiop->aiothreadflags = 0; /* The daemon resides in its own pgrp. */ setsid(td, NULL); /* * Wakeup parent process. (Parent sleeps to keep from blasting away * and creating too many daemons.) */ sema_post(&aio_newproc_sem); mtx_lock(&aio_job_mtx); for (;;) { /* * curcp is the current daemon process context. * userp is the current user process context. */ curcp = mycp; /* * Take daemon off of free queue */ if (aiop->aiothreadflags & AIOP_FREE) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; } /* * Check for jobs. */ while ((aiocbe = aio_selectjob(aiop)) != NULL) { mtx_unlock(&aio_job_mtx); userp = aiocbe->userproc; /* * Connect to process address space for user program. */ if (userp != curcp) { /* * Save the current address space that we are * connected to. */ tmpvm = mycp->p_vmspace; /* * Point to the new user address space, and * refer to it. */ mycp->p_vmspace = userp->p_vmspace; atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1); /* Activate the new mapping. */ pmap_activate(FIRST_THREAD_IN_PROC(mycp)); /* * If the old address space wasn't the daemons * own address space, then we need to remove the * daemon's reference from the other process * that it was acting on behalf of. */ if (tmpvm != myvm) { vmspace_free(tmpvm); } curcp = userp; } ki = userp->p_aioinfo; /* Do the I/O function. */ aio_process(aiocbe); mtx_lock(&aio_job_mtx); /* Decrement the active job count. */ ki->kaio_active_count--; mtx_unlock(&aio_job_mtx); AIO_LOCK(ki); TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); aio_bio_done_notify(userp, aiocbe, DONE_QUEUE); AIO_UNLOCK(ki); mtx_lock(&aio_job_mtx); } /* * Disconnect from user address space. */ if (curcp != mycp) { mtx_unlock(&aio_job_mtx); /* Get the user address space to disconnect from. */ tmpvm = mycp->p_vmspace; /* Get original address space for daemon. */ mycp->p_vmspace = myvm; /* Activate the daemon's address space. */ pmap_activate(FIRST_THREAD_IN_PROC(mycp)); #ifdef DIAGNOSTIC if (tmpvm == myvm) { printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); } #endif /* Remove our vmspace reference. */ vmspace_free(tmpvm); curcp = mycp; mtx_lock(&aio_job_mtx); /* * We have to restart to avoid race, we only sleep if * no job can be selected, that should be * curcp == mycp. */ continue; } mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aiothreadflags |= AIOP_FREE; /* * If daemon is inactive for a long time, allow it to exit, * thereby freeing resources. */ if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy", aiod_lifetime)) { if (TAILQ_EMPTY(&aio_jobs)) { if ((aiop->aiothreadflags & AIOP_FREE) && (num_aio_procs > target_aio_procs)) { TAILQ_REMOVE(&aio_freeproc, aiop, list); num_aio_procs--; mtx_unlock(&aio_job_mtx); uma_zfree(aiop_zone, aiop); free_unr(aiod_unr, id); #ifdef DIAGNOSTIC if (mycp->p_vmspace->vm_refcnt <= 1) { printf("AIOD: bad vm refcnt for" " exiting daemon: %d\n", mycp->p_vmspace->vm_refcnt); } #endif kthread_exit(0); } } } } mtx_unlock(&aio_job_mtx); panic("shouldn't be here\n"); } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The * AIO daemon modifies its environment itself. */ static int aio_newproc(int *start) { int error; struct proc *p; int id; id = alloc_unr(aiod_unr); error = kthread_create(aio_daemon, (void *)(intptr_t)id, &p, RFNOWAIT, 0, "aiod%d", id); if (error == 0) { /* * Wait until daemon is started. */ sema_wait(&aio_newproc_sem); mtx_lock(&aio_job_mtx); num_aio_procs++; if (start != NULL) (*start)--; mtx_unlock(&aio_job_mtx); } else { free_unr(aiod_unr, id); } return (error); } /* * Try the high-performance, low-overhead physio method for eligible * VCHR devices. This method doesn't use an aio helper thread, and * thus has very low overhead. * * Assumes that the caller, aio_aqueue(), has incremented the file * structure's reference count, preventing its deallocation for the * duration of this call. */ static int aio_qphysio(struct proc *p, struct aiocblist *aiocbe) { struct aiocb *cb; struct file *fp; struct buf *bp; struct vnode *vp; struct kaioinfo *ki; struct aioliojob *lj; int error; cb = &aiocbe->uaiocb; fp = aiocbe->fd_file; if (fp->f_type != DTYPE_VNODE) return (-1); vp = fp->f_vnode; /* * If its not a disk, we don't want to return a positive error. * It causes the aio code to not fall through to try the thread * way when you're talking to a regular file. */ if (!vn_isdisk(vp, &error)) { if (error == ENOTBLK) return (-1); else return (error); } if (vp->v_bufobj.bo_bsize == 0) return (-1); if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) return (-1); if (cb->aio_nbytes > vp->v_rdev->si_iosize_max) return (-1); if (cb->aio_nbytes > MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK)) return (-1); ki = p->p_aioinfo; if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) return (-1); /* Create and build a buffer header for a transfer. */ bp = (struct buf *)getpbuf(NULL); BUF_KERNPROC(bp); AIO_LOCK(ki); ki->kaio_count++; ki->kaio_buffer_count++; lj = aiocbe->lio; if (lj) lj->lioj_count++; AIO_UNLOCK(ki); /* * Get a copy of the kva from the physical buffer. */ error = 0; bp->b_bcount = cb->aio_nbytes; bp->b_bufsize = cb->aio_nbytes; bp->b_iodone = aio_physwakeup; bp->b_saveaddr = bp->b_data; bp->b_data = (void *)(uintptr_t)cb->aio_buf; bp->b_offset = cb->aio_offset; bp->b_iooffset = cb->aio_offset; bp->b_blkno = btodb(cb->aio_offset); bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; /* * Bring buffer into kernel space. */ if (vmapbuf(bp) < 0) { error = EFAULT; goto doerror; } AIO_LOCK(ki); aiocbe->bp = bp; bp->b_caller1 = (void *)aiocbe; TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); aiocbe->jobstate = JOBST_JOBQBUF; cb->_aiocb_private.status = cb->aio_nbytes; AIO_UNLOCK(ki); atomic_add_int(&num_queue_count, 1); atomic_add_int(&num_buf_aio, 1); bp->b_error = 0; TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe); /* Perform transfer. */ dev_strategy(vp->v_rdev, bp); return (0); doerror: AIO_LOCK(ki); ki->kaio_count--; ki->kaio_buffer_count--; if (lj) lj->lioj_count--; aiocbe->bp = NULL; AIO_UNLOCK(ki); relpbuf(bp, NULL); return (error); } /* * Wake up aio requests that may be serviceable now. */ static void aio_swake_cb(struct socket *so, struct sockbuf *sb) { struct aiocblist *cb, *cbn; int opcode; if (sb == &so->so_snd) opcode = LIO_WRITE; else opcode = LIO_READ; SOCKBUF_LOCK(sb); sb->sb_flags &= ~SB_AIO; mtx_lock(&aio_job_mtx); TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) { if (opcode == cb->uaiocb.aio_lio_opcode) { if (cb->jobstate != JOBST_JOBQSOCK) panic("invalid queue value"); /* XXX * We don't have actual sockets backend yet, * so we simply move the requests to the generic * file I/O backend. */ TAILQ_REMOVE(&so->so_aiojobq, cb, list); TAILQ_INSERT_TAIL(&aio_jobs, cb, list); aio_kick_nowait(cb->userproc); } } mtx_unlock(&aio_job_mtx); SOCKBUF_UNLOCK(sb); } /* * Queue a new AIO request. Choosing either the threaded or direct physio VCHR * technique is done in this code. */ int aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj, int type, int oldsigev) { struct proc *p = td->td_proc; struct file *fp; struct socket *so; struct aiocblist *aiocbe, *cb; struct kaioinfo *ki; struct kevent kev; struct sockbuf *sb; int opcode; int error; int fd, kqfd; int jid; if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; suword(&job->_aiocb_private.status, -1); suword(&job->_aiocb_private.error, 0); suword(&job->_aiocb_private.kernelinfo, -1); if (num_queue_count >= max_queue_count || ki->kaio_count >= ki->kaio_qallowed_count) { suword(&job->_aiocb_private.error, EAGAIN); return (EAGAIN); } aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); aiocbe->inputcharge = 0; aiocbe->outputcharge = 0; knlist_init(&aiocbe->klist, AIO_MTX(ki), NULL, NULL, NULL); if (oldsigev) { bzero(&aiocbe->uaiocb, sizeof(struct aiocb)); error = copyin(job, &aiocbe->uaiocb, sizeof(struct oaiocb)); bcopy(&aiocbe->uaiocb.__spare__, &aiocbe->uaiocb.aio_sigevent, sizeof(struct osigevent)); } else { error = copyin(job, &aiocbe->uaiocb, sizeof(struct aiocb)); } if (error) { suword(&job->_aiocb_private.error, error); uma_zfree(aiocb_zone, aiocbe); return (error); } if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { suword(&job->_aiocb_private.error, EINVAL); uma_zfree(aiocb_zone, aiocbe); return (EINVAL); } if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { uma_zfree(aiocb_zone, aiocbe); return (EINVAL); } ksiginfo_init(&aiocbe->ksi); /* Save userspace address of the job info. */ aiocbe->uuaiocb = job; /* Get the opcode. */ if (type != LIO_NOP) aiocbe->uaiocb.aio_lio_opcode = type; opcode = aiocbe->uaiocb.aio_lio_opcode; /* Fetch the file object for the specified file descriptor. */ fd = aiocbe->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: error = fget_write(td, fd, &fp); break; case LIO_READ: error = fget_read(td, fd, &fp); break; default: error = fget(td, fd, &fp); } if (error) { uma_zfree(aiocb_zone, aiocbe); suword(&job->_aiocb_private.error, error); return (error); } if (opcode == LIO_SYNC && fp->f_vnode == NULL) { error = EINVAL; goto aqueue_fail; } if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) { error = EINVAL; goto aqueue_fail; } aiocbe->fd_file = fp; mtx_lock(&aio_job_mtx); jid = jobrefid++; aiocbe->seqno = jobseqno++; mtx_unlock(&aio_job_mtx); error = suword(&job->_aiocb_private.kernelinfo, jid); if (error) { error = EINVAL; goto aqueue_fail; } aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; if (opcode == LIO_NOP) { fdrop(fp, td); uma_zfree(aiocb_zone, aiocbe); return (0); } if ((opcode != LIO_READ) && (opcode != LIO_WRITE) && (opcode != LIO_SYNC)) { error = EINVAL; goto aqueue_fail; } if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) goto no_kqueue; kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; kev.ident = (uintptr_t)aiocbe->uuaiocb; kev.filter = EVFILT_AIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.data = (intptr_t)aiocbe; kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr; error = kqfd_register(kqfd, &kev, td, 1); aqueue_fail: if (error) { fdrop(fp, td); uma_zfree(aiocb_zone, aiocbe); suword(&job->_aiocb_private.error, error); goto done; } no_kqueue: suword(&job->_aiocb_private.error, EINPROGRESS); aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; aiocbe->userproc = p; aiocbe->cred = crhold(td->td_ucred); aiocbe->jobflags = 0; aiocbe->lio = lj; if (opcode == LIO_SYNC) goto queueit; if (fp->f_type == DTYPE_SOCKET) { /* * Alternate queueing for socket ops: Reach down into the * descriptor to get the socket data. Then check to see if the * socket is ready to be read or written (based on the requested * operation). * * If it is not ready for io, then queue the aiocbe on the * socket, and set the flags so we get a call when sbnotify() * happens. * * Note if opcode is neither LIO_WRITE nor LIO_READ we lock * and unlock the snd sockbuf for no reason. */ so = fp->f_data; sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd; SOCKBUF_LOCK(sb); if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == LIO_WRITE) && (!sowriteable(so)))) { sb->sb_flags |= SB_AIO; mtx_lock(&aio_job_mtx); TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); mtx_unlock(&aio_job_mtx); AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); aiocbe->jobstate = JOBST_JOBQSOCK; ki->kaio_count++; if (lj) lj->lioj_count++; AIO_UNLOCK(ki); SOCKBUF_UNLOCK(sb); atomic_add_int(&num_queue_count, 1); error = 0; goto done; } SOCKBUF_UNLOCK(sb); } if ((error = aio_qphysio(p, aiocbe)) == 0) goto done; #if 0 if (error > 0) { aiocbe->uaiocb._aiocb_private.error = error; suword(&job->_aiocb_private.error, error); goto done; } #endif queueit: /* No buffer for daemon I/O. */ aiocbe->bp = NULL; atomic_add_int(&num_queue_count, 1); AIO_LOCK(ki); ki->kaio_count++; if (lj) lj->lioj_count++; TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); if (opcode == LIO_SYNC) { TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) { if (cb->fd_file == aiocbe->fd_file && cb->uaiocb.aio_lio_opcode != LIO_SYNC && cb->seqno < aiocbe->seqno) { cb->jobflags |= AIOCBLIST_CHECKSYNC; aiocbe->pending++; } } TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) { if (cb->fd_file == aiocbe->fd_file && cb->uaiocb.aio_lio_opcode != LIO_SYNC && cb->seqno < aiocbe->seqno) { cb->jobflags |= AIOCBLIST_CHECKSYNC; aiocbe->pending++; } } if (aiocbe->pending != 0) { TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list); aiocbe->jobstate = JOBST_JOBQSYNC; AIO_UNLOCK(ki); goto done; } } mtx_lock(&aio_job_mtx); TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); aiocbe->jobstate = JOBST_JOBQGLOBAL; aio_kick_nowait(p); mtx_unlock(&aio_job_mtx); AIO_UNLOCK(ki); error = 0; done: return (error); } static void aio_kick_nowait(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aiothreadlist *aiop; mtx_assert(&aio_job_mtx, MA_OWNED); if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; wakeup(aiop->aiothread); } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && ((ki->kaio_active_count + num_aio_resv_start) < ki->kaio_maxactive_count)) { taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task); } } static int aio_kick(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aiothreadlist *aiop; int error, ret = 0; mtx_assert(&aio_job_mtx, MA_OWNED); retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; wakeup(aiop->aiothread); } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && ((ki->kaio_active_count + num_aio_resv_start) < ki->kaio_maxactive_count)) { num_aio_resv_start++; mtx_unlock(&aio_job_mtx); error = aio_newproc(&num_aio_resv_start); mtx_lock(&aio_job_mtx); if (error) { num_aio_resv_start--; goto retryproc; } } else { ret = -1; } return (ret); } static void aio_kick_helper(void *context, int pending) { struct proc *userp = context; mtx_lock(&aio_job_mtx); while (--pending >= 0) { if (aio_kick(userp)) break; } mtx_unlock(&aio_job_mtx); } /* * Support the aio_return system call, as a side-effect, kernel resources are * released. */ int aio_return(struct thread *td, struct aio_return_args *uap) { struct proc *p = td->td_proc; struct aiocblist *cb; struct aiocb *uaiocb; struct kaioinfo *ki; int status, error; ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); uaiocb = uap->aiocbp; AIO_LOCK(ki); TAILQ_FOREACH(cb, &ki->kaio_done, plist) { if (cb->uuaiocb == uaiocb) break; } if (cb != NULL) { MPASS(cb->jobstate == JOBST_JOBFINISHED); status = cb->uaiocb._aiocb_private.status; error = cb->uaiocb._aiocb_private.error; td->td_retval[0] = status; if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { p->p_stats->p_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { p->p_stats->p_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } aio_free_entry(cb); AIO_UNLOCK(ki); suword(&uaiocb->_aiocb_private.error, error); suword(&uaiocb->_aiocb_private.status, status); } else { error = EINVAL; AIO_UNLOCK(ki); } return (error); } /* * Allow a process to wakeup when any of the I/O requests are completed. */ int aio_suspend(struct thread *td, struct aio_suspend_args *uap) { struct proc *p = td->td_proc; struct timeval atv; struct timespec ts; struct aiocb *const *cbptr, *cbp; struct kaioinfo *ki; struct aiocblist *cb, *cbfirst; struct aiocb **ujoblist; int njoblist; int error; int timo; int i; if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) return (EINVAL); timo = 0; if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) return (error); if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, &ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); njoblist = 0; ujoblist = uma_zalloc(aiol_zone, M_WAITOK); cbptr = uap->aiocbp; for (i = 0; i < uap->nent; i++) { cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); if (cbp == 0) continue; ujoblist[njoblist] = cbp; njoblist++; } if (njoblist == 0) { uma_zfree(aiol_zone, ujoblist); return (0); } AIO_LOCK(ki); for (;;) { cbfirst = NULL; error = 0; TAILQ_FOREACH(cb, &ki->kaio_all, allist) { for (i = 0; i < njoblist; i++) { if (cb->uuaiocb == ujoblist[i]) { if (cbfirst == NULL) cbfirst = cb; if (cb->jobstate == JOBST_JOBFINISHED) goto RETURN; } } } /* All tasks were finished. */ if (cbfirst == NULL) break; ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", timo); if (error == ERESTART) error = EINTR; if (error) break; } RETURN: AIO_UNLOCK(ki); uma_zfree(aiol_zone, ujoblist); return (error); } /* * aio_cancel cancels any non-physio aio operations not currently in * progress. */ int aio_cancel(struct thread *td, struct aio_cancel_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; struct aiocblist *cbe, *cbn; struct file *fp; struct socket *so; int error; int remove; int cancelled = 0; int notcancelled = 0; struct vnode *vp; /* Lookup file object. */ error = fget(td, uap->fd, &fp); if (error) return (error); ki = p->p_aioinfo; if (ki == NULL) goto done; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vn_isdisk(vp, &error)) { fdrop(fp, td); td->td_retval[0] = AIO_NOTCANCELED; return (0); } } AIO_LOCK(ki); TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { if ((uap->fd == cbe->uaiocb.aio_fildes) && ((uap->aiocbp == NULL) || (uap->aiocbp == cbe->uuaiocb))) { remove = 0; mtx_lock(&aio_job_mtx); if (cbe->jobstate == JOBST_JOBQGLOBAL) { TAILQ_REMOVE(&aio_jobs, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSOCK) { MPASS(fp->f_type == DTYPE_SOCKET); so = fp->f_data; TAILQ_REMOVE(&so->so_aiojobq, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSYNC) { TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); remove = 1; } mtx_unlock(&aio_job_mtx); if (remove) { TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); cbe->uaiocb._aiocb_private.status = -1; cbe->uaiocb._aiocb_private.error = ECANCELED; aio_bio_done_notify(p, cbe, DONE_QUEUE); cancelled++; } else { notcancelled++; } if (uap->aiocbp != NULL) break; } } AIO_UNLOCK(ki); done: fdrop(fp, td); if (uap->aiocbp != NULL) { if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } } if (notcancelled) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } td->td_retval[0] = AIO_ALLDONE; return (0); } /* - * aio_error is implemented in the kernel level for compatibility purposes only. - * For a user mode async implementation, it would be best to do it in a userland - * subroutine. + * aio_error is implemented in the kernel level for compatibility purposes + * only. For a user mode async implementation, it would be best to do it in + * a userland subroutine. */ int aio_error(struct thread *td, struct aio_error_args *uap) { struct proc *p = td->td_proc; struct aiocblist *cb; struct kaioinfo *ki; int status; ki = p->p_aioinfo; if (ki == NULL) { td->td_retval[0] = EINVAL; return (0); } AIO_LOCK(ki); TAILQ_FOREACH(cb, &ki->kaio_all, allist) { if (cb->uuaiocb == uap->aiocbp) { if (cb->jobstate == JOBST_JOBFINISHED) td->td_retval[0] = cb->uaiocb._aiocb_private.error; else td->td_retval[0] = EINPROGRESS; AIO_UNLOCK(ki); return (0); } } AIO_UNLOCK(ki); /* * Hack for failure of aio_aqueue. */ status = fuword(&uap->aiocbp->_aiocb_private.status); if (status == -1) { td->td_retval[0] = fuword(&uap->aiocbp->_aiocb_private.error); return (0); } td->td_retval[0] = EINVAL; return (0); } /* syscall - asynchronous read from a file (REALTIME) */ int oaio_read(struct thread *td, struct oaio_read_args *uap) { return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 1); } int aio_read(struct thread *td, struct aio_read_args *uap) { return aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, 0); } /* syscall - asynchronous write to a file (REALTIME) */ int oaio_write(struct thread *td, struct oaio_write_args *uap) { return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 1); } int aio_write(struct thread *td, struct aio_write_args *uap) { return aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, 0); } /* syscall - list directed I/O (REALTIME) */ int olio_listio(struct thread *td, struct olio_listio_args *uap) { return do_lio_listio(td, (struct lio_listio_args *)uap, 1); } /* syscall - list directed I/O (REALTIME) */ int lio_listio(struct thread *td, struct lio_listio_args *uap) { return do_lio_listio(td, uap, 0); } static int do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev) { struct proc *p = td->td_proc; struct aiocb *iocb, * const *cbptr; struct kaioinfo *ki; struct aioliojob *lj; struct kevent kev; int nent; int error; int nerror; int i; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > AIO_LISTIO_MAX) return (EINVAL); if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; lj = uma_zalloc(aiolio_zone, M_WAITOK); lj->lioj_flags = 0; lj->lioj_count = 0; lj->lioj_finished_count = 0; knlist_init(&lj->klist, AIO_MTX(ki), NULL, NULL, NULL); ksiginfo_init(&lj->lioj_ksi); /* * Setup signal. */ if (uap->sig && (uap->mode == LIO_NOWAIT)) { bzero(&lj->lioj_signal, sizeof(&lj->lioj_signal)); error = copyin(uap->sig, &lj->lioj_signal, oldsigev ? sizeof(struct osigevent) : sizeof(struct sigevent)); if (error) { uma_zfree(aiolio_zone, lj); return (error); } if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { /* Assume only new style KEVENT */ kev.filter = EVFILT_LIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.ident = (uintptr_t)uap->acb_list; /* something unique */ kev.data = (intptr_t)lj; /* pass user defined sigval data */ kev.udata = lj->lioj_signal.sigev_value.sival_ptr; error = kqfd_register( lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1); if (error) { uma_zfree(aiolio_zone, lj); return (error); } } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { ; } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { uma_zfree(aiolio_zone, lj); return EINVAL; } lj->lioj_flags |= LIOJ_SIGNAL; } else { uma_zfree(aiolio_zone, lj); return EINVAL; } } AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Add extra aiocb count to avoid the lio to be freed * by other threads doing aio_waitcomplete or aio_return, * and prevent event from being sent until we have queued * all tasks. */ lj->lioj_count = 1; AIO_UNLOCK(ki); /* * Get pointers to the list of I/O requests. */ nerror = 0; cbptr = uap->acb_list; for (i = 0; i < uap->nent; i++) { iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]); if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) { error = aio_aqueue(td, iocb, lj, LIO_NOP, oldsigev); if (error != 0) nerror++; } } error = 0; AIO_LOCK(ki); if (uap->mode == LIO_WAIT) { while (lj->lioj_count - 1 != lj->lioj_finished_count) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", 0); if (error == ERESTART) error = EINTR; if (error) break; } } else { if (lj->lioj_count - 1 == lj->lioj_finished_count) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } } lj->lioj_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); uma_zfree(aiolio_zone, lj); } else AIO_UNLOCK(ki); if (nerror) return (EIO); return (error); } /* * Called from interrupt thread for physio, we should return as fast * as possible, so we schedule a biohelper task. */ static void aio_physwakeup(struct buf *bp) { struct aiocblist *aiocbe; aiocbe = (struct aiocblist *)bp->b_caller1; taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask); } /* * Task routine to perform heavy tasks, process wakeup, and signals. */ static void biohelper(void *context, int pending) { struct aiocblist *aiocbe = context; struct buf *bp; struct proc *userp; struct kaioinfo *ki; int nblks; bp = aiocbe->bp; userp = aiocbe->userproc; ki = userp->p_aioinfo; AIO_LOCK(ki); aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; aiocbe->uaiocb._aiocb_private.error = 0; if (bp->b_ioflags & BIO_ERROR) aiocbe->uaiocb._aiocb_private.error = bp->b_error; nblks = btodb(aiocbe->uaiocb.aio_nbytes); if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE) aiocbe->outputcharge += nblks; else aiocbe->inputcharge += nblks; aiocbe->bp = NULL; TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist); ki->kaio_buffer_count--; aio_bio_done_notify(userp, aiocbe, DONE_BUF); AIO_UNLOCK(ki); /* Release mapping into kernel space. */ vunmapbuf(bp); relpbuf(bp, NULL); atomic_subtract_int(&num_buf_aio, 1); } /* syscall - wait for the next completion of an aio request */ int aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) { struct proc *p = td->td_proc; struct timeval atv; struct timespec ts; struct kaioinfo *ki; struct aiocblist *cb; struct aiocb *uuaiocb; int error, status, timo; suword(uap->aiocbp, (long)NULL); timo = 0; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000)) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, &ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; error = 0; cb = NULL; AIO_LOCK(ki); while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiowc", timo); if (timo && error == ERESTART) error = EINTR; if (error) break; } if (cb != NULL) { MPASS(cb->jobstate == JOBST_JOBFINISHED); uuaiocb = cb->uuaiocb; status = cb->uaiocb._aiocb_private.status; error = cb->uaiocb._aiocb_private.error; td->td_retval[0] = status; if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { p->p_stats->p_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { p->p_stats->p_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } aio_free_entry(cb); AIO_UNLOCK(ki); suword(uap->aiocbp, (long)uuaiocb); suword(&uuaiocb->_aiocb_private.error, error); suword(&uuaiocb->_aiocb_private.status, status); } else AIO_UNLOCK(ki); return (error); } int aio_fsync(struct thread *td, struct aio_fsync_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; if (uap->op != O_SYNC) /* XXX lack of O_DSYNC */ return (EINVAL); ki = p->p_aioinfo; if (ki == NULL) aio_init_aioinfo(p); return aio_aqueue(td, uap->aiocbp, NULL, LIO_SYNC, 0); } /* kqueue attach function */ static int filt_aioattach(struct knote *kn) { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; /* * The aiocbe pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_flags &= ~EV_FLAG1; knlist_add(&aiocbe->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_aiodetach(struct knote *kn) { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; if (!knlist_empty(&aiocbe->klist)) knlist_remove(&aiocbe->klist, kn, 0); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_aio(struct knote *kn, long hint) { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; kn->kn_data = aiocbe->uaiocb._aiocb_private.error; if (aiocbe->jobstate != JOBST_JOBFINISHED) return (0); kn->kn_flags |= EV_EOF; return (1); } /* kqueue attach function */ static int filt_lioattach(struct knote *kn) { struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata; /* * The aioliojob pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_flags &= ~EV_FLAG1; knlist_add(&lj->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_liodetach(struct knote *kn) { struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata; if (!knlist_empty(&lj->klist)) knlist_remove(&lj->klist, kn, 0); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_lio(struct knote *kn, long hint) { struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata; return (lj->lioj_flags & LIOJ_KEVENT_POSTED); } Index: head/sys/kern/vfs_cache.c =================================================================== --- head/sys/kern/vfs_cache.c (revision 167231) +++ head/sys/kern/vfs_cache.c (revision 167232) @@ -1,871 +1,871 @@ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct namecache { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ struct vnode *nc_vp; /* vnode the name refers to */ u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[0]; /* segment name */ }; /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (vp, name) where vp refers to the directory * containing name. * * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. */ /* * Structures associated with name cacheing. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ static u_long nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, ""); static u_long ncnegfactor = 16; /* ratio of negative entries */ SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, ""); static u_long numneg; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, ""); static u_long numcache; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, ""); static u_long numcachehv; /* number of cache entries with vnodes held */ SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, ""); #if 0 static u_long numcachepl; /* number of cache purge for leaf entries */ SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, ""); #endif struct nchstats nchstats; /* cache effectiveness statistics */ static struct mtx cache_lock; MTX_SYSINIT(vfscache, &cache_lock, "Name Cache", MTX_DEF); #define CACHE_LOCK() mtx_lock(&cache_lock) #define CACHE_UNLOCK() mtx_unlock(&cache_lock) /* * UMA zones for the VFS cache. * * The small cache is used for entries with short names, which are the * most common. The large cache is used for entries which are too big to * fit in the small cache. */ static uma_zone_t cache_zone_small; static uma_zone_t cache_zone_large; #define CACHE_PATH_CUTOFF 32 #define CACHE_ZONE_SMALL (sizeof(struct namecache) + CACHE_PATH_CUTOFF) #define CACHE_ZONE_LARGE (sizeof(struct namecache) + NAME_MAX) #define cache_alloc(len) uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \ cache_zone_small : cache_zone_large, M_WAITOK) #define cache_free(ncp) do { \ if (ncp != NULL) \ uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \ cache_zone_small : cache_zone_large, (ncp)); \ } while (0) static int doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, ""); /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0, sizeof(struct namecache), ""); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); #define STATNODE(mode, name, var) \ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, ""); STATNODE(CTLFLAG_RD, numneg, &numneg); STATNODE(CTLFLAG_RD, numcache, &numcache); static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls); static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits); static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits); static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks); static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss); static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap); static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps); static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits); static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps); static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits); SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics"); static void cache_zap(struct namecache *ncp); static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); /* * Flags in namecache.nc_flag */ #define NCF_WHITE 1 /* * Grab an atomic snapshot of the name cache hash chain lengths */ SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count; n_nchash = nchash + 1; /* nchash is max index, not count */ if (!req->oldptr) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } error = SYSCTL_OUT(req, &count, sizeof(count)); if (error) return (error); } return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; pct = (used * 100 * 100) / n_nchash; error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths"); /* * cache_zap(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap(ncp) struct namecache *ncp; { struct vnode *vp; mtx_assert(&cache_lock, MA_OWNED); CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp); vp = NULL; LIST_REMOVE(ncp, nc_hash); LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { vp = ncp->nc_dvp; numcachehv--; } if (ncp->nc_vp) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); ncp->nc_vp->v_dd = NULL; } else { TAILQ_REMOVE(&ncneg, ncp, nc_dst); numneg--; } numcache--; cache_free(ncp); if (vp) vdrop(vp); } /* * cache_leaf_test() * * Test whether this (directory) vnode's namei cache entry contains * subdirectories or not. Used to determine whether the directory is * a leaf in the namei cache or not. Note: the directory may still * contain files in the namei cache. * * Returns 0 if the directory is a leaf, -1 if it isn't. */ int cache_leaf_test(struct vnode *vp) { struct namecache *ncpc; int leaf; leaf = 0; CACHE_LOCK(); for (ncpc = LIST_FIRST(&vp->v_cache_src); ncpc != NULL; ncpc = LIST_NEXT(ncpc, nc_src) ) { if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR) { leaf = -1; break; } } CACHE_UNLOCK(); return (leaf); } /* * Lookup an entry in the cache * * Lookup is called with dvp pointing to the directory to search, * cnp pointing to the name of the entry being sought. If the lookup * succeeds, the vnode is returned in *vpp, and a status of -1 is * returned. If the lookup determines that the name does not exist * (negative cacheing), a status of ENOENT is returned. If the lookup * fails, a status of zero is returned. * * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is * unlocked. If we're looking up . an extra ref is taken, but the lock is * not recursively acquired. */ int cache_lookup(dvp, vpp, cnp) struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { struct namecache *ncp; u_int32_t hash; int error; if (!doingcache) { cnp->cn_flags &= ~MAKEENTRY; return (0); } retry: CACHE_LOCK(); numcalls++; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { *vpp = dvp; CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", dvp, cnp->cn_nameptr); dothits++; goto success; } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { dotdothits++; if (dvp->v_dd == NULL || (cnp->cn_flags & MAKEENTRY) == 0) { CACHE_UNLOCK(); return (0); } *vpp = dvp->v_dd; CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", dvp, cnp->cn_nameptr, *vpp); goto success; } } hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { numchecks++; if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } /* We failed to find an entry */ if (ncp == 0) { if ((cnp->cn_flags & MAKEENTRY) == 0) { nummisszap++; } else { nummiss++; } nchstats.ncs_miss++; CACHE_UNLOCK(); return (0); } /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { numposzaps++; nchstats.ncs_badhits++; cache_zap(ncp); CACHE_UNLOCK(); return (0); } /* We found a "positive" match, return the vnode */ if (ncp->nc_vp) { numposhits++; nchstats.ncs_goodhits++; *vpp = ncp->nc_vp; CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", dvp, cnp->cn_nameptr, *vpp, ncp); goto success; } /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { numnegzaps++; nchstats.ncs_badhits++; cache_zap(ncp); CACHE_UNLOCK(); return (0); } numneghits++; /* * We found a "negative" match, so we shift it to the end of * the "negative" cache entries queue to satisfy LRU. Also, * check to see if the entry is a whiteout; indicate this to * the componentname, if so. */ TAILQ_REMOVE(&ncneg, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); nchstats.ncs_neghits++; if (ncp->nc_flag & NCF_WHITE) cnp->cn_flags |= ISWHITEOUT; CACHE_UNLOCK(); return (ENOENT); success: /* * On success we return a locked and ref'd vnode as per the lookup * protocol. */ if (dvp == *vpp) { /* lookup on "." */ VREF(*vpp); CACHE_UNLOCK(); return (-1); } if (cnp->cn_flags & ISDOTDOT) VOP_UNLOCK(dvp, 0, cnp->cn_thread); VI_LOCK(*vpp); CACHE_UNLOCK(); error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread); if (cnp->cn_flags & ISDOTDOT) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, cnp->cn_thread); if (error) { *vpp = NULL; goto retry; } return (-1); } /* * Add an entry to the cache. */ void cache_enter(dvp, vp, cnp) struct vnode *dvp; struct vnode *vp; struct componentname *cnp; { struct namecache *ncp; struct nchashhead *ncpp; u_int32_t hash; int hold; int zap; int len; CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, ("cahe_enter: Adding a doomed vnode")); if (!doingcache) return; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { return; } /* * For dotdot lookups only cache the v_dd pointer if the * directory has a link back to its parent via v_cache_dst. * Without this an unlinked directory would keep a soft * reference to its parent which could not be NULLd at * cache_purge() time. */ if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { CACHE_LOCK(); if (!TAILQ_EMPTY(&dvp->v_cache_dst)) dvp->v_dd = vp; CACHE_UNLOCK(); return; } } hold = 0; zap = 0; ncp = cache_alloc(cnp->cn_namelen); CACHE_LOCK(); numcache++; if (!vp) { numneg++; ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0; } else if (vp->v_type == VDIR) { vp->v_dd = dvp; } else { vp->v_dd = NULL; } /* * Set the rest of the namecache entry elements, calculate it's * hash key and insert it into the appropriate chain within * the cache entries table. */ ncp->nc_vp = vp; ncp->nc_dvp = dvp; len = ncp->nc_nlen = cnp->cn_namelen; hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT); bcopy(cnp->cn_nameptr, ncp->nc_name, len); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); ncpp = NCHHASH(hash); LIST_INSERT_HEAD(ncpp, ncp, nc_hash); if (LIST_EMPTY(&dvp->v_cache_src)) { hold = 1; numcachehv++; } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); } else { TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); } if (numneg * ncnegfactor > numcache) { ncp = TAILQ_FIRST(&ncneg); zap = 1; } if (hold) vhold(dvp); if (zap) cache_zap(ncp); CACHE_UNLOCK(); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { TAILQ_INIT(&ncneg); cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL) /* * Invalidate all entries to a particular vnode. */ void cache_purge(vp) struct vnode *vp; { CTR1(KTR_VFS, "cache_purge(%p)", vp); CACHE_LOCK(); while (!LIST_EMPTY(&vp->v_cache_src)) cache_zap(LIST_FIRST(&vp->v_cache_src)); while (!TAILQ_EMPTY(&vp->v_cache_dst)) cache_zap(TAILQ_FIRST(&vp->v_cache_dst)); vp->v_dd = NULL; CACHE_UNLOCK(); } /* * Flush all entries referencing a particular filesystem. * * Since we need to check it anyway, we will flush all the invalid * entries at the same time. */ void cache_purgevfs(mp) struct mount *mp; { struct nchashhead *ncpp; struct namecache *ncp, *nnp; struct nchashhead mplist; LIST_INIT(&mplist); ncp = NULL; /* Scan hash tables for applicable entries */ CACHE_LOCK(); for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) { nnp = LIST_NEXT(ncp, nc_hash); if (ncp->nc_dvp->v_mount == mp) { LIST_REMOVE(ncp, nc_hash); LIST_INSERT_HEAD(&mplist, ncp, nc_hash); } } } while (!LIST_EMPTY(&mplist)) cache_zap(LIST_FIRST(&mplist)); CACHE_UNLOCK(); } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; struct thread *td = cnp->cn_thread; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = VOP_ACCESS(dvp, VEXEC, cred, td); if (error) return (error); error = cache_lookup(dvp, vpp, cnp); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == ENOENT) return (error); return (0); } #ifndef _SYS_SYSPROTO_H_ struct __getcwd_args { u_char *buf; u_int buflen; }; #endif /* * XXX All of these sysctls would probably be more productive dead. */ static int disablecwd; SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "Disable the getcwd syscall"); -/* Implementation of the getcwd syscall */ +/* Implementation of the getcwd syscall. */ int __getcwd(td, uap) struct thread *td; struct __getcwd_args *uap; { return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen)); } int kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen) { char *bp, *tmpbuf; struct filedesc *fdp; int error; if (disablecwd) return (ENODEV); if (buflen < 2) return (EINVAL); if (buflen > MAXPATHLEN) buflen = MAXPATHLEN; tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; mtx_lock(&Giant); FILEDESC_LOCK(fdp); error = vn_fullpath1(td, fdp->fd_cdir, fdp->fd_rdir, tmpbuf, &bp, buflen); FILEDESC_UNLOCK(fdp); mtx_unlock(&Giant); if (!error) { if (bufseg == UIO_SYSSPACE) bcopy(bp, buf, strlen(bp) + 1); else error = copyout(bp, buf, strlen(bp) + 1); } free(tmpbuf, M_TEMP); return (error); } /* * Thus begins the fullpath magic. */ #undef STATNODE #define STATNODE(name) \ static u_int name; \ SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "") static int disablefullpath; SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, "Disable the vn_fullpath function"); /* These count for kern___getcwd(), too. */ STATNODE(numfullpathcalls); STATNODE(numfullpathfail1); STATNODE(numfullpathfail2); STATNODE(numfullpathfail4); STATNODE(numfullpathfound); /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; struct filedesc *fdp; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_LOCK(fdp); error = vn_fullpath1(td, vn, fdp->fd_rdir, buf, retbuf, MAXPATHLEN); FILEDESC_UNLOCK(fdp); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * The magic behind kern___getcwd() and vn_fullpath(). */ static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen) { char *bp; int error, i, slash_prefixed; struct namecache *ncp; bp = buf + buflen - 1; *bp = '\0'; error = 0; slash_prefixed = 0; CACHE_LOCK(); numfullpathcalls++; if (vp->v_type != VDIR) { ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!ncp) { numfullpathfail2++; CACHE_UNLOCK(); return (ENOENT); } for (i = ncp->nc_nlen - 1; i >= 0 && bp > buf; i--) *--bp = ncp->nc_name[i]; if (bp == buf) { numfullpathfail4++; CACHE_UNLOCK(); return (ENOMEM); } *--bp = '/'; slash_prefixed = 1; vp = ncp->nc_dvp; } while (vp != rdir && vp != rootvnode) { if (vp->v_vflag & VV_ROOT) { if (vp->v_iflag & VI_DOOMED) { /* forced unmount */ error = EBADF; break; } vp = vp->v_mount->mnt_vnodecovered; continue; } if (vp->v_dd == NULL) { numfullpathfail1++; error = ENOTDIR; break; } ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!ncp) { numfullpathfail2++; error = ENOENT; break; } MPASS(ncp->nc_dvp == vp->v_dd); for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--) *--bp = ncp->nc_name[i]; if (bp == buf) { numfullpathfail4++; error = ENOMEM; break; } *--bp = '/'; slash_prefixed = 1; vp = ncp->nc_dvp; } if (error) { CACHE_UNLOCK(); return (error); } if (!slash_prefixed) { if (bp == buf) { numfullpathfail4++; CACHE_UNLOCK(); return (ENOMEM); } else { *--bp = '/'; } } numfullpathfound++; CACHE_UNLOCK(); *retbuf = bp; return (0); } Index: head/sys/kern/vfs_mount.c =================================================================== --- head/sys/kern/vfs_mount.c (revision 167231) +++ head/sys/kern/vfs_mount.c (revision 167232) @@ -1,2192 +1,2189 @@ /*- * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_rootdevname.h" #include "opt_ddb.h" #include "opt_mac.h" #ifdef DDB #include #endif #define ROOTNAME "root_device" #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) static int vfs_domount(struct thread *td, const char *fstype, char *fspath, int fsflags, void *fsdata); static struct mount *vfs_mount_alloc(struct vnode *dvp, struct vfsconf *vfsp, const char *fspath, struct thread *td); static int vfs_mountroot_ask(void); static int vfs_mountroot_try(const char *mountfrom); static int vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions); static void free_mntarg(struct mntarg *ma); static void vfs_mount_destroy(struct mount *); static int vfs_getopt_pos(struct vfsoptlist *opts, const char *name); static int usermount = 0; SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "Unprivileged users may mount and unmount file systems"); MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static uma_zone_t mount_zone; /* List of mounted filesystems. */ struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* For any iteration/modification of mountlist */ struct mtx mountlist_mtx; MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); TAILQ_HEAD(vfsoptlist, vfsopt); struct vfsopt { TAILQ_ENTRY(vfsopt) link; char *name; void *value; int len; }; /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store */ /* * Global opts, taken by all filesystems */ static const char *global_opts[] = { "errmsg", "fstype", "fspath", "rdonly", "ro", "rw", "suid", "exec", "update", NULL }; /* * The root specifiers we will try if RB_CDROM is specified. */ static char *cdrom_rootdevnames[] = { "cd9660:cd0", "cd9660:acd0", NULL }; /* legacy find-root code */ char *rootdevnames[2] = {NULL, NULL}; #ifndef ROOTDEVNAME # define ROOTDEVNAME NULL #endif static const char *ctrootdevname = ROOTDEVNAME; /* * --------------------------------------------------------------------- * Functions for building and sanitizing the mount options */ /* Remove one mount option. */ static void vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) { TAILQ_REMOVE(opts, opt, link); free(opt->name, M_MOUNT); if (opt->value != NULL) free(opt->value, M_MOUNT); #ifdef INVARIANTS else if (opt->len != 0) panic("%s: mount option with NULL value but length != 0", __func__); #endif free(opt, M_MOUNT); } /* Release all resources related to the mount options. */ static void vfs_freeopts(struct vfsoptlist *opts) { struct vfsopt *opt; while (!TAILQ_EMPTY(opts)) { opt = TAILQ_FIRST(opts); vfs_freeopt(opts, opt); } free(opts, M_MOUNT); } void vfs_deleteopt(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt, *temp; TAILQ_FOREACH_SAFE(opt, opts, link, temp) { if (strcmp(opt->name, name) == 0) vfs_freeopt(opts, opt); } } /* * Check if options are equal (with or without the "no" prefix). */ static int vfs_equalopts(const char *opt1, const char *opt2) { /* "opt" vs. "opt" or "noopt" vs. "noopt" */ if (strcmp(opt1, opt2) == 0) return (1); /* "noopt" vs. "opt" */ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) return (1); /* "opt" vs. "noopt" */ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) return (1); return (0); } /* * If a mount option is specified several times, * (with or without the "no" prefix) only keep * the last occurence of it. */ static void vfs_sanitizeopts(struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *tmp; TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { opt2 = TAILQ_PREV(opt, vfsoptlist, link); while (opt2 != NULL) { if (vfs_equalopts(opt->name, opt2->name)) { tmp = TAILQ_PREV(opt2, vfsoptlist, link); vfs_freeopt(opts, opt2); opt2 = tmp; } else { opt2 = TAILQ_PREV(opt2, vfsoptlist, link); } } } } /* * Build a linked list of mount options from a struct uio. */ static int vfs_buildopts(struct uio *auio, struct vfsoptlist **options) { struct vfsoptlist *opts; struct vfsopt *opt; size_t memused; unsigned int i, iovcnt; int error, namelen, optlen; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); memused = 0; iovcnt = auio->uio_iovcnt; for (i = 0; i < iovcnt; i += 2) { opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); namelen = auio->uio_iov[i].iov_len; optlen = auio->uio_iov[i + 1].iov_len; opt->name = malloc(namelen, M_MOUNT, M_WAITOK); opt->value = NULL; opt->len = 0; /* * Do this early, so jumps to "bad" will free the current * option. */ TAILQ_INSERT_TAIL(opts, opt, link); memused += sizeof(struct vfsopt) + optlen + namelen; /* * Avoid consuming too much memory, and attempts to overflow * memused. */ if (memused > VFS_MOUNTARG_SIZE_MAX || optlen > VFS_MOUNTARG_SIZE_MAX || namelen > VFS_MOUNTARG_SIZE_MAX) { error = EINVAL; goto bad; } if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); } else { error = copyin(auio->uio_iov[i].iov_base, opt->name, namelen); if (error) goto bad; } /* Ensure names are null-terminated strings. */ if (opt->name[namelen - 1] != '\0') { error = EINVAL; goto bad; } if (optlen != 0) { opt->len = optlen; opt->value = malloc(optlen, M_MOUNT, M_WAITOK); if (auio->uio_segflg == UIO_SYSSPACE) { bcopy(auio->uio_iov[i + 1].iov_base, opt->value, optlen); } else { error = copyin(auio->uio_iov[i + 1].iov_base, opt->value, optlen); if (error) goto bad; } } } vfs_sanitizeopts(opts); *options = opts; return (0); bad: vfs_freeopts(opts); return (error); } /* * Merge the old mount options with the new ones passed * in the MNT_UPDATE case. */ static void vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts) { struct vfsopt *opt, *opt2, *new; TAILQ_FOREACH(opt, opts, link) { /* * Check that this option hasn't been redefined * nor cancelled with a "no" mount option. */ opt2 = TAILQ_FIRST(toopts); while (opt2 != NULL) { if (strcmp(opt2->name, opt->name) == 0) goto next; if (strncmp(opt2->name, "no", 2) == 0 && strcmp(opt2->name + 2, opt->name) == 0) { vfs_freeopt(toopts, opt2); goto next; } opt2 = TAILQ_NEXT(opt2, link); } /* We want this option, duplicate it. */ new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK); strcpy(new->name, opt->name); if (opt->len != 0) { new->value = malloc(opt->len, M_MOUNT, M_WAITOK); bcopy(opt->value, new->value, opt->len); } else { new->value = NULL; } new->len = opt->len; TAILQ_INSERT_TAIL(toopts, new, link); next: continue; } } /* - * --------------------------------------------------------------------- - * Mount a filesystem + * Mount a filesystem. */ int nmount(td, uap) struct thread *td; struct nmount_args /* { struct iovec *iovp; unsigned int iovcnt; int flags; } */ *uap; { struct uio *auio; struct iovec *iov; unsigned int i; int error; u_int iovcnt; AUDIT_ARG(fflags, uap->flags); /* Kick out MNT_ROOTFS early as it is legal internally */ if (uap->flags & MNT_ROOTFS) return (EINVAL); iovcnt = uap->iovcnt; /* * Check that we have an even number of iovec's * and that we have at least two options. */ if ((iovcnt & 1) || (iovcnt < 4)) return (EINVAL); error = copyinuio(uap->iovp, iovcnt, &auio); if (error) return (error); iov = auio->uio_iov; for (i = 0; i < iovcnt; i++) { if (iov->iov_len > MMAXOPTIONLEN) { free(auio, M_IOV); return (EINVAL); } iov++; } error = vfs_donmount(td, uap->flags, auio); free(auio, M_IOV); return (error); } /* * --------------------------------------------------------------------- * Various utility functions */ void vfs_ref(struct mount *mp) { MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); } void vfs_rel(struct mount *mp) { MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); } static int mount_init(void *mem, int size, int flags) { struct mount *mp; mp = (struct mount *)mem; mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); return (0); } static void mount_fini(void *mem, int size) { struct mount *mp; mp = (struct mount *)mem; lockdestroy(&mp->mnt_lock); mtx_destroy(&mp->mnt_mtx); } /* * Allocate and initialize the mount point struct. */ static struct mount * vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, struct thread *td) { struct mount *mp; mp = uma_zalloc(mount_zone, M_WAITOK); bzero(&mp->mnt_startzero, __rangeof(struct mount, mnt_startzero, mnt_endzero)); TAILQ_INIT(&mp->mnt_nvnodelist); mp->mnt_nvnodelistsize = 0; mp->mnt_ref = 0; (void) vfs_busy(mp, LK_NOWAIT, 0, td); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; vfsp->vfc_refcount++; /* XXX Unlocked */ mp->mnt_stat.f_type = vfsp->vfc_typenum; MNT_ILOCK(mp); mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; MNT_IUNLOCK(mp); mp->mnt_gen++; strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_vnodecovered = vp; mp->mnt_cred = crdup(td->td_ucred); mp->mnt_stat.f_owner = td->td_ucred->cr_uid; strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); mp->mnt_iosize_max = DFLTPHYS; #ifdef MAC mac_init_mount(mp); mac_create_mount(td->td_ucred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); return (mp); } /* * Destroy the mount struct previously allocated by vfs_mount_alloc(). */ static void vfs_mount_destroy(struct mount *mp) { int i; MNT_ILOCK(mp); for (i = 0; mp->mnt_ref && i < 3; i++) msleep(mp, MNT_MTX(mp), PVFS, "mntref", hz); /* * This will always cause a 3 second delay in rebooting due to * refs on the root mountpoint that never go away. Most of these * are held by init which never exits. */ if (i == 3 && (!rebooting || bootverbose)) printf("Mount point %s had %d dangling refs\n", mp->mnt_stat.f_mntonname, mp->mnt_ref); if (mp->mnt_holdcnt != 0) { printf("Waiting for mount point to be unheld\n"); while (mp->mnt_holdcnt != 0) { mp->mnt_holdcntwaiters++; msleep(&mp->mnt_holdcnt, MNT_MTX(mp), PZERO, "mntdestroy", 0); mp->mnt_holdcntwaiters--; } printf("mount point unheld\n"); } if (mp->mnt_writeopcount > 0) { printf("Waiting for mount point write ops\n"); while (mp->mnt_writeopcount > 0) { mp->mnt_kern_flag |= MNTK_SUSPEND; msleep(&mp->mnt_writeopcount, MNT_MTX(mp), PZERO, "mntdestroy2", 0); } printf("mount point write ops completed\n"); } if (mp->mnt_secondary_writes > 0) { printf("Waiting for mount point secondary write ops\n"); while (mp->mnt_secondary_writes > 0) { mp->mnt_kern_flag |= MNTK_SUSPEND; msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), PZERO, "mntdestroy3", 0); } printf("mount point secondary write ops completed\n"); } MNT_IUNLOCK(mp); mp->mnt_vfc->vfc_refcount--; if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { struct vnode *vp; TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) vprint("", vp); panic("unmount: dangling vnode"); } MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup(mp); if (mp->mnt_writeopcount != 0) panic("vfs_mount_destroy: nonzero writeopcount"); if (mp->mnt_secondary_writes != 0) panic("vfs_mount_destroy: nonzero secondary_writes"); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); mp->mnt_writeopcount = -1000; mp->mnt_nvnodelistsize = -1000; mp->mnt_secondary_writes = -1000; MNT_IUNLOCK(mp); #ifdef MAC mac_destroy_mount(mp); #endif if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); crfree(mp->mnt_cred); uma_zfree(mount_zone, mp); } static int vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions) { struct vfsoptlist *optlist; struct vfsopt *opt, *noro_opt; char *fstype, *fspath, *errmsg; int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; int has_rw, has_noro; errmsg = NULL; errmsg_len = 0; errmsg_pos = -1; has_rw = 0; has_noro = 0; error = vfs_buildopts(fsoptions, &optlist); if (error) return (error); if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); /* * We need these two options before the others, * and they are mandatory for any filesystem. * Ensure they are NUL terminated as well. */ fstypelen = 0; error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); if (error || fstype[fstypelen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fstype", errmsg_len); goto bail; } fspathlen = 0; error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); if (error || fspath[fspathlen - 1] != '\0') { error = EINVAL; if (errmsg != NULL) strncpy(errmsg, "Invalid fspath", errmsg_len); goto bail; } /* * We need to see if we have the "update" option * before we call vfs_domount(), since vfs_domount() has special * logic based on MNT_UPDATE. This is very important * when we want to update the root filesystem. */ TAILQ_FOREACH(opt, optlist, link) { if (strcmp(opt->name, "update") == 0) fsflags |= MNT_UPDATE; else if (strcmp(opt->name, "async") == 0) fsflags |= MNT_ASYNC; else if (strcmp(opt->name, "force") == 0) fsflags |= MNT_FORCE; else if (strcmp(opt->name, "multilabel") == 0) fsflags |= MNT_MULTILABEL; else if (strcmp(opt->name, "noasync") == 0) fsflags &= ~MNT_ASYNC; else if (strcmp(opt->name, "noatime") == 0) fsflags |= MNT_NOATIME; else if (strcmp(opt->name, "noclusterr") == 0) fsflags |= MNT_NOCLUSTERR; else if (strcmp(opt->name, "noclusterw") == 0) fsflags |= MNT_NOCLUSTERW; else if (strcmp(opt->name, "noexec") == 0) fsflags |= MNT_NOEXEC; else if (strcmp(opt->name, "nosuid") == 0) fsflags |= MNT_NOSUID; else if (strcmp(opt->name, "nosymfollow") == 0) fsflags |= MNT_NOSYMFOLLOW; else if (strcmp(opt->name, "noro") == 0) { fsflags &= ~MNT_RDONLY; has_noro = 1; } else if (strcmp(opt->name, "rw") == 0) { fsflags &= ~MNT_RDONLY; has_rw = 1; } else if (strcmp(opt->name, "ro") == 0 || strcmp(opt->name, "rdonly") == 0) fsflags |= MNT_RDONLY; else if (strcmp(opt->name, "snapshot") == 0) fsflags |= MNT_SNAPSHOT; else if (strcmp(opt->name, "suiddir") == 0) fsflags |= MNT_SUIDDIR; else if (strcmp(opt->name, "sync") == 0) fsflags |= MNT_SYNCHRONOUS; else if (strcmp(opt->name, "union") == 0) fsflags |= MNT_UNION; } /* * If "rw" was specified as a mount option, and we * are trying to update a mount-point from "ro" to "rw", * we need a mount option "noro", since in vfs_mergeopts(), * "noro" will cancel "ro", but "rw" will not do anything. */ if (has_rw && !has_noro) { noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); noro_opt->name = strdup("noro", M_MOUNT); noro_opt->value = NULL; noro_opt->len = 0; TAILQ_INSERT_TAIL(optlist, noro_opt, link); } /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) { error = ENAMETOOLONG; goto bail; } mtx_lock(&Giant); error = vfs_domount(td, fstype, fspath, fsflags, optlist); mtx_unlock(&Giant); bail: /* copyout the errmsg */ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) && errmsg_len > 0 && errmsg != NULL) { if (fsoptions->uio_segflg == UIO_SYSSPACE) { bcopy(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } else { copyout(errmsg, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); } } if (error != 0) vfs_freeopts(optlist); return (error); } /* - * --------------------------------------------------------------------- * Old mount API. */ #ifndef _SYS_SYSPROTO_H_ struct mount_args { char *type; char *path; int flags; caddr_t data; }; #endif /* ARGSUSED */ int mount(td, uap) struct thread *td; struct mount_args /* { char *type; char *path; int flags; caddr_t data; } */ *uap; { char *fstype; struct vfsconf *vfsp = NULL; struct mntarg *ma = NULL; int error; AUDIT_ARG(fflags, uap->flags); /* Kick out MNT_ROOTFS early as it is legal internally */ uap->flags &= ~MNT_ROOTFS; fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); if (error) { free(fstype, M_TEMP); return (error); } AUDIT_ARG(text, fstype); mtx_lock(&Giant); vfsp = vfs_byname_kld(fstype, td, &error); free(fstype, M_TEMP); if (vfsp == NULL) { mtx_unlock(&Giant); return (ENOENT); } if (vfsp->vfc_vfsops->vfs_cmount == NULL) { mtx_unlock(&Giant); return (EOPNOTSUPP); } ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN); ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro"); ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid"); ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec"); error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td); mtx_unlock(&Giant); return (error); } /* * vfs_domount(): actually attempt a filesystem mount. */ static int vfs_domount( struct thread *td, /* Calling thread. */ const char *fstype, /* Filesystem type. */ char *fspath, /* Mount path. */ int fsflags, /* Flags common to all filesystems. */ void *fsdata /* Options local to the filesystem. */ ) { struct vnode *vp; struct mount *mp; struct vfsconf *vfsp; struct export_args export; int error, flag = 0; struct vattr va; struct nameidata nd; mtx_assert(&Giant, MA_OWNED); /* * Be ultra-paranoid about making sure the type and fspath * variables will fit in our mp buffers, including the * terminating NUL. */ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) return (ENAMETOOLONG); if (jailed(td->td_ucred)) return (EPERM); if (usermount == 0) { if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) return (error); } /* * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. */ if (fsflags & MNT_EXPORTED) { error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); if (error) return (error); } if (fsflags & MNT_SUIDDIR) { error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); if (error) return (error); } /* * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. */ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) fsflags |= MNT_NOSUID | MNT_USER; } /* Load KLDs before we lock the covered vnode to avoid reversals. */ vfsp = NULL; if ((fsflags & MNT_UPDATE) == 0) { /* Don't try to load KLDs if we're mounting the root. */ if (fsflags & MNT_ROOTFS) vfsp = vfs_byname(fstype); else vfsp = vfs_byname_kld(fstype, td, &error); if (vfsp == NULL) return (ENODEV); } /* * Get vnode to be covered */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, fspath, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (fsflags & MNT_UPDATE) { if ((vp->v_vflag & VV_ROOT) == 0) { vput(vp); return (EINVAL); } mp = vp->v_mount; MNT_ILOCK(mp); flag = mp->mnt_flag; /* * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ if ((fsflags & MNT_RELOAD) && ((mp->mnt_flag & MNT_RDONLY) == 0)) { MNT_IUNLOCK(mp); vput(vp); return (EOPNOTSUPP); /* Needs translation */ } MNT_IUNLOCK(mp); /* * Only privileged root, or (if MNT_USER is set) the user that * did the original mount is permitted to update it. */ error = vfs_suser(mp, td); if (error) { vput(vp); return (error); } if (vfs_busy(mp, LK_NOWAIT, 0, td)) { vput(vp); return (EBUSY); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vfs_unbusy(mp, td); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); MNT_ILOCK(mp); mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS); MNT_IUNLOCK(mp); VOP_UNLOCK(vp, 0, td); mp->mnt_optnew = fsdata; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); } else { /* * If the user is not root, ensure that they own the directory * onto which we are attempting to mount. */ error = VOP_GETATTR(vp, &va, td->td_ucred, td); if (error) { vput(vp); return (error); } if (va.va_uid != td->td_ucred->cr_uid) { error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, SUSER_ALLOWJAIL); if (error) { vput(vp); return (error); } } error = vinvalbuf(vp, V_SAVE, td, 0, 0); if (error != 0) { vput(vp); return (error); } if (vp->v_type != VDIR) { vput(vp); return (ENOTDIR); } VI_LOCK(vp); if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { VI_UNLOCK(vp); vput(vp); return (EBUSY); } vp->v_iflag |= VI_MOUNT; VI_UNLOCK(vp); /* * Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td); VOP_UNLOCK(vp, 0, td); /* XXXMAC: pass to vfs_mount_alloc? */ mp->mnt_optnew = fsdata; } /* * Set the mount level flags. */ MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & ~MNT_UPDATEMASK) | (fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS | MNT_RDONLY)); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); /* * Mount the filesystem. * XXX The final recipients of VFS_MOUNT just overwrite the ndp they * get. No freeing of cn_pnbuf. */ error = VFS_MOUNT(mp, td); /* * Process the export option only if we are * updating mount options. */ if (!error && (fsflags & MNT_UPDATE)) { if (vfs_copyopt(mp->mnt_optnew, "export", &export, sizeof(export)) == 0) error = vfs_export(mp, &export); } if (!error) { if (mp->mnt_opt != NULL) vfs_freeopts(mp->mnt_opt); mp->mnt_opt = mp->mnt_optnew; (void)VFS_STATFS(mp, &mp->mnt_stat, td); } /* * Prevent external consumers of mount options from reading * mnt_optnew. */ mp->mnt_optnew = NULL; if (mp->mnt_flag & MNT_UPDATE) { MNT_ILOCK(mp); if (error) mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); else mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT); if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0) { if (mp->mnt_syncer == NULL) error = vfs_allocate_syncvnode(mp); } else { if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); mp->mnt_syncer = NULL; } vfs_unbusy(mp, td); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vrele(vp); return (error); } MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; else mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* * Put the new filesystem on the mount list after root. */ cache_purge(vp); if (!error) { struct vnode *newdp; VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vp->v_mountedhere = mp; mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_event_signal(NULL, VQ_MOUNT, 0); if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp, td)) panic("mount: lost mount"); mountcheckdirs(vp, newdp); vput(newdp); VOP_UNLOCK(vp, 0, td); if ((mp->mnt_flag & MNT_RDONLY) == 0) error = vfs_allocate_syncvnode(mp); vfs_unbusy(mp, td); if (error) vrele(vp); } else { VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); vfs_unbusy(mp, td); vfs_mount_destroy(mp); vput(vp); } return (error); } /* - * --------------------------------------------------------------------- * Unmount a filesystem. * - * Note: unmount takes a path to the vnode mounted on as argument, - * not special file (as before). + * Note: unmount takes a path to the vnode mounted on as argument, not + * special file (as before). */ #ifndef _SYS_SYSPROTO_H_ struct unmount_args { char *path; int flags; }; #endif /* ARGSUSED */ int unmount(td, uap) struct thread *td; register struct unmount_args /* { char *path; int flags; } */ *uap; { struct mount *mp; char *pathbuf; int error, id0, id1; if (jailed(td->td_ucred)) return (EPERM); if (usermount == 0) { error = priv_check(td, PRIV_VFS_UNMOUNT); if (error) return (error); } pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL); if (error) { free(pathbuf, M_TEMP); return (error); } AUDIT_ARG(upath, td, pathbuf, ARG_UPATH1); mtx_lock(&Giant); if (uap->flags & MNT_BYFSID) { /* Decode the filesystem ID. */ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { mtx_unlock(&Giant); free(pathbuf, M_TEMP); return (EINVAL); } mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (mp->mnt_stat.f_fsid.val[0] == id0 && mp->mnt_stat.f_fsid.val[1] == id1) break; } mtx_unlock(&mountlist_mtx); } else { mtx_lock(&mountlist_mtx); TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) break; } mtx_unlock(&mountlist_mtx); } free(pathbuf, M_TEMP); if (mp == NULL) { /* * Previously we returned ENOENT for a nonexistent path and * EINVAL for a non-mountpoint. We cannot tell these apart * now, so in the !MNT_BYFSID case return the more likely * EINVAL for compatibility. */ mtx_unlock(&Giant); return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL); } /* * Don't allow unmounting the root filesystem. */ if (mp->mnt_flag & MNT_ROOTFS) { mtx_unlock(&Giant); return (EINVAL); } error = dounmount(mp, uap->flags, td); mtx_unlock(&Giant); return (error); } /* * Do the actual filesystem unmount. */ int dounmount(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct vnode *coveredvp, *fsrootvp; int error; int async_flag; int mnt_gen_r; mtx_assert(&Giant, MA_OWNED); if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); vholdl(coveredvp); error = vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK, td); vdrop(coveredvp); /* * Check for mp being unmounted while waiting for the * covered vnode lock. */ if (error) return (error); if (coveredvp->v_mountedhere != mp || coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { VOP_UNLOCK(coveredvp, 0, td); return (EBUSY); } } /* * Only privileged root, or (if MNT_USER is set) the user that did the * original mount is permitted to unmount this filesystem. */ error = vfs_suser(mp, td); if (error) { if (coveredvp) VOP_UNLOCK(coveredvp, 0, td); return (error); } MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_UNMOUNT) { MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0, td); return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT; /* Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) mp->mnt_kern_flag |= MNTK_UNMOUNTF; error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp), td); if (error) { MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup(mp); MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0, td); return (error); } vn_start_write(NULL, &mp, V_WAIT); if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); vfs_msync(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); cache_purgevfs(mp); /* remove cache entries for this file sys */ if (mp->mnt_syncer != NULL) vrele(mp->mnt_syncer); /* * For forced unmounts, move process cdir/rdir refs on the fs root * vnode to the covered vnode. For non-forced unmounts we want * such references to cause an EBUSY error. */ if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) { if (mp->mnt_vnodecovered != NULL) mountcheckdirs(fsrootvp, mp->mnt_vnodecovered); if (fsrootvp == rootvnode) { vrele(rootvnode); rootvnode = NULL; } vput(fsrootvp); } if (((mp->mnt_flag & MNT_RDONLY) || (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) || (flags & MNT_FORCE)) { error = VFS_UNMOUNT(mp, flags, td); } vn_finished_write(mp); if (error) { /* Undo cdir/rdir and rootvnode changes made above. */ if ((flags & MNT_FORCE) && VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) { if (mp->mnt_vnodecovered != NULL) mountcheckdirs(mp->mnt_vnodecovered, fsrootvp); if (rootvnode == NULL) { rootvnode = fsrootvp; vref(rootvnode); } vput(fsrootvp); } if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) (void) vfs_allocate_syncvnode(mp); MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); mp->mnt_flag |= async_flag; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); if (mp->mnt_kern_flag & MNTK_MWAIT) wakeup(mp); MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0, td); return (error); } mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; vput(coveredvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); vfs_mount_destroy(mp); return (0); } /* * --------------------------------------------------------------------- * Mounting of root filesystem * */ struct root_hold_token { const char *who; LIST_ENTRY(root_hold_token) list; }; static LIST_HEAD(, root_hold_token) root_holds = LIST_HEAD_INITIALIZER(&root_holds); struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->who = identifier; mtx_lock(&mountlist_mtx); LIST_INSERT_HEAD(&root_holds, h, list); mtx_unlock(&mountlist_mtx); return (h); } void root_mount_rel(struct root_hold_token *h) { mtx_lock(&mountlist_mtx); LIST_REMOVE(h, list); wakeup(&root_holds); mtx_unlock(&mountlist_mtx); free(h, M_DEVBUF); } static void root_mount_wait(void) { struct root_hold_token *h; for (;;) { DROP_GIANT(); g_waitidle(); PICKUP_GIANT(); mtx_lock(&mountlist_mtx); if (LIST_EMPTY(&root_holds)) { mtx_unlock(&mountlist_mtx); break; } printf("Root mount waiting for:"); LIST_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold", hz); } } static void set_rootvnode(struct thread *td) { struct proc *p; if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode, td)) panic("Cannot find root vnode"); p = td->td_proc; FILEDESC_LOCK(p->p_fd); if (p->p_fd->fd_cdir != NULL) vrele(p->p_fd->fd_cdir); p->p_fd->fd_cdir = rootvnode; VREF(rootvnode); if (p->p_fd->fd_rdir != NULL) vrele(p->p_fd->fd_rdir); p->p_fd->fd_rdir = rootvnode; VREF(rootvnode); FILEDESC_UNLOCK(p->p_fd); VOP_UNLOCK(rootvnode, 0, td); } /* * Mount /devfs as our root filesystem, but do not put it on the mountlist * yet. Create a /dev -> / symlink so that absolute pathnames will lookup. */ static void devfs_first(void) { struct thread *td = curthread; struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp = NULL; int error; vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return; mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td); error = VFS_MOUNT(mp, td); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return; opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); set_rootvnode(td); error = kern_symlink(td, "/", "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); } /* * Surgically move our devfs to be mounted on /dev. */ static void devfs_fixup(struct thread *td) { struct nameidata nd; int error; struct vnode *vp, *dvp; struct mount *mp; /* Remove our devfs mount from the mountlist and purge the cache */ mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); cache_purgevfs(mp); VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td); VI_LOCK(dvp); dvp->v_iflag &= ~VI_MOUNT; dvp->v_mountedhere = NULL; VI_UNLOCK(dvp); /* Set up the real rootvnode, and purge the cache */ TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL; set_rootvnode(td); cache_purgevfs(rootvnode->v_mount); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td); error = namei(&nd); if (error) { printf("Lookup of /dev for devfs, error: %d\n", error); return; } NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type != VDIR) { vput(vp); } error = vinvalbuf(vp, V_SAVE, td, 0, 0); if (error) { vput(vp); } cache_purge(vp); mp->mnt_vnodecovered = vp; vp->v_mountedhere = mp; mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); VOP_UNLOCK(vp, 0, td); vput(dvp); vfs_unbusy(mp, td); /* Unlink the no longer needed /dev/dev -> / symlink */ kern_unlink(td, "/dev/dev", UIO_SYSSPACE); } /* * Report errors during filesystem mounting. */ void vfs_mount_error(struct mount *mp, const char *fmt, ...) { struct vfsoptlist *moptlist = mp->mnt_optnew; va_list ap; int error, len; char *errmsg; error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); if (error || errmsg == NULL || len <= 0) return; va_start(ap, fmt); vsnprintf(errmsg, (size_t)len, fmt, ap); va_end(ap); } /* * Find and mount the root filesystem */ void vfs_mountroot(void) { char *cp; int error, i, asked = 0; root_mount_wait(); mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); devfs_first(); /* * We are booted with instructions to prompt for the root filesystem. */ if (boothowto & RB_ASKNAME) { if (!vfs_mountroot_ask()) return; asked = 1; } /* * The root filesystem information is compiled in, and we are * booted with instructions to use it. */ if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) { if (!vfs_mountroot_try(ctrootdevname)) return; ctrootdevname = NULL; } /* * We've been given the generic "use CDROM as root" flag. This is * necessary because one media may be used in many different * devices, so we need to search for them. */ if (boothowto & RB_CDROM) { for (i = 0; cdrom_rootdevnames[i] != NULL; i++) { if (!vfs_mountroot_try(cdrom_rootdevnames[i])) return; } } /* * Try to use the value read by the loader from /etc/fstab, or * supplied via some other means. This is the preferred * mechanism. */ cp = getenv("vfs.root.mountfrom"); if (cp != NULL) { error = vfs_mountroot_try(cp); freeenv(cp); if (!error) return; } /* * Try values that may have been computed by code during boot */ if (!vfs_mountroot_try(rootdevnames[0])) return; if (!vfs_mountroot_try(rootdevnames[1])) return; /* * If we (still) have a compiled-in default, try it. */ if (ctrootdevname != NULL) if (!vfs_mountroot_try(ctrootdevname)) return; /* * Everything so far has failed, prompt on the console if we haven't * already tried that. */ if (!asked) if (!vfs_mountroot_ask()) return; panic("Root mount failed, startup aborted."); } /* * Mount (mountfrom) as the root filesystem. */ static int vfs_mountroot_try(const char *mountfrom) { struct mount *mp; char *vfsname, *path; time_t timebase; int error; char patt[32]; vfsname = NULL; path = NULL; mp = NULL; error = EINVAL; if (mountfrom == NULL) return (error); /* don't complain */ printf("Trying to mount root from %s\n", mountfrom); /* parse vfs name and path */ vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK); path = malloc(MNAMELEN, M_MOUNT, M_WAITOK); vfsname[0] = path[0] = 0; sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN); if (sscanf(mountfrom, patt, vfsname, path) < 1) goto out; if (path[0] == '\0') strcpy(path, ROOTNAME); error = kernel_vmount( MNT_RDONLY | MNT_ROOTFS, "fstype", vfsname, "fspath", "/", "from", path, NULL); if (error == 0) { /* * We mount devfs prior to mounting the / FS, so the first * entry will typically be devfs. */ mp = TAILQ_FIRST(&mountlist); KASSERT(mp != NULL, ("%s: mountlist is empty", __func__)); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Typically devfs has no time stamp and the only other FS * is the actual / FS. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; do { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } while (mp != NULL); inittodr(timebase); devfs_fixup(curthread); } out: free(path, M_MOUNT); free(vfsname, M_MOUNT); return (error); } /* * --------------------------------------------------------------------- * Interactive root filesystem selection code. */ static int vfs_mountroot_ask(void) { char name[128]; for(;;) { printf("\nManual root filesystem specification:\n"); printf(" : Mount using filesystem \n"); #if defined(__amd64__) || defined(__i386__) || defined(__ia64__) printf(" eg. ufs:da0s1a\n"); #else printf(" eg. ufs:/dev/da0a\n"); #endif printf(" ? List valid disk boot devices\n"); printf(" Abort manual input\n"); printf("\nmountroot> "); gets(name, sizeof(name), 1); if (name[0] == '\0') return (1); if (name[0] == '?') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (!vfs_mountroot_try(name)) return (0); } } /* * --------------------------------------------------------------------- * Functions for querying mount options/arguments from filesystems. */ /* * Check that no unknown options are given */ int vfs_filteropt(struct vfsoptlist *opts, const char **legal) { struct vfsopt *opt; const char **t, *p; TAILQ_FOREACH(opt, opts, link) { p = opt->name; if (p[0] == 'n' && p[1] == 'o') p += 2; for(t = global_opts; *t != NULL; t++) if (!strcmp(*t, p)) break; if (*t != NULL) continue; for(t = legal; *t != NULL; t++) if (!strcmp(*t, p)) break; if (*t != NULL) continue; printf("mount option <%s> is unknown\n", p); return (EINVAL); } return (0); } /* * Get a mount option by its name. * * Return 0 if the option was found, ENOENT otherwise. * If len is non-NULL it will be filled with the length * of the option. If buf is non-NULL, it will be filled * with the address of the option. */ int vfs_getopt(opts, name, buf, len) struct vfsoptlist *opts; const char *name; void **buf; int *len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (len != NULL) *len = opt->len; if (buf != NULL) *buf = opt->value; return (0); } } return (ENOENT); } static int vfs_getopt_pos(struct vfsoptlist *opts, const char *name) { struct vfsopt *opt; int i; if (opts == NULL) return (-1); i = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) return (i); ++i; } return (-1); } char * vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) { struct vfsopt *opt; *error = 0; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; if (((char *)opt->value)[opt->len - 1] != '\0') { *error = EINVAL; return (NULL); } return (opt->value); } *error = ENOENT; return (NULL); } int vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val) { struct vfsopt *opt; TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (w != NULL) *w |= val; return (1); } } if (w != NULL) *w &= ~val; return (0); } int vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) { va_list ap; struct vfsopt *opt; int ret; KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) != 0) continue; if (((char *)opt->value)[opt->len - 1] != '\0') return (0); va_start(ap, fmt); ret = vsscanf(opt->value, fmt, ap); va_end(ap); return (ret); } return (0); } /* * Find and copy a mount option. * * The size of the buffer has to be specified * in len, if it is not the same length as the * mount option, EINVAL is returned. * Returns ENOENT if the option is not found. */ int vfs_copyopt(opts, name, dest, len) struct vfsoptlist *opts; const char *name; void *dest; int len; { struct vfsopt *opt; KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); TAILQ_FOREACH(opt, opts, link) { if (strcmp(name, opt->name) == 0) { if (len != opt->len) return (EINVAL); bcopy(opt->value, dest, opt->len); return (0); } } return (ENOENT); } /* * This is a helper function for filesystems to traverse their * vnodes. See MNT_VNODE_FOREACH() in sys/mount.h */ struct vnode * __mnt_vnode_next(struct vnode **mvp, struct mount *mp) { struct vnode *vp; mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); vp = TAILQ_NEXT(*mvp, v_nmntvnodes); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { __mnt_vnode_markerfree(mvp, mp); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); return (vp); } struct vnode * __mnt_vnode_first(struct vnode **mvp, struct mount *mp) { struct vnode *vp; mtx_assert(MNT_MTX(mp), MA_OWNED); vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { *mvp = NULL; return (NULL); } mp->mnt_holdcnt++; MNT_IUNLOCK(mp); *mvp = (struct vnode *) malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); (*mvp)->v_type = VMARKER; vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); /* Check if we are done */ if (vp == NULL) { MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); MNT_ILOCK(mp); *mvp = NULL; mp->mnt_holdcnt--; if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0) wakeup(&mp->mnt_holdcnt); return (NULL); } mp->mnt_markercnt++; (*mvp)->v_mount = mp; TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); return (vp); } void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); free(*mvp, M_VNODE_MARKER); MNT_ILOCK(mp); *mvp = NULL; mp->mnt_markercnt--; mp->mnt_holdcnt--; if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0) wakeup(&mp->mnt_holdcnt); } int __vfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { int error; error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat, td); if (sbp != &mp->mnt_stat) *sbp = mp->mnt_stat; return (error); } void vfs_mountedfrom(struct mount *mp, const char *from) { bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); strlcpy(mp->mnt_stat.f_mntfromname, from, sizeof mp->mnt_stat.f_mntfromname); } /* * --------------------------------------------------------------------- * This is the api for building mount args and mounting filesystems from * inside the kernel. * * The API works by accumulation of individual args. First error is * latched. * * XXX: should be documented in new manpage kernel_mount(9) */ /* A memory allocation which must be freed when we are done */ struct mntaarg { SLIST_ENTRY(mntaarg) next; }; /* The header for the mount arguments */ struct mntarg { struct iovec *v; int len; int error; SLIST_HEAD(, mntaarg) list; }; /* * Add a boolean argument. * * flag is the boolean value. * name must start with "no". */ struct mntarg * mount_argb(struct mntarg *ma, int flag, const char *name) { KASSERT(name[0] == 'n' && name[1] == 'o', ("mount_argb(...,%s): name must start with 'no'", name)); return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); } /* * Add an argument printf style */ struct mntarg * mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) { va_list ap; struct mntaarg *maa; struct sbuf *sb; int len; if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); va_start(ap, fmt); sbuf_vprintf(sb, fmt, ap); va_end(ap); sbuf_finish(sb); len = sbuf_len(sb) + 1; maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); bcopy(sbuf_data(sb), maa + 1, len); sbuf_delete(sb); ma->v[ma->len].iov_base = maa + 1; ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Add an argument which is a userland string. */ struct mntarg * mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) { struct mntaarg *maa; char *tbuf; if (val == NULL) return (ma); if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INSERT_HEAD(&ma->list, maa, next); tbuf = (void *)(maa + 1); ma->error = copyinstr(val, tbuf, len, NULL); return (mount_arg(ma, name, tbuf, -1)); } /* * Plain argument. * * If length is -1, use printf. */ struct mntarg * mount_arg(struct mntarg *ma, const char *name, const void *val, int len) { if (ma == NULL) { ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); SLIST_INIT(&ma->list); } if (ma->error) return (ma); ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), M_MOUNT, M_WAITOK); ma->v[ma->len].iov_base = (void *)(uintptr_t)name; ma->v[ma->len].iov_len = strlen(name) + 1; ma->len++; ma->v[ma->len].iov_base = (void *)(uintptr_t)val; if (len < 0) ma->v[ma->len].iov_len = strlen(val) + 1; else ma->v[ma->len].iov_len = len; ma->len++; return (ma); } /* * Free a mntarg structure */ static void free_mntarg(struct mntarg *ma) { struct mntaarg *maa; while (!SLIST_EMPTY(&ma->list)) { maa = SLIST_FIRST(&ma->list); SLIST_REMOVE_HEAD(&ma->list, next); free(maa, M_MOUNT); } free(ma->v, M_MOUNT); free(ma, M_MOUNT); } /* * Mount a filesystem */ int kernel_mount(struct mntarg *ma, int flags) { struct uio auio; int error; KASSERT(ma != NULL, ("kernel_mount NULL ma")); KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); auio.uio_iov = ma->v; auio.uio_iovcnt = ma->len; auio.uio_segflg = UIO_SYSSPACE; error = ma->error; if (!error) error = vfs_donmount(curthread, flags, &auio); free_mntarg(ma); return (error); } /* * A printflike function to mount a filesystem. */ int kernel_vmount(int flags, ...) { struct mntarg *ma = NULL; va_list ap; const char *cp; const void *vp; int error; va_start(ap, flags); for (;;) { cp = va_arg(ap, const char *); if (cp == NULL) break; vp = va_arg(ap, const void *); ma = mount_arg(ma, cp, vp, -1); } va_end(ap); error = kernel_mount(ma, flags); return (error); } Index: head/sys/kern/vfs_syscalls.c =================================================================== --- head/sys/kern/vfs_syscalls.c (revision 167231) +++ head/sys/kern/vfs_syscalls.c (revision 167232) @@ -1,4319 +1,4305 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int chroot_refuse_vdir_fds(struct filedesc *fdp); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int setfown(struct thread *td, struct vnode *, uid_t, gid_t); static int setfmode(struct thread *td, struct vnode *, int); static int setfflags(struct thread *td, struct vnode *, int); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); /* * The module initialization routine for POSIX asynchronous I/O will * set this to the version of AIO that it implements. (Zero means * that it is not implemented.) This value is used here by pathconf() * and in kern_descrip.c by fpathconf(). */ int async_io_version; +#ifdef DEBUG +static int syncprt = 0; +SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); +#endif + /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif - -#ifdef DEBUG -static int syncprt = 0; -SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); -#endif - /* ARGSUSED */ int sync(td, uap) struct thread *td; struct sync_args *uap; { struct mount *mp, *nmp; int vfslocked; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } vfslocked = VFS_LOCK_GIANT(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { MNT_ILOCK(mp); mp->mnt_noasync++; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT, td); MNT_ILOCK(mp); mp->mnt_noasync--; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; MNT_IUNLOCK(mp); vn_finished_write(mp); } VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); return (0); } /* XXX PRISON: could be per prison flag */ static int prison_quotas; #if 0 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); #endif /* * Change filesystem quotas. - * - * MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif int quotactl(td, uap) struct thread *td; register struct quotactl_args /* { char *path; int cmd; int uid; caddr_t arg; } */ *uap; { struct mount *mp, *vmp; int vfslocked; int error; struct nameidata nd; AUDIT_ARG(cmd, uap->cmd); AUDIT_ARG(uid, uap->uid); if (jailed(td->td_ucred) && !prison_quotas) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_start_write(nd.ni_vp, &vmp, V_WAIT | PCATCH); mp = nd.ni_vp->v_mount; vrele(nd.ni_vp); if (error) goto out; error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, td); vn_finished_write(vmp); out: VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif int statfs(td, uap) struct thread *td; register struct statfs_args /* { char *path; struct statfs *buf; } */ *uap; { struct statfs sf; int error; error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf); if (error == 0) error = copyout(&sf, uap->buf, sizeof(sf)); return (error); } int kern_statfs(struct thread *td, char *path, enum uio_seg pathseg, struct statfs *buf) { struct mount *mp; struct statfs *sp, sb; int vfslocked; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); mp = nd.ni_vp->v_mount; vfs_ref(mp); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); #ifdef MAC error = mac_check_mount_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) goto out; if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } *buf = *sp; out: vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); if (mtx_owned(&Giant)) printf("statfs(%d): %s: %d\n", vfslocked, path, error); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif int fstatfs(td, uap) struct thread *td; register struct fstatfs_args /* { int fd; struct statfs *buf; } */ *uap; { struct statfs sf; int error; error = kern_fstatfs(td, uap->fd, &sf); if (error == 0) error = copyout(&sf, uap->buf, sizeof(sf)); return (error); } int kern_fstatfs(struct thread *td, int fd, struct statfs *buf) { struct file *fp; struct mount *mp; struct statfs *sp, sb; int vfslocked; struct vnode *vp; int error; AUDIT_ARG(fd, fd); error = getvnode(td->td_proc->p_fd, fd, &fp); if (error) return (error); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef AUDIT AUDIT_ARG(vnode, vp, ARG_VNODE1); #endif mp = vp->v_mount; if (mp) vfs_ref(mp); VOP_UNLOCK(vp, 0, td); fdrop(fp, td); if (vp->v_iflag & VI_DOOMED) { error = EBADF; goto out; } #ifdef MAC error = mac_check_mount_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) goto out; if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } *buf = *sp; out: if (mp) vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int flags; }; #endif int getfsstat(td, uap) struct thread *td; register struct getfsstat_args /* { struct statfs *buf; long bufsize; int flags; } */ *uap; { return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE, uap->flags)); } /* * If (bufsize > 0 && bufseg == UIO_SYSSPACE) * The caller is responsible for freeing memory which will be allocated * in '*buf'. */ int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, enum uio_seg bufseg, int flags) { struct mount *mp, *nmp; struct statfs *sfsp, *sp, sb; size_t count, maxcount; int vfslocked; int error; maxcount = bufsize / sizeof(struct statfs); if (bufsize == 0) sfsp = NULL; else if (bufseg == UIO_USERSPACE) sfsp = *buf; else /* if (bufseg == UIO_SYSSPACE) */ { count = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { count++; } mtx_unlock(&mountlist_mtx); if (maxcount > count) maxcount = count; sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP, M_WAITOK); } count = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_check_mount_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } vfslocked = VFS_LOCK_GIANT(mp); if (sfsp && count < maxcount) { sp = &mp->mnt_stat; /* * Set these in case the underlying filesystem * fails to do so. */ sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; /* * If MNT_NOWAIT or MNT_LAZY is specified, do not * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY * overrides MNT_WAIT. */ if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 || (flags & MNT_WAIT)) && (error = VFS_STATFS(mp, sp, td))) { VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); continue; } if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } if (bufseg == UIO_SYSSPACE) bcopy(sp, sfsp, sizeof(*sp)); else /* if (bufseg == UIO_USERSPACE) */ { error = copyout(sp, sfsp, sizeof(*sp)); if (error) { vfs_unbusy(mp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } } sfsp++; } VFS_UNLOCK_GIANT(vfslocked); count++; mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (sfsp && count > maxcount) td->td_retval[0] = maxcount; else td->td_retval[0] = count; return (0); } #ifdef COMPAT_FREEBSD4 /* * Get old format filesystem statistics. */ static void cvtstatfs(struct statfs *, struct ostatfs *); #ifndef _SYS_SYSPROTO_H_ struct freebsd4_statfs_args { char *path; struct ostatfs *buf; }; #endif int freebsd4_statfs(td, uap) struct thread *td; struct freebsd4_statfs_args /* { char *path; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; int error; error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fstatfs_args { int fd; struct ostatfs *buf; }; #endif int freebsd4_fstatfs(td, uap) struct thread *td; struct freebsd4_fstatfs_args /* { int fd; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; int error; error = kern_fstatfs(td, uap->fd, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_getfsstat_args { struct ostatfs *buf; long bufsize; int flags; }; #endif int freebsd4_getfsstat(td, uap) struct thread *td; register struct freebsd4_getfsstat_args /* { struct ostatfs *buf; long bufsize; int flags; } */ *uap; { struct statfs *buf, *sp; struct ostatfs osb; size_t count, size; int error; count = uap->bufsize / sizeof(struct ostatfs); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags); if (size > 0) { count = td->td_retval[0]; sp = buf; while (count > 0 && error == 0) { cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_TEMP); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fhstatfs_args { struct fhandle *u_fhp; struct ostatfs *buf; }; #endif int freebsd4_fhstatfs(td, uap) struct thread *td; struct freebsd4_fhstatfs_args /* { struct fhandle *u_fhp; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); error = kern_fhstatfs(td, fh, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void cvtstatfs(nsp, osp) struct statfs *nsp; struct ostatfs *osp; { bzero(osp, sizeof(*osp)); osp->f_bsize = MIN(nsp->f_bsize, LONG_MAX); osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); osp->f_blocks = MIN(nsp->f_blocks, LONG_MAX); osp->f_bfree = MIN(nsp->f_bfree, LONG_MAX); osp->f_bavail = MIN(nsp->f_bavail, LONG_MAX); osp->f_files = MIN(nsp->f_files, LONG_MAX); osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); osp->f_owner = nsp->f_owner; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, OMFSNAMELEN)); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, OMNAMELEN)); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, OMNAMELEN)); osp->f_fsid = nsp->f_fsid; } #endif /* COMPAT_FREEBSD4 */ /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif int fchdir(td, uap) struct thread *td; struct fchdir_args /* { int fd; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; struct vnode *vp, *tdp, *vpold; struct mount *mp; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(fdp, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; VREF(vp); fdrop(fp, td); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, vp, ARG_VNODE1); error = change_dir(vp, td); while (!error && (mp = vp->v_mountedhere) != NULL) { int tvfslocked; if (vfs_busy(mp, 0, 0, td)) continue; tvfslocked = VFS_LOCK_GIANT(mp); error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdp, td); vfs_unbusy(mp, td); if (error) { VFS_UNLOCK_GIANT(tvfslocked); break; } vput(vp); VFS_UNLOCK_GIANT(vfslocked); vp = tdp; vfslocked = tvfslocked; } if (error) { vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); FILEDESC_LOCK_FAST(fdp); vpold = fdp->fd_cdir; fdp->fd_cdir = vp; FILEDESC_UNLOCK_FAST(fdp); vfslocked = VFS_LOCK_GIANT(vpold->v_mount); vrele(vpold); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif int chdir(td, uap) struct thread *td; struct chdir_args /* { char *path; } */ *uap; { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, char *path, enum uio_seg pathseg) { register struct filedesc *fdp = td->td_proc->p_fd; int error; struct nameidata nd; struct vnode *vp; int vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } VOP_UNLOCK(nd.ni_vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); FILEDESC_LOCK_FAST(fdp); vp = fdp->fd_cdir; fdp->fd_cdir = nd.ni_vp; FILEDESC_UNLOCK_FAST(fdp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(fdp) struct filedesc *fdp; { struct vnode *vp; struct file *fp; int fd; FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); for (fd = 0; fd < fdp->fd_nfiles ; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, ""); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif int chroot(td, uap) struct thread *td; struct chroot_args /* { char *path; } */ *uap; { int error; struct nameidata nd; int vfslocked; error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, SUSER_ALLOWJAIL); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) goto error; vfslocked = NDHASGIANT(&nd); if ((error = change_dir(nd.ni_vp, td)) != 0) goto e_vunlock; #ifdef MAC if ((error = mac_check_vnode_chroot(td->td_ucred, nd.ni_vp))) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp, 0, td); error = change_root(nd.ni_vp, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); e_vunlock: vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); error: NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(vp, td) struct vnode *vp; struct thread *td; { int error; ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_check_vnode_chdir(td->td_ucred, vp); if (error) return (error); #endif error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); return (error); } /* * Common routine for kern_chroot() and jail_attach(). The caller is * responsible for invoking priv_check() and mac_check_chroot() to authorize * this operation. */ int change_root(vp, td) struct vnode *vp; struct thread *td; { struct filedesc *fdp; struct vnode *oldvp; int vfslocked; int error; VFS_ASSERT_GIANT(vp->v_mount); fdp = td->td_proc->p_fd; FILEDESC_LOCK(fdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); if (error) { FILEDESC_UNLOCK(fdp); return (error); } } oldvp = fdp->fd_rdir; fdp->fd_rdir = vp; VREF(fdp->fd_rdir); if (!fdp->fd_jdir) { fdp->fd_jdir = vp; VREF(fdp->fd_jdir); } FILEDESC_UNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(oldvp->v_mount); vrele(oldvp); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* - * Check permissions, allocate an open file structure, - * and call the device open routine if any. - * - * MP SAFE + * Check permissions, allocate an open file structure, and call the device + * open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int open(td, uap) struct thread *td; register struct open_args /* { char *path; int flags; int mode; } */ *uap; { return kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode); } int kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct vattr vat; struct mount *mp; int cmode; struct file *nfp; int type, indx, error; struct flock lf; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, flags); AUDIT_ARG(mode, mode); if ((flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); flags = FFLAGS(flags); error = falloc(td, &nfp, &indx); if (error) return (error); /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, indx); if (error) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) { fdrop(fp, td); td->td_retval[0] = indx; return (0); } /* * release our own reference */ fdrop(fp, td); /* * handle special fdopen() case. bleh. dupfdopen() is * responsible for dropping the old contents of ofiles[indx] * if it succeeds. */ if ((error == ENODEV || error == ENXIO) && td->td_dupfd >= 0 && /* XXX from fdopen */ (error = dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { td->td_retval[0] = indx; return (0); } /* * Clean up the descriptor, but only if another thread hadn't * replaced or closed it. */ fdclose(fdp, fp, indx, td); if (error == ERESTART) error = EINTR; return (error); } td->td_dupfd = 0; vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* * There should be 2 references on the file, one from the descriptor * table, and one for us. * * Handle the case where someone closed the file (via its file * descriptor) while we were blocked. The end result should look * like opening the file succeeded but it was immediately closed. * We call vn_close() manually because we haven't yet hooked up * the various 'struct file' fields. */ FILEDESC_LOCK(fdp); FILE_LOCK(fp); if (fp->f_count == 1) { mp = vp->v_mount; KASSERT(fdp->fd_ofiles[indx] != fp, ("Open file descriptor lost all refs")); FILE_UNLOCK(fp); FILEDESC_UNLOCK(fdp); VOP_UNLOCK(vp, 0, td); vn_close(vp, flags & FMASK, fp->f_cred, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); td->td_retval[0] = indx; return (0); } fp->f_vnode = vp; if (fp->f_data == NULL) fp->f_data = vp; fp->f_flag = flags & FMASK; if (fp->f_ops == &badfileops) fp->f_ops = &vnops; fp->f_seqcount = 1; fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); FILE_UNLOCK(fp); FILEDESC_UNLOCK(fdp); VOP_UNLOCK(vp, 0, td); if (flags & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (flags & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((flags & FNONBLOCK) == 0) type |= F_WAIT; if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) goto bad; fp->f_flag |= FHASLOCK; } if (flags & O_TRUNC) { if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto bad; VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); VATTR_NULL(&vat); vat.va_size = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC error = mac_check_vnode_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif error = VOP_SETATTR(vp, &vat, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); if (error) goto bad; } VFS_UNLOCK_GIANT(vfslocked); /* * Release our private reference, leaving the one associated with * the descriptor table intact. */ fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: VFS_UNLOCK_GIANT(vfslocked); fdclose(fdp, fp, indx, td); fdrop(fp, td); return (error); } #ifdef COMPAT_43 /* * Create a file. - * - * MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(td, uap) struct thread *td; register struct ocreat_args /* { char *path; int mode; } */ *uap; { return (kern_open(td, uap->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknod_args { char *path; int mode; int dev; }; #endif int mknod(td, uap) struct thread *td; register struct mknod_args /* { char *path; int mode; int dev; } */ *uap; { return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode, int dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; int error; int whiteout = 0; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); AUDIT_ARG(dev, dev); switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = priv_check(td, PRIV_VFS_MKNOD_DEV); break; case S_IFMT: error = priv_check(td, PRIV_VFS_MKNOD_BAD); break; case S_IFWHT: error = priv_check(td, PRIV_VFS_MKNOD_WHT); break; default: error = EINVAL; break; } if (error) return (error); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } else { VATTR_NULL(&vattr); FILEDESC_LOCK_FAST(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK_FAST(td->td_proc->p_fd); vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFMT: /* used by badsect to flag bad sectors */ vattr.va_type = VBAD; break; case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: panic("kern_mknod: invalid mode"); } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (!error) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif int mkfifo(td, uap) struct thread *td; register struct mkfifo_args /* { char *path; int mode; } */ *uap; { return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; FILEDESC_LOCK_FAST(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK_FAST(td->td_proc->p_fd); #ifdef MAC error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); #ifdef MAC out: #endif vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif int link(td, uap) struct thread *td; register struct link_args /* { char *path; char *link; } */ *uap; { int error; error = kern_link(td, uap->path, uap->link, UIO_USERSPACE); return (error); } static int hardlink_check_uid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, &hardlink_check_uid, 0, "Unprivileged processes cannot create hard links to files owned by other " "users"); static int hardlink_check_gid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, &hardlink_check_gid, 0, "Unprivileged processes cannot create hard links to files owned by other " "groups"); static int can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred) { struct vattr va; int error; if (!hardlink_check_uid && !hardlink_check_gid) return (0); error = VOP_GETATTR(vp, &va, cred, td); if (error != 0) return (error); if (hardlink_check_uid && cred->cr_uid != va.va_uid) { error = priv_check_cred(cred, PRIV_VFS_LINK, SUSER_ALLOWJAIL); if (error) return (error); } if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { error = priv_check_cred(cred, PRIV_VFS_LINK, SUSER_ALLOWJAIL); if (error) return (error); } return (0); } int kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg) { struct vnode *vp; struct mount *mp; struct nameidata nd; int vfslocked; int lvfslocked; int error; bwillwrite(); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, segflg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type == VDIR) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EPERM); /* POSIX */ } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2, segflg, link, td); if ((error = namei(&nd)) == 0) { lvfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULL) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); error = EEXIST; } else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td)) == 0) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = can_hardlink(vp, td, td->td_ucred); if (error == 0) #ifdef MAC error = mac_check_vnode_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error == 0) #endif error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_UNLOCK(vp, 0, td); vput(nd.ni_dvp); } NDFREE(&nd, NDF_ONLY_PNBUF); VFS_UNLOCK_GIANT(lvfslocked); } vrele(vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif int symlink(td, uap) struct thread *td; register struct symlink_args /* { char *path; char *link; } */ *uap; { return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE)); } int kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; char *syspath; int error; struct nameidata nd; int vfslocked; if (segflg == UIO_SYSSPACE) { syspath = path; } else { syspath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path, syspath, MAXPATHLEN, NULL)) != 0) goto out; } AUDIT_ARG(text, syspath); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, segflg, link, td); if ((error = namei(&nd)) != 0) goto out; vfslocked = NDHASGIANT(&nd); if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; goto restart; } VATTR_NULL(&vattr); FILEDESC_LOCK_FAST(td->td_proc->p_fd); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK_FAST(td->td_proc->p_fd); #ifdef MAC vattr.va_type = VLNK; error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out2; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); if (error == 0) vput(nd.ni_vp); #ifdef MAC out2: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, syspath); return (error); } /* * Delete a whiteout from the filesystem. */ int undelete(td, uap) struct thread *td; register struct undelete_args /* { char *path; } */ *uap; { int error; struct mount *mp; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif int unlink(td, uap) struct thread *td; struct unlink_args /* { char *path; } */ *uap; { int error; error = kern_unlink(td, uap->path, UIO_USERSPACE); return (error); } int kern_unlink(struct thread *td, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error == EINVAL ? EPERM : error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp->v_type == VDIR) error = EPERM; /* POSIX */ else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int lseek(td, uap) struct thread *td; register struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ *uap; { struct ucred *cred = td->td_ucred; struct file *fp; struct vnode *vp; struct vattr vattr; off_t offset; int error, noneg; int vfslocked; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { fdrop(fp, td); return (ESPIPE); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); noneg = (vp->v_type != VCHR); offset = uap->offset; switch (uap->whence) { case L_INCR: if (noneg && (fp->f_offset < 0 || (offset > 0 && fp->f_offset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += fp->f_offset; break; case L_XTND: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_GETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0, td); if (error) break; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) goto drop; fp->f_offset = offset; *(off_t *)(td->td_retval) = fp->f_offset; drop: fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } #if defined(COMPAT_43) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(td, uap) struct thread *td; register struct olseek_args /* { int fd; long offset; int whence; } */ *uap; { struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ nuap; int error; nuap.fd = uap->fd; nuap.offset = uap->offset; nuap.whence = uap->whence; error = lseek(td, &nuap); return (error); } #endif /* COMPAT_43 */ /* * Check access permissions using passed credentials. */ static int vn_access(vp, user_flags, cred, td) struct vnode *vp; int user_flags; struct ucred *cred; struct thread *td; { int error, flags; /* Flags == 0 means only check for existence. */ error = 0; if (user_flags) { flags = 0; if (user_flags & R_OK) flags |= VREAD; if (user_flags & W_OK) flags |= VWRITE; if (user_flags & X_OK) flags |= VEXEC; #ifdef MAC error = mac_check_vnode_access(cred, vp, flags); if (error) return (error); #endif if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, flags, cred, td); } return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int flags; }; #endif int access(td, uap) struct thread *td; register struct access_args /* { char *path; int flags; } */ *uap; { return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags)); } int kern_access(struct thread *td, char *path, enum uio_seg pathseg, int flags) { struct ucred *cred, *tmpcred; register struct vnode *vp; struct nameidata nd; int vfslocked; int error; /* * Create and modify a temporary credential instead of one that * is potentially shared. This could also mess up socket * buffer accounting which can run in an interrupt context. */ cred = td->td_ucred; tmpcred = crdup(cred); tmpcred->cr_uid = cred->cr_ruid; tmpcred->cr_groups[0] = cred->cr_rgid; td->td_ucred = tmpcred; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) goto out1; vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; error = vn_access(vp, flags, tmpcred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); out1: td->td_ucred = cred; crfree(tmpcred); return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int flags; }; #endif int eaccess(td, uap) struct thread *td; register struct eaccess_args /* { char *path; int flags; } */ *uap; { return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags)); } int kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int flags) { struct nameidata nd; struct vnode *vp; int vfslocked; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; vfslocked = NDHASGIANT(&nd); error = vn_access(vp, flags, td->td_ucred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #if defined(COMPAT_43) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif int ostat(td, uap) struct thread *td; register struct ostat_args /* { char *path; struct ostat *ub; } */ *uap; { struct stat sb; struct ostat osb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif int olstat(td, uap) struct thread *td; register struct olstat_args /* { char *path; struct ostat *ub; } */ *uap; { struct stat sb; struct ostat osb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Convert from an old to a new stat structure. */ void cvtstat(st, ost) struct stat *st; struct ostat *ost; { ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (st->st_size < (quad_t)1 << 32) ost->st_size = st->st_size; else ost->st_size = -2; ost->st_atime = st->st_atime; ost->st_mtime = st->st_mtime; ost->st_ctime = st->st_ctime; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 */ /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct stat_args { char *path; struct stat *ub; }; #endif int stat(td, uap) struct thread *td; register struct stat_args /* { char *path; struct stat *ub; } */ *uap; { struct stat sb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error == 0) error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } int kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { struct nameidata nd; struct stat sb; int error, vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); if (mtx_owned(&Giant)) printf("stat(%d): %s\n", vfslocked, path); if (error) return (error); *sbp = sb; return (0); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif int lstat(td, uap) struct thread *td; register struct lstat_args /* { char *path; struct stat *ub; } */ *uap; { struct stat sb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error == 0) error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } int kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { struct vnode *vp; struct stat sb; struct nameidata nd; int error, vfslocked; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); *sbp = sb; return (0); } /* * Implementation of the NetBSD [l]stat() functions. */ void cvtnstat(sb, nsb) struct stat *sb; struct nstat *nsb; { bzero(nsb, sizeof *nsb); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atimespec = sb->st_atimespec; nsb->st_mtimespec = sb->st_mtimespec; nsb->st_ctimespec = sb->st_ctimespec; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtimespec = sb->st_birthtimespec; } #ifndef _SYS_SYSPROTO_H_ struct nstat_args { char *path; struct nstat *ub; }; #endif int nstat(td, uap) struct thread *td; register struct nstat_args /* { char *path; struct nstat *ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif int nlstat(td, uap) struct thread *td; register struct nlstat_args /* { char *path; struct nstat *ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif int pathconf(td, uap) struct thread *td; register struct pathconf_args /* { char *path; int name; } */ *uap; { return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name)); } int kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name) { struct nameidata nd; int error, vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); /* If asynchronous I/O is available, it works for all files. */ if (name == _PC_ASYNC_IO) td->td_retval[0] = async_io_version; else error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval); vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; int count; }; #endif int readlink(td, uap) struct thread *td; register struct readlink_args /* { char *path; char *buf; int count; } */ *uap; { return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } int kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, int count) { register struct vnode *vp; struct iovec aiov; struct uio auio; int error; struct nameidata nd; int vfslocked; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; #ifdef MAC error = mac_check_vnode_readlink(td->td_ucred, vp); if (error) { vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #endif if (vp->v_type != VLNK) error = EINVAL; else { aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); } vput(vp); VFS_UNLOCK_GIANT(vfslocked); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(td, vp, flags) struct thread *td; struct vnode *vp; int flags; { int error; struct mount *mp; struct vattr vattr; /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = priv_check_cred(td->td_ucred, PRIV_VFS_CHFLAGS_DEV, SUSER_ALLOWJAIL); if (error) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_flags = flags; #ifdef MAC error = mac_check_vnode_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { char *path; int flags; }; #endif int chflags(td, uap) struct thread *td; register struct chflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, uap->flags); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Same as chflags() but doesn't follow symlinks. */ int lchflags(td, uap) struct thread *td; register struct lchflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, uap->flags); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; int flags; }; #endif int fchflags(td, uap) struct thread *td; register struct fchflags_args /* { int fd; int flags; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(fflags, uap->flags); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setfflags(td, fp->f_vnode, uap->flags); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ static int setfmode(td, vp, mode) struct thread *td; struct vnode *vp; int mode; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_check_vnode_setmode(td->td_ucred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif int chmod(td, uap) struct thread *td; register struct chmod_args /* { char *path; int mode; } */ *uap; { return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode) { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, mode); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif int lchmod(td, uap) struct thread *td; register struct lchmod_args /* { char *path; int mode; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, (mode_t)uap->mode); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, uap->mode); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif int fchmod(td, uap) struct thread *td; register struct fchmod_args /* { int fd; int mode; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(mode, uap->mode); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setfmode(td, fp->f_vnode, uap->mode); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ static int setfown(td, vp, uid, gid) struct thread *td; struct vnode *vp; uid_t uid; gid_t gid; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_check_vnode_setowner(td->td_ucred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif int chown(td, uap) struct thread *td; register struct chown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(owner, uid, gid); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif int lchown(td, uap) struct thread *td; register struct lchown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(owner, uid, gid); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif int fchown(td, uap) struct thread *td; register struct fchown_args /* { int fd; int uid; int gid; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(owner, uap->uid, uap->gid); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setfown(td, fp->f_vnode, uap->uid, uap->gid); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(usrtvp, tvpseg, tsp) const struct timeval *usrtvp; enum uio_seg tvpseg; struct timespec *tsp; { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { microtime(&tv[0]); TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return (0); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int setutimes(td, vp, ts, numtimes, nullflag) struct thread *td; struct vnode *vp; const struct timespec *ts; int numtimes; int nullflag; { int error, setbirthtime; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); setbirthtime = 0; if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = 1; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_check_vnode_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif int utimes(td, uap) struct thread *td; register struct utimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_utimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; int vfslocked; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif int lutimes(td, uap) struct thread *td; register struct lutimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; int vfslocked; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif int futimes(td, uap) struct thread *td; register struct futimes_args /* { int fd; struct timeval *tptr; } */ *uap; { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, fd); if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif int truncate(td, uap) struct thread *td; register struct truncate_args /* { char *path; int pad; off_t length; } */ *uap; { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; if (length < 0) return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_check_vnode_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); } vput(vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ftruncate_args { int fd; int pad; off_t length; }; #endif int ftruncate(td, uap) struct thread *td; register struct ftruncate_args /* { int fd; int pad; off_t length; } */ *uap; { struct mount *mp; struct vattr vattr; struct vnode *vp; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); if (uap->length < 0) return(EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FWRITE) == 0) { fdrop(fp, td); return (EINVAL); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, vp, ARG_VNODE1); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_check_vnode_write(td->td_ucred, fp->f_cred, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0) { VATTR_NULL(&vattr); vattr.va_size = uap->length; error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); } VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); drop: VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } #if defined(COMPAT_43) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif int otruncate(td, uap) struct thread *td; register struct otruncate_args /* { char *path; long length; } */ *uap; { struct truncate_args /* { char *path; int pad; off_t length; } */ nuap; nuap.path = uap->path; nuap.length = uap->length; return (truncate(td, &nuap)); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct oftruncate_args { int fd; long length; }; #endif int oftruncate(td, uap) struct thread *td; register struct oftruncate_args /* { int fd; long length; } */ *uap; { struct ftruncate_args /* { int fd; int pad; off_t length; } */ nuap; nuap.fd = uap->fd; nuap.length = uap->length; return (ftruncate(td, &nuap)); } #endif /* COMPAT_43 */ /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif int fsync(td, uap) struct thread *td; struct fsync_args /* { int fd; } */ *uap; { struct vnode *vp; struct mount *mp; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, vp, ARG_VNODE1); if (vp->v_object != NULL) { VM_OBJECT_LOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_UNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); drop: VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* - * Rename files. Source and destination must either both be directories, - * or both not be directories. If target is a directory, it must be empty. + * Rename files. Source and destination must either both be directories, or + * both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif int rename(td, uap) struct thread *td; register struct rename_args /* { char *from; char *to; } */ *uap; { return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE)); } int kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int tvfslocked; int fvfslocked; int error; bwillwrite(); #ifdef MAC NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE | AUDITVNODE1, pathseg, from, td); #else NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE | AUDITVNODE1, pathseg, from, td); #endif if ((error = namei(&fromnd)) != 0) return (error); fvfslocked = NDHASGIANT(&fromnd); tvfslocked = 0; #ifdef MAC error = mac_check_vnode_rename_from(td->td_ucred, fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd); VOP_UNLOCK(fromnd.ni_dvp, 0, td); if (fromnd.ni_dvp != fromnd.ni_vp) VOP_UNLOCK(fromnd.ni_vp, 0, td); #endif fvp = fromnd.ni_vp; if (error == 0) error = vn_start_write(fvp, &mp, V_WAIT | PCATCH); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | MPSAFE | AUDITVNODE2, pathseg, to, td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); vn_finished_write(mp); goto out1; } tvfslocked = NDHASGIANT(&tond); tdvp = tond.ni_dvp; tvp = tond.ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } } if (fvp == tdvp) error = EINVAL; /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = -1; #ifdef MAC else error = mac_check_vnode_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (!error) { VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE); if (fromnd.ni_dvp != tdvp) { VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE); } if (tvp) { VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); VFS_UNLOCK_GIANT(fvfslocked); VFS_UNLOCK_GIANT(tvfslocked); if (error == -1) return (0); return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif int mkdir(td, uap) struct thread *td; register struct mkdir_args /* { char *path; int mode; } */ *uap; { return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, segflg, path, td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); /* * XXX namei called with LOCKPARENT but not LOCKLEAF has * the strange behaviour of leaving the vnode unlocked * if the target is the same vnode as the parent. */ if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; FILEDESC_LOCK_FAST(td->td_proc->p_fd); vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; FILEDESC_UNLOCK_FAST(td->td_proc->p_fd); #ifdef MAC error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (!error) vput(nd.ni_vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif int rmdir(td, uap) struct thread *td; struct rmdir_args /* { char *path; } */ *uap; { return (kern_rmdir(td, uap->path, UIO_USERSPACE)); } int kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } #ifdef MAC error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #ifdef COMPAT_43 /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(td, uap) struct thread *td; register struct ogetdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio, kuio; struct iovec aiov, kiov; struct dirent *dp, *edp; caddr_t dirbuf; int error, eofflag, readcnt, vfslocked; long loff; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (EINVAL); } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_check_vnode_readdir(td->td_ucred, vp); if (error) { VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } #endif # if (BYTE_ORDER != LITTLE_ENDIAN) if (vp->v_mount->mnt_maxsymlinklen <= 0) { error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; } else # endif { kuio = auio; kuio.uio_iov = &kiov; kuio.uio_segflg = UIO_SYSSPACE; kiov.iov_len = uap->count; MALLOC(dirbuf, caddr_t, uap->count, M_TEMP, M_WAITOK); kiov.iov_base = dirbuf; error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = kuio.uio_offset; if (error == 0) { readcnt = uap->count - kuio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { # if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of * dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen * is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; # else /* * The dp->d_type is the high byte * of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; # endif if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, &auio); } FREE(dirbuf, M_TEMP); } if (error) { VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } if (uap->count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); VFS_UNLOCK_GIANT(vfslocked); goto unionread; } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); error = copyout(&loff, uap->basep, sizeof(long)); fdrop(fp, td); td->td_retval[0] = uap->count - auio.uio_resid; return (error); } #endif /* COMPAT_43 */ /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int getdirentries(td, uap) struct thread *td; register struct getdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; int vfslocked; long loff; int error, eofflag; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); error = EINVAL; goto fail; } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, vp, ARG_VNODE1); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_check_vnode_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; if (error) { VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); goto fail; } if (uap->count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); VFS_UNLOCK_GIANT(vfslocked); goto unionread; } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); if (uap->basep != NULL) { error = copyout(&loff, uap->basep, sizeof(long)); } td->td_retval[0] = uap->count - auio.uio_resid; fail: fdrop(fp, td); return (error); } + #ifndef _SYS_SYSPROTO_H_ struct getdents_args { int fd; char *buf; size_t count; }; #endif int getdents(td, uap) struct thread *td; register struct getdents_args /* { int fd; char *buf; u_int count; } */ *uap; { struct getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return (getdirentries(td, &ap)); } /* * Set the mode mask for creation of filesystem nodes. - * - * MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int umask(td, uap) struct thread *td; struct umask_args /* { int newmask; } */ *uap; { register struct filedesc *fdp; FILEDESC_LOCK_FAST(td->td_proc->p_fd); fdp = td->td_proc->p_fd; td->td_retval[0] = fdp->fd_cmask; fdp->fd_cmask = uap->newmask & ALLPERMS; FILEDESC_UNLOCK_FAST(td->td_proc->p_fd); return (0); } /* - * Void all references to file by ripping underlying filesystem - * away from vnode. + * Void all references to file by ripping underlying filesystem away from + * vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif int revoke(td, uap) struct thread *td; register struct revoke_args /* { char *path; } */ *uap; { struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VCHR) { error = EINVAL; goto out; } #ifdef MAC error = mac_check_vnode_revoke(td->td_ucred, vp); if (error) goto out; #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); if (error) goto out; if (td->td_ucred->cr_uid != vattr.va_uid) { error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, SUSER_ALLOWJAIL); if (error) goto out; } if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); out: vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Convert a user file descriptor to a kernel file entry. * A reference on the file entry is held upon returning. */ int getvnode(fdp, fd, fpp) struct filedesc *fdp; int fd; struct file **fpp; { int error; struct file *fp; fp = NULL; if (fdp == NULL) error = EBADF; else { FILEDESC_LOCK(fdp); if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) error = EBADF; else if (fp->f_vnode == NULL) { fp = NULL; error = EINVAL; } else { fhold(fp); error = 0; } FILEDESC_UNLOCK(fdp); } *fpp = fp; return (error); } /* - * Get (NFS) file handle + * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct lgetfh_args { char *fname; fhandle_t *fhp; }; #endif int lgetfh(td, uap) struct thread *td; register struct lgetfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int getfh(td, uap) struct thread *td; register struct getfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into an * open descriptor. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. - * - * MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int fhopen(td, uap) struct thread *td; struct fhopen_args /* { const struct fhandle *u_fhp; int flags; } */ *uap; { struct proc *p = td->td_proc; struct mount *mp; struct vnode *vp; struct fhandle fhp; struct vattr vat; struct vattr *vap = &vat; struct flock lf; struct file *fp; register struct filedesc *fdp = p->p_fd; int fmode, mode, error, type; struct file *nfp; int vfslocked; int indx; error = priv_check(td, PRIV_VFS_FHOPEN); if (error) return (error); fmode = FFLAGS(uap->flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(uap->u_fhp, &fhp, sizeof(fhp)); if (error) return(error); /* find the mount point */ mp = vfs_getvfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp); if (error) goto out; /* * from now on we have to make sure not * to forget about the vnode * any error that causes an abort must vput(vp) * just set error = err and 'goto bad;'. */ /* * from vn_open */ if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error) goto bad; mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (fmode & O_APPEND) mode |= VAPPEND; #ifdef MAC error = mac_check_vnode_open(td->td_ucred, vp, mode); if (error) goto bad; #endif if (mode) { error = VOP_ACCESS(vp, mode, td->td_ucred, td); if (error) goto bad; } if (fmode & O_TRUNC) { VOP_UNLOCK(vp, 0, td); /* XXX */ if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); goto out; } VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */ #ifdef MAC /* * We don't yet have fp->f_cred, so use td->td_ucred, which * should be right. */ error = mac_check_vnode_write(td->td_ucred, td->td_ucred, vp); if (error == 0) { #endif VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, td->td_ucred, td); #ifdef MAC } #endif vn_finished_write(mp); if (error) goto bad; } error = VOP_OPEN(vp, fmode, td->td_ucred, td, -1); if (error) goto bad; if (fmode & FWRITE) vp->v_writecount++; /* * end of vn_open code */ if ((error = falloc(td, &nfp, &indx)) != 0) { if (fmode & FWRITE) vp->v_writecount--; goto bad; } /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; nfp->f_vnode = vp; nfp->f_data = vp; nfp->f_flag = fmode & FMASK; nfp->f_ops = &vnops; nfp->f_type = DTYPE_VNODE; if (fmode & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (fmode & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; VOP_UNLOCK(vp, 0, td); if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) { /* * The lock request failed. Normally close the * descriptor but handle the case where someone might * have dup()d or close()d it when we weren't looking. */ fdclose(fdp, fp, indx, td); /* * release our private reference */ fdrop(fp, td); goto out; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); fp->f_flag |= FHASLOCK; } VOP_UNLOCK(vp, 0, td); fdrop(fp, td); vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); td->td_retval[0] = indx; return (0); bad: vput(vp); out: vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Stat an (NFS) file handle. - * - * MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int fhstat(td, uap) struct thread *td; register struct fhstat_args /* { struct fhandle *u_fhp; struct stat *sb; } */ *uap; { struct stat sb; fhandle_t fh; struct mount *mp; struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_FHSTAT); if (error) return (error); error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) { vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); vput(vp); vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } /* * Implement fstatfs() for (NFS) file handles. - * - * MP SAFE */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int fhstatfs(td, uap) struct thread *td; struct fhstatfs_args /* { struct fhandle *u_fhp; struct statfs *buf; } */ *uap; { struct statfs sf; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); error = kern_fhstatfs(td, fh, &sf); if (error) return (error); return (copyout(&sf, uap->buf, sizeof(sf))); } int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) { struct statfs *sp; struct mount *mp; struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_FHSTATFS); if (error) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); error = VFS_FHTOVP(mp, &fh.fh_fid, &vp); if (error) { VFS_UNLOCK_GIANT(vfslocked); vfs_rel(mp); return (error); } vput(vp); error = prison_canseemount(td->td_ucred, mp); if (error) goto out; #ifdef MAC error = mac_check_mount_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error == 0) *buf = *sp; out: vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); }