Index: head/sys/cddl/compat/opensolaris/sys/file.h =================================================================== --- head/sys/cddl/compat/opensolaris/sys/file.h (revision 285171) +++ head/sys/cddl/compat/opensolaris/sys/file.h (revision 285172) @@ -1,64 +1,65 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_FILE_H_ #define _OPENSOLARIS_SYS_FILE_H_ #include_next #define FKIOCTL 0x80000000 /* ioctl addresses are from kernel */ #ifdef _KERNEL typedef struct file file_t; #include static __inline file_t * getf(int fd, cap_rights_t *rightsp) { struct file *fp; if (fget(curthread, fd, rightsp, &fp) == 0) return (fp); return (NULL); } static __inline void releasef(int fd) { struct file *fp; + cap_rights_t rights; /* No CAP_ rights required, as we're only releasing. */ - if (fget(curthread, fd, NULL, &fp) == 0) { + if (fget(curthread, fd, cap_rights_init(&rights), &fp) == 0) { fdrop(fp, curthread); fdrop(fp, curthread); } } #endif /* _KERNEL */ #endif /* !_OPENSOLARIS_SYS_FILE_H_ */ Index: head/sys/compat/linux/linux_stats.c =================================================================== --- head/sys/compat/linux/linux_stats.c (revision 285171) +++ head/sys/compat/linux/linux_stats.c (revision 285172) @@ -1,696 +1,697 @@ /*- * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include static void translate_vnhook_major_minor(struct vnode *vp, struct stat *sb) { int major, minor; if (vp->v_type == VCHR && vp->v_rdev != NULL && linux_driver_get_major_minor(devtoname(vp->v_rdev), &major, &minor) == 0) { sb->st_rdev = (major << 8 | minor); } } static int linux_kern_statat(struct thread *td, int flag, int fd, char *path, enum uio_seg pathseg, struct stat *sbp) { return (kern_statat(td, flag, fd, path, pathseg, sbp, translate_vnhook_major_minor)); } static int linux_kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { return (linux_kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp)); } static int linux_kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { return (linux_kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg, sbp)); } /* * XXX: This was removed from newstat_copyout(), and almost identical * XXX: code was in stat64_copyout(). findcdev() needs to be replaced * XXX: with something that does lookup and locking properly. * XXX: When somebody fixes this: please try to avoid duplicating it. */ #if 0 static void disk_foo(struct somestat *tbuf) { struct cdevsw *cdevsw; struct cdev *dev; /* Lie about disk drives which are character devices * in FreeBSD but block devices under Linux. */ if (S_ISCHR(tbuf.st_mode) && (dev = findcdev(buf->st_rdev)) != NULL) { cdevsw = dev_refthread(dev); if (cdevsw != NULL) { if (cdevsw->d_flags & D_DISK) { tbuf.st_mode &= ~S_IFMT; tbuf.st_mode |= S_IFBLK; /* XXX this may not be quite right */ /* Map major number to 0 */ tbuf.st_dev = minor(buf->st_dev) & 0xf; tbuf.st_rdev = buf->st_rdev & 0xff; } dev_relthread(dev); } } } #endif static void translate_fd_major_minor(struct thread *td, int fd, struct stat *buf) { struct file *fp; struct vnode *vp; + cap_rights_t rights; int major, minor; /* * No capability rights required here. */ if ((!S_ISCHR(buf->st_mode) && !S_ISBLK(buf->st_mode)) || - fget(td, fd, 0, &fp) != 0) + fget(td, fd, cap_rights_init(&rights), &fp) != 0) return; vp = fp->f_vnode; if (vp != NULL && vp->v_rdev != NULL && linux_driver_get_major_minor(devtoname(vp->v_rdev), &major, &minor) == 0) { buf->st_rdev = (major << 8 | minor); } else if (fp->f_type == DTYPE_PTS) { struct tty *tp = fp->f_data; /* Convert the numbers for the slave device. */ if (linux_driver_get_major_minor(devtoname(tp->t_dev), &major, &minor) == 0) { buf->st_rdev = (major << 8 | minor); } } fdrop(fp, td); } static int newstat_copyout(struct stat *buf, void *ubuf) { struct l_newstat tbuf; bzero(&tbuf, sizeof(tbuf)); tbuf.st_dev = minor(buf->st_dev) | (major(buf->st_dev) << 8); tbuf.st_ino = buf->st_ino; tbuf.st_mode = buf->st_mode; tbuf.st_nlink = buf->st_nlink; tbuf.st_uid = buf->st_uid; tbuf.st_gid = buf->st_gid; tbuf.st_rdev = buf->st_rdev; tbuf.st_size = buf->st_size; tbuf.st_atim.tv_sec = buf->st_atim.tv_sec; tbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; tbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; tbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; tbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; tbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; tbuf.st_blksize = buf->st_blksize; tbuf.st_blocks = buf->st_blocks; return (copyout(&tbuf, ubuf, sizeof(tbuf))); } int linux_newstat(struct thread *td, struct linux_newstat_args *args) { struct stat buf; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(newstat)) printf(ARGS(newstat, "%s, *"), path); #endif error = linux_kern_stat(td, path, UIO_SYSSPACE, &buf); LFREEPATH(path); if (error) return (error); return (newstat_copyout(&buf, args->buf)); } int linux_newlstat(struct thread *td, struct linux_newlstat_args *args) { struct stat sb; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(newlstat)) printf(ARGS(newlstat, "%s, *"), path); #endif error = linux_kern_lstat(td, path, UIO_SYSSPACE, &sb); LFREEPATH(path); if (error) return (error); return (newstat_copyout(&sb, args->buf)); } int linux_newfstat(struct thread *td, struct linux_newfstat_args *args) { struct stat buf; int error; #ifdef DEBUG if (ldebug(newfstat)) printf(ARGS(newfstat, "%d, *"), args->fd); #endif error = kern_fstat(td, args->fd, &buf); translate_fd_major_minor(td, args->fd, &buf); if (!error) error = newstat_copyout(&buf, args->buf); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int stat_copyout(struct stat *buf, void *ubuf) { struct l_stat lbuf; bzero(&lbuf, sizeof(lbuf)); lbuf.st_dev = buf->st_dev; lbuf.st_ino = buf->st_ino; lbuf.st_mode = buf->st_mode; lbuf.st_nlink = buf->st_nlink; lbuf.st_uid = buf->st_uid; lbuf.st_gid = buf->st_gid; lbuf.st_rdev = buf->st_rdev; if (buf->st_size < (quad_t)1 << 32) lbuf.st_size = buf->st_size; else lbuf.st_size = -2; lbuf.st_atim.tv_sec = buf->st_atim.tv_sec; lbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; lbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; lbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; lbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; lbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; lbuf.st_blksize = buf->st_blksize; lbuf.st_blocks = buf->st_blocks; lbuf.st_flags = buf->st_flags; lbuf.st_gen = buf->st_gen; return (copyout(&lbuf, ubuf, sizeof(lbuf))); } int linux_stat(struct thread *td, struct linux_stat_args *args) { struct stat buf; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(stat)) printf(ARGS(stat, "%s, *"), path); #endif error = linux_kern_stat(td, path, UIO_SYSSPACE, &buf); if (error) { LFREEPATH(path); return (error); } LFREEPATH(path); return(stat_copyout(&buf, args->up)); } int linux_lstat(struct thread *td, struct linux_lstat_args *args) { struct stat buf; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(lstat)) printf(ARGS(lstat, "%s, *"), path); #endif error = linux_kern_lstat(td, path, UIO_SYSSPACE, &buf); if (error) { LFREEPATH(path); return (error); } LFREEPATH(path); return(stat_copyout(&buf, args->up)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ struct l_statfs { l_long f_type; l_long f_bsize; l_long f_blocks; l_long f_bfree; l_long f_bavail; l_long f_files; l_long f_ffree; l_fsid_t f_fsid; l_long f_namelen; l_long f_spare[6]; }; #define LINUX_CODA_SUPER_MAGIC 0x73757245L #define LINUX_EXT2_SUPER_MAGIC 0xEF53L #define LINUX_HPFS_SUPER_MAGIC 0xf995e849L #define LINUX_ISOFS_SUPER_MAGIC 0x9660L #define LINUX_MSDOS_SUPER_MAGIC 0x4d44L #define LINUX_NCP_SUPER_MAGIC 0x564cL #define LINUX_NFS_SUPER_MAGIC 0x6969L #define LINUX_NTFS_SUPER_MAGIC 0x5346544EL #define LINUX_PROC_SUPER_MAGIC 0x9fa0L #define LINUX_UFS_SUPER_MAGIC 0x00011954L /* XXX - UFS_MAGIC in Linux */ #define LINUX_DEVFS_SUPER_MAGIC 0x1373L #define LINUX_SHMFS_MAGIC 0x01021994 static long bsd_to_linux_ftype(const char *fstypename) { int i; static struct {const char *bsd_name; long linux_type;} b2l_tbl[] = { {"ufs", LINUX_UFS_SUPER_MAGIC}, {"cd9660", LINUX_ISOFS_SUPER_MAGIC}, {"nfs", LINUX_NFS_SUPER_MAGIC}, {"ext2fs", LINUX_EXT2_SUPER_MAGIC}, {"procfs", LINUX_PROC_SUPER_MAGIC}, {"msdosfs", LINUX_MSDOS_SUPER_MAGIC}, {"ntfs", LINUX_NTFS_SUPER_MAGIC}, {"nwfs", LINUX_NCP_SUPER_MAGIC}, {"hpfs", LINUX_HPFS_SUPER_MAGIC}, {"coda", LINUX_CODA_SUPER_MAGIC}, {"devfs", LINUX_DEVFS_SUPER_MAGIC}, {"tmpfs", LINUX_SHMFS_MAGIC}, {NULL, 0L}}; for (i = 0; b2l_tbl[i].bsd_name != NULL; i++) if (strcmp(b2l_tbl[i].bsd_name, fstypename) == 0) return (b2l_tbl[i].linux_type); return (0L); } static void bsd_to_linux_statfs(struct statfs *bsd_statfs, struct l_statfs *linux_statfs) { linux_statfs->f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename); linux_statfs->f_bsize = bsd_statfs->f_bsize; linux_statfs->f_blocks = bsd_statfs->f_blocks; linux_statfs->f_bfree = bsd_statfs->f_bfree; linux_statfs->f_bavail = bsd_statfs->f_bavail; linux_statfs->f_ffree = bsd_statfs->f_ffree; linux_statfs->f_files = bsd_statfs->f_files; linux_statfs->f_fsid.val[0] = bsd_statfs->f_fsid.val[0]; linux_statfs->f_fsid.val[1] = bsd_statfs->f_fsid.val[1]; linux_statfs->f_namelen = MAXNAMLEN; } int linux_statfs(struct thread *td, struct linux_statfs_args *args) { struct l_statfs linux_statfs; struct statfs bsd_statfs; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(statfs)) printf(ARGS(statfs, "%s, *"), path); #endif error = kern_statfs(td, path, UIO_SYSSPACE, &bsd_statfs); LFREEPATH(path); if (error) return (error); bsd_to_linux_statfs(&bsd_statfs, &linux_statfs); return copyout(&linux_statfs, args->buf, sizeof(linux_statfs)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static void bsd_to_linux_statfs64(struct statfs *bsd_statfs, struct l_statfs64 *linux_statfs) { linux_statfs->f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename); linux_statfs->f_bsize = bsd_statfs->f_bsize; linux_statfs->f_blocks = bsd_statfs->f_blocks; linux_statfs->f_bfree = bsd_statfs->f_bfree; linux_statfs->f_bavail = bsd_statfs->f_bavail; linux_statfs->f_ffree = bsd_statfs->f_ffree; linux_statfs->f_files = bsd_statfs->f_files; linux_statfs->f_fsid.val[0] = bsd_statfs->f_fsid.val[0]; linux_statfs->f_fsid.val[1] = bsd_statfs->f_fsid.val[1]; linux_statfs->f_namelen = MAXNAMLEN; } int linux_statfs64(struct thread *td, struct linux_statfs64_args *args) { struct l_statfs64 linux_statfs; struct statfs bsd_statfs; char *path; int error; if (args->bufsize != sizeof(struct l_statfs64)) return EINVAL; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(statfs64)) printf(ARGS(statfs64, "%s, *"), path); #endif error = kern_statfs(td, path, UIO_SYSSPACE, &bsd_statfs); LFREEPATH(path); if (error) return (error); bsd_to_linux_statfs64(&bsd_statfs, &linux_statfs); return copyout(&linux_statfs, args->buf, sizeof(linux_statfs)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_fstatfs(struct thread *td, struct linux_fstatfs_args *args) { struct l_statfs linux_statfs; struct statfs bsd_statfs; int error; #ifdef DEBUG if (ldebug(fstatfs)) printf(ARGS(fstatfs, "%d, *"), args->fd); #endif error = kern_fstatfs(td, args->fd, &bsd_statfs); if (error) return error; bsd_to_linux_statfs(&bsd_statfs, &linux_statfs); return copyout(&linux_statfs, args->buf, sizeof(linux_statfs)); } struct l_ustat { l_daddr_t f_tfree; l_ino_t f_tinode; char f_fname[6]; char f_fpack[6]; }; int linux_ustat(struct thread *td, struct linux_ustat_args *args) { #ifdef DEBUG if (ldebug(ustat)) printf(ARGS(ustat, "%ju, *"), (uintmax_t)args->dev); #endif return (EOPNOTSUPP); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int stat64_copyout(struct stat *buf, void *ubuf) { struct l_stat64 lbuf; bzero(&lbuf, sizeof(lbuf)); lbuf.st_dev = minor(buf->st_dev) | (major(buf->st_dev) << 8); lbuf.st_ino = buf->st_ino; lbuf.st_mode = buf->st_mode; lbuf.st_nlink = buf->st_nlink; lbuf.st_uid = buf->st_uid; lbuf.st_gid = buf->st_gid; lbuf.st_rdev = buf->st_rdev; lbuf.st_size = buf->st_size; lbuf.st_atim.tv_sec = buf->st_atim.tv_sec; lbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; lbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; lbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; lbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; lbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; lbuf.st_blksize = buf->st_blksize; lbuf.st_blocks = buf->st_blocks; /* * The __st_ino field makes all the difference. In the Linux kernel * it is conditionally compiled based on STAT64_HAS_BROKEN_ST_INO, * but without the assignment to __st_ino the runtime linker refuses * to mmap(2) any shared libraries. I guess it's broken alright :-) */ lbuf.__st_ino = buf->st_ino; return (copyout(&lbuf, ubuf, sizeof(lbuf))); } int linux_stat64(struct thread *td, struct linux_stat64_args *args) { struct stat buf; char *filename; int error; LCONVPATHEXIST(td, args->filename, &filename); #ifdef DEBUG if (ldebug(stat64)) printf(ARGS(stat64, "%s, *"), filename); #endif error = linux_kern_stat(td, filename, UIO_SYSSPACE, &buf); LFREEPATH(filename); if (error) return (error); return (stat64_copyout(&buf, args->statbuf)); } int linux_lstat64(struct thread *td, struct linux_lstat64_args *args) { struct stat sb; char *filename; int error; LCONVPATHEXIST(td, args->filename, &filename); #ifdef DEBUG if (ldebug(lstat64)) printf(ARGS(lstat64, "%s, *"), args->filename); #endif error = linux_kern_lstat(td, filename, UIO_SYSSPACE, &sb); LFREEPATH(filename); if (error) return (error); return (stat64_copyout(&sb, args->statbuf)); } int linux_fstat64(struct thread *td, struct linux_fstat64_args *args) { struct stat buf; int error; #ifdef DEBUG if (ldebug(fstat64)) printf(ARGS(fstat64, "%d, *"), args->fd); #endif error = kern_fstat(td, args->fd, &buf); translate_fd_major_minor(td, args->fd, &buf); if (!error) error = stat64_copyout(&buf, args->statbuf); return (error); } int linux_fstatat64(struct thread *td, struct linux_fstatat64_args *args) { char *path; int error, dfd, flag; struct stat buf; if (args->flag & ~LINUX_AT_SYMLINK_NOFOLLOW) return (EINVAL); flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->pathname, &path, dfd); #ifdef DEBUG if (ldebug(fstatat64)) printf(ARGS(fstatat64, "%i, %s, %i"), args->dfd, path, args->flag); #endif error = linux_kern_statat(td, flag, dfd, path, UIO_SYSSPACE, &buf); if (!error) error = stat64_copyout(&buf, args->statbuf); LFREEPATH(path); return (error); } #else /* __amd64__ && !COMPAT_LINUX32 */ int linux_newfstatat(struct thread *td, struct linux_newfstatat_args *args) { char *path; int error, dfd, flag; struct stat buf; if (args->flag & ~LINUX_AT_SYMLINK_NOFOLLOW) return (EINVAL); flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->pathname, &path, dfd); #ifdef DEBUG if (ldebug(newfstatat)) printf(ARGS(newfstatat, "%i, %s, %i"), args->dfd, path, args->flag); #endif error = linux_kern_statat(td, flag, dfd, path, UIO_SYSSPACE, &buf); if (error == 0) error = newstat_copyout(&buf, args->statbuf); LFREEPATH(path); return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_syncfs(struct thread *td, struct linux_syncfs_args *args) { cap_rights_t rights; struct mount *mp; struct vnode *vp; int error, save; error = fgetvp(td, args->fd, cap_rights_init(&rights, CAP_FSYNC), &vp); if (error != 0) /* * Linux syncfs() returns only EBADF, however fgetvp() * can return EINVAL in case of file descriptor does * not represent a vnode. XXX. */ return (error); mp = vp->v_mount; mtx_lock(&mountlist_mtx); error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error != 0) { /* See comment above. */ mtx_unlock(&mountlist_mtx); goto out; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); } vfs_unbusy(mp); out: vrele(vp); return (error); } Index: head/sys/fs/fdescfs/fdesc_vnops.c =================================================================== --- head/sys/fs/fdescfs/fdesc_vnops.c (revision 285171) +++ head/sys/fs/fdescfs/fdesc_vnops.c (revision 285172) @@ -1,604 +1,605 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fdesc_vnops.c 8.9 (Berkeley) 1/21/94 * * $FreeBSD$ */ /* * /dev/fd Filesystem */ #include #include #include #include #include #include #include /* boottime */ #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #define NFDCACHE 4 #define FD_NHASH(ix) \ (&fdhashtbl[(ix) & fdhash]) static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl; static u_long fdhash; struct mtx fdesc_hashmtx; static vop_getattr_t fdesc_getattr; static vop_lookup_t fdesc_lookup; static vop_open_t fdesc_open; static vop_readdir_t fdesc_readdir; static vop_reclaim_t fdesc_reclaim; static vop_setattr_t fdesc_setattr; static struct vop_vector fdesc_vnodeops = { .vop_default = &default_vnodeops, .vop_access = VOP_NULL, .vop_getattr = fdesc_getattr, .vop_lookup = fdesc_lookup, .vop_open = fdesc_open, .vop_pathconf = vop_stdpathconf, .vop_readdir = fdesc_readdir, .vop_reclaim = fdesc_reclaim, .vop_setattr = fdesc_setattr, }; static void fdesc_insmntque_dtr(struct vnode *, void *); static void fdesc_remove_entry(struct fdescnode *); /* * Initialise cache headers */ int fdesc_init(vfsp) struct vfsconf *vfsp; { mtx_init(&fdesc_hashmtx, "fdescfs_hash", NULL, MTX_DEF); fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash); return (0); } /* * Uninit ready for unload. */ int fdesc_uninit(vfsp) struct vfsconf *vfsp; { hashdestroy(fdhashtbl, M_CACHE, fdhash); mtx_destroy(&fdesc_hashmtx); return (0); } /* * If allocating vnode fails, call this. */ static void fdesc_insmntque_dtr(struct vnode *vp, void *arg) { vgone(vp); vput(vp); } /* * Remove an entry from the hash if it exists. */ static void fdesc_remove_entry(struct fdescnode *fd) { struct fdhashhead *fc; struct fdescnode *fd2; fc = FD_NHASH(fd->fd_ix); mtx_lock(&fdesc_hashmtx); LIST_FOREACH(fd2, fc, fd_hash) { if (fd == fd2) { LIST_REMOVE(fd, fd_hash); break; } } mtx_unlock(&fdesc_hashmtx); } int fdesc_allocvp(ftype, fd_fd, ix, mp, vpp) fdntype ftype; unsigned fd_fd; int ix; struct mount *mp; struct vnode **vpp; { struct fdescmount *fmp; struct fdhashhead *fc; struct fdescnode *fd, *fd2; struct vnode *vp, *vp2; struct thread *td; int error = 0; td = curthread; fc = FD_NHASH(ix); loop: mtx_lock(&fdesc_hashmtx); /* * If a forced unmount is progressing, we need to drop it. The flags are * protected by the hashmtx. */ fmp = (struct fdescmount *)mp->mnt_data; if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) { mtx_unlock(&fdesc_hashmtx); return (-1); } LIST_FOREACH(fd, fc, fd_hash) { if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) { /* Get reference to vnode in case it's being free'd */ vp = fd->fd_vnode; VI_LOCK(vp); mtx_unlock(&fdesc_hashmtx); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) goto loop; *vpp = vp; return (0); } } mtx_unlock(&fdesc_hashmtx); fd = malloc(sizeof(struct fdescnode), M_TEMP, M_WAITOK); error = getnewvnode("fdescfs", mp, &fdesc_vnodeops, &vp); if (error) { free(fd, M_TEMP); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_data = fd; fd->fd_vnode = vp; fd->fd_type = ftype; fd->fd_fd = fd_fd; fd->fd_ix = ix; error = insmntque1(vp, mp, fdesc_insmntque_dtr, NULL); if (error != 0) { *vpp = NULLVP; return (error); } /* Make sure that someone didn't beat us when inserting the vnode. */ mtx_lock(&fdesc_hashmtx); /* * If a forced unmount is progressing, we need to drop it. The flags are * protected by the hashmtx. */ fmp = (struct fdescmount *)mp->mnt_data; if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) { mtx_unlock(&fdesc_hashmtx); vgone(vp); vput(vp); *vpp = NULLVP; return (-1); } LIST_FOREACH(fd2, fc, fd_hash) { if (fd2->fd_ix == ix && fd2->fd_vnode->v_mount == mp) { /* Get reference to vnode in case it's being free'd */ vp2 = fd2->fd_vnode; VI_LOCK(vp2); mtx_unlock(&fdesc_hashmtx); error = vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, td); /* Someone beat us, dec use count and wait for reclaim */ vgone(vp); vput(vp); /* If we didn't get it, return no vnode. */ if (error) vp2 = NULLVP; *vpp = vp2; return (error); } } /* If we came here, we can insert it safely. */ LIST_INSERT_HEAD(fc, fd, fd_hash); mtx_unlock(&fdesc_hashmtx); *vpp = vp; return (0); } struct fdesc_get_ino_args { fdntype ftype; unsigned fd_fd; int ix; struct file *fp; struct thread *td; }; static int fdesc_get_ino_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { struct fdesc_get_ino_args *a; int error; a = arg; error = fdesc_allocvp(a->ftype, a->fd_fd, a->ix, mp, rvp); fdrop(a->fp, a->td); return (error); } /* * vp is the current namei directory * ndp is the name to locate in that directory... */ static int fdesc_lookup(ap) struct vop_lookup_args /* { struct vnode * a_dvp; struct vnode ** a_vpp; struct componentname * a_cnp; } */ *ap; { struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; char *pname = cnp->cn_nameptr; struct thread *td = cnp->cn_thread; struct file *fp; struct fdesc_get_ino_args arg; + cap_rights_t rights; int nlen = cnp->cn_namelen; u_int fd, fd1; int error; struct vnode *fvp; if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad; } if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; VREF(dvp); return (0); } if (VTOFDESC(dvp)->fd_type != Froot) { error = ENOTDIR; goto bad; } fd = 0; /* the only time a leading 0 is acceptable is if it's "0" */ if (*pname == '0' && nlen != 1) { error = ENOENT; goto bad; } while (nlen--) { if (*pname < '0' || *pname > '9') { error = ENOENT; goto bad; } fd1 = 10 * fd + *pname++ - '0'; if (fd1 < fd) { error = ENOENT; goto bad; } fd = fd1; } /* * No rights to check since 'fp' isn't actually used. */ - if ((error = fget(td, fd, NULL, &fp)) != 0) + if ((error = fget(td, fd, cap_rights_init(&rights), &fp)) != 0) goto bad; /* Check if we're looking up ourselves. */ if (VTOFDESC(dvp)->fd_ix == FD_DESC + fd) { /* * In case we're holding the last reference to the file, the dvp * will be re-acquired. */ vhold(dvp); VOP_UNLOCK(dvp, 0); fdrop(fp, td); /* Re-aquire the lock afterwards. */ vn_lock(dvp, LK_RETRY | LK_EXCLUSIVE); vdrop(dvp); fvp = dvp; if ((dvp->v_iflag & VI_DOOMED) != 0) error = ENOENT; } else { /* * Unlock our root node (dvp) when doing this, since we might * deadlock since the vnode might be locked by another thread * and the root vnode lock will be obtained afterwards (in case * we're looking up the fd of the root vnode), which will be the * opposite lock order. Vhold the root vnode first so we don't * lose it. */ arg.ftype = Fdesc; arg.fd_fd = fd; arg.ix = FD_DESC + fd; arg.fp = fp; arg.td = td; error = vn_vget_ino_gen(dvp, fdesc_get_ino_alloc, &arg, LK_EXCLUSIVE, &fvp); } if (error) goto bad; *vpp = fvp; return (0); bad: *vpp = NULL; return (error); } static int fdesc_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; if (VTOFDESC(vp)->fd_type == Froot) return (0); /* * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the file * descriptor being sought for duplication. The error return ensures * that the vnode for this device will be released by vn_open. Open * will detect this special error and take the actions in dupfdopen. * Other callers of vn_open or VOP_OPEN will simply report the * error. */ ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd; /* XXX */ return (ENODEV); } static int fdesc_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; vap->va_fileid = VTOFDESC(vp)->fd_ix; vap->va_uid = 0; vap->va_gid = 0; vap->va_blocksize = DEV_BSIZE; vap->va_atime.tv_sec = boottime.tv_sec; vap->va_atime.tv_nsec = 0; vap->va_mtime = vap->va_atime; vap->va_ctime = vap->va_mtime; vap->va_gen = 0; vap->va_flags = 0; vap->va_bytes = 0; vap->va_filerev = 0; switch (VTOFDESC(vp)->fd_type) { case Froot: vap->va_type = VDIR; vap->va_nlink = 2; vap->va_size = DEV_BSIZE; vap->va_rdev = NODEV; break; case Fdesc: vap->va_type = VCHR; vap->va_nlink = 1; vap->va_size = 0; vap->va_rdev = makedev(0, vap->va_fileid); break; default: panic("fdesc_getattr"); break; } vp->v_type = vap->va_type; return (0); } static int fdesc_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp; struct mount *mp; struct file *fp; struct thread *td = curthread; cap_rights_t rights; unsigned fd; int error; /* * Can't mess with the root vnode */ if (VTOFDESC(ap->a_vp)->fd_type == Froot) return (EACCES); fd = VTOFDESC(ap->a_vp)->fd_fd; /* * Allow setattr where there is an underlying vnode. */ error = getvnode(td, fd, cap_rights_init(&rights, CAP_EXTATTR_SET), &fp); if (error) { /* * getvnode() returns EINVAL if the file descriptor is not * backed by a vnode. Silently drop all changes except * chflags(2) in this case. */ if (error == EINVAL) { if (vap->va_flags != VNOVAL) error = EOPNOTSUPP; else error = 0; } return (error); } vp = fp->f_vnode; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); } fdrop(fp, td); return (error); } #define UIO_MX 16 static int fdesc_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; u_long *a_cookies; int a_ncookies; } */ *ap; { struct uio *uio = ap->a_uio; struct filedesc *fdp; struct dirent d; struct dirent *dp = &d; int error, i, off, fcnt; if (VTOFDESC(ap->a_vp)->fd_type != Froot) panic("fdesc_readdir: not dir"); if (ap->a_ncookies != NULL) *ap->a_ncookies = 0; off = (int)uio->uio_offset; if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 || uio->uio_resid < UIO_MX) return (EINVAL); i = (u_int)off / UIO_MX; fdp = uio->uio_td->td_proc->p_fd; error = 0; fcnt = i - 2; /* The first two nodes are `.' and `..' */ FILEDESC_SLOCK(fdp); while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) { bzero((caddr_t)dp, UIO_MX); switch (i) { case 0: /* `.' */ case 1: /* `..' */ dp->d_fileno = i + FD_ROOT; dp->d_namlen = i + 1; dp->d_reclen = UIO_MX; bcopy("..", dp->d_name, dp->d_namlen); dp->d_name[i + 1] = '\0'; dp->d_type = DT_DIR; break; default: if (fdp->fd_ofiles[fcnt].fde_file == NULL) break; dp->d_namlen = sprintf(dp->d_name, "%d", fcnt); dp->d_reclen = UIO_MX; dp->d_type = DT_CHR; dp->d_fileno = i + FD_DESC; break; } if (dp->d_namlen != 0) { /* * And ship to userland */ FILEDESC_SUNLOCK(fdp); error = uiomove(dp, UIO_MX, uio); if (error) goto done; FILEDESC_SLOCK(fdp); } i++; fcnt++; } FILEDESC_SUNLOCK(fdp); done: uio->uio_offset = i * UIO_MX; return (error); } static int fdesc_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp; struct fdescnode *fd; vp = ap->a_vp; fd = VTOFDESC(vp); fdesc_remove_entry(fd); free(vp->v_data, M_TEMP); vp->v_data = NULL; return (0); } Index: head/sys/kern/kern_descrip.c =================================================================== --- head/sys/kern/kern_descrip.c (revision 285171) +++ head/sys/kern/kern_descrip.c (revision 285172) @@ -1,3862 +1,3859 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); MALLOC_DECLARE(M_FADVISE); static uma_zone_t file_zone; static uma_zone_t filedesc0_zone; static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, int holdleaders); static int do_dup(struct thread *td, int flags, int old, int new); static int fd_first_free(struct filedesc *fdp, int low, int size); static int fd_last_used(struct filedesc *fdp, int size); static void fdgrowtable(struct filedesc *fdp, int nfd); static void fdgrowtable_exp(struct filedesc *fdp, int nfd); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); static int getmaxfd(struct thread *td); /* Flags for do_dup() */ #define DUP_FIXED 0x1 /* Force fixed allocation. */ #define DUP_FCNTL 0x2 /* fcntl()-style errors. */ #define DUP_CLOEXEC 0x4 /* Atomically set FD_CLOEXEC. */ /* * Each process has: * * - An array of open file descriptors (fd_ofiles) * - An array of file flags (fd_ofileflags) * - A bitmap recording which descriptors are in use (fd_map) * * A process starts out with NDFILE descriptors. The value of NDFILE has * been selected based the historical limit of 20 open files, and an * assumption that the majority of processes, especially short-lived * processes like shells, will never need more. * * If this initial allocation is exhausted, a larger descriptor table and * map are allocated dynamically, and the pointers in the process's struct * filedesc are updated to point to those. This is repeated every time * the process runs out of file descriptors (provided it hasn't hit its * resource limit). * * Since threads may hold references to individual descriptor table * entries, the tables are never freed. Instead, they are placed on a * linked list and freed only when the struct filedesc is released. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* * SLIST entry used to keep track of ofiles which must be reclaimed when * the process exits. */ struct freetable { struct fdescenttbl *ft_table; SLIST_ENTRY(freetable) ft_next; }; /* * Initial allocation: a filedesc structure + the head of SLIST used to * keep track of old ofiles + enough space for NDFILE descriptors. */ struct fdescenttbl0 { int fdt_nfiles; struct filedescent fdt_ofiles[NDFILE]; }; struct filedesc0 { struct filedesc fd_fd; SLIST_HEAD(, freetable) fd_free; struct fdescenttbl0 fd_dfiles; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ volatile int openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* * If low >= size, just return low. Otherwise find the first zero bit in the * given bitmap, starting at low and not exceeding size - 1. Return size if * not found. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the highest non-zero bit in the given bitmap, starting at 0 and * not exceeding size - 1. Return -1 if not found. */ static int fd_last_used(struct filedesc *fdp, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, minoff; off = NDSLOT(size); if (size % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); if ((mask &= map[off]) != 0) return (off * NDENTRIES + flsl(mask) - 1); --off; } for (minoff = NDSLOT(0); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (-1); } #ifdef INVARIANTS static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } #endif /* * Mark a file descriptor as used. */ static void fdused_init(struct filedesc *fdp, int fd) { KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); } static void fdused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); fdused_init(fdp, fd); if (fd > fdp->fd_lastfile) fdp->fd_lastfile = fd; if (fd == fdp->fd_freefile) fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("fd=%d is still in use", fd)); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; if (fd == fdp->fd_lastfile) fdp->fd_lastfile = fd_last_used(fdp, fd); } /* * Free a file descriptor. * * Avoid some work if fdp is about to be destroyed. */ static inline void fdefree_last(struct filedescent *fde) { filecaps_free(&fde->fde_caps); } static inline void fdfree(struct filedesc *fdp, int fd) { struct filedescent *fde; fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seq_write_begin(&fde->fde_seq); #endif fdefree_last(fde); fde->fde_file = NULL; fdunused(fdp, fd); #ifdef CAPABILITIES seq_write_end(&fde->fde_seq); #endif } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) { #ifdef RACCT uint64_t lim; #endif td->td_retval[0] = min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc); #ifdef RACCT PROC_LOCK(td->td_proc); lim = racct_get_limit(td->td_proc, RACCT_NOFILE); PROC_UNLOCK(td->td_proc); if (lim < td->td_retval[0]) td->td_retval[0] = lim; #endif return (0); } /* * Duplicate a file descriptor to a particular value. * * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int sys_dup2(struct thread *td, struct dup2_args *uap) { return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int sys_dup(struct thread *td, struct dup_args *uap) { return (do_dup(td, 0, (int)uap->fd, 0)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int sys_fcntl(struct thread *td, struct fcntl_args *uap) { return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); } int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) { struct flock fl; struct __oflock ofl; intptr_t arg1; int error; error = 0; switch (cmd) { case F_OGETLK: case F_OSETLK: case F_OSETLKW: /* * Convert old flock structure to new. */ error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); fl.l_start = ofl.l_start; fl.l_len = ofl.l_len; fl.l_pid = ofl.l_pid; fl.l_type = ofl.l_type; fl.l_whence = ofl.l_whence; fl.l_sysid = 0; switch (cmd) { case F_OGETLK: cmd = F_GETLK; break; case F_OSETLK: cmd = F_SETLK; break; case F_OSETLKW: cmd = F_SETLKW; break; } arg1 = (intptr_t)&fl; break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_SETLK_REMOTE: error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); arg1 = (intptr_t)&fl; break; default: arg1 = arg; break; } if (error) return (error); error = kern_fcntl(td, fd, cmd, arg1); if (error) return (error); if (cmd == F_OGETLK) { ofl.l_start = fl.l_start; ofl.l_len = fl.l_len; ofl.l_pid = fl.l_pid; ofl.l_type = fl.l_type; ofl.l_whence = fl.l_whence; error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); } else if (cmd == F_GETLK) { error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); } return (error); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp, *fp2; struct filedescent *fde; struct proc *p; struct vnode *vp; cap_rights_t rights; int error, flg, tmp; uint64_t bsize; off_t foffset; error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; switch (cmd) { case F_DUPFD: tmp = arg; error = do_dup(td, DUP_FCNTL, fd, tmp); break; case F_DUPFD_CLOEXEC: tmp = arg; error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp); break; case F_DUP2FD: tmp = arg; error = do_dup(td, DUP_FIXED, fd, tmp); break; case F_DUP2FD_CLOEXEC: tmp = arg; error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp); break; case F_GETFD: FILEDESC_SLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } fde = &fdp->fd_ofiles[fd]; td->td_retval[0] = (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; FILEDESC_SUNLOCK(fdp); break; case F_SETFD: FILEDESC_XLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { FILEDESC_XUNLOCK(fdp); error = EBADF; break; } fde = &fdp->fd_ofiles[fd]; fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); FILEDESC_XUNLOCK(fdp); break; case F_GETFL: error = fget_fcntl(td, fd, cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp); if (error != 0) break; td->td_retval[0] = OFLAGS(fp->f_flag); fdrop(fp, td); break; case F_SETFL: error = fget_fcntl(td, fd, cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp); if (error != 0) break; do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error != 0) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: error = fget_fcntl(td, fd, cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp); if (error != 0) break; error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: error = fget_fcntl(td, fd, cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp); if (error != 0) break; tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLK_REMOTE: error = priv_check(td, PRIV_NFS_LOCKD); if (error) return (error); flg = F_REMOTE; goto do_setlk; case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: do_setlk: cap_rights_init(&rights, CAP_FLOCK); error = fget_unlocked(fdp, fd, &rights, &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if (foffset < 0 || (flp->l_start > 0 && foffset > OFF_MAX - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, flg); break; case F_UNLCKSYS: /* * Temporary api for testing remote lock * infrastructure. */ if (flg != F_REMOTE) { error = EINVAL; break; } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCKSYS, flp, flg); break; default: error = EINVAL; break; } if (error != 0 || flp->l_type == F_UNLCK || flp->l_type == F_UNLCKSYS) { fdrop(fp, td); break; } /* * Check for a race with close. * * The vnode is now advisory locked (or unlocked, but this case * is not really important) as the caller requested. * We had to drop the filedesc lock, so we need to recheck if * the descriptor is still valid, because if it was closed * in the meantime we need to remove advisory lock from the * vnode - close on any descriptor leading to an advisory * locked vnode, removes that lock. * We will return 0 on purpose in that case, as the result of * successful advisory lock might have been externally visible * already. This is fine - effectively we pretend to the caller * that the closing thread was a bit slower and that the * advisory lock succeeded before the close. */ error = fget_unlocked(fdp, fd, &rights, &fp2, NULL); if (error != 0) { fdrop(fp, td); break; } if (fp != fp2) { flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); } fdrop(fp, td); fdrop(fp2, td); break; case F_GETLK: error = fget_unlocked(fdp, fd, cap_rights_init(&rights, CAP_FLOCK), &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { error = EINVAL; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if ((flp->l_start > 0 && foffset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && foffset < OFF_MIN - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); fdrop(fp, td); break; case F_RDAHEAD: arg = arg ? 128 * 1024: 0; /* FALLTHROUGH */ case F_READAHEAD: error = fget_unlocked(fdp, fd, cap_rights_init(&rights), &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; /* * Exclusive lock synchronizes against f_seqcount reads and * writes in sequential_heuristic(). */ error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { fdrop(fp, td); break; } if (arg >= 0) { bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; fp->f_seqcount = (arg + bsize - 1) / bsize; atomic_set_int(&fp->f_flag, FRDAHEAD); } else { atomic_clear_int(&fp->f_flag, FRDAHEAD); } VOP_UNLOCK(vp, 0); fdrop(fp, td); break; default: error = EINVAL; break; } return (error); } static int getmaxfd(struct thread *td) { return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); } /* * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). */ static int do_dup(struct thread *td, int flags, int old, int new) { struct filedesc *fdp; struct filedescent *oldfde, *newfde; struct proc *p; struct file *fp; struct file *delfp; int error, maxfd; p = td->td_proc; fdp = p->p_fd; /* * Verify we have a valid descriptor to dup from and possibly to * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should * return EINVAL when the new descriptor is out of bounds. */ if (old < 0) return (EBADF); if (new < 0) return (flags & DUP_FCNTL ? EINVAL : EBADF); maxfd = getmaxfd(td); if (new >= maxfd) return (flags & DUP_FCNTL ? EINVAL : EBADF); FILEDESC_XLOCK(fdp); if (fget_locked(fdp, old) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } oldfde = &fdp->fd_ofiles[old]; if (flags & DUP_FIXED && old == new) { td->td_retval[0] = new; if (flags & DUP_CLOEXEC) fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; FILEDESC_XUNLOCK(fdp); return (0); } fp = oldfde->fde_file; fhold(fp); /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. */ if (flags & DUP_FIXED) { if (new >= fdp->fd_nfiles) { /* * The resource limits are here instead of e.g. * fdalloc(), because the file descriptor table may be * shared between processes, so we can't really use * racct_add()/racct_sub(). Instead of counting the * number of actually allocated descriptors, just put * the limit on the size of the file descriptor table. */ #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_set(p, RACCT_NOFILE, new + 1); PROC_UNLOCK(p); if (error != 0) { FILEDESC_XUNLOCK(fdp); fdrop(fp, td); return (EMFILE); } } #endif fdgrowtable_exp(fdp, new + 1); oldfde = &fdp->fd_ofiles[old]; } newfde = &fdp->fd_ofiles[new]; if (newfde->fde_file == NULL) fdused(fdp, new); } else { if ((error = fdalloc(td, new, &new)) != 0) { FILEDESC_XUNLOCK(fdp); fdrop(fp, td); return (error); } newfde = &fdp->fd_ofiles[new]; } KASSERT(fp == oldfde->fde_file, ("old fd has been modified")); KASSERT(old != new, ("new fd is same as old")); delfp = newfde->fde_file; /* * Duplicate the source descriptor. */ #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif filecaps_free(&newfde->fde_caps); memcpy(newfde, oldfde, fde_change_size); filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps); if ((flags & DUP_CLOEXEC) != 0) newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; else newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif td->td_retval[0] = new; if (delfp != NULL) { (void) closefp(fdp, new, delfp, td, 1); /* closefp() drops the FILEDESC lock for us. */ } else { FILEDESC_XUNLOCK(fdp); } return (0); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } *(sigio->sio_myref) = NULL; if ((sigio)->sio_pgid < 0) { struct pgrp *pg = (sigio)->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { struct proc *p = (sigio)->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); } /* * Free a list of sigio structures. * We only need to lock the SIGIO_LOCK because we have made ourselves * inaccessible to callers of fsetown and therefore do not need to lock * the proc or pgrp struct for the list manipulation. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio; sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; /* * Every entry of the list should belong * to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK_ASSERT(p, MA_NOTOWNED); } SIGIO_LOCK(); while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { *(sigio->sio_myref) = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); SIGIO_LOCK(); } SIGIO_UNLOCK(); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } ret = 0; /* Allocate and fill in the new sigio out of locks. */ sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; sx_slock(&proctree_lock); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { ret = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ PROC_UNLOCK(proc); if (proc->p_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } pgrp = NULL; } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; goto fail; } PGRP_UNLOCK(pgrp); /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } proc = NULL; } funsetown(sigiop); if (pgid > 0) { PROC_LOCK(proc); /* * Since funsetownlst() is called without the proctree * locked, we need to check for P_WEXIT. * XXX: is ESRCH correct? */ if ((proc->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(proc); ret = ESRCH; goto fail; } SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); sigio->sio_proc = proc; PROC_UNLOCK(proc); } else { PGRP_LOCK(pgrp); SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); SIGIO_LOCK(); *sigiop = sigio; SIGIO_UNLOCK(); return (0); fail: sx_sunlock(&proctree_lock); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(sigiop) struct sigio **sigiop; { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } /* * Function drops the filedesc lock on return. */ static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, int holdleaders) { int error; FILEDESC_XLOCK_ASSERT(fdp); if (holdleaders) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; } else { holdleaders = 0; } } /* * We now hold the fp reference that used to be owned by the * descriptor array. We have to unlock the FILEDESC *AFTER* * knote_fdclose to prevent a race of the fd getting opened, a knote * added, and deleteing a knote for the new fd. */ knote_fdclose(td, fd); /* * We need to notify mqueue if the object is of type mqueue. */ if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, fd, fp); FILEDESC_XUNLOCK(fdp); error = closef(fp, td); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } return (error); } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int sys_close(struct thread *td, struct close_args *uap) { return (kern_close(td, uap->fd)); } int kern_close(struct thread *td, int fd) { struct filedesc *fdp; struct file *fp; fdp = td->td_proc->p_fd; AUDIT_SYSCLOSE(td, fd); FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, fd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } fdfree(fdp, fd); /* closefp() drops the FILEDESC lock for us. */ return (closefp(fdp, fd, fp, td, 1)); } /* * Close open file descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct closefrom_args { int lowfd; }; #endif /* ARGSUSED */ int sys_closefrom(struct thread *td, struct closefrom_args *uap) { struct filedesc *fdp; int fd; fdp = td->td_proc->p_fd; AUDIT_ARG_FD(uap->lowfd); /* * Treat negative starting file descriptor values identical to * closefrom(0) which closes all files. */ if (uap->lowfd < 0) uap->lowfd = 0; FILEDESC_SLOCK(fdp); for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) { if (fdp->fd_ofiles[fd].fde_file != NULL) { FILEDESC_SUNLOCK(fdp); (void)kern_close(td, fd); FILEDESC_SLOCK(fdp); } } FILEDESC_SUNLOCK(fdp); return (0); } #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int sys_fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp); if (error != 0) return (error); AUDIT_ARG_FILE(td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred, td); fdrop(fp, td); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT)) ktrstat(sbp); #endif return (error); } /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int sys_nfstat(struct thread *td, struct nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtnstat(&ub, &nub); error = copyout(&nub, uap->sb, sizeof(nub)); } return (error); } /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int sys_fpathconf(struct thread *td, struct fpathconf_args *uap) { struct file *fp; struct vnode *vp; cap_rights_t rights; int error; error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp); if (error != 0) return (error); /* If asynchronous I/O is available, it works for all descriptors. */ if (uap->name == _PC_ASYNC_IO) { td->td_retval[0] = async_io_version; goto out; } vp = fp->f_vnode; if (vp != NULL) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_PATHCONF(vp, uap->name, td->td_retval); VOP_UNLOCK(vp, 0); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (uap->name != _PC_PIPE_BUF) { error = EINVAL; } else { td->td_retval[0] = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Initialize filecaps structure. */ void filecaps_init(struct filecaps *fcaps) { bzero(fcaps, sizeof(*fcaps)); fcaps->fc_nioctls = -1; } /* * Copy filecaps structure allocating memory for ioctls array if needed. */ void filecaps_copy(const struct filecaps *src, struct filecaps *dst) { size_t size; *dst = *src; if (src->fc_ioctls != NULL) { KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); bcopy(src->fc_ioctls, dst->fc_ioctls, size); } } /* * Move filecaps structure to the new place and clear the old place. */ void filecaps_move(struct filecaps *src, struct filecaps *dst) { *dst = *src; bzero(src, sizeof(*src)); } /* * Fill the given filecaps structure with full rights. */ static void filecaps_fill(struct filecaps *fcaps) { CAP_ALL(&fcaps->fc_rights); fcaps->fc_ioctls = NULL; fcaps->fc_nioctls = -1; fcaps->fc_fcntls = CAP_FCNTL_ALL; } /* * Free memory allocated within filecaps structure. */ void filecaps_free(struct filecaps *fcaps) { free(fcaps->fc_ioctls, M_FILECAPS); bzero(fcaps, sizeof(*fcaps)); } /* * Validate the given filecaps structure. */ static void filecaps_validate(const struct filecaps *fcaps, const char *func) { KASSERT(cap_rights_is_valid(&fcaps->fc_rights), ("%s: invalid rights", func)); KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, ("%s: invalid fcntls", func)); KASSERT(fcaps->fc_fcntls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), ("%s: fcntls without CAP_FCNTL", func)); KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), ("%s: invalid ioctls", func)); KASSERT(fcaps->fc_nioctls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), ("%s: ioctls without CAP_IOCTL", func)); } static void fdgrowtable_exp(struct filedesc *fdp, int nfd) { int nfd1; FILEDESC_XLOCK_ASSERT(fdp); nfd1 = fdp->fd_nfiles * 2; if (nfd1 < nfd) nfd1 = nfd; fdgrowtable(fdp, nfd1); } /* * Grow the file table to accomodate (at least) nfd descriptors. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct filedesc0 *fdp0; struct freetable *ft; struct fdescenttbl *ntable; struct fdescenttbl *otable; int nnfiles, onfiles; NDSLOTTYPE *nmap, *omap; /* * If lastfile is -1 this struct filedesc was just allocated and we are * growing it to accomodate for the one we are going to copy from. There * is no need to have a lock on this one as it's not visible to anyone. */ if (fdp->fd_lastfile != -1) FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* save old values */ onfiles = fdp->fd_nfiles; otable = fdp->fd_files; omap = fdp->fd_map; /* compute the size of the new table */ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* * Allocate a new table. We need enough space for the number of * entries, file entries themselves and the struct freetable we will use * when we decommission the table and place it on the freelist. * We place the struct freetable in the middle so we don't have * to worry about padding. */ ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + nnfiles * sizeof(ntable->fdt_ofiles[0]) + sizeof(struct freetable), M_FILEDESC, M_ZERO | M_WAITOK); /* copy the old data */ ntable->fdt_nfiles = nnfiles; memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, onfiles * sizeof(ntable->fdt_ofiles[0])); /* * Allocate a new map only if the old is not large enough. It will * grow at a slower rate than the table as it can map more * entries than the table can hold. */ if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); /* copy over the old data and update the pointer */ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); fdp->fd_map = nmap; } /* * Make sure that ntable is correctly initialized before we replace * fd_files poiner. Otherwise fget_unlocked() may see inconsistent * data. */ atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); /* * Do not free the old file table, as some threads may still * reference entries within it. Instead, place it on a freelist * which will be processed when the struct filedesc is released. * * Note that if onfiles == NDFILE, we're dealing with the original * static allocation contained within (struct filedesc0 *)fdp, * which must not be freed. */ if (onfiles > NDFILE) { ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; fdp0 = (struct filedesc0 *)fdp; ft->ft_table = otable; SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); } /* * The map does not have the same possibility of threads still * holding references to it. So always free it as long as it * does not reference the original static allocation. */ if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(omap, M_FILEDESC); } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd, maxfd, allocfd; #ifdef RACCT int error; #endif FILEDESC_XLOCK_ASSERT(fdp); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; maxfd = getmaxfd(td); /* * Search the bitmap for a free descriptor starting at minfd. * If none is found, grow the file table. */ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (fd >= maxfd) return (EMFILE); if (fd >= fdp->fd_nfiles) { allocfd = min(fd * 2, maxfd); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_set(p, RACCT_NOFILE, allocfd); PROC_UNLOCK(p); if (error != 0) return (EMFILE); } #endif /* * fd is already equal to first free descriptor >= minfd, so * we only need to grow the table and we are done. */ fdgrowtable_exp(fdp, allocfd); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), ("invalid descriptor %d", fd)); KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("file descriptor isn't free")); fdused(fdp, fd); *result = fd; return (0); } /* * Allocate n file descriptors for the process. */ int fdallocn(struct thread *td, int minfd, int *fds, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int i; FILEDESC_XLOCK_ASSERT(fdp); for (i = 0; i < n; i++) if (fdalloc(td, 0, &fds[i]) != 0) break; if (i < n) { for (i--; i >= 0; i--) fdunused(fdp, fds[i]); return (EMFILE); } return (0); } /* * Create a new open file structure and allocate a file decriptor for the * process that refers to it. We add one reference to the file for the * descriptor table and one reference for resultfp. This is to prevent us * being preempted and the entry in the descriptor table closed after we * release the FILEDESC lock. */ int falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags) { struct file *fp; int error, fd; error = falloc_noinstall(td, &fp); if (error) return (error); /* no reference held on error */ error = finstall(td, fp, &fd, flags, NULL); if (error) { fdrop(fp, td); /* one reference (fp only) */ return (error); } if (resultfp != NULL) *resultfp = fp; /* copy out result */ else fdrop(fp, td); /* release local reference */ if (resultfd != NULL) *resultfd = fd; return (0); } /* * Create a new open file structure without allocating a file descriptor. */ int falloc_noinstall(struct thread *td, struct file **resultfp) { struct file *fp; int maxuserfiles = maxfiles - (maxfiles / 20); static struct timeval lastfail; static int curfail; KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); if ((openfiles >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles >= maxfiles) { if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, " "please see tuning(7).\n", td->td_ucred->cr_ruid); } return (ENFILE); } atomic_add_int(&openfiles, 1); fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); refcount_init(&fp->f_count, 1); fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; *resultfp = fp; return (0); } /* * Install a file in a file descriptor table. */ void _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, struct filecaps *fcaps) { struct filedescent *fde; MPASS(fp != NULL); if (fcaps != NULL) filecaps_validate(fcaps, __func__); FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seq_write_begin(&fde->fde_seq); #endif fde->fde_file = fp; fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; if (fcaps != NULL) filecaps_move(fcaps, &fde->fde_caps); else filecaps_fill(&fde->fde_caps); #ifdef CAPABILITIES seq_write_end(&fde->fde_seq); #endif } int finstall(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { struct filedesc *fdp = td->td_proc->p_fd; int error; MPASS(fd != NULL); FILEDESC_XLOCK(fdp); if ((error = fdalloc(td, 0, fd))) { FILEDESC_XUNLOCK(fdp); return (error); } fhold(fp); _finstall(fdp, fp, *fd, flags, fcaps); FILEDESC_XUNLOCK(fdp); return (0); } /* * Build a new filedesc structure from another. * Copy the current, root, and jail root vnode references. * * If fdp is not NULL, return with it shared locked. */ struct filedesc * fdinit(struct filedesc *fdp, bool prepfiles) { struct filedesc0 *newfdp0; struct filedesc *newfdp; newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); newfdp = &newfdp0->fd_fd; /* Create the file descriptor table. */ FILEDESC_LOCK_INIT(newfdp); refcount_init(&newfdp->fd_refcnt, 1); refcount_init(&newfdp->fd_holdcnt, 1); newfdp->fd_cmask = CMASK; newfdp->fd_map = newfdp0->fd_dmap; newfdp->fd_lastfile = -1; newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; newfdp->fd_files->fdt_nfiles = NDFILE; if (fdp == NULL) return (newfdp); if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles) fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_SLOCK(fdp); newfdp->fd_cdir = fdp->fd_cdir; if (newfdp->fd_cdir) VREF(newfdp->fd_cdir); newfdp->fd_rdir = fdp->fd_rdir; if (newfdp->fd_rdir) VREF(newfdp->fd_rdir); newfdp->fd_jdir = fdp->fd_jdir; if (newfdp->fd_jdir) VREF(newfdp->fd_jdir); if (!prepfiles) { FILEDESC_SUNLOCK(fdp); } else { while (fdp->fd_lastfile >= newfdp->fd_nfiles) { FILEDESC_SUNLOCK(fdp); fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_SLOCK(fdp); } } return (newfdp); } static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = p->p_fd; if (fdp != NULL) refcount_acquire(&fdp->fd_holdcnt); return (fdp); } static void fddrop(struct filedesc *fdp) { if (fdp->fd_holdcnt > 1) { if (refcount_release(&fdp->fd_holdcnt) == 0) return; } FILEDESC_LOCK_DESTROY(fdp); uma_zfree(filedesc0_zone, fdp); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { refcount_acquire(&fdp->fd_refcnt); return (fdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct thread *td) { struct filedesc *tmp; struct proc *p = td->td_proc; if (p->p_fd->fd_refcnt == 1) return; tmp = fdcopy(p->p_fd); fdescfree(td); p->p_fd = tmp; } /* * Copy a filedesc structure. A NULL pointer in returns a NULL reference, * this is to ease callers, not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int i; MPASS(fdp != NULL); newfdp = fdinit(fdp, true); /* copy all passable descriptors (i.e. not kqueue) */ newfdp->fd_freefile = -1; for (i = 0; i <= fdp->fd_lastfile; ++i) { ofde = &fdp->fd_ofiles[i]; if (ofde->fde_file == NULL || (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; continue; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps); fhold(nfde->fde_file); fdused_init(newfdp, i); newfdp->fd_lastfile = i; } if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_SUNLOCK(fdp); return (newfdp); } /* * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. * one of processes using it exits) and the table used to be shared. */ static void fdclearlocks(struct thread *td) { struct filedesc *fdp; struct filedesc_to_leader *fdtol; struct flock lf; struct file *fp; struct proc *p; struct vnode *vp; int i; p = td->td_proc; fdp = p->p_fd; fdtol = p->p_fdtol; MPASS(fdtol != NULL); FILEDESC_XLOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { for (i = 0; i <= fdp->fd_lastfile; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp == NULL || fp->f_type != DTYPE_VNODE) continue; fhold(fp); FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdrop(fp, td); } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or do_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; sx_sleep(&fdp->fd_holdleaderscount, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader remains * valid in closef(). */ fdtol->fdl_wakeup = 1; sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; p->p_fdtol = NULL; FILEDESC_XUNLOCK(fdp); if (fdtol != NULL) free(fdtol, M_FILEDESC_TO_LEADER); } /* * Release a filedesc structure. */ void fdescfree(struct thread *td) { struct proc *p; struct filedesc0 *fdp0; struct filedesc *fdp; struct freetable *ft, *tft; struct filedescent *fde; struct file *fp; struct vnode *cdir, *jdir, *rdir; int i; p = td->td_proc; fdp = p->p_fd; MPASS(fdp != NULL); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_set(p, RACCT_NOFILE, 0); PROC_UNLOCK(p); } #endif if (td->td_proc->p_fdtol != NULL) fdclearlocks(td); PROC_LOCK(p); p->p_fd = NULL; PROC_UNLOCK(p); if (refcount_release(&fdp->fd_refcnt) == 0) return; FILEDESC_XLOCK(fdp); cdir = fdp->fd_cdir; fdp->fd_cdir = NULL; rdir = fdp->fd_rdir; fdp->fd_rdir = NULL; jdir = fdp->fd_jdir; fdp->fd_jdir = NULL; FILEDESC_XUNLOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL) { fdefree_last(fde); (void) closef(fp, td); } } if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); if (fdp->fd_nfiles > NDFILE) free(fdp->fd_files, M_FILEDESC); fdp0 = (struct filedesc0 *)fdp; SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) free(ft->ft_table, M_FILEDESC); if (cdir != NULL) vrele(cdir); if (rdir != NULL) vrele(rdir); if (jdir != NULL) vrele(jdir); fddrop(fdp); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static bool is_unsafe(struct file *fp) { struct vnode *vp; if (fp->f_type != DTYPE_VNODE) return (false); vp = fp->f_vnode; return ((vp->v_vflag & VV_PROCDEP) != 0); } /* * Make this setguid thing safe, if at all possible. */ void fdsetugidsafety(struct thread *td) { struct filedesc *fdp; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); for (i = 0; i <= 2; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp != NULL && is_unsafe(fp)) { FILEDESC_XLOCK(fdp); knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fdfree(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); } } } /* * If a specific file object occupies a specific file descriptor, close the * file descriptor entry and drop a reference on the file object. This is a * convenience function to handle a subsequent error in a function that calls * falloc() that handles the race that another thread might have closed the * file descriptor out from under the thread creating the file object. */ void fdclose(struct thread *td, struct file *fp, int idx) { struct filedesc *fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[idx].fde_file == fp) { fdfree(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); } else FILEDESC_XUNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; struct filedescent *fde; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || (fde->fde_flags & UF_EXCLOSE))) { FILEDESC_XLOCK(fdp); fdfree(fdp, i); (void) closefp(fdp, i, fp, td, 0); /* closefp() drops the FILEDESC lock. */ } } } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct filedesc *fdp; register_t save; int i, error, devnull; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); devnull = -1; for (i = 0; i <= 2; i++) { if (fdp->fd_ofiles[i].fde_file != NULL) continue; save = td->td_retval[0]; if (devnull != -1) { error = do_dup(td, DUP_FIXED, devnull, i); } else { error = kern_openat(td, AT_FDCWD, "/dev/null", UIO_SYSSPACE, O_RDWR, 0); if (error == 0) { devnull = td->td_retval[0]; KASSERT(devnull == i, ("we didn't get our fd")); } } td->td_retval[0] = save; if (error != 0) return (error); } return (0); } /* * Internal form of close. Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. * * XXXRW: Giant is not required for the caller, but often will be held; this * makes it moderately likely the Giant will be recursed in the VFS case. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE && td != NULL) { vp = fp->f_vnode; if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table is * shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_XUNLOCK(fdp); } } return (fdrop(fp, td)); } /* * Initialize the file pointer with the specified properties. * * The ops are set with release semantics to be certain that the flags, type, * and data are visible when ops is. This is to prevent ops methods from being * called with bad data. */ void finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) { fp->f_data = data; fp->f_flag = flag; fp->f_type = type; atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); } int fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, seq_t *seqp) { #ifdef CAPABILITIES struct filedescent *fde; #endif struct fdescenttbl *fdt; struct file *fp; u_int count; #ifdef CAPABILITIES seq_t seq; cap_rights_t haverights; int error; #endif fdt = fdp->fd_files; if ((u_int)fd >= fdt->fdt_nfiles) return (EBADF); /* * Fetch the descriptor locklessly. We avoid fdrop() races by * never raising a refcount above 0. To accomplish this we have * to use a cmpset loop rather than an atomic_add. The descriptor * must be re-verified once we acquire a reference to be certain * that the identity is still correct and we did not lose a race * due to preemption. */ for (;;) { #ifdef CAPABILITIES seq = seq_read(fd_seq(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = *cap_rights_fde(fde); fp = fde->fde_file; if (!seq_consistent(fd_seq(fdt, fd), seq)) { cpu_spinwait(); continue; } #else fp = fdt->fdt_ofiles[fd].fde_file; #endif if (fp == NULL) return (EBADF); #ifdef CAPABILITIES error = cap_check(&haverights, needrightsp); if (error != 0) return (error); #endif retry: count = fp->f_count; if (count == 0) { /* * Force a reload. Other thread could reallocate the * table before this fd was closed, so it possible that * there is a stale fp pointer in cached version. */ fdt = *(struct fdescenttbl * volatile *)&(fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) == 0) goto retry; fdt = fdp->fd_files; #ifdef CAPABILITIES if (seq_consistent_nomb(fd_seq(fdt, fd), seq)) #else if (fp == fdt->fdt_ofiles[fd].fde_file) #endif break; fdrop(fp, curthread); } *fpp = fp; if (seqp != NULL) { #ifdef CAPABILITIES *seqp = seq; #endif } return (0); } /* * Extract the file pointer associated with the specified descriptor for the * current user process. * * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * * File's rights will be checked against the capability rights mask. * * If an error occured the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is held and set and zero is returned. Caller is * responsible for fdrop(). */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, cap_rights_t *needrightsp, seq_t *seqp) { struct filedesc *fdp; struct file *fp; - cap_rights_t needrights; int error; *fpp = NULL; fdp = td->td_proc->p_fd; - if (needrightsp == NULL) - needrightsp = cap_rights_init(&needrights); error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp); if (error != 0) return (error); if (fp->f_ops == &badfileops) { fdrop(fp, td); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. */ error = 0; switch (flags) { case FREAD: case FWRITE: if ((fp->f_flag & flags) == 0) error = EBADF; break; case FEXEC: if ((fp->f_flag & (FREAD | FEXEC)) == 0 || ((fp->f_flag & FWRITE) != 0)) error = EBADF; break; case 0: break; default: KASSERT(0, ("wrong flags")); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, 0, rightsp, NULL)); } int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp, struct file **fpp) { int error; #ifndef CAPABILITIES error = _fget(td, fd, fpp, 0, rightsp, NULL); if (maxprotp != NULL) *maxprotp = VM_PROT_ALL; #else struct filedesc *fdp = td->td_proc->p_fd; seq_t seq; MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); for (;;) { error = _fget(td, fd, fpp, 0, rightsp, &seq); if (error != 0) return (error); /* * If requested, convert capability rights to access flags. */ if (maxprotp != NULL) *maxprotp = cap_rights_to_vmprot(cap_rights(fdp, fd)); if (!fd_modified(fdp, fd, seq)) break; fdrop(*fpp, td); } #endif return (error); } int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FREAD, rightsp, NULL)); } int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FWRITE, rightsp, NULL)); } int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp) { struct filedesc *fdp = td->td_proc->p_fd; #ifndef CAPABILITIES return (fget_unlocked(fdp, fd, rightsp, fpp, NULL)); #else int error; seq_t seq; MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); for (;;) { error = fget_unlocked(fdp, fd, rightsp, fpp, &seq); if (error != 0) return (error); error = cap_fcntl_check(fdp, fd, needfcntl); if (!fd_modified(fdp, fd, seq)) break; fdrop(*fpp, td); } if (error != 0) { fdrop(*fpp, td); *fpp = NULL; } return (error); #endif } /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but * never have VM objects. The returned vnode will be vref()'d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, struct vnode **vpp) { struct file *fp; int error; *vpp = NULL; error = _fget(td, fd, &fp, flags, needrightsp, NULL); if (error != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vref(*vpp); } fdrop(fp, td); return (error); } int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, 0, rightsp, vpp)); } int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp) { struct filedesc *fdp; struct file *fp; #ifdef CAPABILITIES int error; #endif fdp = td->td_proc->p_fd; fp = fget_locked(fdp, fd); if (fp == NULL || fp->f_ops == &badfileops) return (EBADF); #ifdef CAPABILITIES if (needrightsp != NULL) { error = cap_check(cap_rights(fdp, fd), needrightsp); if (error != 0) return (error); } #endif if (fp->f_vnode == NULL) return (EINVAL); *vpp = fp->f_vnode; vref(*vpp); filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps); return (0); } int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FREAD, rightsp, vpp)); } int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); } #endif /* * Like fget() but loads the underlying socket, or returns an error if the * descriptor does not represent a socket. * * We bump the ref count on the returned socket. XXX Also obtain the SX lock * in the future. * * Note: fgetsock() and fputsock() are deprecated, as consumers should rely * on their file descriptor reference to prevent the socket from being free'd * during use. */ int fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp, u_int *fflagp) { struct file *fp; int error; *spp = NULL; if (fflagp != NULL) *fflagp = 0; if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { error = ENOTSOCK; } else { *spp = fp->f_data; if (fflagp) *fflagp = fp->f_flag; SOCK_LOCK(*spp); soref(*spp); SOCK_UNLOCK(*spp); } fdrop(fp, td); return (error); } /* * Drop the reference count on the socket and XXX release the SX lock in the * future. The last reference closes the socket. * * Note: fputsock() is deprecated, see comment for fgetsock(). */ void fputsock(struct socket *so) { ACCEPT_LOCK(); SOCK_LOCK(so); CURVNET_SET(so->so_vnet); sorele(so); CURVNET_RESTORE(); } /* * Handle the last reference to a file being closed. */ int _fdrop(struct file *fp, struct thread *td) { int error; if (fp->f_count != 0) panic("fdrop: count %d", fp->f_count); error = fo_close(fp, td); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); free(fp->f_advice, M_FADVISE); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on the entire file * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int sys_flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; cap_rights_t rights; int error; error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); return (EOPNOTSUPP); } vp = fp->f_vnode; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done2; } atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: fdrop(fp, td); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp) { struct filedescent *newfde, *oldfde; struct file *fp; int error, indx; KASSERT(openerror == ENODEV || openerror == ENXIO, ("unexpected error %d in %s", openerror, __func__)); /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, dfd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } error = fdalloc(td, 0, &indx); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor (indx) and return. * * For ENXIO steal away the file structure from (dfd) and store it in * (indx). (dfd) is effectively closed by this operation. */ switch (openerror) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EACCES); } fhold(fp); newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif memcpy(newfde, oldfde, fde_change_size); filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps); #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif break; case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif memcpy(newfde, oldfde, fde_change_size); oldfde->fde_file = NULL; fdunused(fdp, dfd); #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif break; } FILEDESC_XUNLOCK(fdp); *indxp = indx; return (0); } /* * Scan all active processes and prisons to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct filedesc *fdp; struct prison *pr; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; nrele = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_XLOCK(fdp); if (fdp->fd_cdir == olddp) { vref(newdp); fdp->fd_cdir = newdp; nrele++; } if (fdp->fd_rdir == olddp) { vref(newdp); fdp->fd_rdir = newdp; nrele++; } if (fdp->fd_jdir == olddp) { vref(newdp); fdp->fd_jdir = newdp; nrele++; } FILEDESC_XUNLOCK(fdp); fddrop(fdp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { vref(newdp); rootvnode = newdp; nrele++; } mtx_lock(&prison0.pr_mtx); if (prison0.pr_root == olddp) { vref(newdp); prison0.pr_root = newdp; nrele++; } mtx_unlock(&prison0.pr_mtx); sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { mtx_lock(&pr->pr_mtx); if (pr->pr_root == olddp) { vref(newdp); pr->pr_root = newdp; nrele++; } mtx_unlock(&pr->pr_mtx); } sx_sunlock(&allprison_lock); while (nrele--) vrele(olddp); } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; fdtol = malloc(sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_XLOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_XUNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } /* * Get file structures globally. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; /* overestimates sparse tables. */ if (fdp->fd_lastfile > 0) n += fdp->fd_lastfile; fddrop(fdp); } sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_SLOCK(fdp); for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; xf.xf_fd = n; xf.xf_file = fp; xf.xf_data = fp->f_data; xf.xf_vnode = fp->f_vnode; xf.xf_type = fp->f_type; xf.xf_count = fp->f_count; xf.xf_msgcount = 0; xf.xf_offset = foffset_get(fp); xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static int xlate_fflags(int fflags) { static const struct { int fflag; int kf_fflag; } fflags_table[] = { { FAPPEND, KF_FLAG_APPEND }, { FASYNC, KF_FLAG_ASYNC }, { FFSYNC, KF_FLAG_FSYNC }, { FHASLOCK, KF_FLAG_HASLOCK }, { FNONBLOCK, KF_FLAG_NONBLOCK }, { FREAD, KF_FLAG_READ }, { FWRITE, KF_FLAG_WRITE }, { O_CREAT, KF_FLAG_CREAT }, { O_DIRECT, KF_FLAG_DIRECT }, { O_EXCL, KF_FLAG_EXCL }, { O_EXEC, KF_FLAG_EXEC }, { O_EXLOCK, KF_FLAG_EXLOCK }, { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, { O_SHLOCK, KF_FLAG_SHLOCK }, { O_TRUNC, KF_FLAG_TRUNC } }; unsigned int i; int kflags; kflags = 0; for (i = 0; i < nitems(fflags_table); i++) if (fflags & fflags_table[i].fflag) kflags |= fflags_table[i].kf_fflag; return (kflags); } /* Trim unused data from kf_path by truncating the structure size. */ static void pack_kinfo(struct kinfo_file *kif) { kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + strlen(kif->kf_path) + 1; kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); } static void export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, struct kinfo_file *kif, struct filedesc *fdp) { int error; bzero(kif, sizeof(*kif)); /* Set a default type to allow for empty fill_kinfo() methods. */ kif->kf_type = KF_TYPE_UNKNOWN; kif->kf_flags = xlate_fflags(fp->f_flag); if (rightsp != NULL) kif->kf_cap_rights = *rightsp; else cap_rights_init(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = fp->f_count; kif->kf_offset = foffset_get(fp); /* * This may drop the filedesc lock, so the 'fp' cannot be * accessed after this call. */ error = fo_fill_kinfo(fp, kif, fdp); if (error == 0) kif->kf_status |= KF_ATTR_VALID; pack_kinfo(kif); } static void export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, struct kinfo_file *kif) { int error; bzero(kif, sizeof(*kif)); kif->kf_type = KF_TYPE_VNODE; error = vn_fill_kinfo_vnode(vp, kif); if (error == 0) kif->kf_status |= KF_ATTR_VALID; kif->kf_flags = xlate_fflags(fflags); cap_rights_init(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = -1; kif->kf_offset = -1; pack_kinfo(kif); vrele(vp); } struct export_fd_buf { struct filedesc *fdp; struct sbuf *sb; ssize_t remainder; struct kinfo_file kif; }; static int export_kinfo_to_sb(struct export_fd_buf *efbuf) { struct kinfo_file *kif; kif = &efbuf->kif; if (efbuf->remainder != -1) { if (efbuf->remainder < kif->kf_structsize) { /* Terminate export. */ efbuf->remainder = 0; return (0); } efbuf->remainder -= kif->kf_structsize; } return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM); } static int export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp); FILEDESC_SUNLOCK(efbuf->fdp); error = export_kinfo_to_sb(efbuf); FILEDESC_SLOCK(efbuf->fdp); return (error); } static int export_vnode_to_sb(struct vnode *vp, int fd, int fflags, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); if (efbuf->fdp != NULL) FILEDESC_SUNLOCK(efbuf->fdp); export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif); error = export_kinfo_to_sb(efbuf); if (efbuf->fdp != NULL) FILEDESC_SLOCK(efbuf->fdp); return (error); } /* * Store a process file descriptor information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) { struct file *fp; struct filedesc *fdp; struct export_fd_buf *efbuf; struct vnode *cttyvp, *textvp, *tracevp; int error, i; cap_rights_t rights; PROC_LOCK_ASSERT(p, MA_OWNED); /* ktrace vnode */ tracevp = p->p_tracevp; if (tracevp != NULL) vref(tracevp); /* text vnode */ textvp = p->p_textvp; if (textvp != NULL) vref(textvp); /* Controlling tty. */ cttyvp = NULL; if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { cttyvp = p->p_pgrp->pg_session->s_ttyvp; if (cttyvp != NULL) vref(cttyvp); } fdp = fdhold(p); PROC_UNLOCK(p); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = NULL; efbuf->sb = sb; efbuf->remainder = maxlen; if (tracevp != NULL) export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE, efbuf); if (textvp != NULL) export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf); if (cttyvp != NULL) export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE, efbuf); error = 0; if (fdp == NULL) goto fail; efbuf->fdp = fdp; FILEDESC_SLOCK(fdp); /* working directory */ if (fdp->fd_cdir != NULL) { vref(fdp->fd_cdir); export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } /* root directory */ if (fdp->fd_rdir != NULL) { vref(fdp->fd_rdir); export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf); } /* jail directory */ if (fdp->fd_jdir != NULL) { vref(fdp->fd_jdir); export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf); } for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; #ifdef CAPABILITIES rights = *cap_rights(fdp, i); #else /* !CAPABILITIES */ cap_rights_init(&rights); #endif /* * Create sysctl entry. It is OK to drop the filedesc * lock inside of export_file_to_sb() as we will * re-validate and re-evaluate its properties when the * loop continues. */ error = export_file_to_sb(fp, i, &rights, efbuf); if (error != 0 || efbuf->remainder == 0) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); fail: free(efbuf, M_TEMP); return (error); } #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_filedesc_out(p, &sb, maxlen); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #ifdef KINFO_OFILE_SIZE CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); #endif #ifdef COMPAT_FREEBSD7 static void kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) { okif->kf_structsize = sizeof(*okif); okif->kf_type = kif->kf_type; okif->kf_fd = kif->kf_fd; okif->kf_ref_count = kif->kf_ref_count; okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | KF_FLAG_DIRECT | KF_FLAG_HASLOCK); okif->kf_offset = kif->kf_offset; okif->kf_vnode_type = kif->kf_vnode_type; okif->kf_sock_domain = kif->kf_sock_domain; okif->kf_sock_type = kif->kf_sock_type; okif->kf_sock_protocol = kif->kf_sock_protocol; strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); okif->kf_sa_local = kif->kf_sa_local; okif->kf_sa_peer = kif->kf_sa_peer; } static int export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req) { int error; vref(vp); FILEDESC_SUNLOCK(fdp); export_vnode_to_kinfo(vp, type, 0, kif); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); return (error); } /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) { struct kinfo_ofile *okif; struct kinfo_file *kif; struct filedesc *fdp; int error, i, *name; struct file *fp; struct proc *p; name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) return (error); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) return (ENOENT); kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); FILEDESC_SLOCK(fdp); if (fdp->fd_cdir != NULL) export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, okif, fdp, req); if (fdp->fd_rdir != NULL) export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, okif, fdp, req); if (fdp->fd_jdir != NULL) export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, okif, fdp, req); for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; export_file_to_kinfo(fp, i, NULL, kif, fdp); FILEDESC_SUNLOCK(fdp); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); free(kif, M_TEMP); free(okif, M_TEMP); return (0); } static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); #endif /* COMPAT_FREEBSD7 */ int vntype_to_kinfo(int vtype) { struct { int vtype; int kf_vtype; } vtypes_table[] = { { VBAD, KF_VTYPE_VBAD }, { VBLK, KF_VTYPE_VBLK }, { VCHR, KF_VTYPE_VCHR }, { VDIR, KF_VTYPE_VDIR }, { VFIFO, KF_VTYPE_VFIFO }, { VLNK, KF_VTYPE_VLNK }, { VNON, KF_VTYPE_VNON }, { VREG, KF_VTYPE_VREG }, { VSOCK, KF_VTYPE_VSOCK } }; unsigned int i; /* * Perform vtype translation. */ for (i = 0; i < nitems(vtypes_table); i++) if (vtypes_table[i].vtype == vtype) return (vtypes_table[i].kf_vtype); return (KF_VTYPE_UNKNOWN); } static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, "Process filedesc entries"); /* * Store a process current working directory information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) { struct filedesc *fdp; struct export_fd_buf *efbuf; int error; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) return (EINVAL); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = fdp; efbuf->sb = sb; efbuf->remainder = maxlen; FILEDESC_SLOCK(fdp); if (fdp->fd_cdir == NULL) error = EINVAL; else { vref(fdp->fd_cdir); error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } FILEDESC_SUNLOCK(fdp); fddrop(fdp); free(efbuf, M_TEMP); return (error); } /* * Get per-process current working directory. */ static int sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_cwd_out(p, &sb, maxlen); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_cwd, "Process current working directory"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnod"); case DTYPE_SOCKET: return ("sock"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kque"); case DTYPE_CRYPTO: return ("crpt"); case DTYPE_MQUEUE: return ("mque"); case DTYPE_SHM: return ("shm"); case DTYPE_SEM: return ("ksem"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n <= fdp->fd_lastfile; n++) { if (fp == fdp->fd_ofiles[n].fde_file) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { struct proc *p; if (header) db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", "File", "Type", "Data", "Flag", "GCFl", "Count", "MCount", "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, 0, fp->f_count, 0, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } DB_SHOW_COMMAND(files, db_show_files) { struct filedesc *fdp; struct file *fp; struct proc *p; int header; int n; header = 1; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; if ((fdp = p->p_fd) == NULL) continue; for (n = 0; n <= fdp->fd_lastfile; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; db_print_file(fp, header); header = 0; } } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (0); } static int badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, int kflags, struct thread *td) { return (EBADF); } static int badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return (0); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = badfo_fill_kinfo, }; int invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } int invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } int invfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (poll_no_poll(events)); } int invfo_kqfilter(struct file *fp, struct knote *kn) { return (EINVAL); } int invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, int kflags, struct thread *td) { return (EINVAL); } /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); Index: head/sys/kern/vfs_aio.c =================================================================== --- head/sys/kern/vfs_aio.c (revision 285171) +++ head/sys/kern/vfs_aio.c (revision 285172) @@ -1,3073 +1,3074 @@ /*- * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_vfs_aio.h" /* * Counter for allocating reference ids to new jobs. Wrapped to 1 on * overflow. (XXX will be removed soon.) */ static u_long jobrefid; /* * Counter for aio_fsync. */ static uint64_t jobseqno; #define JOBST_NULL 0 #define JOBST_JOBQSOCK 1 #define JOBST_JOBQGLOBAL 2 #define JOBST_JOBRUNNING 3 #define JOBST_JOBFINISHED 4 #define JOBST_JOBQBUF 5 #define JOBST_JOBQSYNC 6 #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ #endif #ifndef MAX_AIO_PROCS #define MAX_AIO_PROCS 32 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ #endif #ifndef TARGET_AIO_PROCS #define TARGET_AIO_PROCS 4 #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif #ifndef AIOD_TIMEOUT_DEFAULT #define AIOD_TIMEOUT_DEFAULT (10 * hz) #endif #ifndef AIOD_LIFETIME_DEFAULT #define AIOD_LIFETIME_DEFAULT (30 * hz) #endif FEATURE(aio, "Asynchronous I/O"); static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); static int max_aio_procs = MAX_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, "Maximum number of kernel threads to use for handling async IO "); static int num_aio_procs = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, "Number of presently active kernel threads for async IO"); /* * The code will adjust the actual number of AIO processes towards this * number when it gets a chance. */ static int target_aio_procs = TARGET_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel threads for async IO"); static int max_queue_count = MAX_AIO_QUEUE; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, "Number of queued aio requests"); static int num_buf_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, "Number of aio requests presently handled by the buf subsystem"); /* Number of async I/O thread in the process of being started */ /* XXX This should be local to aio_aqueue() */ static int num_aio_resv_start = 0; static int aiod_timeout; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0, "Timeout value for synchronous aio operations"); static int aiod_lifetime; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, "Maximum lifetime for idle aiod"); static int unloadable = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0, "Allow unload of aio (not recommended)"); static int max_aio_per_proc = MAX_AIO_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process (stored in the process)"); static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process (stored in the process)"); static int max_buf_aio = MAX_BUF_AIO; SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process (stored in the process)"); typedef struct oaiocb { int aio_fildes; /* File descriptor */ off_t aio_offset; /* File offset for I/O */ volatile void *aio_buf; /* I/O buffer in process space */ size_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private _aiocb_private; } oaiocb_t; /* * Below is a key of locks used to protect each member of struct aiocblist * aioliojob and kaioinfo and any backends. * * * - need not protected * a - locked by kaioinfo lock * b - locked by backend lock, the backend lock can be null in some cases, * for example, BIO belongs to this type, in this case, proc lock is * reused. * c - locked by aio_job_mtx, the lock for the generic file I/O backend. */ /* * Current, there is only two backends: BIO and generic file I/O. * socket I/O is served by generic file I/O, this is not a good idea, since * disk file I/O and any other types without O_NONBLOCK flag can block daemon * threads, if there is no thread to serve socket I/O, the socket I/O will be * delayed too long or starved, we should create some threads dedicated to * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O * systems we really need non-blocking interface, fiddling O_NONBLOCK in file * structure is not safe because there is race between userland and aio * daemons. */ struct aiocblist { TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */ TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */ TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */ int jobflags; /* (a) job flags */ int jobstate; /* (b) job state */ int inputcharge; /* (*) input blockes */ int outputcharge; /* (*) output blockes */ struct bio *bp; /* (*) BIO backend BIO pointer */ struct buf *pbuf; /* (*) BIO backend buffer pointer */ struct vm_page *pages[btoc(MAXPHYS)+1]; /* BIO backend pages */ int npages; /* BIO backend number of pages */ struct proc *userproc; /* (*) user process */ struct ucred *cred; /* (*) active credential when created */ struct file *fd_file; /* (*) pointer to file structure */ struct aioliojob *lio; /* (*) optional lio job */ struct aiocb *uuaiocb; /* (*) pointer in userspace of aiocb */ struct knlist klist; /* (a) list of knotes */ struct aiocb uaiocb; /* (*) kernel I/O control block */ ksiginfo_t ksi; /* (a) realtime signal info */ uint64_t seqno; /* (*) job number */ int pending; /* (a) number of pending I/O, aio_fsync only */ }; /* jobflags */ #define AIOCBLIST_DONE 0x01 #define AIOCBLIST_BUFDONE 0x02 #define AIOCBLIST_RUNDOWN 0x04 #define AIOCBLIST_CHECKSYNC 0x08 /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ struct aiothreadlist { int aiothreadflags; /* (c) AIO proc flags */ TAILQ_ENTRY(aiothreadlist) list; /* (c) list of processes */ struct thread *aiothread; /* (*) the AIO thread */ }; /* * data-structure for lio signal management */ struct aioliojob { int lioj_flags; /* (a) listio flags */ int lioj_count; /* (a) listio flags */ int lioj_finished_count; /* (a) listio flags */ struct sigevent lioj_signal; /* (a) signal on all I/O done */ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ struct knlist klist; /* (a) list of knotes */ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ /* * per process aio data structure */ struct kaioinfo { struct mtx kaio_mtx; /* the lock to protect this struct */ int kaio_flags; /* (a) per process kaio flags */ int kaio_maxactive_count; /* (*) maximum number of AIOs */ int kaio_active_count; /* (c) number of currently used AIOs */ int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */ int kaio_count; /* (a) size of AIO queue */ int kaio_ballowed_count; /* (*) maximum number of buffers */ int kaio_buffer_count; /* (a) number of physio buffers */ TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */ TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */ TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */ TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* (a) queue for aios waiting on sockets, * NOT USED YET. */ TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */ struct task kaio_task; /* (*) task to kick aio threads */ }; #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) #define AIO_MTX(ki) (&(ki)->kaio_mtx) #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */ /* * Operations used to interact with userland aio control blocks. * Different ABIs provide their own operations. */ struct aiocb_ops { int (*copyin)(struct aiocb *ujob, struct aiocb *kjob); long (*fetch_status)(struct aiocb *ujob); long (*fetch_error)(struct aiocb *ujob); int (*store_status)(struct aiocb *ujob, long status); int (*store_error)(struct aiocb *ujob, long error); int (*store_kernelinfo)(struct aiocb *ujob, long jobref); int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); }; static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* (c) Idle daemons */ static struct sema aio_newproc_sem; static struct mtx aio_job_mtx; static struct mtx aio_sock_mtx; static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; void aio_init_aioinfo(struct proc *p); static int aio_onceonly(void); static int aio_free_entry(struct aiocblist *aiocbe); static void aio_process_rw(struct aiocblist *aiocbe); static void aio_process_sync(struct aiocblist *aiocbe); static void aio_process_mlock(struct aiocblist *aiocbe); static int aio_newproc(int *); int aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lio, int type, struct aiocb_ops *ops); static void aio_physwakeup(struct bio *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); static int aio_qphysio(struct proc *p, struct aiocblist *iocb); static void aio_daemon(void *param); static void aio_swake_cb(struct socket *, struct sockbuf *); static int aio_unload(void); static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type); #define DONE_BUF 1 #define DONE_QUEUE 2 static int aio_kick(struct proc *userp); static void aio_kick_nowait(struct proc *userp); static void aio_kick_helper(void *context, int pending); static int filt_aioattach(struct knote *kn); static void filt_aiodetach(struct knote *kn); static int filt_aio(struct knote *kn, long hint); static int filt_lioattach(struct knote *kn); static void filt_liodetach(struct knote *kn); static int filt_lio(struct knote *kn, long hint); /* * Zones for: * kaio Per process async io info * aiop async io thread data * aiocb async io jobs * aiol list io job pointer - internal to aio_suspend XXX * aiolio list io jobs */ static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone; /* kqueue filters for aio */ static struct filterops aio_filtops = { .f_isfd = 0, .f_attach = filt_aioattach, .f_detach = filt_aiodetach, .f_event = filt_aio, }; static struct filterops lio_filtops = { .f_isfd = 0, .f_attach = filt_lioattach, .f_detach = filt_liodetach, .f_event = filt_lio }; static eventhandler_tag exit_tag, exec_tag; TASKQUEUE_DEFINE_THREAD(aiod_bio); /* * Main operations function for use as a kernel module. */ static int aio_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: aio_onceonly(); break; case MOD_UNLOAD: error = aio_unload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t aio_mod = { "aio", &aio_modload, NULL }; static struct syscall_helper_data aio_syscalls[] = { SYSCALL_INIT_HELPER(aio_cancel), SYSCALL_INIT_HELPER(aio_error), SYSCALL_INIT_HELPER(aio_fsync), SYSCALL_INIT_HELPER(aio_mlock), SYSCALL_INIT_HELPER(aio_read), SYSCALL_INIT_HELPER(aio_return), SYSCALL_INIT_HELPER(aio_suspend), SYSCALL_INIT_HELPER(aio_waitcomplete), SYSCALL_INIT_HELPER(aio_write), SYSCALL_INIT_HELPER(lio_listio), SYSCALL_INIT_HELPER(oaio_read), SYSCALL_INIT_HELPER(oaio_write), SYSCALL_INIT_HELPER(olio_listio), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include #include static struct syscall_helper_data aio32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_aio_return), SYSCALL32_INIT_HELPER(freebsd32_aio_suspend), SYSCALL32_INIT_HELPER(freebsd32_aio_cancel), SYSCALL32_INIT_HELPER(freebsd32_aio_error), SYSCALL32_INIT_HELPER(freebsd32_aio_fsync), SYSCALL32_INIT_HELPER(freebsd32_aio_mlock), SYSCALL32_INIT_HELPER(freebsd32_aio_read), SYSCALL32_INIT_HELPER(freebsd32_aio_write), SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete), SYSCALL32_INIT_HELPER(freebsd32_lio_listio), SYSCALL32_INIT_HELPER(freebsd32_oaio_read), SYSCALL32_INIT_HELPER(freebsd32_oaio_write), SYSCALL32_INIT_HELPER(freebsd32_olio_listio), SYSCALL_INIT_LAST }; #endif DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(aio, 1); /* * Startup initialization */ static int aio_onceonly(void) { int error; /* XXX: should probably just use so->callback */ aio_swake = &aio_swake_cb; exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, EVENTHANDLER_PRI_ANY); kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); TAILQ_INIT(&aio_freeproc); sema_init(&aio_newproc_sem, 0, "aio_new_proc"); mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF); TAILQ_INIT(&aio_jobs); aiod_unr = new_unrhdr(1, INT_MAX, NULL); kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiod_timeout = AIOD_TIMEOUT_DEFAULT; aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; async_io_version = _POSIX_VERSION; p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX); p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); error = syscall_helper_register(aio_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(aio32_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #endif return (0); } /* * Callback for unload of AIO when used as a module. */ static int aio_unload(void) { int error; /* * XXX: no unloads by default, it's too dangerous. * perhaps we could do it if locked out callers and then * did an aio_proc_rundown() on each process. * * jhb: aio_proc_rundown() needs to run on curproc though, * so I don't think that would fly. */ if (!unloadable) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(aio32_syscalls); #endif syscall_helper_unregister(aio_syscalls); error = kqueue_del_filteropts(EVFILT_AIO); if (error) return error; error = kqueue_del_filteropts(EVFILT_LIO); if (error) return error; async_io_version = 0; aio_swake = NULL; taskqueue_free(taskqueue_aiod_bio); delete_unrhdr(aiod_unr); uma_zdestroy(kaio_zone); uma_zdestroy(aiop_zone); uma_zdestroy(aiocb_zone); uma_zdestroy(aiol_zone); uma_zdestroy(aiolio_zone); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); EVENTHANDLER_DEREGISTER(process_exec, exec_tag); mtx_destroy(&aio_job_mtx); mtx_destroy(&aio_sock_mtx); sema_destroy(&aio_newproc_sem); p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1); p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1); return (0); } /* * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; ki = uma_zalloc(kaio_zone, M_WAITOK); mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF); ki->kaio_flags = 0; ki->kaio_maxactive_count = max_aio_per_proc; ki->kaio_active_count = 0; ki->kaio_qallowed_count = max_aio_queue_per_proc; ki->kaio_count = 0; ki->kaio_ballowed_count = max_buf_aio; ki->kaio_buffer_count = 0; TAILQ_INIT(&ki->kaio_all); TAILQ_INIT(&ki->kaio_done); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_bufqueue); TAILQ_INIT(&ki->kaio_liojoblist); TAILQ_INIT(&ki->kaio_sockqueue); TAILQ_INIT(&ki->kaio_syncqueue); TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); PROC_LOCK(p); if (p->p_aioinfo == NULL) { p->p_aioinfo = ki; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); } while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) aio_newproc(NULL); } static int aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) { struct thread *td; int error; error = sigev_findtd(p, sigev, &td); if (error) return (error); if (!KSI_ONQ(ksi)) { ksiginfo_set_sigev(ksi, sigev); ksi->ksi_code = SI_ASYNCIO; ksi->ksi_flags |= KSI_EXT | KSI_INS; tdsendsignal(p, td, ksi->ksi_signo, ksi); } PROC_UNLOCK(p); return (error); } /* * Free a job entry. Wait for completion if it is currently active, but don't * delay forever. If we delay, we return a flag that says that we have to * restart the queue scan. */ static int aio_free_entry(struct aiocblist *aiocbe) { struct kaioinfo *ki; struct aioliojob *lj; struct proc *p; p = aiocbe->userproc; MPASS(curproc == p); ki = p->p_aioinfo; MPASS(ki != NULL); AIO_LOCK_ASSERT(ki, MA_OWNED); MPASS(aiocbe->jobstate == JOBST_JOBFINISHED); atomic_subtract_int(&num_queue_count, 1); ki->kaio_count--; MPASS(ki->kaio_count >= 0); TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist); TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist); lj = aiocbe->lio; if (lj) { lj->lioj_count--; lj->lioj_finished_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); /* lio is going away, we need to destroy any knotes */ knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } } /* aiocbe is going away, we need to destroy any knotes */ knlist_delete(&aiocbe->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&aiocbe->ksi); PROC_UNLOCK(p); MPASS(aiocbe->bp == NULL); aiocbe->jobstate = JOBST_NULL; AIO_UNLOCK(ki); /* * The thread argument here is used to find the owning process * and is also passed to fo_close() which may pass it to various * places such as devsw close() routines. Because of that, we * need a thread pointer from the process owning the job that is * persistent and won't disappear out from under us or move to * another process. * * Currently, all the callers of this function call it to remove * an aiocblist from the current process' job list either via a * syscall or due to the current process calling exit() or * execve(). Thus, we know that p == curproc. We also know that * curthread can't exit since we are curthread. * * Therefore, we use curthread as the thread to pass to * knlist_delete(). This does mean that it is possible for the * thread pointer at close time to differ from the thread pointer * at open time, but this is already true of file descriptors in * a multithreaded process. */ if (aiocbe->fd_file) fdrop(aiocbe->fd_file, curthread); crfree(aiocbe->cred); uma_zfree(aiocb_zone, aiocbe); AIO_LOCK(ki); return (0); } static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { aio_proc_rundown(arg, p); } /* * Rundown the jobs for a given process. */ static void aio_proc_rundown(void *arg, struct proc *p) { struct kaioinfo *ki; struct aioliojob *lj; struct aiocblist *cbe, *cbn; struct file *fp; struct socket *so; int remove; KASSERT(curthread->td_proc == p, ("%s: called on non-curproc", __func__)); ki = p->p_aioinfo; if (ki == NULL) return; AIO_LOCK(ki); ki->kaio_flags |= KAIO_RUNDOWN; restart: /* * Try to cancel all pending requests. This code simulates * aio_cancel on all pending I/O requests. */ TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { remove = 0; mtx_lock(&aio_job_mtx); if (cbe->jobstate == JOBST_JOBQGLOBAL) { TAILQ_REMOVE(&aio_jobs, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSOCK) { fp = cbe->fd_file; MPASS(fp->f_type == DTYPE_SOCKET); so = fp->f_data; TAILQ_REMOVE(&so->so_aiojobq, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSYNC) { TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); remove = 1; } mtx_unlock(&aio_job_mtx); if (remove) { cbe->jobstate = JOBST_JOBFINISHED; cbe->uaiocb._aiocb_private.status = -1; cbe->uaiocb._aiocb_private.error = ECANCELED; TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); aio_bio_done_notify(p, cbe, DONE_QUEUE); } } /* Wait for all running I/O to be finished */ if (TAILQ_FIRST(&ki->kaio_bufqueue) || TAILQ_FIRST(&ki->kaio_jobqueue)) { ki->kaio_flags |= KAIO_WAKEUP; msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); goto restart; } /* Free all completed I/O requests. */ while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL) aio_free_entry(cbe); while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } else { panic("LIO job not cleaned up: C:%d, FC:%d\n", lj->lioj_count, lj->lioj_finished_count); } } AIO_UNLOCK(ki); taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); p->p_aioinfo = NULL; } /* * Select a job to run (called by an AIO daemon). */ static struct aiocblist * aio_selectjob(struct aiothreadlist *aiop) { struct aiocblist *aiocbe; struct kaioinfo *ki; struct proc *userp; mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_FOREACH(aiocbe, &aio_jobs, list) { userp = aiocbe->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < ki->kaio_maxactive_count) { TAILQ_REMOVE(&aio_jobs, aiocbe, list); /* Account for currently active jobs. */ ki->kaio_active_count++; aiocbe->jobstate = JOBST_JOBRUNNING; break; } } return (aiocbe); } /* * Move all data to a permanent storage device, this code * simulates fsync syscall. */ static int aio_fsync_vnode(struct thread *td, struct vnode *vp) { struct mount *mp; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); drop: return (error); } /* * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that * does the I/O request for the non-physio version of the operations. The * normal vn operations are used, and this code should work in all instances * for every type of file, including pipes, sockets, fifos, and regular files. * * XXX I don't think it works well for socket, pipe, and fifo. */ static void aio_process_rw(struct aiocblist *aiocbe) { struct ucred *td_savedcred; struct thread *td; struct aiocb *cb; struct file *fp; struct socket *so; struct uio auio; struct iovec aiov; int cnt; int error; int oublock_st, oublock_end; int inblock_st, inblock_end; KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_READ || aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE, ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode)); td = curthread; td_savedcred = td->td_ucred; td->td_ucred = aiocbe->cred; cb = &aiocbe->uaiocb; fp = aiocbe->fd_file; aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; aiov.iov_len = cb->aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = cb->aio_offset; auio.uio_resid = cb->aio_nbytes; cnt = cb->aio_nbytes; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; inblock_st = td->td_ru.ru_inblock; oublock_st = td->td_ru.ru_oublock; /* * aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ if (cb->aio_lio_opcode == LIO_READ) { auio.uio_rw = UIO_READ; if (auio.uio_resid == 0) error = 0; else error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); } else { if (fp->f_type == DTYPE_VNODE) bwillwrite(); auio.uio_rw = UIO_WRITE; error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); } inblock_end = td->td_ru.ru_inblock; oublock_end = td->td_ru.ru_oublock; aiocbe->inputcharge = inblock_end - inblock_st; aiocbe->outputcharge = oublock_end - oublock_st; if ((error) && (auio.uio_resid != cnt)) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { int sigpipe = 1; if (fp->f_type == DTYPE_SOCKET) { so = fp->f_data; if (so->so_options & SO_NOSIGPIPE) sigpipe = 0; } if (sigpipe) { PROC_LOCK(aiocbe->userproc); kern_psignal(aiocbe->userproc, SIGPIPE); PROC_UNLOCK(aiocbe->userproc); } } } cnt -= auio.uio_resid; cb->_aiocb_private.error = error; cb->_aiocb_private.status = cnt; td->td_ucred = td_savedcred; } static void aio_process_sync(struct aiocblist *aiocbe) { struct thread *td = curthread; struct ucred *td_savedcred = td->td_ucred; struct aiocb *cb = &aiocbe->uaiocb; struct file *fp = aiocbe->fd_file; int error = 0; KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_SYNC, ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode)); td->td_ucred = aiocbe->cred; if (fp->f_vnode != NULL) error = aio_fsync_vnode(td, fp->f_vnode); cb->_aiocb_private.error = error; cb->_aiocb_private.status = 0; td->td_ucred = td_savedcred; } static void aio_process_mlock(struct aiocblist *aiocbe) { struct aiocb *cb = &aiocbe->uaiocb; int error; KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_MLOCK, ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode)); error = vm_mlock(aiocbe->userproc, aiocbe->cred, __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes); cb->_aiocb_private.error = error; cb->_aiocb_private.status = 0; } static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type) { struct aioliojob *lj; struct kaioinfo *ki; struct aiocblist *scb, *scbn; int lj_done; ki = userp->p_aioinfo; AIO_LOCK_ASSERT(ki, MA_OWNED); lj = aiocbe->lio; lj_done = 0; if (lj) { lj->lioj_finished_count++; if (lj->lioj_count == lj->lioj_finished_count) lj_done = 1; } if (type == DONE_QUEUE) { aiocbe->jobflags |= AIOCBLIST_DONE; } else { aiocbe->jobflags |= AIOCBLIST_BUFDONE; } TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist); aiocbe->jobstate = JOBST_JOBFINISHED; if (ki->kaio_flags & KAIO_RUNDOWN) goto notification_done; if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi); KNOTE_LOCKED(&aiocbe->klist, 1); if (lj_done) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } notification_done: if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) { TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) { if (aiocbe->fd_file == scb->fd_file && aiocbe->seqno < scb->seqno) { if (--scb->pending == 0) { mtx_lock(&aio_job_mtx); scb->jobstate = JOBST_JOBQGLOBAL; TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list); TAILQ_INSERT_TAIL(&aio_jobs, scb, list); aio_kick_nowait(userp); mtx_unlock(&aio_job_mtx); } } } } if (ki->kaio_flags & KAIO_WAKEUP) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(&userp->p_aioinfo); } } /* * The AIO daemon, most of the actual work is done in aio_process_*, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *_id) { struct aiocblist *aiocbe; struct aiothreadlist *aiop; struct kaioinfo *ki; struct proc *curcp, *mycp, *userp; struct vmspace *myvm, *tmpvm; struct thread *td = curthread; int id = (intptr_t)_id; /* * Local copies of curproc (cp) and vmspace (myvm) */ mycp = td->td_proc; myvm = mycp->p_vmspace; KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp")); /* * Allocate and ready the aio control info. There is one aiop structure * per daemon. */ aiop = uma_zalloc(aiop_zone, M_WAITOK); aiop->aiothread = td; aiop->aiothreadflags = 0; /* The daemon resides in its own pgrp. */ sys_setsid(td, NULL); /* * Wakeup parent process. (Parent sleeps to keep from blasting away * and creating too many daemons.) */ sema_post(&aio_newproc_sem); mtx_lock(&aio_job_mtx); for (;;) { /* * curcp is the current daemon process context. * userp is the current user process context. */ curcp = mycp; /* * Take daemon off of free queue */ if (aiop->aiothreadflags & AIOP_FREE) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; } /* * Check for jobs. */ while ((aiocbe = aio_selectjob(aiop)) != NULL) { mtx_unlock(&aio_job_mtx); userp = aiocbe->userproc; /* * Connect to process address space for user program. */ if (userp != curcp) { /* * Save the current address space that we are * connected to. */ tmpvm = mycp->p_vmspace; /* * Point to the new user address space, and * refer to it. */ mycp->p_vmspace = userp->p_vmspace; atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1); /* Activate the new mapping. */ pmap_activate(FIRST_THREAD_IN_PROC(mycp)); /* * If the old address space wasn't the daemons * own address space, then we need to remove the * daemon's reference from the other process * that it was acting on behalf of. */ if (tmpvm != myvm) { vmspace_free(tmpvm); } curcp = userp; } ki = userp->p_aioinfo; /* Do the I/O function. */ switch(aiocbe->uaiocb.aio_lio_opcode) { case LIO_READ: case LIO_WRITE: aio_process_rw(aiocbe); break; case LIO_SYNC: aio_process_sync(aiocbe); break; case LIO_MLOCK: aio_process_mlock(aiocbe); break; } mtx_lock(&aio_job_mtx); /* Decrement the active job count. */ ki->kaio_active_count--; mtx_unlock(&aio_job_mtx); AIO_LOCK(ki); TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist); aio_bio_done_notify(userp, aiocbe, DONE_QUEUE); AIO_UNLOCK(ki); mtx_lock(&aio_job_mtx); } /* * Disconnect from user address space. */ if (curcp != mycp) { mtx_unlock(&aio_job_mtx); /* Get the user address space to disconnect from. */ tmpvm = mycp->p_vmspace; /* Get original address space for daemon. */ mycp->p_vmspace = myvm; /* Activate the daemon's address space. */ pmap_activate(FIRST_THREAD_IN_PROC(mycp)); #ifdef DIAGNOSTIC if (tmpvm == myvm) { printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); } #endif /* Remove our vmspace reference. */ vmspace_free(tmpvm); curcp = mycp; mtx_lock(&aio_job_mtx); /* * We have to restart to avoid race, we only sleep if * no job can be selected, that should be * curcp == mycp. */ continue; } mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aiothreadflags |= AIOP_FREE; /* * If daemon is inactive for a long time, allow it to exit, * thereby freeing resources. */ if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy", aiod_lifetime)) { if (TAILQ_EMPTY(&aio_jobs)) { if ((aiop->aiothreadflags & AIOP_FREE) && (num_aio_procs > target_aio_procs)) { TAILQ_REMOVE(&aio_freeproc, aiop, list); num_aio_procs--; mtx_unlock(&aio_job_mtx); uma_zfree(aiop_zone, aiop); free_unr(aiod_unr, id); #ifdef DIAGNOSTIC if (mycp->p_vmspace->vm_refcnt <= 1) { printf("AIOD: bad vm refcnt for" " exiting daemon: %d\n", mycp->p_vmspace->vm_refcnt); } #endif kproc_exit(0); } } } } mtx_unlock(&aio_job_mtx); panic("shouldn't be here\n"); } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The * AIO daemon modifies its environment itself. */ static int aio_newproc(int *start) { int error; struct proc *p; int id; id = alloc_unr(aiod_unr); error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, RFNOWAIT, 0, "aiod%d", id); if (error == 0) { /* * Wait until daemon is started. */ sema_wait(&aio_newproc_sem); mtx_lock(&aio_job_mtx); num_aio_procs++; if (start != NULL) (*start)--; mtx_unlock(&aio_job_mtx); } else { free_unr(aiod_unr, id); } return (error); } /* * Try the high-performance, low-overhead physio method for eligible * VCHR devices. This method doesn't use an aio helper thread, and * thus has very low overhead. * * Assumes that the caller, aio_aqueue(), has incremented the file * structure's reference count, preventing its deallocation for the * duration of this call. */ static int aio_qphysio(struct proc *p, struct aiocblist *aiocbe) { struct aiocb *cb; struct file *fp; struct bio *bp; struct buf *pbuf; struct vnode *vp; struct cdevsw *csw; struct cdev *dev; struct kaioinfo *ki; struct aioliojob *lj; int error, ref, unmap, poff; vm_prot_t prot; cb = &aiocbe->uaiocb; fp = aiocbe->fd_file; if (fp == NULL || fp->f_type != DTYPE_VNODE) return (-1); vp = fp->f_vnode; if (vp->v_type != VCHR) return (-1); if (vp->v_bufobj.bo_bsize == 0) return (-1); if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) return (-1); ref = 0; csw = devvn_refthread(vp, &dev, &ref); if (csw == NULL) return (ENXIO); if ((csw->d_flags & D_DISK) == 0) { error = -1; goto unref; } if (cb->aio_nbytes > dev->si_iosize_max) { error = -1; goto unref; } ki = p->p_aioinfo; poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed); if (unmap) { if (cb->aio_nbytes > MAXPHYS) { error = -1; goto unref; } } else { if (cb->aio_nbytes > MAXPHYS - poff) { error = -1; goto unref; } if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) { error = -1; goto unref; } } aiocbe->bp = bp = g_alloc_bio(); if (!unmap) { aiocbe->pbuf = pbuf = (struct buf *)getpbuf(NULL); BUF_KERNPROC(pbuf); } AIO_LOCK(ki); ki->kaio_count++; if (!unmap) ki->kaio_buffer_count++; lj = aiocbe->lio; if (lj) lj->lioj_count++; TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); aiocbe->jobstate = JOBST_JOBQBUF; cb->_aiocb_private.status = cb->aio_nbytes; AIO_UNLOCK(ki); bp->bio_length = cb->aio_nbytes; bp->bio_bcount = cb->aio_nbytes; bp->bio_done = aio_physwakeup; bp->bio_data = (void *)(uintptr_t)cb->aio_buf; bp->bio_offset = cb->aio_offset; bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; bp->bio_dev = dev; bp->bio_caller1 = (void *)aiocbe; prot = VM_PROT_READ; if (cb->aio_lio_opcode == LIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((aiocbe->npages = vm_fault_quick_hold_pages( &curproc->p_vmspace->vm_map, (vm_offset_t)bp->bio_data, bp->bio_length, prot, aiocbe->pages, sizeof(aiocbe->pages)/sizeof(aiocbe->pages[0]))) < 0) { error = EFAULT; goto doerror; } if (!unmap) { pmap_qenter((vm_offset_t)pbuf->b_data, aiocbe->pages, aiocbe->npages); bp->bio_data = pbuf->b_data + poff; } else { bp->bio_ma = aiocbe->pages; bp->bio_ma_n = aiocbe->npages; bp->bio_ma_offset = poff; bp->bio_data = unmapped_buf; bp->bio_flags |= BIO_UNMAPPED; } atomic_add_int(&num_queue_count, 1); if (!unmap) atomic_add_int(&num_buf_aio, 1); /* Perform transfer. */ csw->d_strategy(bp); dev_relthread(dev, ref); return (0); doerror: AIO_LOCK(ki); aiocbe->jobstate = JOBST_NULL; TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist); ki->kaio_count--; if (!unmap) ki->kaio_buffer_count--; if (lj) lj->lioj_count--; AIO_UNLOCK(ki); if (pbuf) { relpbuf(pbuf, NULL); aiocbe->pbuf = NULL; } g_destroy_bio(bp); aiocbe->bp = NULL; unref: dev_relthread(dev, ref); return (error); } /* * Wake up aio requests that may be serviceable now. */ static void aio_swake_cb(struct socket *so, struct sockbuf *sb) { struct aiocblist *cb, *cbn; int opcode; SOCKBUF_LOCK_ASSERT(sb); if (sb == &so->so_snd) opcode = LIO_WRITE; else opcode = LIO_READ; sb->sb_flags &= ~SB_AIO; mtx_lock(&aio_job_mtx); TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) { if (opcode == cb->uaiocb.aio_lio_opcode) { if (cb->jobstate != JOBST_JOBQSOCK) panic("invalid queue value"); /* XXX * We don't have actual sockets backend yet, * so we simply move the requests to the generic * file I/O backend. */ TAILQ_REMOVE(&so->so_aiojobq, cb, list); TAILQ_INSERT_TAIL(&aio_jobs, cb, list); aio_kick_nowait(cb->userproc); } } mtx_unlock(&aio_job_mtx); } static int convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ nsig->sigev_notify = osig->sigev_notify; switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; break; default: return (EINVAL); } return (0); } static int aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) { struct oaiocb *ojob; int error; bzero(kjob, sizeof(struct aiocb)); error = copyin(ujob, kjob, sizeof(struct oaiocb)); if (error) return (error); ojob = (struct oaiocb *)kjob; return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent)); } static int aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob) { return (copyin(ujob, kjob, sizeof(struct aiocb))); } static long aiocb_fetch_status(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.status)); } static long aiocb_fetch_error(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.error)); } static int aiocb_store_status(struct aiocb *ujob, long status) { return (suword(&ujob->_aiocb_private.status, status)); } static int aiocb_store_error(struct aiocb *ujob, long error) { return (suword(&ujob->_aiocb_private.error, error)); } static int aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) { return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); } static int aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword(ujobp, (long)ujob)); } static struct aiocb_ops aiocb_ops = { .copyin = aiocb_copyin, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; static struct aiocb_ops aiocb_ops_osigevent = { .copyin = aiocb_copyin_old_sigevent, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; /* * Queue a new AIO request. Choosing either the threaded or direct physio VCHR * technique is done in this code. */ int aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj, int type, struct aiocb_ops *ops) { struct proc *p = td->td_proc; cap_rights_t rights; struct file *fp; struct socket *so; struct aiocblist *aiocbe, *cb; struct kaioinfo *ki; struct kevent kev; struct sockbuf *sb; int opcode; int error; int fd, kqfd; int jid; u_short evflags; if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; ops->store_status(job, -1); ops->store_error(job, 0); ops->store_kernelinfo(job, -1); if (num_queue_count >= max_queue_count || ki->kaio_count >= ki->kaio_qallowed_count) { ops->store_error(job, EAGAIN); return (EAGAIN); } aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki)); error = ops->copyin(job, &aiocbe->uaiocb); if (error) { ops->store_error(job, error); uma_zfree(aiocb_zone, aiocbe); return (error); } /* XXX: aio_nbytes is later casted to signed types. */ if (aiocbe->uaiocb.aio_nbytes > INT_MAX) { uma_zfree(aiocb_zone, aiocbe); return (EINVAL); } if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { ops->store_error(job, EINVAL); uma_zfree(aiocb_zone, aiocbe); return (EINVAL); } if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) { uma_zfree(aiocb_zone, aiocbe); return (EINVAL); } ksiginfo_init(&aiocbe->ksi); /* Save userspace address of the job info. */ aiocbe->uuaiocb = job; /* Get the opcode. */ if (type != LIO_NOP) aiocbe->uaiocb.aio_lio_opcode = type; opcode = aiocbe->uaiocb.aio_lio_opcode; /* * Validate the opcode and fetch the file object for the specified * file descriptor. * * XXXRW: Moved the opcode validation up here so that we don't * retrieve a file descriptor without knowing what the capabiltity * should be. */ fd = aiocbe->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp); break; case LIO_READ: error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp); break; case LIO_SYNC: error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp); break; case LIO_MLOCK: fp = NULL; break; case LIO_NOP: error = fget(td, fd, cap_rights_init(&rights), &fp); break; default: error = EINVAL; } if (error) { uma_zfree(aiocb_zone, aiocbe); ops->store_error(job, error); return (error); } if (opcode == LIO_SYNC && fp->f_vnode == NULL) { error = EINVAL; goto aqueue_fail; } if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) { error = EINVAL; goto aqueue_fail; } aiocbe->fd_file = fp; mtx_lock(&aio_job_mtx); jid = jobrefid++; aiocbe->seqno = jobseqno++; mtx_unlock(&aio_job_mtx); error = ops->store_kernelinfo(job, jid); if (error) { error = EINVAL; goto aqueue_fail; } aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; if (opcode == LIO_NOP) { fdrop(fp, td); uma_zfree(aiocb_zone, aiocbe); return (0); } if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) goto no_kqueue; evflags = aiocbe->uaiocb.aio_sigevent.sigev_notify_kevent_flags; if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) { error = EINVAL; goto aqueue_fail; } kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue; kev.ident = (uintptr_t)aiocbe->uuaiocb; kev.filter = EVFILT_AIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags; kev.data = (intptr_t)aiocbe; kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr; error = kqfd_register(kqfd, &kev, td, 1); aqueue_fail: if (error) { if (fp) fdrop(fp, td); uma_zfree(aiocb_zone, aiocbe); ops->store_error(job, error); goto done; } no_kqueue: ops->store_error(job, EINPROGRESS); aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; aiocbe->userproc = p; aiocbe->cred = crhold(td->td_ucred); aiocbe->jobflags = 0; aiocbe->lio = lj; if (opcode == LIO_SYNC) goto queueit; if (fp && fp->f_type == DTYPE_SOCKET) { /* * Alternate queueing for socket ops: Reach down into the * descriptor to get the socket data. Then check to see if the * socket is ready to be read or written (based on the requested * operation). * * If it is not ready for io, then queue the aiocbe on the * socket, and set the flags so we get a call when sbnotify() * happens. * * Note if opcode is neither LIO_WRITE nor LIO_READ we lock * and unlock the snd sockbuf for no reason. */ so = fp->f_data; sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd; SOCKBUF_LOCK(sb); if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode == LIO_WRITE) && (!sowriteable(so)))) { sb->sb_flags |= SB_AIO; mtx_lock(&aio_job_mtx); TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list); mtx_unlock(&aio_job_mtx); AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); aiocbe->jobstate = JOBST_JOBQSOCK; ki->kaio_count++; if (lj) lj->lioj_count++; AIO_UNLOCK(ki); SOCKBUF_UNLOCK(sb); atomic_add_int(&num_queue_count, 1); error = 0; goto done; } SOCKBUF_UNLOCK(sb); } if ((error = aio_qphysio(p, aiocbe)) == 0) goto done; #if 0 if (error > 0) { aiocbe->uaiocb._aiocb_private.error = error; ops->store_error(job, error); goto done; } #endif queueit: atomic_add_int(&num_queue_count, 1); AIO_LOCK(ki); ki->kaio_count++; if (lj) lj->lioj_count++; TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist); if (opcode == LIO_SYNC) { TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) { if (cb->fd_file == aiocbe->fd_file && cb->uaiocb.aio_lio_opcode != LIO_SYNC && cb->seqno < aiocbe->seqno) { cb->jobflags |= AIOCBLIST_CHECKSYNC; aiocbe->pending++; } } TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) { if (cb->fd_file == aiocbe->fd_file && cb->uaiocb.aio_lio_opcode != LIO_SYNC && cb->seqno < aiocbe->seqno) { cb->jobflags |= AIOCBLIST_CHECKSYNC; aiocbe->pending++; } } if (aiocbe->pending != 0) { TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list); aiocbe->jobstate = JOBST_JOBQSYNC; AIO_UNLOCK(ki); goto done; } } mtx_lock(&aio_job_mtx); TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); aiocbe->jobstate = JOBST_JOBQGLOBAL; aio_kick_nowait(p); mtx_unlock(&aio_job_mtx); AIO_UNLOCK(ki); error = 0; done: return (error); } static void aio_kick_nowait(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aiothreadlist *aiop; mtx_assert(&aio_job_mtx, MA_OWNED); if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; wakeup(aiop->aiothread); } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && ((ki->kaio_active_count + num_aio_resv_start) < ki->kaio_maxactive_count)) { taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task); } } static int aio_kick(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aiothreadlist *aiop; int error, ret = 0; mtx_assert(&aio_job_mtx, MA_OWNED); retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aiothreadflags &= ~AIOP_FREE; wakeup(aiop->aiothread); } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && ((ki->kaio_active_count + num_aio_resv_start) < ki->kaio_maxactive_count)) { num_aio_resv_start++; mtx_unlock(&aio_job_mtx); error = aio_newproc(&num_aio_resv_start); mtx_lock(&aio_job_mtx); if (error) { num_aio_resv_start--; goto retryproc; } } else { ret = -1; } return (ret); } static void aio_kick_helper(void *context, int pending) { struct proc *userp = context; mtx_lock(&aio_job_mtx); while (--pending >= 0) { if (aio_kick(userp)) break; } mtx_unlock(&aio_job_mtx); } /* * Support the aio_return system call, as a side-effect, kernel resources are * released. */ static int kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct aiocblist *cb; struct kaioinfo *ki; int status, error; ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); AIO_LOCK(ki); TAILQ_FOREACH(cb, &ki->kaio_done, plist) { if (cb->uuaiocb == uaiocb) break; } if (cb != NULL) { MPASS(cb->jobstate == JOBST_JOBFINISHED); status = cb->uaiocb._aiocb_private.status; error = cb->uaiocb._aiocb_private.error; td->td_retval[0] = status; if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { td->td_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { td->td_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } aio_free_entry(cb); AIO_UNLOCK(ki); ops->store_error(uaiocb, error); ops->store_status(uaiocb, status); } else { error = EINVAL; AIO_UNLOCK(ki); } return (error); } int sys_aio_return(struct thread *td, struct aio_return_args *uap) { return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); } /* * Allow a process to wakeup when any of the I/O requests are completed. */ static int kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, struct timespec *ts) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct aiocblist *cb, *cbfirst; int error, i, timo; timo = 0; if (ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); if (njoblist == 0) return (0); AIO_LOCK(ki); for (;;) { cbfirst = NULL; error = 0; TAILQ_FOREACH(cb, &ki->kaio_all, allist) { for (i = 0; i < njoblist; i++) { if (cb->uuaiocb == ujoblist[i]) { if (cbfirst == NULL) cbfirst = cb; if (cb->jobstate == JOBST_JOBFINISHED) goto RETURN; } } } /* All tasks were finished. */ if (cbfirst == NULL) break; ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", timo); if (error == ERESTART) error = EINTR; if (error) break; } RETURN: AIO_UNLOCK(ki); return (error); } int sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap) { struct timespec ts, *tsp; struct aiocb **ujoblist; int error; if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) return (error); tsp = &ts; } else tsp = NULL; ujoblist = uma_zalloc(aiol_zone, M_WAITOK); error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); if (error == 0) error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); uma_zfree(aiol_zone, ujoblist); return (error); } /* * aio_cancel cancels any non-physio aio operations not currently in * progress. */ int sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; struct aiocblist *cbe, *cbn; struct file *fp; struct socket *so; + cap_rights_t rights; int error; int remove; int cancelled = 0; int notcancelled = 0; struct vnode *vp; /* Lookup file object. */ - error = fget(td, uap->fd, NULL, &fp); + error = fget(td, uap->fd, cap_rights_init(&rights), &fp); if (error) return (error); ki = p->p_aioinfo; if (ki == NULL) goto done; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vn_isdisk(vp, &error)) { fdrop(fp, td); td->td_retval[0] = AIO_NOTCANCELED; return (0); } } AIO_LOCK(ki); TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) { if ((uap->fd == cbe->uaiocb.aio_fildes) && ((uap->aiocbp == NULL) || (uap->aiocbp == cbe->uuaiocb))) { remove = 0; mtx_lock(&aio_job_mtx); if (cbe->jobstate == JOBST_JOBQGLOBAL) { TAILQ_REMOVE(&aio_jobs, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSOCK) { MPASS(fp->f_type == DTYPE_SOCKET); so = fp->f_data; TAILQ_REMOVE(&so->so_aiojobq, cbe, list); remove = 1; } else if (cbe->jobstate == JOBST_JOBQSYNC) { TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list); remove = 1; } mtx_unlock(&aio_job_mtx); if (remove) { TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist); cbe->uaiocb._aiocb_private.status = -1; cbe->uaiocb._aiocb_private.error = ECANCELED; aio_bio_done_notify(p, cbe, DONE_QUEUE); cancelled++; } else { notcancelled++; } if (uap->aiocbp != NULL) break; } } AIO_UNLOCK(ki); done: fdrop(fp, td); if (uap->aiocbp != NULL) { if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } } if (notcancelled) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } td->td_retval[0] = AIO_ALLDONE; return (0); } /* * aio_error is implemented in the kernel level for compatibility purposes * only. For a user mode async implementation, it would be best to do it in * a userland subroutine. */ static int kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct aiocblist *cb; struct kaioinfo *ki; int status; ki = p->p_aioinfo; if (ki == NULL) { td->td_retval[0] = EINVAL; return (0); } AIO_LOCK(ki); TAILQ_FOREACH(cb, &ki->kaio_all, allist) { if (cb->uuaiocb == aiocbp) { if (cb->jobstate == JOBST_JOBFINISHED) td->td_retval[0] = cb->uaiocb._aiocb_private.error; else td->td_retval[0] = EINPROGRESS; AIO_UNLOCK(ki); return (0); } } AIO_UNLOCK(ki); /* * Hack for failure of aio_aqueue. */ status = ops->fetch_status(aiocbp); if (status == -1) { td->td_retval[0] = ops->fetch_error(aiocbp); return (0); } td->td_retval[0] = EINVAL; return (0); } int sys_aio_error(struct thread *td, struct aio_error_args *uap) { return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); } /* syscall - asynchronous read from a file (REALTIME) */ int sys_oaio_read(struct thread *td, struct oaio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb_ops_osigevent)); } int sys_aio_read(struct thread *td, struct aio_read_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); } /* syscall - asynchronous write to a file (REALTIME) */ int sys_oaio_write(struct thread *td, struct oaio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops_osigevent)); } int sys_aio_write(struct thread *td, struct aio_write_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); } int sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops)); } static int kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, struct aiocb **acb_list, int nent, struct sigevent *sig, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct aiocb *iocb; struct kaioinfo *ki; struct aioliojob *lj; struct kevent kev; int error; int nerror; int i; if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) return (EINVAL); if (nent < 0 || nent > AIO_LISTIO_MAX) return (EINVAL); if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; lj = uma_zalloc(aiolio_zone, M_WAITOK); lj->lioj_flags = 0; lj->lioj_count = 0; lj->lioj_finished_count = 0; knlist_init_mtx(&lj->klist, AIO_MTX(ki)); ksiginfo_init(&lj->lioj_ksi); /* * Setup signal. */ if (sig && (mode == LIO_NOWAIT)) { bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { /* Assume only new style KEVENT */ kev.filter = EVFILT_LIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.ident = (uintptr_t)uacb_list; /* something unique */ kev.data = (intptr_t)lj; /* pass user defined sigval data */ kev.udata = lj->lioj_signal.sigev_value.sival_ptr; error = kqfd_register( lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1); if (error) { uma_zfree(aiolio_zone, lj); return (error); } } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { ; } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { uma_zfree(aiolio_zone, lj); return EINVAL; } lj->lioj_flags |= LIOJ_SIGNAL; } else { uma_zfree(aiolio_zone, lj); return EINVAL; } } AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Add extra aiocb count to avoid the lio to be freed * by other threads doing aio_waitcomplete or aio_return, * and prevent event from being sent until we have queued * all tasks. */ lj->lioj_count = 1; AIO_UNLOCK(ki); /* * Get pointers to the list of I/O requests. */ nerror = 0; for (i = 0; i < nent; i++) { iocb = acb_list[i]; if (iocb != NULL) { error = aio_aqueue(td, iocb, lj, LIO_NOP, ops); if (error != 0) nerror++; } } error = 0; AIO_LOCK(ki); if (mode == LIO_WAIT) { while (lj->lioj_count - 1 != lj->lioj_finished_count) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", 0); if (error == ERESTART) error = EINTR; if (error) break; } } else { if (lj->lioj_count - 1 == lj->lioj_finished_count) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } } lj->lioj_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); uma_zfree(aiolio_zone, lj); } else AIO_UNLOCK(ki); if (nerror) return (EIO); return (error); } /* syscall - list directed I/O (REALTIME) */ int sys_olio_listio(struct thread *td, struct olio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent osig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > AIO_LISTIO_MAX) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb_ops_osigevent); free(acb_list, M_LIO); return (error); } /* syscall - list directed I/O (REALTIME) */ int sys_lio_listio(struct thread *td, struct lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > AIO_LISTIO_MAX) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig, sizeof(sig)); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, nent, sigp, &aiocb_ops); free(acb_list, M_LIO); return (error); } static void aio_physwakeup(struct bio *bp) { struct aiocblist *aiocbe = (struct aiocblist *)bp->bio_caller1; struct proc *userp; struct kaioinfo *ki; int nblks; /* Release mapping into kernel space. */ if (aiocbe->pbuf) { pmap_qremove((vm_offset_t)aiocbe->pbuf->b_data, aiocbe->npages); relpbuf(aiocbe->pbuf, NULL); aiocbe->pbuf = NULL; atomic_subtract_int(&num_buf_aio, 1); } vm_page_unhold_pages(aiocbe->pages, aiocbe->npages); bp = aiocbe->bp; aiocbe->bp = NULL; userp = aiocbe->userproc; ki = userp->p_aioinfo; AIO_LOCK(ki); aiocbe->uaiocb._aiocb_private.status -= bp->bio_resid; aiocbe->uaiocb._aiocb_private.error = 0; if (bp->bio_flags & BIO_ERROR) aiocbe->uaiocb._aiocb_private.error = bp->bio_error; nblks = btodb(aiocbe->uaiocb.aio_nbytes); if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE) aiocbe->outputcharge += nblks; else aiocbe->inputcharge += nblks; TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist); ki->kaio_buffer_count--; aio_bio_done_notify(userp, aiocbe, DONE_BUF); AIO_UNLOCK(ki); g_destroy_bio(bp); } /* syscall - wait for the next completion of an aio request */ static int kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp, struct timespec *ts, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct aiocblist *cb; struct aiocb *uuaiocb; int error, status, timo; ops->store_aiocb(aiocbp, NULL); timo = 0; if (ts) { if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; error = 0; cb = NULL; AIO_LOCK(ki); while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiowc", timo); if (timo && error == ERESTART) error = EINTR; if (error) break; } if (cb != NULL) { MPASS(cb->jobstate == JOBST_JOBFINISHED); uuaiocb = cb->uuaiocb; status = cb->uaiocb._aiocb_private.status; error = cb->uaiocb._aiocb_private.error; td->td_retval[0] = status; if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { td->td_ru.ru_oublock += cb->outputcharge; cb->outputcharge = 0; } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { td->td_ru.ru_inblock += cb->inputcharge; cb->inputcharge = 0; } aio_free_entry(cb); AIO_UNLOCK(ki); ops->store_aiocb(aiocbp, uuaiocb); ops->store_error(uuaiocb, error); ops->store_status(uuaiocb, status); } else AIO_UNLOCK(ki); return (error); } int sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) { struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); } static int kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaioinfo *ki; if (op != O_SYNC) /* XXX lack of O_DSYNC */ return (EINVAL); ki = p->p_aioinfo; if (ki == NULL) aio_init_aioinfo(p); return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops)); } int sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); } /* kqueue attach function */ static int filt_aioattach(struct knote *kn) { struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata; /* * The aiocbe pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_aio = aiocbe; kn->kn_flags &= ~EV_FLAG1; knlist_add(&aiocbe->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_aiodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_aio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_aio(struct knote *kn, long hint) { struct aiocblist *aiocbe = kn->kn_ptr.p_aio; kn->kn_data = aiocbe->uaiocb._aiocb_private.error; if (aiocbe->jobstate != JOBST_JOBFINISHED) return (0); kn->kn_flags |= EV_EOF; return (1); } /* kqueue attach function */ static int filt_lioattach(struct knote *kn) { struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata; /* * The aioliojob pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_lio = lj; kn->kn_flags &= ~EV_FLAG1; knlist_add(&lj->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_liodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_lio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_lio(struct knote *kn, long hint) { struct aioliojob * lj = kn->kn_ptr.p_lio; return (lj->lioj_flags & LIOJ_KEVENT_POSTED); } #ifdef COMPAT_FREEBSD32 struct __aiocb_private32 { int32_t status; int32_t error; uint32_t kernelinfo; }; typedef struct oaiocb32 { int aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent32 aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; } oaiocb32_t; typedef struct aiocb32 { int32_t aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ int __spare__[2]; uint32_t __spare2__; int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; struct sigevent32 aio_sigevent; /* Signal to deliver */ } aiocb32_t; static int convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ CP(*osig, *nsig, sigev_notify); switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); break; default: return (EINVAL); } return (0); } static int aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) { struct oaiocb32 job32; int error; bzero(kjob, sizeof(struct aiocb)); error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kjob, aio_fildes); CP(job32, *kjob, aio_offset); PTRIN_CP(job32, *kjob, aio_buf); CP(job32, *kjob, aio_nbytes); CP(job32, *kjob, aio_lio_opcode); CP(job32, *kjob, aio_reqprio); CP(job32, *kjob, _aiocb_private.status); CP(job32, *kjob, _aiocb_private.error); PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); return (convert_old_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); } static int aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob) { struct aiocb32 job32; int error; error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kjob, aio_fildes); CP(job32, *kjob, aio_offset); PTRIN_CP(job32, *kjob, aio_buf); CP(job32, *kjob, aio_nbytes); CP(job32, *kjob, aio_lio_opcode); CP(job32, *kjob, aio_reqprio); CP(job32, *kjob, _aiocb_private.status); CP(job32, *kjob, _aiocb_private.error); PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); } static long aiocb32_fetch_status(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.status)); } static long aiocb32_fetch_error(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.error)); } static int aiocb32_store_status(struct aiocb *ujob, long status) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.status, status)); } static int aiocb32_store_error(struct aiocb *ujob, long error) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.error, error)); } static int aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); } static int aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword32(ujobp, (long)ujob)); } static struct aiocb_ops aiocb32_ops = { .copyin = aiocb32_copyin, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; static struct aiocb_ops aiocb32_ops_osigevent = { .copyin = aiocb32_copyin_old_sigevent, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; int freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) { return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } int freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; struct aiocb **ujoblist; uint32_t *ujoblist32; int error, i; if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; ujoblist = uma_zalloc(aiol_zone, M_WAITOK); ujoblist32 = (uint32_t *)ujoblist; error = copyin(uap->aiocbp, ujoblist32, uap->nent * sizeof(ujoblist32[0])); if (error == 0) { for (i = uap->nent; i > 0; i--) ujoblist[i] = PTRIN(ujoblist32[i]); error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); } uma_zfree(aiol_zone, ujoblist); return (error); } int freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap) { return (sys_aio_cancel(td, (struct aio_cancel_args *)uap)); } int freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) { return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } int freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops_osigevent)); } int freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops)); } int freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops_osigevent)); } int freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops)); } int freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK, &aiocb32_ops)); } int freebsd32_aio_waitcomplete(struct thread *td, struct freebsd32_aio_waitcomplete_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts32, sizeof(ts32)); if (error) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, &aiocb32_ops)); } int freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } int freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent32 osig; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > AIO_LISTIO_MAX) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent32(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops_osigevent); free(acb_list, M_LIO); return (error); } int freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct sigevent32 sig32; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > AIO_LISTIO_MAX) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig32, sizeof(sig32)); if (error) return (error); error = convert_sigevent32(&sig32, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops); free(acb_list, M_LIO); return (error); } #endif Index: head/sys/security/audit/audit_bsm_klib.c =================================================================== --- head/sys/security/audit/audit_bsm_klib.c (revision 285171) +++ head/sys/security/audit/audit_bsm_klib.c (revision 285172) @@ -1,571 +1,573 @@ /* * Copyright (c) 1999-2009 Apple Inc. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Apple Inc. ("Apple") nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Hash table functions for the audit event number to event class mask * mapping. */ #define EVCLASSMAP_HASH_TABLE_SIZE 251 struct evclass_elem { au_event_t event; au_class_t class; LIST_ENTRY(evclass_elem) entry; }; struct evclass_list { LIST_HEAD(, evclass_elem) head; }; static MALLOC_DEFINE(M_AUDITEVCLASS, "audit_evclass", "Audit event class"); static struct rwlock evclass_lock; static struct evclass_list evclass_hash[EVCLASSMAP_HASH_TABLE_SIZE]; #define EVCLASS_LOCK_INIT() rw_init(&evclass_lock, "evclass_lock") #define EVCLASS_RLOCK() rw_rlock(&evclass_lock) #define EVCLASS_RUNLOCK() rw_runlock(&evclass_lock) #define EVCLASS_WLOCK() rw_wlock(&evclass_lock) #define EVCLASS_WUNLOCK() rw_wunlock(&evclass_lock) struct aue_open_event { int aoe_flags; au_event_t aoe_event; }; static const struct aue_open_event aue_open[] = { { O_RDONLY, AUE_OPEN_R }, { (O_RDONLY | O_CREAT), AUE_OPEN_RC }, { (O_RDONLY | O_CREAT | O_TRUNC), AUE_OPEN_RTC }, { (O_RDONLY | O_TRUNC), AUE_OPEN_RT }, { O_RDWR, AUE_OPEN_RW }, { (O_RDWR | O_CREAT), AUE_OPEN_RWC }, { (O_RDWR | O_CREAT | O_TRUNC), AUE_OPEN_RWTC }, { (O_RDWR | O_TRUNC), AUE_OPEN_RWT }, { O_WRONLY, AUE_OPEN_W }, { (O_WRONLY | O_CREAT), AUE_OPEN_WC }, { (O_WRONLY | O_CREAT | O_TRUNC), AUE_OPEN_WTC }, { (O_WRONLY | O_TRUNC), AUE_OPEN_WT }, }; static const int aue_open_count = sizeof(aue_open) / sizeof(aue_open[0]); static const struct aue_open_event aue_openat[] = { { O_RDONLY, AUE_OPENAT_R }, { (O_RDONLY | O_CREAT), AUE_OPENAT_RC }, { (O_RDONLY | O_CREAT | O_TRUNC), AUE_OPENAT_RTC }, { (O_RDONLY | O_TRUNC), AUE_OPENAT_RT }, { O_RDWR, AUE_OPENAT_RW }, { (O_RDWR | O_CREAT), AUE_OPENAT_RWC }, { (O_RDWR | O_CREAT | O_TRUNC), AUE_OPENAT_RWTC }, { (O_RDWR | O_TRUNC), AUE_OPENAT_RWT }, { O_WRONLY, AUE_OPENAT_W }, { (O_WRONLY | O_CREAT), AUE_OPENAT_WC }, { (O_WRONLY | O_CREAT | O_TRUNC), AUE_OPENAT_WTC }, { (O_WRONLY | O_TRUNC), AUE_OPENAT_WT }, }; static const int aue_openat_count = sizeof(aue_openat) / sizeof(aue_openat[0]); /* * Look up the class for an audit event in the class mapping table. */ au_class_t au_event_class(au_event_t event) { struct evclass_list *evcl; struct evclass_elem *evc; au_class_t class; EVCLASS_RLOCK(); evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE]; class = 0; LIST_FOREACH(evc, &evcl->head, entry) { if (evc->event == event) { class = evc->class; goto out; } } out: EVCLASS_RUNLOCK(); return (class); } /* * Insert a event to class mapping. If the event already exists in the * mapping, then replace the mapping with the new one. * * XXX There is currently no constraints placed on the number of mappings. * May want to either limit to a number, or in terms of memory usage. */ void au_evclassmap_insert(au_event_t event, au_class_t class) { struct evclass_list *evcl; struct evclass_elem *evc, *evc_new; /* * Pessimistically, always allocate storage before acquiring mutex. * Free if there is already a mapping for this event. */ evc_new = malloc(sizeof(*evc), M_AUDITEVCLASS, M_WAITOK); EVCLASS_WLOCK(); evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE]; LIST_FOREACH(evc, &evcl->head, entry) { if (evc->event == event) { evc->class = class; EVCLASS_WUNLOCK(); free(evc_new, M_AUDITEVCLASS); return; } } evc = evc_new; evc->event = event; evc->class = class; LIST_INSERT_HEAD(&evcl->head, evc, entry); EVCLASS_WUNLOCK(); } void au_evclassmap_init(void) { int i; EVCLASS_LOCK_INIT(); for (i = 0; i < EVCLASSMAP_HASH_TABLE_SIZE; i++) LIST_INIT(&evclass_hash[i].head); /* * Set up the initial event to class mapping for system calls. * * XXXRW: Really, this should walk all possible audit events, not all * native ABI system calls, as there may be audit events reachable * only through non-native system calls. It also seems a shame to * frob the mutex this early. */ for (i = 0; i < SYS_MAXSYSCALL; i++) { if (sysent[i].sy_auevent != AUE_NULL) au_evclassmap_insert(sysent[i].sy_auevent, 0); } } /* * Check whether an event is aditable by comparing the mask of classes this * event is part of against the given mask. */ int au_preselect(au_event_t event, au_class_t class, au_mask_t *mask_p, int sorf) { au_class_t effmask = 0; if (mask_p == NULL) return (-1); /* * Perform the actual check of the masks against the event. */ if (sorf & AU_PRS_SUCCESS) effmask |= (mask_p->am_success & class); if (sorf & AU_PRS_FAILURE) effmask |= (mask_p->am_failure & class); if (effmask) return (1); else return (0); } /* * Convert sysctl names and present arguments to events. */ au_event_t audit_ctlname_to_sysctlevent(int name[], uint64_t valid_arg) { /* can't parse it - so return the worst case */ if ((valid_arg & (ARG_CTLNAME | ARG_LEN)) != (ARG_CTLNAME | ARG_LEN)) return (AUE_SYSCTL); switch (name[0]) { /* non-admin "lookups" treat them special */ case KERN_OSTYPE: case KERN_OSRELEASE: case KERN_OSREV: case KERN_VERSION: case KERN_ARGMAX: case KERN_CLOCKRATE: case KERN_BOOTTIME: case KERN_POSIX1: case KERN_NGROUPS: case KERN_JOB_CONTROL: case KERN_SAVED_IDS: case KERN_OSRELDATE: case KERN_DUMMY: return (AUE_SYSCTL_NONADMIN); /* only treat the changeable controls as admin */ case KERN_MAXVNODES: case KERN_MAXPROC: case KERN_MAXFILES: case KERN_MAXPROCPERUID: case KERN_MAXFILESPERPROC: case KERN_HOSTID: case KERN_SECURELVL: case KERN_HOSTNAME: case KERN_VNODE: case KERN_PROC: case KERN_FILE: case KERN_PROF: case KERN_NISDOMAINNAME: case KERN_UPDATEINTERVAL: case KERN_NTP_PLL: case KERN_BOOTFILE: case KERN_DUMPDEV: case KERN_IPC: case KERN_PS_STRINGS: case KERN_USRSTACK: case KERN_LOGSIGEXIT: case KERN_IOV_MAX: return ((valid_arg & ARG_VALUE) ? AUE_SYSCTL : AUE_SYSCTL_NONADMIN); default: return (AUE_SYSCTL); } /* NOTREACHED */ } /* * Convert an open flags specifier into a specific type of open event for * auditing purposes. */ au_event_t audit_flags_and_error_to_openevent(int oflags, int error) { int i; /* * Need to check only those flags we care about. */ oflags = oflags & (O_RDONLY | O_CREAT | O_TRUNC | O_RDWR | O_WRONLY); for (i = 0; i < aue_open_count; i++) { if (aue_open[i].aoe_flags == oflags) return (aue_open[i].aoe_event); } return (AUE_OPEN); } au_event_t audit_flags_and_error_to_openatevent(int oflags, int error) { int i; /* * Need to check only those flags we care about. */ oflags = oflags & (O_RDONLY | O_CREAT | O_TRUNC | O_RDWR | O_WRONLY); for (i = 0; i < aue_openat_count; i++) { if (aue_openat[i].aoe_flags == oflags) return (aue_openat[i].aoe_event); } return (AUE_OPENAT); } /* * Convert a MSGCTL command to a specific event. */ au_event_t audit_msgctl_to_event(int cmd) { switch (cmd) { case IPC_RMID: return (AUE_MSGCTL_RMID); case IPC_SET: return (AUE_MSGCTL_SET); case IPC_STAT: return (AUE_MSGCTL_STAT); default: /* We will audit a bad command. */ return (AUE_MSGCTL); } } /* * Convert a SEMCTL command to a specific event. */ au_event_t audit_semctl_to_event(int cmd) { switch (cmd) { case GETALL: return (AUE_SEMCTL_GETALL); case GETNCNT: return (AUE_SEMCTL_GETNCNT); case GETPID: return (AUE_SEMCTL_GETPID); case GETVAL: return (AUE_SEMCTL_GETVAL); case GETZCNT: return (AUE_SEMCTL_GETZCNT); case IPC_RMID: return (AUE_SEMCTL_RMID); case IPC_SET: return (AUE_SEMCTL_SET); case SETALL: return (AUE_SEMCTL_SETALL); case SETVAL: return (AUE_SEMCTL_SETVAL); case IPC_STAT: return (AUE_SEMCTL_STAT); default: /* We will audit a bad command. */ return (AUE_SEMCTL); } } /* * Convert a command for the auditon() system call to a audit event. */ au_event_t auditon_command_event(int cmd) { switch(cmd) { case A_GETPOLICY: return (AUE_AUDITON_GPOLICY); case A_SETPOLICY: return (AUE_AUDITON_SPOLICY); case A_GETKMASK: return (AUE_AUDITON_GETKMASK); case A_SETKMASK: return (AUE_AUDITON_SETKMASK); case A_GETQCTRL: return (AUE_AUDITON_GQCTRL); case A_SETQCTRL: return (AUE_AUDITON_SQCTRL); case A_GETCWD: return (AUE_AUDITON_GETCWD); case A_GETCAR: return (AUE_AUDITON_GETCAR); case A_GETSTAT: return (AUE_AUDITON_GETSTAT); case A_SETSTAT: return (AUE_AUDITON_SETSTAT); case A_SETUMASK: return (AUE_AUDITON_SETUMASK); case A_SETSMASK: return (AUE_AUDITON_SETSMASK); case A_GETCOND: return (AUE_AUDITON_GETCOND); case A_SETCOND: return (AUE_AUDITON_SETCOND); case A_GETCLASS: return (AUE_AUDITON_GETCLASS); case A_SETCLASS: return (AUE_AUDITON_SETCLASS); case A_GETPINFO: case A_SETPMASK: case A_SETFSIZE: case A_GETFSIZE: case A_GETPINFO_ADDR: case A_GETKAUDIT: case A_SETKAUDIT: default: return (AUE_AUDITON); /* No special record */ } } /* * Create a canonical path from given path by prefixing either the root * directory, or the current working directory. If the process working * directory is NULL, we could use 'rootvnode' to obtain the root directory, * but this results in a volfs name written to the audit log. So we will * leave the filename starting with '/' in the audit log in this case. */ void audit_canon_path(struct thread *td, int dirfd, char *path, char *cpath) { struct vnode *cvnp, *rvnp; char *rbuf, *fbuf, *copy; struct filedesc *fdp; struct sbuf sbf; + cap_rights_t rights; int error, needslash; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "%s: at %s:%d", __func__, __FILE__, __LINE__); copy = path; rvnp = cvnp = NULL; fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); /* * Make sure that we handle the chroot(2) case. If there is an * alternate root directory, prepend it to the audited pathname. */ if (fdp->fd_rdir != NULL && fdp->fd_rdir != rootvnode) { rvnp = fdp->fd_rdir; vhold(rvnp); } /* * If the supplied path is relative, make sure we capture the current * working directory so we can prepend it to the supplied relative * path. */ if (*path != '/') { if (dirfd == AT_FDCWD) { cvnp = fdp->fd_cdir; vhold(cvnp); } else { /* XXX: fgetvp() that vhold()s vnode instead of vref()ing it would be better */ - error = fgetvp(td, dirfd, NULL, &cvnp); + error = fgetvp(td, dirfd, cap_rights_init(&rights), &cvnp); if (error) { FILEDESC_SUNLOCK(fdp); cpath[0] = '\0'; if (rvnp != NULL) vdrop(rvnp); return; } vhold(cvnp); vrele(cvnp); } needslash = (fdp->fd_rdir != cvnp); } else { needslash = 1; } FILEDESC_SUNLOCK(fdp); /* * NB: We require that the supplied array be at least MAXPATHLEN bytes * long. If this is not the case, then we can run into serious trouble. */ (void) sbuf_new(&sbf, cpath, MAXPATHLEN, SBUF_FIXEDLEN); /* * Strip leading forward slashes. */ while (*copy == '/') copy++; /* * Make sure we handle chroot(2) and prepend the global path to these * environments. * * NB: vn_fullpath(9) on FreeBSD is less reliable than vn_getpath(9) * on Darwin. As a result, this may need some additional attention * in the future. */ if (rvnp != NULL) { error = vn_fullpath_global(td, rvnp, &rbuf, &fbuf); vdrop(rvnp); if (error) { cpath[0] = '\0'; if (cvnp != NULL) vdrop(cvnp); return; } (void) sbuf_cat(&sbf, rbuf); free(fbuf, M_TEMP); } if (cvnp != NULL) { error = vn_fullpath(td, cvnp, &rbuf, &fbuf); vdrop(cvnp); if (error) { cpath[0] = '\0'; return; } (void) sbuf_cat(&sbf, rbuf); free(fbuf, M_TEMP); } if (needslash) (void) sbuf_putc(&sbf, '/'); /* * Now that we have processed any alternate root and relative path * names, add the supplied pathname. */ (void) sbuf_cat(&sbf, copy); /* * One or more of the previous sbuf operations could have resulted in * the supplied buffer being overflowed. Check to see if this is the * case. */ if (sbuf_error(&sbf) != 0) { cpath[0] = '\0'; return; } sbuf_finish(&sbf); }