Index: head/sys/kern/kern_descrip.c =================================================================== --- head/sys/kern/kern_descrip.c (revision 334485) +++ head/sys/kern/kern_descrip.c (revision 334486) @@ -1,4249 +1,4248 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); MALLOC_DECLARE(M_FADVISE); static __read_mostly uma_zone_t file_zone; static __read_mostly uma_zone_t filedesc0_zone; static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, int holdleaders); static int fd_first_free(struct filedesc *fdp, int low, int size); static int fd_last_used(struct filedesc *fdp, int size); static void fdgrowtable(struct filedesc *fdp, int nfd); static void fdgrowtable_exp(struct filedesc *fdp, int nfd); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); static int getmaxfd(struct thread *td); static u_long *filecaps_copy_prep(const struct filecaps *src); static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls); static u_long *filecaps_free_prep(struct filecaps *fcaps); static void filecaps_free_finish(u_long *ioctls); /* * Each process has: * * - An array of open file descriptors (fd_ofiles) * - An array of file flags (fd_ofileflags) * - A bitmap recording which descriptors are in use (fd_map) * * A process starts out with NDFILE descriptors. The value of NDFILE has * been selected based the historical limit of 20 open files, and an * assumption that the majority of processes, especially short-lived * processes like shells, will never need more. * * If this initial allocation is exhausted, a larger descriptor table and * map are allocated dynamically, and the pointers in the process's struct * filedesc are updated to point to those. This is repeated every time * the process runs out of file descriptors (provided it hasn't hit its * resource limit). * * Since threads may hold references to individual descriptor table * entries, the tables are never freed. Instead, they are placed on a * linked list and freed only when the struct filedesc is released. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* * SLIST entry used to keep track of ofiles which must be reclaimed when * the process exits. */ struct freetable { struct fdescenttbl *ft_table; SLIST_ENTRY(freetable) ft_next; }; /* * Initial allocation: a filedesc structure + the head of SLIST used to * keep track of old ofiles + enough space for NDFILE descriptors. */ struct fdescenttbl0 { int fdt_nfiles; struct filedescent fdt_ofiles[NDFILE]; }; struct filedesc0 { struct filedesc fd_fd; SLIST_HEAD(, freetable) fd_free; struct fdescenttbl0 fd_dfiles; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ volatile int __exclusive_cache_line openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* * If low >= size, just return low. Otherwise find the first zero bit in the * given bitmap, starting at low and not exceeding size - 1. Return size if * not found. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the highest non-zero bit in the given bitmap, starting at 0 and * not exceeding size - 1. Return -1 if not found. */ static int fd_last_used(struct filedesc *fdp, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, minoff; off = NDSLOT(size); if (size % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); if ((mask &= map[off]) != 0) return (off * NDENTRIES + flsl(mask) - 1); --off; } for (minoff = NDSLOT(0); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (-1); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused_init(struct filedesc *fdp, int fd) { KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); } static void fdused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); fdused_init(fdp, fd); if (fd > fdp->fd_lastfile) fdp->fd_lastfile = fd; if (fd == fdp->fd_freefile) fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("fd=%d is still in use", fd)); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; if (fd == fdp->fd_lastfile) fdp->fd_lastfile = fd_last_used(fdp, fd); } /* * Free a file descriptor. * * Avoid some work if fdp is about to be destroyed. */ static inline void fdefree_last(struct filedescent *fde) { filecaps_free(&fde->fde_caps); } static inline void fdfree(struct filedesc *fdp, int fd) { struct filedescent *fde; fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seq_write_begin(&fde->fde_seq); #endif fde->fde_file = NULL; #ifdef CAPABILITIES seq_write_end(&fde->fde_seq); #endif fdefree_last(fde); fdunused(fdp, fd); } void pwd_ensure_dirs(void) { struct filedesc *fdp; fdp = curproc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_cdir == NULL) { fdp->fd_cdir = rootvnode; vrefact(rootvnode); } if (fdp->fd_rdir == NULL) { fdp->fd_rdir = rootvnode; vrefact(rootvnode); } FILEDESC_XUNLOCK(fdp); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) { #ifdef RACCT uint64_t lim; #endif td->td_retval[0] = min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc); #ifdef RACCT PROC_LOCK(td->td_proc); lim = racct_get_limit(td->td_proc, RACCT_NOFILE); PROC_UNLOCK(td->td_proc); if (lim < td->td_retval[0]) td->td_retval[0] = lim; #endif return (0); } /* * Duplicate a file descriptor to a particular value. * * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int sys_dup2(struct thread *td, struct dup2_args *uap) { return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int sys_dup(struct thread *td, struct dup_args *uap) { return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int sys_fcntl(struct thread *td, struct fcntl_args *uap) { return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); } int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) { struct flock fl; struct __oflock ofl; intptr_t arg1; int error, newcmd; error = 0; newcmd = cmd; switch (cmd) { case F_OGETLK: case F_OSETLK: case F_OSETLKW: /* * Convert old flock structure to new. */ error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); fl.l_start = ofl.l_start; fl.l_len = ofl.l_len; fl.l_pid = ofl.l_pid; fl.l_type = ofl.l_type; fl.l_whence = ofl.l_whence; fl.l_sysid = 0; switch (cmd) { case F_OGETLK: newcmd = F_GETLK; break; case F_OSETLK: newcmd = F_SETLK; break; case F_OSETLKW: newcmd = F_SETLKW; break; } arg1 = (intptr_t)&fl; break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_SETLK_REMOTE: error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); arg1 = (intptr_t)&fl; break; default: arg1 = arg; break; } if (error) return (error); error = kern_fcntl(td, fd, newcmd, arg1); if (error) return (error); if (cmd == F_OGETLK) { ofl.l_start = fl.l_start; ofl.l_len = fl.l_len; ofl.l_pid = fl.l_pid; ofl.l_type = fl.l_type; ofl.l_whence = fl.l_whence; error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); } else if (cmd == F_GETLK) { error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); } return (error); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp, *fp2; struct filedescent *fde; struct proc *p; struct vnode *vp; int error, flg, tmp; uint64_t bsize; off_t foffset; error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; AUDIT_ARG_FD(cmd); AUDIT_ARG_CMD(cmd); switch (cmd) { case F_DUPFD: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); break; case F_DUPFD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_DUP2FD: tmp = arg; error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); break; case F_DUP2FD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_GETFD: error = EBADF; FILEDESC_SLOCK(fdp); fde = fdeget_locked(fdp, fd); if (fde != NULL) { td->td_retval[0] = (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; error = 0; } FILEDESC_SUNLOCK(fdp); break; case F_SETFD: error = EBADF; FILEDESC_XLOCK(fdp); fde = fdeget_locked(fdp, fd); if (fde != NULL) { fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); error = 0; } FILEDESC_XUNLOCK(fdp); break; case F_GETFL: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); if (error != 0) break; td->td_retval[0] = OFLAGS(fp->f_flag); fdrop(fp, td); break; case F_SETFL: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); if (error != 0) break; do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error != 0) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); if (error != 0) break; error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); if (error != 0) break; tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLK_REMOTE: error = priv_check(td, PRIV_NFS_LOCKD); if (error) return (error); flg = F_REMOTE; goto do_setlk; case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: do_setlk: error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if (foffset < 0 || (flp->l_start > 0 && foffset > OFF_MAX - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, flg); break; case F_UNLCKSYS: /* * Temporary api for testing remote lock * infrastructure. */ if (flg != F_REMOTE) { error = EINVAL; break; } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCKSYS, flp, flg); break; default: error = EINVAL; break; } if (error != 0 || flp->l_type == F_UNLCK || flp->l_type == F_UNLCKSYS) { fdrop(fp, td); break; } /* * Check for a race with close. * * The vnode is now advisory locked (or unlocked, but this case * is not really important) as the caller requested. * We had to drop the filedesc lock, so we need to recheck if * the descriptor is still valid, because if it was closed * in the meantime we need to remove advisory lock from the * vnode - close on any descriptor leading to an advisory * locked vnode, removes that lock. * We will return 0 on purpose in that case, as the result of * successful advisory lock might have been externally visible * already. This is fine - effectively we pretend to the caller * that the closing thread was a bit slower and that the * advisory lock succeeded before the close. */ error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2, NULL); if (error != 0) { fdrop(fp, td); break; } if (fp != fp2) { flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); } fdrop(fp, td); fdrop(fp2, td); break; case F_GETLK: error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { error = EINVAL; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if ((flp->l_start > 0 && foffset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && foffset < OFF_MIN - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); fdrop(fp, td); break; case F_RDAHEAD: arg = arg ? 128 * 1024: 0; /* FALLTHROUGH */ case F_READAHEAD: error = fget_unlocked(fdp, fd, &cap_no_rights, &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; /* * Exclusive lock synchronizes against f_seqcount reads and * writes in sequential_heuristic(). */ error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { fdrop(fp, td); break; } if (arg >= 0) { bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; fp->f_seqcount = (arg + bsize - 1) / bsize; atomic_set_int(&fp->f_flag, FRDAHEAD); } else { atomic_clear_int(&fp->f_flag, FRDAHEAD); } VOP_UNLOCK(vp, 0); fdrop(fp, td); break; default: error = EINVAL; break; } return (error); } static int getmaxfd(struct thread *td) { return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); } /* * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). */ int kern_dup(struct thread *td, u_int mode, int flags, int old, int new) { struct filedesc *fdp; struct filedescent *oldfde, *newfde; struct proc *p; struct file *delfp; u_long *oioctls, *nioctls; int error, maxfd; p = td->td_proc; fdp = p->p_fd; MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); MPASS(mode < FDDUP_LASTMODE); AUDIT_ARG_FD(old); /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ /* * Verify we have a valid descriptor to dup from and possibly to * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should * return EINVAL when the new descriptor is out of bounds. */ if (old < 0) return (EBADF); if (new < 0) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); maxfd = getmaxfd(td); if (new >= maxfd) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); error = EBADF; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, old) == NULL) goto unlock; if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) { td->td_retval[0] = new; if (flags & FDDUP_FLAG_CLOEXEC) fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; error = 0; goto unlock; } /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. */ switch (mode) { case FDDUP_NORMAL: case FDDUP_FCNTL: if ((error = fdalloc(td, new, &new)) != 0) goto unlock; break; case FDDUP_MUSTREPLACE: /* Target file descriptor must exist. */ if (fget_locked(fdp, new) == NULL) goto unlock; break; case FDDUP_FIXED: if (new >= fdp->fd_nfiles) { /* * The resource limits are here instead of e.g. * fdalloc(), because the file descriptor table may be * shared between processes, so we can't really use * racct_add()/racct_sub(). Instead of counting the * number of actually allocated descriptors, just put * the limit on the size of the file descriptor table. */ #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_set(p, RACCT_NOFILE, new + 1); PROC_UNLOCK(p); if (error != 0) { error = EMFILE; goto unlock; } } #endif fdgrowtable_exp(fdp, new + 1); } if (!fdisused(fdp, new)) fdused(fdp, new); break; default: KASSERT(0, ("%s unsupported mode %d", __func__, mode)); } KASSERT(old != new, ("new fd is same as old")); oldfde = &fdp->fd_ofiles[old]; fhold(oldfde->fde_file); newfde = &fdp->fd_ofiles[new]; delfp = newfde->fde_file; oioctls = filecaps_free_prep(&newfde->fde_caps); nioctls = filecaps_copy_prep(&oldfde->fde_caps); /* * Duplicate the source descriptor. */ #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif memcpy(newfde, oldfde, fde_change_size); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, nioctls); if ((flags & FDDUP_FLAG_CLOEXEC) != 0) newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; else newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif filecaps_free_finish(oioctls); td->td_retval[0] = new; error = 0; if (delfp != NULL) { (void) closefp(fdp, new, delfp, td, 1); FILEDESC_UNLOCK_ASSERT(fdp); } else { unlock: FILEDESC_XUNLOCK(fdp); } return (error); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; if (*sigiop == NULL) return; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } *(sigio->sio_myref) = NULL; if ((sigio)->sio_pgid < 0) { struct pgrp *pg = (sigio)->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { struct proc *p = (sigio)->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); } /* * Free a list of sigio structures. * We only need to lock the SIGIO_LOCK because we have made ourselves * inaccessible to callers of fsetown and therefore do not need to lock * the proc or pgrp struct for the list manipulation. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio; sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; /* * Every entry of the list should belong * to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK_ASSERT(p, MA_NOTOWNED); } SIGIO_LOCK(); while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { *(sigio->sio_myref) = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); SIGIO_LOCK(); } SIGIO_UNLOCK(); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } ret = 0; /* Allocate and fill in the new sigio out of locks. */ sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; sx_slock(&proctree_lock); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { ret = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ PROC_UNLOCK(proc); if (proc->p_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } pgrp = NULL; } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; goto fail; } PGRP_UNLOCK(pgrp); /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } proc = NULL; } funsetown(sigiop); if (pgid > 0) { PROC_LOCK(proc); /* * Since funsetownlst() is called without the proctree * locked, we need to check for P_WEXIT. * XXX: is ESRCH correct? */ if ((proc->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(proc); ret = ESRCH; goto fail; } SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); sigio->sio_proc = proc; PROC_UNLOCK(proc); } else { PGRP_LOCK(pgrp); SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); SIGIO_LOCK(); *sigiop = sigio; SIGIO_UNLOCK(); return (0); fail: sx_sunlock(&proctree_lock); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t -fgetown(sigiop) - struct sigio **sigiop; +fgetown(struct sigio **sigiop) { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } /* * Function drops the filedesc lock on return. */ static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, int holdleaders) { int error; FILEDESC_XLOCK_ASSERT(fdp); if (holdleaders) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; } else { holdleaders = 0; } } /* * We now hold the fp reference that used to be owned by the * descriptor array. We have to unlock the FILEDESC *AFTER* * knote_fdclose to prevent a race of the fd getting opened, a knote * added, and deleteing a knote for the new fd. */ knote_fdclose(td, fd); /* * We need to notify mqueue if the object is of type mqueue. */ if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, fd, fp); FILEDESC_XUNLOCK(fdp); error = closef(fp, td); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } return (error); } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int sys_close(struct thread *td, struct close_args *uap) { return (kern_close(td, uap->fd)); } int kern_close(struct thread *td, int fd) { struct filedesc *fdp; struct file *fp; fdp = td->td_proc->p_fd; AUDIT_SYSCLOSE(td, fd); FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, fd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } fdfree(fdp, fd); /* closefp() drops the FILEDESC lock for us. */ return (closefp(fdp, fd, fp, td, 1)); } /* * Close open file descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct closefrom_args { int lowfd; }; #endif /* ARGSUSED */ int sys_closefrom(struct thread *td, struct closefrom_args *uap) { struct filedesc *fdp; int fd; fdp = td->td_proc->p_fd; AUDIT_ARG_FD(uap->lowfd); /* * Treat negative starting file descriptor values identical to * closefrom(0) which closes all files. */ if (uap->lowfd < 0) uap->lowfd = 0; FILEDESC_SLOCK(fdp); for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) { if (fdp->fd_ofiles[fd].fde_file != NULL) { FILEDESC_SUNLOCK(fdp); (void)kern_close(td, fd); FILEDESC_SLOCK(fdp); } } FILEDESC_SUNLOCK(fdp); return (0); } #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) int freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_fstat(td, uap->fd, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int sys_fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_fstat_rights, &fp); if (error != 0) return (error); AUDIT_ARG_FILE(td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred, td); fdrop(fp, td); #ifdef __STAT_TIME_T_EXT if (error == 0) { sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; } #endif #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT)) ktrstat(sbp); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { freebsd11_cvtnstat(&ub, &nub); error = copyout(&nub, uap->sb, sizeof(nub)); } return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int sys_fpathconf(struct thread *td, struct fpathconf_args *uap) { long value; int error; error = kern_fpathconf(td, uap->fd, uap->name, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_fpathconf(struct thread *td, int fd, int name, long *valuep) { struct file *fp; struct vnode *vp; int error; error = fget(td, fd, &cap_fpathconf_rights, &fp); if (error != 0) return (error); if (name == _PC_ASYNC_IO) { *valuep = _POSIX_ASYNCHRONOUS_IO; goto out; } vp = fp->f_vnode; if (vp != NULL) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_PATHCONF(vp, name, valuep); VOP_UNLOCK(vp, 0); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (name != _PC_PIPE_BUF) { error = EINVAL; } else { *valuep = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Initialize filecaps structure. */ void filecaps_init(struct filecaps *fcaps) { bzero(fcaps, sizeof(*fcaps)); fcaps->fc_nioctls = -1; } /* * Copy filecaps structure allocating memory for ioctls array if needed. * * The last parameter indicates whether the fdtable is locked. If it is not and * ioctls are encountered, copying fails and the caller must lock the table. * * Note that if the table was not locked, the caller has to check the relevant * sequence counter to determine whether the operation was successful. */ bool filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) { size_t size; if (src->fc_ioctls != NULL && !locked) return (false); memcpy(dst, src, sizeof(*src)); if (src->fc_ioctls == NULL) return (true); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); memcpy(dst->fc_ioctls, src->fc_ioctls, size); return (true); } static u_long * filecaps_copy_prep(const struct filecaps *src) { u_long *ioctls; size_t size; if (src->fc_ioctls == NULL) return (NULL); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; ioctls = malloc(size, M_FILECAPS, M_WAITOK); return (ioctls); } static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls) { size_t size; *dst = *src; if (src->fc_ioctls == NULL) { MPASS(ioctls == NULL); return; } size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = ioctls; bcopy(src->fc_ioctls, dst->fc_ioctls, size); } /* * Move filecaps structure to the new place and clear the old place. */ void filecaps_move(struct filecaps *src, struct filecaps *dst) { *dst = *src; bzero(src, sizeof(*src)); } /* * Fill the given filecaps structure with full rights. */ static void filecaps_fill(struct filecaps *fcaps) { CAP_ALL(&fcaps->fc_rights); fcaps->fc_ioctls = NULL; fcaps->fc_nioctls = -1; fcaps->fc_fcntls = CAP_FCNTL_ALL; } /* * Free memory allocated within filecaps structure. */ void filecaps_free(struct filecaps *fcaps) { free(fcaps->fc_ioctls, M_FILECAPS); bzero(fcaps, sizeof(*fcaps)); } static u_long * filecaps_free_prep(struct filecaps *fcaps) { u_long *ioctls; ioctls = fcaps->fc_ioctls; bzero(fcaps, sizeof(*fcaps)); return (ioctls); } static void filecaps_free_finish(u_long *ioctls) { free(ioctls, M_FILECAPS); } /* * Validate the given filecaps structure. */ static void filecaps_validate(const struct filecaps *fcaps, const char *func) { KASSERT(cap_rights_is_valid(&fcaps->fc_rights), ("%s: invalid rights", func)); KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, ("%s: invalid fcntls", func)); KASSERT(fcaps->fc_fcntls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), ("%s: fcntls without CAP_FCNTL", func)); KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), ("%s: invalid ioctls", func)); KASSERT(fcaps->fc_nioctls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), ("%s: ioctls without CAP_IOCTL", func)); } static void fdgrowtable_exp(struct filedesc *fdp, int nfd) { int nfd1; FILEDESC_XLOCK_ASSERT(fdp); nfd1 = fdp->fd_nfiles * 2; if (nfd1 < nfd) nfd1 = nfd; fdgrowtable(fdp, nfd1); } /* * Grow the file table to accommodate (at least) nfd descriptors. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct filedesc0 *fdp0; struct freetable *ft; struct fdescenttbl *ntable; struct fdescenttbl *otable; int nnfiles, onfiles; NDSLOTTYPE *nmap, *omap; /* * If lastfile is -1 this struct filedesc was just allocated and we are * growing it to accommodate for the one we are going to copy from. There * is no need to have a lock on this one as it's not visible to anyone. */ if (fdp->fd_lastfile != -1) FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* save old values */ onfiles = fdp->fd_nfiles; otable = fdp->fd_files; omap = fdp->fd_map; /* compute the size of the new table */ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* * Allocate a new table. We need enough space for the number of * entries, file entries themselves and the struct freetable we will use * when we decommission the table and place it on the freelist. * We place the struct freetable in the middle so we don't have * to worry about padding. */ ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + nnfiles * sizeof(ntable->fdt_ofiles[0]) + sizeof(struct freetable), M_FILEDESC, M_ZERO | M_WAITOK); /* copy the old data */ ntable->fdt_nfiles = nnfiles; memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, onfiles * sizeof(ntable->fdt_ofiles[0])); /* * Allocate a new map only if the old is not large enough. It will * grow at a slower rate than the table as it can map more * entries than the table can hold. */ if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); /* copy over the old data and update the pointer */ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); fdp->fd_map = nmap; } /* * Make sure that ntable is correctly initialized before we replace * fd_files poiner. Otherwise fget_unlocked() may see inconsistent * data. */ atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); /* * Do not free the old file table, as some threads may still * reference entries within it. Instead, place it on a freelist * which will be processed when the struct filedesc is released. * * Note that if onfiles == NDFILE, we're dealing with the original * static allocation contained within (struct filedesc0 *)fdp, * which must not be freed. */ if (onfiles > NDFILE) { ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; fdp0 = (struct filedesc0 *)fdp; ft->ft_table = otable; SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); } /* * The map does not have the same possibility of threads still * holding references to it. So always free it as long as it * does not reference the original static allocation. */ if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(omap, M_FILEDESC); } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd, maxfd, allocfd; #ifdef RACCT int error; #endif FILEDESC_XLOCK_ASSERT(fdp); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; maxfd = getmaxfd(td); /* * Search the bitmap for a free descriptor starting at minfd. * If none is found, grow the file table. */ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (fd >= maxfd) return (EMFILE); if (fd >= fdp->fd_nfiles) { allocfd = min(fd * 2, maxfd); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_set(p, RACCT_NOFILE, allocfd); PROC_UNLOCK(p); if (error != 0) return (EMFILE); } #endif /* * fd is already equal to first free descriptor >= minfd, so * we only need to grow the table and we are done. */ fdgrowtable_exp(fdp, allocfd); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), ("invalid descriptor %d", fd)); KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("file descriptor isn't free")); fdused(fdp, fd); *result = fd; return (0); } /* * Allocate n file descriptors for the process. */ int fdallocn(struct thread *td, int minfd, int *fds, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int i; FILEDESC_XLOCK_ASSERT(fdp); for (i = 0; i < n; i++) if (fdalloc(td, 0, &fds[i]) != 0) break; if (i < n) { for (i--; i >= 0; i--) fdunused(fdp, fds[i]); return (EMFILE); } return (0); } /* * Create a new open file structure and allocate a file descriptor for the * process that refers to it. We add one reference to the file for the * descriptor table and one reference for resultfp. This is to prevent us * being preempted and the entry in the descriptor table closed after we * release the FILEDESC lock. */ int falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { struct file *fp; int error, fd; error = falloc_noinstall(td, &fp); if (error) return (error); /* no reference held on error */ error = finstall(td, fp, &fd, flags, fcaps); if (error) { fdrop(fp, td); /* one reference (fp only) */ return (error); } if (resultfp != NULL) *resultfp = fp; /* copy out result */ else fdrop(fp, td); /* release local reference */ if (resultfd != NULL) *resultfd = fd; return (0); } /* * Create a new open file structure without allocating a file descriptor. */ int falloc_noinstall(struct thread *td, struct file **resultfp) { struct file *fp; int maxuserfiles = maxfiles - (maxfiles / 20); int openfiles_new; static struct timeval lastfail; static int curfail; KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; if ((openfiles_new >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles_new >= maxfiles) { atomic_subtract_int(&openfiles, 1); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, (%s) " "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); } return (ENFILE); } fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); refcount_init(&fp->f_count, 1); fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; *resultfp = fp; return (0); } /* * Install a file in a file descriptor table. */ void _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, struct filecaps *fcaps) { struct filedescent *fde; MPASS(fp != NULL); if (fcaps != NULL) filecaps_validate(fcaps, __func__); FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seq_write_begin(&fde->fde_seq); #endif fde->fde_file = fp; fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; if (fcaps != NULL) filecaps_move(fcaps, &fde->fde_caps); else filecaps_fill(&fde->fde_caps); #ifdef CAPABILITIES seq_write_end(&fde->fde_seq); #endif } int finstall(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { struct filedesc *fdp = td->td_proc->p_fd; int error; MPASS(fd != NULL); FILEDESC_XLOCK(fdp); if ((error = fdalloc(td, 0, fd))) { FILEDESC_XUNLOCK(fdp); return (error); } fhold(fp); _finstall(fdp, fp, *fd, flags, fcaps); FILEDESC_XUNLOCK(fdp); return (0); } /* * Build a new filedesc structure from another. * Copy the current, root, and jail root vnode references. * * If fdp is not NULL, return with it shared locked. */ struct filedesc * fdinit(struct filedesc *fdp, bool prepfiles) { struct filedesc0 *newfdp0; struct filedesc *newfdp; newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); newfdp = &newfdp0->fd_fd; /* Create the file descriptor table. */ FILEDESC_LOCK_INIT(newfdp); refcount_init(&newfdp->fd_refcnt, 1); refcount_init(&newfdp->fd_holdcnt, 1); newfdp->fd_cmask = CMASK; newfdp->fd_map = newfdp0->fd_dmap; newfdp->fd_lastfile = -1; newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; newfdp->fd_files->fdt_nfiles = NDFILE; if (fdp == NULL) return (newfdp); if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles) fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_SLOCK(fdp); newfdp->fd_cdir = fdp->fd_cdir; if (newfdp->fd_cdir) vrefact(newfdp->fd_cdir); newfdp->fd_rdir = fdp->fd_rdir; if (newfdp->fd_rdir) vrefact(newfdp->fd_rdir); newfdp->fd_jdir = fdp->fd_jdir; if (newfdp->fd_jdir) vrefact(newfdp->fd_jdir); if (!prepfiles) { FILEDESC_SUNLOCK(fdp); } else { while (fdp->fd_lastfile >= newfdp->fd_nfiles) { FILEDESC_SUNLOCK(fdp); fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_SLOCK(fdp); } } return (newfdp); } static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = p->p_fd; if (fdp != NULL) refcount_acquire(&fdp->fd_holdcnt); return (fdp); } static void fddrop(struct filedesc *fdp) { if (fdp->fd_holdcnt > 1) { if (refcount_release(&fdp->fd_holdcnt) == 0) return; } FILEDESC_LOCK_DESTROY(fdp); uma_zfree(filedesc0_zone, fdp); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { refcount_acquire(&fdp->fd_refcnt); return (fdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct thread *td) { struct filedesc *tmp; struct proc *p = td->td_proc; if (p->p_fd->fd_refcnt == 1) return; tmp = fdcopy(p->p_fd); fdescfree(td); p->p_fd = tmp; } void fdinstall_remapped(struct thread *td, struct filedesc *fdp) { fdescfree(td); td->td_proc->p_fd = fdp; } /* * Copy a filedesc structure. A NULL pointer in returns a NULL reference, * this is to ease callers, not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int i; MPASS(fdp != NULL); newfdp = fdinit(fdp, true); /* copy all passable descriptors (i.e. not kqueue) */ newfdp->fd_freefile = -1; for (i = 0; i <= fdp->fd_lastfile; ++i) { ofde = &fdp->fd_ofiles[i]; if (ofde->fde_file == NULL || (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; continue; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fhold(nfde->fde_file); fdused_init(newfdp, i); newfdp->fd_lastfile = i; } if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_SUNLOCK(fdp); return (newfdp); } /* * Copies a filedesc structure, while remapping all file descriptors * stored inside using a translation table. * * File descriptors are copied over to the new file descriptor table, * regardless of whether the close-on-exec flag is set. */ int fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds, struct filedesc **ret) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int error, i; MPASS(fdp != NULL); newfdp = fdinit(fdp, true); if (nfds > fdp->fd_lastfile + 1) { /* New table cannot be larger than the old one. */ error = E2BIG; goto bad; } /* Copy all passable descriptors (i.e. not kqueue). */ newfdp->fd_freefile = nfds; for (i = 0; i < nfds; ++i) { if (fds[i] < 0 || fds[i] > fdp->fd_lastfile) { /* File descriptor out of bounds. */ error = EBADF; goto bad; } ofde = &fdp->fd_ofiles[fds[i]]; if (ofde->fde_file == NULL) { /* Unused file descriptor. */ error = EBADF; goto bad; } if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) { /* File descriptor cannot be passed. */ error = EINVAL; goto bad; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fhold(nfde->fde_file); fdused_init(newfdp, i); newfdp->fd_lastfile = i; } newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_SUNLOCK(fdp); *ret = newfdp; return (0); bad: FILEDESC_SUNLOCK(fdp); fdescfree_remapped(newfdp); return (error); } /* * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. * one of processes using it exits) and the table used to be shared. */ static void fdclearlocks(struct thread *td) { struct filedesc *fdp; struct filedesc_to_leader *fdtol; struct flock lf; struct file *fp; struct proc *p; struct vnode *vp; int i; p = td->td_proc; fdp = p->p_fd; fdtol = p->p_fdtol; MPASS(fdtol != NULL); FILEDESC_XLOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { for (i = 0; i <= fdp->fd_lastfile; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp == NULL || fp->f_type != DTYPE_VNODE) continue; fhold(fp); FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdrop(fp, td); } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or kern_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; sx_sleep(&fdp->fd_holdleaderscount, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader remains * valid in closef(). */ fdtol->fdl_wakeup = 1; sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; p->p_fdtol = NULL; FILEDESC_XUNLOCK(fdp); if (fdtol != NULL) free(fdtol, M_FILEDESC_TO_LEADER); } /* * Release a filedesc structure. */ static void fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose) { struct filedesc0 *fdp0; struct freetable *ft, *tft; struct filedescent *fde; struct file *fp; int i; for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL) { fdefree_last(fde); if (needclose) (void) closef(fp, td); else fdrop(fp, td); } } if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); if (fdp->fd_nfiles > NDFILE) free(fdp->fd_files, M_FILEDESC); fdp0 = (struct filedesc0 *)fdp; SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) free(ft->ft_table, M_FILEDESC); fddrop(fdp); } void fdescfree(struct thread *td) { struct proc *p; struct filedesc *fdp; struct vnode *cdir, *jdir, *rdir; p = td->td_proc; fdp = p->p_fd; MPASS(fdp != NULL); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_set(p, RACCT_NOFILE, 0); PROC_UNLOCK(p); } #endif if (p->p_fdtol != NULL) fdclearlocks(td); PROC_LOCK(p); p->p_fd = NULL; PROC_UNLOCK(p); if (refcount_release(&fdp->fd_refcnt) == 0) return; FILEDESC_XLOCK(fdp); cdir = fdp->fd_cdir; fdp->fd_cdir = NULL; rdir = fdp->fd_rdir; fdp->fd_rdir = NULL; jdir = fdp->fd_jdir; fdp->fd_jdir = NULL; FILEDESC_XUNLOCK(fdp); if (cdir != NULL) vrele(cdir); if (rdir != NULL) vrele(rdir); if (jdir != NULL) vrele(jdir); fdescfree_fds(td, fdp, 1); } void fdescfree_remapped(struct filedesc *fdp) { if (fdp->fd_cdir != NULL) vrele(fdp->fd_cdir); if (fdp->fd_rdir != NULL) vrele(fdp->fd_rdir); if (fdp->fd_jdir != NULL) vrele(fdp->fd_jdir); fdescfree_fds(curthread, fdp, 0); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static bool is_unsafe(struct file *fp) { struct vnode *vp; if (fp->f_type != DTYPE_VNODE) return (false); vp = fp->f_vnode; return ((vp->v_vflag & VV_PROCDEP) != 0); } /* * Make this setguid thing safe, if at all possible. */ void fdsetugidsafety(struct thread *td) { struct filedesc *fdp; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); for (i = 0; i <= 2; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp != NULL && is_unsafe(fp)) { FILEDESC_XLOCK(fdp); knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fdfree(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); } } } /* * If a specific file object occupies a specific file descriptor, close the * file descriptor entry and drop a reference on the file object. This is a * convenience function to handle a subsequent error in a function that calls * falloc() that handles the race that another thread might have closed the * file descriptor out from under the thread creating the file object. */ void fdclose(struct thread *td, struct file *fp, int idx) { struct filedesc *fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[idx].fde_file == fp) { fdfree(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); } else FILEDESC_XUNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; struct filedescent *fde; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || (fde->fde_flags & UF_EXCLOSE))) { FILEDESC_XLOCK(fdp); fdfree(fdp, i); (void) closefp(fdp, i, fp, td, 0); FILEDESC_UNLOCK_ASSERT(fdp); } } } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct filedesc *fdp; register_t save; int i, error, devnull; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); devnull = -1; for (i = 0; i <= 2; i++) { if (fdp->fd_ofiles[i].fde_file != NULL) continue; save = td->td_retval[0]; if (devnull != -1) { error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); } else { error = kern_openat(td, AT_FDCWD, "/dev/null", UIO_SYSSPACE, O_RDWR, 0); if (error == 0) { devnull = td->td_retval[0]; KASSERT(devnull == i, ("we didn't get our fd")); } } td->td_retval[0] = save; if (error != 0) return (error); } return (0); } /* * Internal form of close. Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. * * XXXRW: Giant is not required for the caller, but often will be held; this * makes it moderately likely the Giant will be recursed in the VFS case. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE && td != NULL) { vp = fp->f_vnode; if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table is * shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_XUNLOCK(fdp); } } return (fdrop(fp, td)); } /* * Initialize the file pointer with the specified properties. * * The ops are set with release semantics to be certain that the flags, type, * and data are visible when ops is. This is to prevent ops methods from being * called with bad data. */ void finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) { fp->f_data = data; fp->f_flag = flag; fp->f_type = type; atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); } int fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedescent *fde; int error; FILEDESC_LOCK_ASSERT(fdp); fde = fdeget_locked(fdp, fd); if (fde == NULL) { error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights_fde(fde), needrightsp); if (error != 0) goto out; #endif if (havecapsp != NULL) filecaps_copy(&fde->fde_caps, havecapsp, true); *fpp = fde->fde_file; error = 0; out: return (error); } int fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedesc *fdp = td->td_proc->p_fd; int error; #ifndef CAPABILITIES error = fget_unlocked(fdp, fd, needrightsp, fpp, NULL); if (error == 0 && havecapsp != NULL) filecaps_fill(havecapsp); #else struct file *fp; seq_t seq; for (;;) { error = fget_unlocked(fdp, fd, needrightsp, &fp, &seq); if (error != 0) return (error); if (havecapsp != NULL) { if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecapsp, false)) { fdrop(fp, td); goto get_locked; } } if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } *fpp = fp; return (0); get_locked: FILEDESC_SLOCK(fdp); error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp); if (error == 0) fhold(*fpp); FILEDESC_SUNLOCK(fdp); #endif return (error); } int fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, seq_t *seqp) { #ifdef CAPABILITIES const struct filedescent *fde; #endif const struct fdescenttbl *fdt; struct file *fp; u_int count; #ifdef CAPABILITIES seq_t seq; cap_rights_t haverights; int error; #endif fdt = fdp->fd_files; if ((u_int)fd >= fdt->fdt_nfiles) return (EBADF); /* * Fetch the descriptor locklessly. We avoid fdrop() races by * never raising a refcount above 0. To accomplish this we have * to use a cmpset loop rather than an atomic_add. The descriptor * must be re-verified once we acquire a reference to be certain * that the identity is still correct and we did not lose a race * due to preemption. */ for (;;) { #ifdef CAPABILITIES seq = seq_read(fd_seq(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = *cap_rights_fde(fde); fp = fde->fde_file; if (!seq_consistent(fd_seq(fdt, fd), seq)) continue; #else fp = fdt->fdt_ofiles[fd].fde_file; #endif if (fp == NULL) return (EBADF); #ifdef CAPABILITIES error = cap_check(&haverights, needrightsp); if (error != 0) return (error); #endif count = fp->f_count; retry: if (count == 0) { /* * Force a reload. Other thread could reallocate the * table before this fd was closed, so it possible that * there is a stale fp pointer in cached version. */ fdt = *(const struct fdescenttbl * const volatile *)&(fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ if (atomic_fcmpset_acq_int(&fp->f_count, &count, count + 1) == 0) goto retry; fdt = fdp->fd_files; #ifdef CAPABILITIES if (seq_consistent_nomb(fd_seq(fdt, fd), seq)) #else if (fp == fdt->fdt_ofiles[fd].fde_file) #endif break; fdrop(fp, curthread); } *fpp = fp; if (seqp != NULL) { #ifdef CAPABILITIES *seqp = seq; #endif } return (0); } /* * Extract the file pointer associated with the specified descriptor for the * current user process. * * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * * File's rights will be checked against the capability rights mask. * * If an error occurred the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is held and set and zero is returned. Caller is * responsible for fdrop(). */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, cap_rights_t *needrightsp, seq_t *seqp) { struct filedesc *fdp; struct file *fp; int error; *fpp = NULL; fdp = td->td_proc->p_fd; error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp); if (error != 0) return (error); if (fp->f_ops == &badfileops) { fdrop(fp, td); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. */ error = 0; switch (flags) { case FREAD: case FWRITE: if ((fp->f_flag & flags) == 0) error = EBADF; break; case FEXEC: if ((fp->f_flag & (FREAD | FEXEC)) == 0 || ((fp->f_flag & FWRITE) != 0)) error = EBADF; break; case 0: break; default: KASSERT(0, ("wrong flags")); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, 0, rightsp, NULL)); } int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp, struct file **fpp) { int error; #ifndef CAPABILITIES error = _fget(td, fd, fpp, 0, rightsp, NULL); if (maxprotp != NULL) *maxprotp = VM_PROT_ALL; #else struct filedesc *fdp = td->td_proc->p_fd; seq_t seq; MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); for (;;) { error = _fget(td, fd, fpp, 0, rightsp, &seq); if (error != 0) return (error); /* * If requested, convert capability rights to access flags. */ if (maxprotp != NULL) *maxprotp = cap_rights_to_vmprot(cap_rights(fdp, fd)); if (!fd_modified(fdp, fd, seq)) break; fdrop(*fpp, td); } #endif return (error); } int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FREAD, rightsp, NULL)); } int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FWRITE, rightsp, NULL)); } int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp) { struct filedesc *fdp = td->td_proc->p_fd; #ifndef CAPABILITIES return (fget_unlocked(fdp, fd, rightsp, fpp, NULL)); #else int error; seq_t seq; MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); for (;;) { error = fget_unlocked(fdp, fd, rightsp, fpp, &seq); if (error != 0) return (error); error = cap_fcntl_check(fdp, fd, needfcntl); if (!fd_modified(fdp, fd, seq)) break; fdrop(*fpp, td); } if (error != 0) { fdrop(*fpp, td); *fpp = NULL; } return (error); #endif } /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but * never have VM objects. The returned vnode will be vref()'d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, struct vnode **vpp) { struct file *fp; int error; *vpp = NULL; error = _fget(td, fd, &fp, flags, needrightsp, NULL); if (error != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vrefact(*vpp); } fdrop(fp, td); return (error); } int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, 0, rightsp, vpp)); } int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp) { struct filedesc *fdp; struct filecaps caps; struct file *fp; int error; fdp = td->td_proc->p_fd; error = fget_cap_locked(fdp, fd, needrightsp, &fp, &caps); if (error != 0) return (error); if (fp->f_ops == &badfileops) { error = EBADF; goto out; } if (fp->f_vnode == NULL) { error = EINVAL; goto out; } *havecaps = caps; *vpp = fp->f_vnode; vrefact(*vpp); return (0); out: filecaps_free(&caps); return (error); } int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FREAD, rightsp, vpp)); } int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); } #endif /* * Handle the last reference to a file being closed. */ int _fdrop(struct file *fp, struct thread *td) { int error; if (fp->f_count != 0) panic("fdrop: count %d", fp->f_count); error = fo_close(fp, td); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); free(fp->f_advice, M_FADVISE); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on the entire file * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int sys_flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; int error; error = fget(td, uap->fd, &cap_flock_rights, &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); return (EOPNOTSUPP); } vp = fp->f_vnode; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done2; } atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: fdrop(fp, td); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp) { struct filedescent *newfde, *oldfde; struct file *fp; u_long *ioctls; int error, indx; KASSERT(openerror == ENODEV || openerror == ENXIO, ("unexpected error %d in %s", openerror, __func__)); /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, dfd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } error = fdalloc(td, 0, &indx); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor (indx) and return. * * For ENXIO steal away the file structure from (dfd) and store it in * (indx). (dfd) is effectively closed by this operation. */ switch (openerror) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EACCES); } fhold(fp); newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; ioctls = filecaps_copy_prep(&oldfde->fde_caps); #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif memcpy(newfde, oldfde, fde_change_size); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, ioctls); #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif break; case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif memcpy(newfde, oldfde, fde_change_size); oldfde->fde_file = NULL; fdunused(fdp, dfd); #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif break; } FILEDESC_XUNLOCK(fdp); *indxp = indx; return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, "Allow a process to chroot(2) if it has a directory open"); /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(struct filedesc *fdp) { struct vnode *vp; struct file *fp; int fd; FILEDESC_LOCK_ASSERT(fdp); for (fd = 0; fd <= fdp->fd_lastfile; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } /* * Common routine for kern_chroot() and jail_attach(). The caller is * responsible for invoking priv_check() and mac_vnode_check_chroot() to * authorize this operation. */ int pwd_chroot(struct thread *td, struct vnode *vp) { struct filedesc *fdp; struct vnode *oldvp; int error; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } } oldvp = fdp->fd_rdir; vrefact(vp); fdp->fd_rdir = vp; if (fdp->fd_jdir == NULL) { vrefact(vp); fdp->fd_jdir = vp; } FILEDESC_XUNLOCK(fdp); vrele(oldvp); return (0); } void pwd_chdir(struct thread *td, struct vnode *vp) { struct filedesc *fdp; struct vnode *oldvp; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); VNASSERT(vp->v_usecount > 0, vp, ("chdir to a vnode with zero usecount")); oldvp = fdp->fd_cdir; fdp->fd_cdir = vp; FILEDESC_XUNLOCK(fdp); vrele(oldvp); } /* * Scan all active processes and prisons to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct filedesc *fdp; struct prison *pr; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; nrele = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_XLOCK(fdp); if (fdp->fd_cdir == olddp) { vrefact(newdp); fdp->fd_cdir = newdp; nrele++; } if (fdp->fd_rdir == olddp) { vrefact(newdp); fdp->fd_rdir = newdp; nrele++; } if (fdp->fd_jdir == olddp) { vrefact(newdp); fdp->fd_jdir = newdp; nrele++; } FILEDESC_XUNLOCK(fdp); fddrop(fdp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { vrefact(newdp); rootvnode = newdp; nrele++; } mtx_lock(&prison0.pr_mtx); if (prison0.pr_root == olddp) { vrefact(newdp); prison0.pr_root = newdp; nrele++; } mtx_unlock(&prison0.pr_mtx); sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { mtx_lock(&pr->pr_mtx); if (pr->pr_root == olddp) { vrefact(newdp); pr->pr_root = newdp; nrele++; } mtx_unlock(&pr->pr_mtx); } sx_sunlock(&allprison_lock); while (nrele--) vrele(olddp); } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; fdtol = malloc(sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_XLOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_XUNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } static int sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) { struct filedesc *fdp; int i, count, slots; if (*(int *)arg1 != 0) return (EINVAL); fdp = curproc->p_fd; count = 0; FILEDESC_SLOCK(fdp); slots = NDSLOTS(fdp->fd_lastfile + 1); for (i = 0; i < slots; i++) count += bitcountl(fdp->fd_map[i]); FILEDESC_SUNLOCK(fdp); return (SYSCTL_OUT(req, &count, sizeof(count))); } static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, "Number of open file descriptors"); /* * Get file structures globally. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; /* overestimates sparse tables. */ if (fdp->fd_lastfile > 0) n += fdp->fd_lastfile; fddrop(fdp); } sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_SLOCK(fdp); for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; xf.xf_fd = n; xf.xf_file = fp; xf.xf_data = fp->f_data; xf.xf_vnode = fp->f_vnode; xf.xf_type = fp->f_type; xf.xf_count = fp->f_count; xf.xf_msgcount = 0; xf.xf_offset = foffset_get(fp); xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static int xlate_fflags(int fflags) { static const struct { int fflag; int kf_fflag; } fflags_table[] = { { FAPPEND, KF_FLAG_APPEND }, { FASYNC, KF_FLAG_ASYNC }, { FFSYNC, KF_FLAG_FSYNC }, { FHASLOCK, KF_FLAG_HASLOCK }, { FNONBLOCK, KF_FLAG_NONBLOCK }, { FREAD, KF_FLAG_READ }, { FWRITE, KF_FLAG_WRITE }, { O_CREAT, KF_FLAG_CREAT }, { O_DIRECT, KF_FLAG_DIRECT }, { O_EXCL, KF_FLAG_EXCL }, { O_EXEC, KF_FLAG_EXEC }, { O_EXLOCK, KF_FLAG_EXLOCK }, { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, { O_SHLOCK, KF_FLAG_SHLOCK }, { O_TRUNC, KF_FLAG_TRUNC } }; unsigned int i; int kflags; kflags = 0; for (i = 0; i < nitems(fflags_table); i++) if (fflags & fflags_table[i].fflag) kflags |= fflags_table[i].kf_fflag; return (kflags); } /* Trim unused data from kf_path by truncating the structure size. */ static void pack_kinfo(struct kinfo_file *kif) { kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + strlen(kif->kf_path) + 1; kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); } static void export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, struct kinfo_file *kif, struct filedesc *fdp, int flags) { int error; bzero(kif, sizeof(*kif)); /* Set a default type to allow for empty fill_kinfo() methods. */ kif->kf_type = KF_TYPE_UNKNOWN; kif->kf_flags = xlate_fflags(fp->f_flag); if (rightsp != NULL) kif->kf_cap_rights = *rightsp; else cap_rights_init(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = fp->f_count; kif->kf_offset = foffset_get(fp); /* * This may drop the filedesc lock, so the 'fp' cannot be * accessed after this call. */ error = fo_fill_kinfo(fp, kif, fdp); if (error == 0) kif->kf_status |= KF_ATTR_VALID; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); } static void export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, struct kinfo_file *kif, int flags) { int error; bzero(kif, sizeof(*kif)); kif->kf_type = KF_TYPE_VNODE; error = vn_fill_kinfo_vnode(vp, kif); if (error == 0) kif->kf_status |= KF_ATTR_VALID; kif->kf_flags = xlate_fflags(fflags); cap_rights_init(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = -1; kif->kf_offset = -1; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); vrele(vp); } struct export_fd_buf { struct filedesc *fdp; struct sbuf *sb; ssize_t remainder; struct kinfo_file kif; int flags; }; static int export_kinfo_to_sb(struct export_fd_buf *efbuf) { struct kinfo_file *kif; kif = &efbuf->kif; if (efbuf->remainder != -1) { if (efbuf->remainder < kif->kf_structsize) { /* Terminate export. */ efbuf->remainder = 0; return (0); } efbuf->remainder -= kif->kf_structsize; } return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM); } static int export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, efbuf->flags); FILEDESC_SUNLOCK(efbuf->fdp); error = export_kinfo_to_sb(efbuf); FILEDESC_SLOCK(efbuf->fdp); return (error); } static int export_vnode_to_sb(struct vnode *vp, int fd, int fflags, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); if (efbuf->fdp != NULL) FILEDESC_SUNLOCK(efbuf->fdp); export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); error = export_kinfo_to_sb(efbuf); if (efbuf->fdp != NULL) FILEDESC_SLOCK(efbuf->fdp); return (error); } /* * Store a process file descriptor information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags) { struct file *fp; struct filedesc *fdp; struct export_fd_buf *efbuf; struct vnode *cttyvp, *textvp, *tracevp; int error, i; cap_rights_t rights; PROC_LOCK_ASSERT(p, MA_OWNED); /* ktrace vnode */ tracevp = p->p_tracevp; if (tracevp != NULL) vrefact(tracevp); /* text vnode */ textvp = p->p_textvp; if (textvp != NULL) vrefact(textvp); /* Controlling tty. */ cttyvp = NULL; if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { cttyvp = p->p_pgrp->pg_session->s_ttyvp; if (cttyvp != NULL) vrefact(cttyvp); } fdp = fdhold(p); PROC_UNLOCK(p); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = NULL; efbuf->sb = sb; efbuf->remainder = maxlen; efbuf->flags = flags; if (tracevp != NULL) export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE, efbuf); if (textvp != NULL) export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf); if (cttyvp != NULL) export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE, efbuf); error = 0; if (fdp == NULL) goto fail; efbuf->fdp = fdp; FILEDESC_SLOCK(fdp); /* working directory */ if (fdp->fd_cdir != NULL) { vrefact(fdp->fd_cdir); export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } /* root directory */ if (fdp->fd_rdir != NULL) { vrefact(fdp->fd_rdir); export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf); } /* jail directory */ if (fdp->fd_jdir != NULL) { vrefact(fdp->fd_jdir); export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf); } for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; #ifdef CAPABILITIES rights = *cap_rights(fdp, i); #else /* !CAPABILITIES */ rights = cap_no_rights; #endif /* * Create sysctl entry. It is OK to drop the filedesc * lock inside of export_file_to_sb() as we will * re-validate and re-evaluate its properties when the * loop continues. */ error = export_file_to_sb(fp, i, &rights, efbuf); if (error != 0 || efbuf->remainder == 0) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); fail: free(efbuf, M_TEMP); return (error); } #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_filedesc_out(p, &sb, maxlen, KERN_FILEDESC_PACK_KINFO); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #ifdef COMPAT_FREEBSD7 #ifdef KINFO_OFILE_SIZE CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); #endif static void kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) { okif->kf_structsize = sizeof(*okif); okif->kf_type = kif->kf_type; okif->kf_fd = kif->kf_fd; okif->kf_ref_count = kif->kf_ref_count; okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | KF_FLAG_DIRECT | KF_FLAG_HASLOCK); okif->kf_offset = kif->kf_offset; if (kif->kf_type == KF_TYPE_VNODE) okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; else okif->kf_vnode_type = KF_VTYPE_VNON; strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); if (kif->kf_type == KF_TYPE_SOCKET) { okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; } else { okif->kf_sa_local.ss_family = AF_UNSPEC; okif->kf_sa_peer.ss_family = AF_UNSPEC; } } static int export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req) { int error; vrefact(vp); FILEDESC_SUNLOCK(fdp); export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); return (error); } /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) { struct kinfo_ofile *okif; struct kinfo_file *kif; struct filedesc *fdp; int error, i, *name; struct file *fp; struct proc *p; name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) return (error); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) return (ENOENT); kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); FILEDESC_SLOCK(fdp); if (fdp->fd_cdir != NULL) export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, okif, fdp, req); if (fdp->fd_rdir != NULL) export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, okif, fdp, req); if (fdp->fd_jdir != NULL) export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, okif, fdp, req); for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; export_file_to_kinfo(fp, i, NULL, kif, fdp, KERN_FILEDESC_PACK_KINFO); FILEDESC_SUNLOCK(fdp); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); free(kif, M_TEMP); free(okif, M_TEMP); return (0); } static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); #endif /* COMPAT_FREEBSD7 */ int vntype_to_kinfo(int vtype) { struct { int vtype; int kf_vtype; } vtypes_table[] = { { VBAD, KF_VTYPE_VBAD }, { VBLK, KF_VTYPE_VBLK }, { VCHR, KF_VTYPE_VCHR }, { VDIR, KF_VTYPE_VDIR }, { VFIFO, KF_VTYPE_VFIFO }, { VLNK, KF_VTYPE_VLNK }, { VNON, KF_VTYPE_VNON }, { VREG, KF_VTYPE_VREG }, { VSOCK, KF_VTYPE_VSOCK } }; unsigned int i; /* * Perform vtype translation. */ for (i = 0; i < nitems(vtypes_table); i++) if (vtypes_table[i].vtype == vtype) return (vtypes_table[i].kf_vtype); return (KF_VTYPE_UNKNOWN); } static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, "Process filedesc entries"); /* * Store a process current working directory information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) { struct filedesc *fdp; struct export_fd_buf *efbuf; int error; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) return (EINVAL); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = fdp; efbuf->sb = sb; efbuf->remainder = maxlen; FILEDESC_SLOCK(fdp); if (fdp->fd_cdir == NULL) error = EINVAL; else { vrefact(fdp->fd_cdir); error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } FILEDESC_SUNLOCK(fdp); fddrop(fdp); free(efbuf, M_TEMP); return (error); } /* * Get per-process current working directory. */ static int sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_cwd_out(p, &sb, maxlen); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_cwd, "Process current working directory"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnode"); case DTYPE_SOCKET: return ("socket"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kqueue"); case DTYPE_CRYPTO: return ("crypto"); case DTYPE_MQUEUE: return ("mqueue"); case DTYPE_SHM: return ("shm"); case DTYPE_SEM: return ("ksem"); case DTYPE_PTS: return ("pts"); case DTYPE_DEV: return ("dev"); case DTYPE_PROCDESC: return ("proc"); case DTYPE_LINUXEFD: return ("levent"); case DTYPE_LINUXTFD: return ("ltimer"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n <= fdp->fd_lastfile; n++) { if (fp == fdp->fd_ofiles[n].fde_file) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) struct proc *p; if (header) db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, fp->f_flag, 0, fp->f_count, 0, XPTRWIDTH, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); #undef XPTRWIDTH } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } DB_SHOW_COMMAND(files, db_show_files) { struct filedesc *fdp; struct file *fp; struct proc *p; int header; int n; header = 1; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; if ((fdp = p->p_fd) == NULL) continue; for (n = 0; n <= fdp->fd_lastfile; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; db_print_file(fp, header); header = 0; } } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (0); } static int badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EBADF); } static int badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return (0); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = badfo_fill_kinfo, }; int invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } int invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } int invfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (poll_no_poll(events)); } int invfo_kqfilter(struct file *fp, struct knote *kn) { return (EINVAL); } int invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EINVAL); } /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 334485) +++ head/sys/kern/kern_exec.c (revision 334486) @@ -1,1734 +1,1733 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exec, "char *"); SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); EVENTHANDLER_LIST_DECLARE(process_exec); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; } #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); post_execve(td, error, oldvmspace); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == EJUSTRETURN) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(p->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { AUDIT_ARG_ARGV(args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG_ENVV(args->begin_envv, args->envc, args->endp - args->begin_envv); return (do_execve(td, args, mac_p)); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *oldtextvp = NULL, *newtextvp; int credential_changing; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp among other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp); if (error) goto exec_fail; vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = VOP_IS_TEXT(imgp->vp); VOP_SET_TEXT(imgp->vp); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif /* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */ if (credential_changing) imgp->proc->p_pdeathsig = 0; if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * Do the best to calculate the full path to the image file. */ if (args->fname != NULL && args->fname[0] == '/') imgp->execpath = args->fname; else { VOP_UNLOCK(imgp->vp, 0); if (vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); } /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) VOP_UNSET_TEXT(imgp->vp); error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ VOP_UNSET_TEXT(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(newtextvp); vm_object_deallocate(imgp->object); imgp->object = NULL; imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ if (SV_PROC_FLAG(p, SV_CAPSICUM)) sys_cap_enter(td, NULL); /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) error = (*p->p_sysent->sv_fixup)(&stack_base, imgp); else error = suword(--stack_base, imgp->args->argc) == 0 ? 0 : EFAULT; if (error != 0) goto exec_fail_dealloc; if (args->fdp != NULL) { /* Install a brand new file descriptor table. */ fdinstall_remapped(td, args->fdp); args->fdp = NULL; } else { /* * Keep on using the existing file descriptor table. For * security and other reasons, the file descriptor table * cannot be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); } /* * Malloc things before we need locks. */ i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in procfs. This vnode was referenced by namei * or fgetvp_exec. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; PROC_UNLOCK(p); #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); else exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp, 0); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { if (p->p_ptevents & PTRACE_EXEC) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); } /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif /* * We don't want cpu_set_syscall_retval() to overwrite any of * the register values put in place by exec_setregs(). * Implementations of cpu_set_syscall_retval() will leave * registers unmodified when returning EJUSTRETURN. */ return (error == 0 ? EJUSTRETURN : error); } int -exec_map_first_page(imgp) - struct image_params *imgp; +exec_map_first_page(struct image_params *imgp) { int rv, i, after, initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_WLOCK(object); #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (ma[0]->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(ma[0]); if (!vm_pager_has_page(object, 0, NULL, &after)) { vm_page_lock(ma[0]); vm_page_free(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); return (EIO); } initial_pagein = min(after, VM_INITIAL_PAGEIN); KASSERT(initial_pagein <= object->size, ("%s: initial_pagein %d object->size %ju", __func__, initial_pagein, (uintmax_t )object->size)); for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid) break; if (!vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL); if (rv != VM_PAGER_OK) { for (i = 0; i < initial_pagein; i++) { vm_page_lock(ma[i]); vm_page_free(ma[i]); vm_page_unlock(ma[i]); } VM_OBJECT_WUNLOCK(object); return (EIO); } vm_page_xunbusy(ma[0]); for (i = 1; i < initial_pagein; i++) vm_page_readahead_finish(ma[i]); } vm_page_lock(ma[0]); vm_page_hold(ma[0]); vm_page_activate(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(struct image_params *imgp) { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } } /* * Destroy old address space, and allocate a new stack. * The new stack is only sgrowsiz large because it is grown * automatically on a page fault. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* An exec terminates mlockall(MCL_FUTURE). */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error != KERN_SUCCESS) return (vm_mmap_to_errno(error)); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long argp, envp; int error; size_t length; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ if (fname != NULL) { args->fname = args->buf; error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; } else length = 0; args->begin_argv = args->buf + length; args->endp = args->begin_argv; args->stringspace = ARG_MAX; /* * extract arguments first */ for (;;) { error = fueword(argv++, &argp); if (error == -1) { error = EFAULT; goto err_exit; } if (argp == 0) break; error = copyinstr((void *)(uintptr_t)argp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &envp); if (error == -1) { error = EFAULT; goto err_exit; } if (envp == 0) break; error = copyinstr((void *)(uintptr_t)envp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } int exec_copyin_data_fds(struct thread *td, struct image_args *args, const void *data, size_t datalen, const int *fds, size_t fdslen) { struct filedesc *ofdp; const char *p; int *kfds; int error; memset(args, '\0', sizeof(*args)); ofdp = td->td_proc->p_fd; if (datalen >= ARG_MAX || fdslen > ofdp->fd_lastfile + 1) return (E2BIG); error = exec_alloc_args(args); if (error != 0) return (error); args->begin_argv = args->buf; args->stringspace = ARG_MAX; if (datalen > 0) { /* * Argument buffer has been provided. Copy it into the * kernel as a single string and add a terminating null * byte. */ error = copyin(data, args->begin_argv, datalen); if (error != 0) goto err_exit; args->begin_argv[datalen] = '\0'; args->endp = args->begin_argv + datalen + 1; args->stringspace -= datalen + 1; /* * Traditional argument counting. Count the number of * null bytes. */ for (p = args->begin_argv; p < args->endp; ++p) if (*p == '\0') ++args->argc; } else { /* No argument buffer provided. */ args->endp = args->begin_argv; } /* There are no environment variables. */ args->begin_envv = args->endp; /* Create new file descriptor table. */ kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK); error = copyin(fds, kfds, fdslen * sizeof(int)); if (error != 0) { free(kfds, M_TEMP); goto err_exit; } error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp); free(kfds, M_TEMP); if (error != 0) goto err_exit; return (0); err_exit: exec_free_args(args); return (error); } struct exec_args_kva { vm_offset_t addr; u_int gen; SLIST_ENTRY(exec_args_kva) next; }; static DPCPU_DEFINE(struct exec_args_kva *, exec_args_kva); static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist; static struct mtx exec_args_kva_mtx; static u_int exec_args_gen; static void exec_prealloc_args_kva(void *arg __unused) { struct exec_args_kva *argkva; u_int i; SLIST_INIT(&exec_args_kva_freelist); mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF); for (i = 0; i < exec_map_entries; i++) { argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK); argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size); argkva->gen = exec_args_gen; SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); } } SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL); static vm_offset_t exec_alloc_args_kva(void **cookie) { struct exec_args_kva *argkva; argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_PTR(exec_args_kva)); if (argkva == NULL) { mtx_lock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL) (void)mtx_sleep(&exec_args_kva_freelist, &exec_args_kva_mtx, 0, "execkva", 0); SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next); mtx_unlock(&exec_args_kva_mtx); } *(struct exec_args_kva **)cookie = argkva; return (argkva->addr); } static void exec_release_args_kva(struct exec_args_kva *argkva, u_int gen) { vm_offset_t base; base = argkva->addr; if (argkva->gen != gen) { vm_map_madvise(exec_map, base, base + exec_map_entry_size, MADV_FREE); argkva->gen = gen; } if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva), (uintptr_t)NULL, (uintptr_t)argkva)) { mtx_lock(&exec_args_kva_mtx); SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); wakeup_one(&exec_args_kva_freelist); mtx_unlock(&exec_args_kva_mtx); } } static void exec_free_args_kva(void *cookie) { exec_release_args_kva(cookie, exec_args_gen); } static void exec_args_kva_lowmem(void *arg __unused) { SLIST_HEAD(, exec_args_kva) head; struct exec_args_kva *argkva; u_int gen; int i; gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1; /* * Force an madvise of each KVA range. Any currently allocated ranges * will have MADV_FREE applied once they are freed. */ SLIST_INIT(&head); mtx_lock(&exec_args_kva_mtx); SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva); mtx_unlock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&head)) != NULL) { SLIST_REMOVE_HEAD(&head, next); exec_release_args_kva(argkva, gen); } CPU_FOREACH(i) { argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva)); if (argkva != NULL) exec_release_args_kva(argkva, gen); } } EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL, EVENTHANDLER_PRI_ANY); /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)exec_alloc_args_kva(&args->bufkva); return (0); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { exec_free_args_kva(args->bufkva); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } if (args->fdp != NULL) fdescfree_remapped(args->fdp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(struct image_params *imgp) { int argc, envc; char **vectp; char *stringp; uintptr_t destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; size_t execpath_len; int szsigcode, szps; char canary[sizeof(long) * 8]; szps = sizeof(pagesizes[0]) * MAXPAGESIZES; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_sigcode_base == 0) { if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); } destp = (uintptr_t)arginfo; /* * install sigcode */ if (szsigcode != 0) { destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode); } /* * Copy the image path for the rtld. */ if (execpath_len != 0) { destp -= execpath_len; imgp->execpathp = destp; copyout(imgp->execpath, (void *)destp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = destp; copyout(canary, (void *)destp, sizeof(canary)); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ destp -= szps; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = destp; copyout(pagesizes, (void *)destp, szps); imgp->pagesizeslen = szps; destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); vectp = (char **)destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has up to AT_COUNT entries. */ vectp -= howmany(AT_COUNT * sizeof(Elf_Auxinfo), sizeof(*vectp)); } /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(struct image_params *imgp) { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error, writecount; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) return (error); if (writecount != 0) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = 1; return (error); } /* * Exec handler registration */ int exec_register(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; u_int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/subr_prof.c =================================================================== --- head/sys/kern/subr_prof.c (revision 334485) +++ head/sys/kern/subr_prof.c (revision 334486) @@ -1,537 +1,536 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_prof.c 8.3 (Berkeley) 9/23/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #ifdef GPROF #include #include #undef MCOUNT static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer"); static void kmstartup(void *); SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL); struct gmonparam _gmonparam = { GMON_PROF_OFF }; #ifdef GUPROF void nullfunc_loop_profiled() { int i; for (i = 0; i < CALIB_SCALE; i++) nullfunc_profiled(); } #define nullfunc_loop_profiled_end nullfunc_profiled /* XXX */ void nullfunc_profiled() { } #endif /* GUPROF */ /* * Update the histograms to support extending the text region arbitrarily. * This is done slightly naively (no sparse regions), so will waste slight * amounts of memory, but will overall work nicely enough to allow profiling * of KLDs. */ void kmupetext(uintfptr_t nhighpc) { struct gmonparam np; /* slightly large */ struct gmonparam *p = &_gmonparam; char *cp; GIANT_REQUIRED; bcopy(p, &np, sizeof(*p)); np.highpc = ROUNDUP(nhighpc, HISTFRACTION * sizeof(HISTCOUNTER)); if (np.highpc <= p->highpc) return; np.textsize = np.highpc - p->lowpc; np.kcountsize = np.textsize / HISTFRACTION; np.hashfraction = HASHFRACTION; np.fromssize = np.textsize / HASHFRACTION; np.tolimit = np.textsize * ARCDENSITY / 100; if (np.tolimit < MINARCS) np.tolimit = MINARCS; else if (np.tolimit > MAXARCS) np.tolimit = MAXARCS; np.tossize = np.tolimit * sizeof(struct tostruct); cp = malloc(np.kcountsize + np.fromssize + np.tossize, M_GPROF, M_WAITOK); /* * Check for something else extending highpc while we slept. */ if (np.highpc <= p->highpc) { free(cp, M_GPROF); return; } np.tos = (struct tostruct *)cp; cp += np.tossize; np.kcount = (HISTCOUNTER *)cp; cp += np.kcountsize; np.froms = (u_short *)cp; #ifdef GUPROF /* Reinitialize pointers to overhead counters. */ np.cputime_count = &KCOUNT(&np, PC_TO_I(&np, cputime)); np.mcount_count = &KCOUNT(&np, PC_TO_I(&np, mcount)); np.mexitcount_count = &KCOUNT(&np, PC_TO_I(&np, mexitcount)); #endif critical_enter(); bcopy(p->tos, np.tos, p->tossize); bzero((char *)np.tos + p->tossize, np.tossize - p->tossize); bcopy(p->kcount, np.kcount, p->kcountsize); bzero((char *)np.kcount + p->kcountsize, np.kcountsize - p->kcountsize); bcopy(p->froms, np.froms, p->fromssize); bzero((char *)np.froms + p->fromssize, np.fromssize - p->fromssize); cp = (char *)p->tos; bcopy(&np, p, sizeof(*p)); critical_exit(); free(cp, M_GPROF); } static void -kmstartup(dummy) - void *dummy; +kmstartup(void *dummy) { char *cp; struct gmonparam *p = &_gmonparam; #ifdef GUPROF int cputime_overhead; int empty_loop_time; int i; int mcount_overhead; int mexitcount_overhead; int nullfunc_loop_overhead; int nullfunc_loop_profiled_time; uintfptr_t tmp_addr; #endif /* * Round lowpc and highpc to multiples of the density we're using * so the rest of the scaling (here and in gprof) stays in ints. */ p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER)); p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER)); p->textsize = p->highpc - p->lowpc; printf("Profiling kernel, textsize=%lu [%jx..%jx]\n", p->textsize, (uintmax_t)p->lowpc, (uintmax_t)p->highpc); p->kcountsize = p->textsize / HISTFRACTION; p->hashfraction = HASHFRACTION; p->fromssize = p->textsize / HASHFRACTION; p->tolimit = p->textsize * ARCDENSITY / 100; if (p->tolimit < MINARCS) p->tolimit = MINARCS; else if (p->tolimit > MAXARCS) p->tolimit = MAXARCS; p->tossize = p->tolimit * sizeof(struct tostruct); cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize, M_GPROF, M_WAITOK | M_ZERO); p->tos = (struct tostruct *)cp; cp += p->tossize; p->kcount = (HISTCOUNTER *)cp; cp += p->kcountsize; p->froms = (u_short *)cp; p->histcounter_type = FUNCTION_ALIGNMENT / HISTFRACTION * NBBY; #ifdef GUPROF /* Signed counters. */ p->histcounter_type = -p->histcounter_type; /* Initialize pointers to overhead counters. */ p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime)); p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount)); p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount)); /* * Disable interrupts to avoid interference while we calibrate * things. */ critical_enter(); /* * Determine overheads. * XXX this needs to be repeated for each useful timer/counter. */ cputime_overhead = 0; startguprof(p); for (i = 0; i < CALIB_SCALE; i++) cputime_overhead += cputime(); empty_loop(); startguprof(p); empty_loop(); empty_loop_time = cputime(); nullfunc_loop_profiled(); /* * Start profiling. There won't be any normal function calls since * interrupts are disabled, but we will call the profiling routines * directly to determine their overheads. */ p->state = GMON_PROF_HIRES; startguprof(p); nullfunc_loop_profiled(); startguprof(p); for (i = 0; i < CALIB_SCALE; i++) MCOUNT_OVERHEAD(sys_profil); mcount_overhead = KCOUNT(p, PC_TO_I(p, sys_profil)); startguprof(p); for (i = 0; i < CALIB_SCALE; i++) MEXITCOUNT_OVERHEAD(); MEXITCOUNT_OVERHEAD_GETLABEL(tmp_addr); mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr)); p->state = GMON_PROF_OFF; stopguprof(p); critical_exit(); nullfunc_loop_profiled_time = 0; for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled; tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end; tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER)) nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr)); #define CALIB_DOSCALE(count) (((count) + CALIB_SCALE / 3) / CALIB_SCALE) #define c2n(count, freq) ((int)((count) * 1000000000LL / freq)) printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n", CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)), CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)), CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)), CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)), CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate))); cputime_overhead -= empty_loop_time; mcount_overhead -= empty_loop_time; mexitcount_overhead -= empty_loop_time; /*- * Profiling overheads are determined by the times between the * following events: * MC1: mcount() is called * MC2: cputime() (called from mcount()) latches the timer * MC3: mcount() completes * ME1: mexitcount() is called * ME2: cputime() (called from mexitcount()) latches the timer * ME3: mexitcount() completes. * The times between the events vary slightly depending on instruction * combination and cache misses, etc. Attempt to determine the * minimum times. These can be subtracted from the profiling times * without much risk of reducing the profiling times below what they * would be when profiling is not configured. Abbreviate: * ab = minimum time between MC1 and MC3 * a = minimum time between MC1 and MC2 * b = minimum time between MC2 and MC3 * cd = minimum time between ME1 and ME3 * c = minimum time between ME1 and ME2 * d = minimum time between ME2 and ME3. * These satisfy the relations: * ab <= mcount_overhead (just measured) * a + b <= ab * cd <= mexitcount_overhead (just measured) * c + d <= cd * a + d <= nullfunc_loop_profiled_time (just measured) * a >= 0, b >= 0, c >= 0, d >= 0. * Assume that ab and cd are equal to the minimums. */ p->cputime_overhead = CALIB_DOSCALE(cputime_overhead); p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead); p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead - cputime_overhead); nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time; p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead - nullfunc_loop_overhead) / 4); p->mexitcount_pre_overhead = p->mexitcount_overhead + p->cputime_overhead - p->mexitcount_post_overhead; p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead) - p->mexitcount_post_overhead; p->mcount_post_overhead = p->mcount_overhead + p->cputime_overhead - p->mcount_pre_overhead; printf( "Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n", c2n(p->cputime_overhead, p->profrate), c2n(p->mcount_overhead, p->profrate), c2n(p->mcount_pre_overhead, p->profrate), c2n(p->mcount_post_overhead, p->profrate), c2n(p->cputime_overhead, p->profrate), c2n(p->mexitcount_overhead, p->profrate), c2n(p->mexitcount_pre_overhead, p->profrate), c2n(p->mexitcount_post_overhead, p->profrate)); printf( "Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n", p->cputime_overhead, p->mcount_overhead, p->mcount_pre_overhead, p->mcount_post_overhead, p->cputime_overhead, p->mexitcount_overhead, p->mexitcount_pre_overhead, p->mexitcount_post_overhead); #endif /* GUPROF */ } /* * Return kernel profiling information. */ static int sysctl_kern_prof(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; struct gmonparam *gp = &_gmonparam; int error; int state; /* all sysctl names at this level are terminal */ if (namelen != 1) return (ENOTDIR); /* overloaded */ switch (name[0]) { case GPROF_STATE: state = gp->state; error = sysctl_handle_int(oidp, &state, 0, req); if (error) return (error); if (!req->newptr) return (0); if (state == GMON_PROF_OFF) { gp->state = state; PROC_LOCK(&proc0); stopprofclock(&proc0); PROC_UNLOCK(&proc0); stopguprof(gp); } else if (state == GMON_PROF_ON) { gp->state = GMON_PROF_OFF; stopguprof(gp); gp->profrate = profhz; PROC_LOCK(&proc0); startprofclock(&proc0); PROC_UNLOCK(&proc0); gp->state = state; #ifdef GUPROF } else if (state == GMON_PROF_HIRES) { gp->state = GMON_PROF_OFF; PROC_LOCK(&proc0); stopprofclock(&proc0); PROC_UNLOCK(&proc0); startguprof(gp); gp->state = state; #endif } else if (state != gp->state) return (EINVAL); return (0); case GPROF_COUNT: return (sysctl_handle_opaque(oidp, gp->kcount, gp->kcountsize, req)); case GPROF_FROMS: return (sysctl_handle_opaque(oidp, gp->froms, gp->fromssize, req)); case GPROF_TOS: return (sysctl_handle_opaque(oidp, gp->tos, gp->tossize, req)); case GPROF_GMONPARAM: return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req)); default: return (EOPNOTSUPP); } /* NOTREACHED */ } static SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, ""); #endif /* GPROF */ /* * Profiling system call. * * The scale factor is a fixed point number with 16 bits of fraction, so that * 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling. */ #ifndef _SYS_SYSPROTO_H_ struct profil_args { caddr_t samples; size_t size; size_t offset; u_int scale; }; #endif /* ARGSUSED */ int sys_profil(struct thread *td, struct profil_args *uap) { struct uprof *upp; struct proc *p; if (uap->scale > (1 << 16)) return (EINVAL); p = td->td_proc; if (uap->scale == 0) { PROC_LOCK(p); stopprofclock(p); PROC_UNLOCK(p); return (0); } PROC_LOCK(p); upp = &td->td_proc->p_stats->p_prof; PROC_PROFLOCK(p); upp->pr_off = uap->offset; upp->pr_scale = uap->scale; upp->pr_base = uap->samples; upp->pr_size = uap->size; PROC_PROFUNLOCK(p); startprofclock(p); PROC_UNLOCK(p); return (0); } /* * Scale is a fixed-point number with the binary point 16 bits * into the value, and is <= 1.0. pc is at most 32 bits, so the * intermediate result is at most 48 bits. */ #define PC_TO_INDEX(pc, prof) \ ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \ (u_quad_t)((prof)->pr_scale)) >> 16) & ~1) /* * Collect user-level profiling statistics; called on a profiling tick, * when a process is running in user-mode. This routine may be called * from an interrupt context. We perform the update with an AST * that will vector us to trap() with a context in which copyin and * copyout will work. Trap will then call addupc_task(). * * Note that we may (rarely) not get around to the AST soon enough, and * lose profile ticks when the next tick overwrites this one, but in this * case the system is overloaded and the profile is probably already * inaccurate. */ void addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks) { struct uprof *prof; if (ticks == 0) return; prof = &td->td_proc->p_stats->p_prof; PROC_PROFLOCK(td->td_proc); if (pc < prof->pr_off || PC_TO_INDEX(pc, prof) >= prof->pr_size) { PROC_PROFUNLOCK(td->td_proc); return; /* out of range; ignore */ } PROC_PROFUNLOCK(td->td_proc); td->td_profil_addr = pc; td->td_profil_ticks = ticks; td->td_pflags |= TDP_OWEUPC; thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } /* * Actually update the profiling statistics. If the update fails, we * simply turn off profiling. */ void addupc_task(struct thread *td, uintfptr_t pc, u_int ticks) { struct proc *p = td->td_proc; struct uprof *prof; caddr_t addr; u_int i; u_short v; int stop = 0; if (ticks == 0) return; PROC_LOCK(p); if (!(p->p_flag & P_PROFIL)) { PROC_UNLOCK(p); return; } p->p_profthreads++; prof = &p->p_stats->p_prof; PROC_PROFLOCK(p); if (pc < prof->pr_off || (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) { PROC_PROFUNLOCK(p); goto out; } addr = prof->pr_base + i; PROC_PROFUNLOCK(p); PROC_UNLOCK(p); if (copyin(addr, &v, sizeof(v)) == 0) { v += ticks; if (copyout(&v, addr, sizeof(v)) == 0) { PROC_LOCK(p); goto out; } } stop = 1; PROC_LOCK(p); out: if (--p->p_profthreads == 0) { if (p->p_flag & P_STOPPROF) { wakeup(&p->p_profthreads); p->p_flag &= ~P_STOPPROF; stop = 0; } } if (stop) stopprofclock(p); PROC_UNLOCK(p); } Index: head/sys/kern/sys_pipe.c =================================================================== --- head/sys/kern/sys_pipe.c (revision 334485) +++ head/sys/kern/sys_pipe.c (revision 334486) @@ -1,1841 +1,1796 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1996 John S. Dyson * Copyright (c) 2012 Giovanni Trematerra * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, the sending process pins the underlying pages in * memory, and the receiving process copies directly from these pinned pages * in the sending process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. * * In order to limit the resource use of pipes, two sysctls exist: * * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable * address space available to us in pipe_map. This value is normally * autotuned, but may also be loader tuned. * * kern.ipc.pipekva - This read-only sysctl tracks the current amount of * memory in use by pipes. * * Based on how large pipekva is relative to maxpipekva, the following * will happen: * * 0% - 50%: * New pipes are given 16K of memory backing, pipes may dynamically * grow to as large as 64K where needed. * 50% - 75%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes may NOT grow. * 75% - 100%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes will be shrunk down to 4K whenever possible. * * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE * resize which MUST occur for reverse-direction pipes when they are * first used. * * Additional information about the current state of pipes may be obtained * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, * and kern.ipc.piperesizefail. * * Locking rules: There are two locks present here: A mutex, used via * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via * the flag, as mutexes can not persist over uiomove. The mutex * exists only to guard access to the flag, and is not in itself a * locking mechanism. Also note that there is only a single mutex for * both directions of a pipe. * * As pipelock() may have to sleep before it can acquire the flag, it * is important to reread all data after a call to pipelock(); everything * in the structure may have changed. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ #define PIPE_PEER(pipe) \ (((pipe)->pipe_state & PIPE_NAMED) ? (pipe) : ((pipe)->pipe_peer)) /* * interfaces to the outside world */ static fo_rdwr_t pipe_read; static fo_rdwr_t pipe_write; static fo_truncate_t pipe_truncate; static fo_ioctl_t pipe_ioctl; static fo_poll_t pipe_poll; static fo_kqfilter_t pipe_kqfilter; static fo_stat_t pipe_stat; static fo_close_t pipe_close; static fo_chmod_t pipe_chmod; static fo_chown_t pipe_chown; static fo_fill_kinfo_t pipe_fill_kinfo; struct fileops pipeops = { .fo_read = pipe_read, .fo_write = pipe_write, .fo_truncate = pipe_truncate, .fo_ioctl = pipe_ioctl, .fo_poll = pipe_poll, .fo_kqfilter = pipe_kqfilter, .fo_stat = pipe_stat, .fo_close = pipe_close, .fo_chmod = pipe_chmod, .fo_chown = pipe_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = pipe_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_pipedetach(struct knote *kn); static void filt_pipedetach_notsup(struct knote *kn); static int filt_pipenotsup(struct knote *kn, long hint); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); static struct filterops pipe_nfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach_notsup, .f_event = filt_pipenotsup }; static struct filterops pipe_rfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_piperead }; static struct filterops pipe_wfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_pipewrite }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) static long amountpipekva; static int pipefragretry; static int pipeallocfail; static int piperesizefail; static int piperesizeallowed = 1; SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxpipekva, 0, "Pipe KVA limit"); SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, &amountpipekva, 0, "Pipe KVA usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, &pipeallocfail, 0, "Pipe allocation failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, &piperesizefail, 0, "Pipe resize failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, &piperesizeallowed, 0, "Pipe resizing allowed"); static void pipeinit(void *dummy __unused); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static void pipe_create(struct pipe *pipe, int backing); static void pipe_paircreate(struct thread *td, struct pipepair **p_pp); static __inline int pipelock(struct pipe *cpipe, int catch); static __inline void pipeunlock(struct pipe *cpipe); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); static void pipe_destroy_write_buffer(struct pipe *wpipe); static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); static void pipe_clone_write_buffer(struct pipe *wpipe); #endif static int pipespace(struct pipe *cpipe, int size); static int pipespace_new(struct pipe *cpipe, int size); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); static int pipe_zone_init(void *mem, int size, int flags); static void pipe_zone_fini(void *mem, int size); static uma_zone_t pipe_zone; static struct unrhdr *pipeino_unr; static dev_t pipedev_ino; SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); static void pipeinit(void *dummy __unused) { pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, UMA_ALIGN_PTR, 0); KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); pipeino_unr = new_unrhdr(1, INT32_MAX, NULL); KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized")); pipedev_ino = devfs_alloc_cdp_inode(); KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); } static int pipe_zone_ctor(void *mem, int size, void *arg, int flags) { struct pipepair *pp; struct pipe *rpipe, *wpipe; KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); pp = (struct pipepair *)mem; /* * We zero both pipe endpoints to make sure all the kmem pointers * are NULL, flag fields are zero'd, etc. We timestamp both * endpoints with the same time. */ rpipe = &pp->pp_rpipe; bzero(rpipe, sizeof(*rpipe)); vfs_timestamp(&rpipe->pipe_ctime); rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; wpipe = &pp->pp_wpipe; bzero(wpipe, sizeof(*wpipe)); wpipe->pipe_ctime = rpipe->pipe_ctime; wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; rpipe->pipe_peer = wpipe; rpipe->pipe_pair = pp; wpipe->pipe_peer = rpipe; wpipe->pipe_pair = pp; /* * Mark both endpoints as present; they will later get free'd * one at a time. When both are free'd, then the whole pair * is released. */ rpipe->pipe_present = PIPE_ACTIVE; wpipe->pipe_present = PIPE_ACTIVE; /* * Eventually, the MAC Framework may initialize the label * in ctor or init, but for now we do it elswhere to avoid * blocking in ctor or init. */ pp->pp_label = NULL; return (0); } static int pipe_zone_init(void *mem, int size, int flags) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); pp = (struct pipepair *)mem; mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW); return (0); } static void pipe_zone_fini(void *mem, int size) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); pp = (struct pipepair *)mem; mtx_destroy(&pp->pp_mtx); } static void pipe_paircreate(struct thread *td, struct pipepair **p_pp) { struct pipepair *pp; struct pipe *rpipe, *wpipe; *p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK); #ifdef MAC /* * The MAC label is shared between the connected endpoints. As a * result mac_pipe_init() and mac_pipe_create() are called once * for the pair, and not on the endpoints. */ mac_pipe_init(pp); mac_pipe_create(td->td_ucred, pp); #endif rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); /* Only the forward direction pipe is backed by default */ pipe_create(rpipe, 1); pipe_create(wpipe, 0); rpipe->pipe_state |= PIPE_DIRECTOK; wpipe->pipe_state |= PIPE_DIRECTOK; } void pipe_named_ctor(struct pipe **ppipe, struct thread *td) { struct pipepair *pp; pipe_paircreate(td, &pp); pp->pp_rpipe.pipe_state |= PIPE_NAMED; *ppipe = &pp->pp_rpipe; } void pipe_dtor(struct pipe *dpipe) { struct pipe *peer; ino_t ino; ino = dpipe->pipe_ino; peer = (dpipe->pipe_state & PIPE_NAMED) != 0 ? dpipe->pipe_peer : NULL; funsetown(&dpipe->pipe_sigio); pipeclose(dpipe); if (peer != NULL) { funsetown(&peer->pipe_sigio); pipeclose(peer); } if (ino != 0 && ino != (ino_t)-1) free_unr(pipeino_unr, ino); } /* * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let * the zone pick up the pieces via pipeclose(). */ int kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1, struct filecaps *fcaps2) { struct file *rf, *wf; struct pipe *rpipe, *wpipe; struct pipepair *pp; int fd, fflags, error; pipe_paircreate(td, &pp); rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; error = falloc_caps(td, &rf, &fd, flags, fcaps1); if (error) { pipeclose(rpipe); pipeclose(wpipe); return (error); } /* An extra reference on `rf' has been held for us by falloc_caps(). */ fildes[0] = fd; fflags = FREAD | FWRITE; if ((flags & O_NONBLOCK) != 0) fflags |= FNONBLOCK; /* * Warning: once we've gotten past allocation of the fd for the * read-side, we can only drop the read side via fdrop() in order * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops); error = falloc_caps(td, &wf, &fd, flags, fcaps2); if (error) { fdclose(td, rf, fildes[0]); fdrop(rf, td); /* rpipe has been closed by fdrop(). */ pipeclose(wpipe); return (error); } /* An extra reference on `wf' has been held for us by falloc_caps(). */ finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops); fdrop(wf, td); fildes[1] = fd; fdrop(rf, td); return (0); } #ifdef COMPAT_FREEBSD10 /* ARGSUSED */ int freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused) { int error; int fildes[2]; error = kern_pipe(td, fildes, 0, NULL, NULL); if (error) return (error); td->td_retval[0] = fildes[0]; td->td_retval[1] = fildes[1]; return (0); } #endif int sys_pipe2(struct thread *td, struct pipe2_args *uap) { int error, fildes[2]; if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK)) return (EINVAL); error = kern_pipe(td, fildes, uap->flags, NULL, NULL); if (error) return (error); error = copyout(fildes, uap->fildes, 2 * sizeof(int)); if (error) { (void)kern_close(td, fildes[0]); (void)kern_close(td, fildes[1]); } return (error); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int -pipespace_new(cpipe, size) - struct pipe *cpipe; - int size; +pipespace_new(struct pipe *cpipe, int size) { caddr_t buffer; int error, cnt, firstseg; static int curfail = 0; static struct timeval lastfail; KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), ("pipespace: resize of direct writes not allowed")); retry: cnt = cpipe->pipe_buffer.cnt; if (cnt > size) size = cnt; size = round_page(size); buffer = (caddr_t) vm_map_min(pipe_map); error = vm_map_find(pipe_map, NULL, 0, (vm_offset_t *) &buffer, size, 0, VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) { if ((cpipe->pipe_buffer.buffer == NULL) && (size > SMALL_PIPE_SIZE)) { size = SMALL_PIPE_SIZE; pipefragretry++; goto retry; } if (cpipe->pipe_buffer.buffer == NULL) { pipeallocfail++; if (ppsratecheck(&lastfail, &curfail, 1)) printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); } else { piperesizefail++; } return (ENOMEM); } /* copy data, then free old resources if we're resizing */ if (cnt > 0) { if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, firstseg); if ((cnt - firstseg) > 0) bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], cpipe->pipe_buffer.in); } else { bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, cnt); } } pipe_free_kmem(cpipe); cpipe->pipe_buffer.buffer = buffer; cpipe->pipe_buffer.size = size; cpipe->pipe_buffer.in = cnt; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = cnt; atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); return (0); } /* * Wrapper for pipespace_new() that performs locking assertions. */ static int -pipespace(cpipe, size) - struct pipe *cpipe; - int size; +pipespace(struct pipe *cpipe, int size) { KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipespace")); return (pipespace_new(cpipe, size)); } /* * lock a pipe for I/O, blocking other access */ static __inline int -pipelock(cpipe, catch) - struct pipe *cpipe; - int catch; +pipelock(struct pipe *cpipe, int catch) { int error; PIPE_LOCK_ASSERT(cpipe, MA_OWNED); while (cpipe->pipe_state & PIPE_LOCKFL) { cpipe->pipe_state |= PIPE_LWANT; error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO, "pipelk", 0); if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCKFL; return (0); } /* * unlock a pipe I/O lock */ static __inline void -pipeunlock(cpipe) - struct pipe *cpipe; +pipeunlock(struct pipe *cpipe) { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipeunlock")); cpipe->pipe_state &= ~PIPE_LOCKFL; if (cpipe->pipe_state & PIPE_LWANT) { cpipe->pipe_state &= ~PIPE_LWANT; wakeup(cpipe); } } void -pipeselwakeup(cpipe) - struct pipe *cpipe; +pipeselwakeup(struct pipe *cpipe) { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); if (cpipe->pipe_state & PIPE_SEL) { selwakeuppri(&cpipe->pipe_sel, PSOCK); if (!SEL_WAITING(&cpipe->pipe_sel)) cpipe->pipe_state &= ~PIPE_SEL; } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); } /* * Initialize and allocate VM and memory for pipe. The structure * will start out zero'd from the ctor, so we just manage the kmem. */ static void -pipe_create(pipe, backing) - struct pipe *pipe; - int backing; +pipe_create(struct pipe *pipe, int backing) { if (backing) { /* * Note that these functions can fail if pipe map is exhausted * (as a result of too many pipes created), but we ignore the * error as it is not fatal and could be provoked by * unprivileged users. The only consequence is worse performance * with given pipe. */ if (amountpipekva > maxpipekva / 2) (void)pipespace_new(pipe, SMALL_PIPE_SIZE); else (void)pipespace_new(pipe, PIPE_SIZE); } pipe->pipe_ino = -1; } /* ARGSUSED */ static int -pipe_read(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) { struct pipe *rpipe; int error; int nread = 0; int size; rpipe = fp->f_data; PIPE_LOCK(rpipe); ++rpipe->pipe_busy; error = pipelock(rpipe, 1); if (error) goto unlocked_error; #ifdef MAC error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (amountpipekva > (3 * maxpipekva) / 4) { if (!(rpipe->pipe_state & PIPE_DIRECTW) && (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && (piperesizeallowed == 1)) { PIPE_UNLOCK(rpipe); pipespace(rpipe, SMALL_PIPE_SIZE); PIPE_LOCK(rpipe); } } while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > uio->uio_resid) size = uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); PIPE_LOCK(rpipe); if (error) break; rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_map.cnt) && (rpipe->pipe_state & PIPE_DIRECTW)) { if (size > uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove_fromphys(rpipe->pipe_map.ms, rpipe->pipe_map.pos, size, uio); PIPE_LOCK(rpipe); if (error) break; nread += size; rpipe->pipe_map.pos += size; rpipe->pipe_map.cnt -= size; if (rpipe->pipe_map.cnt == 0) { rpipe->pipe_state &= ~(PIPE_DIRECTW|PIPE_WANTW); wakeup(rpipe); } #endif } else { /* * detect EOF condition * read returns 0 on EOF, no need to set error */ if (rpipe->pipe_state & PIPE_EOF) break; /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } /* * Break if some data was read. */ if (nread > 0) break; /* * Unlock the pipe buffer for our remaining processing. * We will either break out with an error or we will * sleep and relock to loop. */ pipeunlock(rpipe); /* * Handle non-blocking mode operation or * wait for more data. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; if ((error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0)) == 0) error = pipelock(rpipe, 1); } if (error) goto unlocked_error; } } #ifdef MAC locked_error: #endif pipeunlock(rpipe); /* XXX: should probably do this before getting any locks. */ if (error == 0) vfs_timestamp(&rpipe->pipe_atime); unlocked_error: --rpipe->pipe_busy; /* * PIPE_WANT processing only makes sense if pipe_busy is 0. */ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * Handle write blocking hysteresis. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) pipeselwakeup(rpipe); PIPE_UNLOCK(rpipe); return (error); } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int -pipe_build_write_buffer(wpipe, uio) - struct pipe *wpipe; - struct uio *uio; +pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) { u_int size; int i; PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); KASSERT(wpipe->pipe_state & PIPE_DIRECTW, ("Clone attempt on non-direct write pipe!")); if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; else size = uio->uio_iov->iov_len; if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, wpipe->pipe_map.ms, PIPENPAGES)) < 0) return (EFAULT); /* * set up the control block */ wpipe->pipe_map.npages = i; wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_map.cnt = size; /* * and update the uio data */ uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; if (uio->uio_iov->iov_len == 0) uio->uio_iov++; uio->uio_resid -= size; uio->uio_offset += size; return (0); } /* * unmap and unwire the process buffer */ static void -pipe_destroy_write_buffer(wpipe) - struct pipe *wpipe; +pipe_destroy_write_buffer(struct pipe *wpipe) { PIPE_LOCK_ASSERT(wpipe, MA_OWNED); vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages); wpipe->pipe_map.npages = 0; } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void -pipe_clone_write_buffer(wpipe) - struct pipe *wpipe; +pipe_clone_write_buffer(struct pipe *wpipe) { struct uio uio; struct iovec iov; int size; int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); size = wpipe->pipe_map.cnt; pos = wpipe->pipe_map.pos; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; wpipe->pipe_state &= ~PIPE_DIRECTW; PIPE_UNLOCK(wpipe); iov.iov_base = wpipe->pipe_buffer.buffer; iov.iov_len = size; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = size; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); PIPE_LOCK(wpipe); pipe_destroy_write_buffer(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int -pipe_direct_write(wpipe, uio) - struct pipe *wpipe; - struct uio *uio; +pipe_direct_write(struct pipe *wpipe, struct uio *uio) { int error; retry: PIPE_LOCK_ASSERT(wpipe, MA_OWNED); error = pipelock(wpipe, 1); if (error != 0) goto error1; if ((wpipe->pipe_state & PIPE_EOF) != 0) { error = EPIPE; pipeunlock(wpipe); goto error1; } while (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdww", 0); if (error) goto error1; else goto retry; } wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwc", 0); if (error) goto error1; else goto retry; } wpipe->pipe_state |= PIPE_DIRECTW; PIPE_UNLOCK(wpipe); error = pipe_build_write_buffer(wpipe, uio); PIPE_LOCK(wpipe); if (error) { wpipe->pipe_state &= ~PIPE_DIRECTW; pipeunlock(wpipe); goto error1; } error = 0; while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { if (wpipe->pipe_state & PIPE_EOF) { pipe_destroy_write_buffer(wpipe); pipeselwakeup(wpipe); pipeunlock(wpipe); error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); pipelock(wpipe, 0); } if (wpipe->pipe_state & PIPE_EOF) error = EPIPE; if (wpipe->pipe_state & PIPE_DIRECTW) { /* * this bit of trickery substitutes a kernel buffer for * the process that might be going away. */ pipe_clone_write_buffer(wpipe); } else { pipe_destroy_write_buffer(wpipe); } pipeunlock(wpipe); return (error); error1: wakeup(wpipe); return (error); } #endif static int -pipe_write(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) { int error = 0; int desiredsize; ssize_t orig_resid; struct pipe *wpipe, *rpipe; rpipe = fp->f_data; wpipe = PIPE_PEER(rpipe); PIPE_LOCK(rpipe); error = pipelock(wpipe, 1); if (error) { PIPE_UNLOCK(rpipe); return (error); } /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (EPIPE); } #ifdef MAC error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); if (error) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } #endif ++wpipe->pipe_busy; /* Choose a larger size if it's advantageous */ desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { if (piperesizeallowed != 1) break; if (amountpipekva > maxpipekva / 2) break; if (desiredsize == BIG_PIPE_SIZE) break; desiredsize = desiredsize * 2; } /* Choose a smaller size if we're in a OOM situation */ if ((amountpipekva > (3 * maxpipekva) / 4) && (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && (piperesizeallowed == 1)) desiredsize = SMALL_PIPE_SIZE; /* Resize if the above determined that a new size was necessary */ if ((desiredsize != wpipe->pipe_buffer.size) && ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { PIPE_UNLOCK(wpipe); pipespace(wpipe, desiredsize); PIPE_LOCK(wpipe); } if (wpipe->pipe_buffer.size == 0) { /* * This can only happen for reverse direction use of pipes * in a complete OOM situation. */ error = ENOMEM; --wpipe->pipe_busy; pipeunlock(wpipe); PIPE_UNLOCK(wpipe); return (error); } pipeunlock(wpipe); orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; pipelock(wpipe, 0); if (wpipe->pipe_state & PIPE_EOF) { pipeunlock(wpipe); error = EPIPE; break; } #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. * * The direct write mechanism will detect the reader going * away on us. */ if (uio->uio_segflg == UIO_USERSPACE && uio->uio_iov->iov_len >= PIPE_MINDIRECT && wpipe->pipe_buffer.size >= PIPE_MINDIRECT && (fp->f_flag & FNONBLOCK) == 0) { pipeunlock(wpipe); error = pipe_direct_write(wpipe, uio); if (error) break; continue; } #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. We break out if a signal occurs or the * reader goes away. */ if (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipbww", 0); if (error) break; else continue; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; if (segsize > size) segsize = size; /* Transfer first segment */ PIPE_UNLOCK(rpipe); error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], segsize, uio); PIPE_LOCK(rpipe); if (error == 0 && segsize < size) { KASSERT(wpipe->pipe_buffer.in + segsize == wpipe->pipe_buffer.size, ("Pipe buffer wraparound disappeared")); /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ PIPE_UNLOCK(rpipe); error = uiomove( &wpipe->pipe_buffer.buffer[0], size - segsize, uio); PIPE_LOCK(rpipe); } if (error == 0) { wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) { KASSERT(wpipe->pipe_buffer.in == size - segsize + wpipe->pipe_buffer.size, ("Expected wraparound bad")); wpipe->pipe_buffer.in = size - segsize; } wpipe->pipe_buffer.cnt += size; KASSERT(wpipe->pipe_buffer.cnt <= wpipe->pipe_buffer.size, ("Pipe buffer overflow")); } pipeunlock(wpipe); if (error != 0) break; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; pipeunlock(wpipe); break; } /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipewr", 0); if (error != 0) break; } } pipelock(wpipe, 0); --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if any byte was written. * EINTR and other interrupts are handled by generic I/O layer. * Do not pretend that I/O succeeded for obvious user error * like EFAULT. */ if (uio->uio_resid != orig_resid && error == EPIPE) error = 0; if (error == 0) vfs_timestamp(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } /* ARGSUSED */ static int -pipe_truncate(fp, length, active_cred, td) - struct file *fp; - off_t length; - struct ucred *active_cred; - struct thread *td; +pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_state & PIPE_NAMED) error = vnops.fo_truncate(fp, length, active_cred, td); else error = invfo_truncate(fp, length, active_cred, td); return (error); } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ static int -pipe_ioctl(fp, cmd, data, active_cred, td) - struct file *fp; - u_long cmd; - void *data; - struct ucred *active_cred; - struct thread *td; +pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, + struct thread *td) { struct pipe *mpipe = fp->f_data; int error; PIPE_LOCK(mpipe); #ifdef MAC error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); if (error) { PIPE_UNLOCK(mpipe); return (error); } #endif error = 0; switch (cmd) { case FIONBIO: break; case FIOASYNC: if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } break; case FIONREAD: if (!(fp->f_flag & FREAD)) { *(int *)data = 0; PIPE_UNLOCK(mpipe); return (0); } if (mpipe->pipe_state & PIPE_DIRECTW) *(int *)data = mpipe->pipe_map.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; break; case FIOSETOWN: PIPE_UNLOCK(mpipe); error = fsetown(*(int *)data, &mpipe->pipe_sigio); goto out_unlocked; case FIOGETOWN: *(int *)data = fgetown(&mpipe->pipe_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: PIPE_UNLOCK(mpipe); error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); goto out_unlocked; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&mpipe->pipe_sigio); break; default: error = ENOTTY; break; } PIPE_UNLOCK(mpipe); out_unlocked: return (error); } static int -pipe_poll(fp, events, active_cred, td) - struct file *fp; - int events; - struct ucred *active_cred; - struct thread *td; +pipe_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) { struct pipe *rpipe; struct pipe *wpipe; int levents, revents; #ifdef MAC int error; #endif revents = 0; rpipe = fp->f_data; wpipe = PIPE_PEER(rpipe); PIPE_LOCK(rpipe); #ifdef MAC error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_state & PIPE_DIRECTW) || (rpipe->pipe_buffer.cnt > 0)) revents |= events & (POLLIN | POLLRDNORM); if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF) || (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF || wpipe->pipe_buffer.size == 0))) revents |= events & (POLLOUT | POLLWRNORM); levents = events & (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents && fp->f_seqcount == rpipe->pipe_wgen) events |= POLLINIGNEOF; if ((events & POLLINIGNEOF) == 0) { if (rpipe->pipe_state & PIPE_EOF) { revents |= (events & (POLLIN | POLLRDNORM)); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; } } if (revents == 0) { if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) { selrecord(td, &rpipe->pipe_sel); if (SEL_WAITING(&rpipe->pipe_sel)) rpipe->pipe_state |= PIPE_SEL; } if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) { selrecord(td, &wpipe->pipe_sel); if (SEL_WAITING(&wpipe->pipe_sel)) wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC locked_error: #endif PIPE_UNLOCK(rpipe); return (revents); } /* * We shouldn't need locks here as we're doing a read and this should * be a natural race. */ static int -pipe_stat(fp, ub, active_cred, td) - struct file *fp; - struct stat *ub; - struct ucred *active_cred; - struct thread *td; +pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred, + struct thread *td) { struct pipe *pipe; int new_unr; #ifdef MAC int error; #endif pipe = fp->f_data; PIPE_LOCK(pipe); #ifdef MAC error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); if (error) { PIPE_UNLOCK(pipe); return (error); } #endif /* For named pipes ask the underlying filesystem. */ if (pipe->pipe_state & PIPE_NAMED) { PIPE_UNLOCK(pipe); return (vnops.fo_stat(fp, ub, active_cred, td)); } /* * Lazily allocate an inode number for the pipe. Most pipe * users do not call fstat(2) on the pipe, which means that * postponing the inode allocation until it is must be * returned to userland is useful. If alloc_unr failed, * assign st_ino zero instead of returning an error. * Special pipe_ino values: * -1 - not yet initialized; * 0 - alloc_unr failed, return 0 as st_ino forever. */ if (pipe->pipe_ino == (ino_t)-1) { new_unr = alloc_unr(pipeino_unr); if (new_unr != -1) pipe->pipe_ino = new_unr; else pipe->pipe_ino = 0; } PIPE_UNLOCK(pipe); bzero(ub, sizeof(*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = PAGE_SIZE; if (pipe->pipe_state & PIPE_DIRECTW) ub->st_size = pipe->pipe_map.cnt; else ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = howmany(ub->st_size, ub->st_blksize); ub->st_atim = pipe->pipe_atime; ub->st_mtim = pipe->pipe_mtime; ub->st_ctim = pipe->pipe_ctime; ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; ub->st_dev = pipedev_ino; ub->st_ino = pipe->pipe_ino; /* * Left as 0: st_nlink, st_rdev, st_flags, st_gen. */ return (0); } /* ARGSUSED */ static int -pipe_close(fp, td) - struct file *fp; - struct thread *td; +pipe_close(struct file *fp, struct thread *td) { if (fp->f_vnode != NULL) return vnops.fo_close(fp, td); fp->f_ops = &badfileops; pipe_dtor(fp->f_data); fp->f_data = NULL; return (0); } static int pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_state & PIPE_NAMED) error = vn_chmod(fp, mode, active_cred, td); else error = invfo_chmod(fp, mode, active_cred, td); return (error); } static int -pipe_chown(fp, uid, gid, active_cred, td) - struct file *fp; - uid_t uid; - gid_t gid; - struct ucred *active_cred; - struct thread *td; +pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, + struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_state & PIPE_NAMED) error = vn_chown(fp, uid, gid, active_cred, td); else error = invfo_chown(fp, uid, gid, active_cred, td); return (error); } static int pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct pipe *pi; if (fp->f_type == DTYPE_FIFO) return (vn_fill_kinfo(fp, kif, fdp)); kif->kf_type = KF_TYPE_PIPE; pi = fp->f_data; kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi; kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer; kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt; return (0); } static void -pipe_free_kmem(cpipe) - struct pipe *cpipe; +pipe_free_kmem(struct pipe *cpipe) { KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipe_free_kmem: pipe mutex locked")); if (cpipe->pipe_buffer.buffer != NULL) { atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); vm_map_remove(pipe_map, (vm_offset_t)cpipe->pipe_buffer.buffer, (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; } #ifndef PIPE_NODIRECT { cpipe->pipe_map.cnt = 0; cpipe->pipe_map.pos = 0; cpipe->pipe_map.npages = 0; } #endif } /* * shutdown the pipe */ static void -pipeclose(cpipe) - struct pipe *cpipe; +pipeclose(struct pipe *cpipe) { struct pipepair *pp; struct pipe *ppipe; KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); PIPE_LOCK(cpipe); pipelock(cpipe, 0); pp = cpipe->pipe_pair; pipeselwakeup(cpipe); /* * If the other side is blocked, wake it up saying that * we want to close it down. */ cpipe->pipe_state |= PIPE_EOF; while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT; pipeunlock(cpipe); msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); pipelock(cpipe, 0); } /* * Disconnect from peer, if any. */ ppipe = cpipe->pipe_peer; if (ppipe->pipe_present == PIPE_ACTIVE) { pipeselwakeup(ppipe); ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); } /* * Mark this endpoint as free. Release kmem resources. We * don't mark this endpoint as unused until we've finished * doing that, or the pipe might disappear out from under * us. */ PIPE_UNLOCK(cpipe); pipe_free_kmem(cpipe); PIPE_LOCK(cpipe); cpipe->pipe_present = PIPE_CLOSING; pipeunlock(cpipe); /* * knlist_clear() may sleep dropping the PIPE_MTX. Set the * PIPE_FINALIZED, that allows other end to free the * pipe_pair, only after the knotes are completely dismantled. */ knlist_clear(&cpipe->pipe_sel.si_note, 1); cpipe->pipe_present = PIPE_FINALIZED; seldrain(&cpipe->pipe_sel); knlist_destroy(&cpipe->pipe_sel.si_note); /* * If both endpoints are now closed, release the memory for the * pipe pair. If not, unlock. */ if (ppipe->pipe_present == PIPE_FINALIZED) { PIPE_UNLOCK(cpipe); #ifdef MAC mac_pipe_destroy(pp); #endif uma_zfree(pipe_zone, cpipe->pipe_pair); } else PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int pipe_kqfilter(struct file *fp, struct knote *kn) { struct pipe *cpipe; /* * If a filter is requested that is not supported by this file * descriptor, don't return an error, but also don't ever generate an * event. */ if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) { kn->kn_fop = &pipe_nfiltops; return (0); } if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { kn->kn_fop = &pipe_nfiltops; return (0); } cpipe = fp->f_data; PIPE_LOCK(cpipe); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { /* other end of pipe has been closed */ PIPE_UNLOCK(cpipe); return (EPIPE); } cpipe = PIPE_PEER(cpipe); break; default: PIPE_UNLOCK(cpipe); return (EINVAL); } kn->kn_hook = cpipe; knlist_add(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); return (0); } static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = kn->kn_hook; PIPE_LOCK(cpipe); knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int filt_piperead(struct knote *kn, long hint) { struct pipe *rpipe = kn->kn_hook; struct pipe *wpipe = rpipe->pipe_peer; int ret; PIPE_LOCK_ASSERT(rpipe, MA_OWNED); kn->kn_data = rpipe->pipe_buffer.cnt; if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) kn->kn_data = rpipe->pipe_map.cnt; if ((rpipe->pipe_state & PIPE_EOF) || wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_flags |= EV_EOF; return (1); } ret = kn->kn_data > 0; return ret; } /*ARGSUSED*/ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *wpipe; wpipe = kn->kn_hook; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; kn->kn_flags |= EV_EOF; return (1); } kn->kn_data = (wpipe->pipe_buffer.size > 0) ? (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF; if (wpipe->pipe_state & PIPE_DIRECTW) kn->kn_data = 0; return (kn->kn_data >= PIPE_BUF); } static void filt_pipedetach_notsup(struct knote *kn) { } static int filt_pipenotsup(struct knote *kn, long hint) { return (0); } Index: head/sys/kern/sysv_msg.c =================================================================== --- head/sys/kern/sysv_msg.c (revision 334485) +++ head/sys/kern/sysv_msg.c (revision 334486) @@ -1,1921 +1,1916 @@ /*- * Implementation of SVID messages * * Author: Daniel Boulet * * Copyright 1993 Daniel Boulet and RTMX Inc. * * This system call was implemented by Daniel Boulet under contract from RTMX. * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2005 McAfee, Inc. * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(sysv_msg, "System V message queues support"); static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues"); static int msginit(void); static int msgunload(void); static int sysvmsg_modload(struct module *, int, void *); static void msq_remove(struct msqid_kernel *); static struct prison *msg_find_prison(struct ucred *); static int msq_prison_cansee(struct prison *, struct msqid_kernel *); static int msg_prison_check(void *, void *); static int msg_prison_set(void *, void *); static int msg_prison_get(void *, void *); static int msg_prison_remove(void *, void *); static void msg_prison_cleanup(struct prison *); #ifdef MSG_DEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) (void)0 #endif static void msg_freehdr(struct msg *msghdr); #ifndef MSGSSZ #define MSGSSZ 8 /* Each segment must be 2^N long */ #endif #ifndef MSGSEG #define MSGSEG 2048 /* must be less than 32767 */ #endif #define MSGMAX (MSGSSZ*MSGSEG) #ifndef MSGMNB #define MSGMNB 2048 /* max # of bytes in a queue */ #endif #ifndef MSGMNI #define MSGMNI 40 #endif #ifndef MSGTQL #define MSGTQL 40 #endif /* * Based on the configuration parameters described in an SVR2 (yes, two) * config(1m) man page. * * Each message is broken up and stored in segments that are msgssz bytes * long. For efficiency reasons, this should be a power of two. Also, * it doesn't make sense if it is less than 8 or greater than about 256. * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of * two between 8 and 1024 inclusive (and panic's if it isn't). */ struct msginfo msginfo = { MSGMAX, /* max chars in a message */ MSGMNI, /* # of message queue identifiers */ MSGMNB, /* max chars in a queue */ MSGTQL, /* max messages in system */ MSGSSZ, /* size of a message segment */ /* (must be small power of 2 greater than 4) */ MSGSEG /* number of message segments */ }; /* * macros to convert between msqid_ds's and msqid's. * (specific to this implementation) */ #define MSQID(ix,ds) ((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000)) #define MSQID_IX(id) ((id) & 0xffff) #define MSQID_SEQ(id) (((id) >> 16) & 0xffff) /* * The rest of this file is specific to this particular implementation. */ struct msgmap { short next; /* next segment in buffer */ /* -1 -> available */ /* 0..(MSGSEG-1) -> index of next segment */ }; #define MSG_LOCKED 01000 /* Is this msqid_ds locked? */ static int nfree_msgmaps; /* # of free map entries */ static short free_msgmaps; /* head of linked list of free map entries */ static struct msg *free_msghdrs;/* list of free msg headers */ static char *msgpool; /* MSGMAX byte long msg buffer pool */ static struct msgmap *msgmaps; /* MSGSEG msgmap structures */ static struct msg *msghdrs; /* MSGTQL msg headers */ static struct msqid_kernel *msqids; /* MSGMNI msqid_kernel struct's */ static struct mtx msq_mtx; /* global mutex for message queues. */ static unsigned msg_prison_slot;/* prison OSD slot */ static struct syscall_helper_data msg_syscalls[] = { SYSCALL_INIT_HELPER(msgctl), SYSCALL_INIT_HELPER(msgget), SYSCALL_INIT_HELPER(msgsnd), SYSCALL_INIT_HELPER(msgrcv), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL_INIT_HELPER(msgsys), SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl), #endif SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include static struct syscall_helper_data msg32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_msgctl), SYSCALL32_INIT_HELPER(freebsd32_msgsnd), SYSCALL32_INIT_HELPER(freebsd32_msgrcv), SYSCALL32_INIT_HELPER_COMPAT(msgget), SYSCALL32_INIT_HELPER(freebsd32_msgsys), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl), #endif SYSCALL_INIT_LAST }; #endif static int msginit() { struct prison *pr; void **rsv; int i, error; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_CHECK] = msg_prison_check, [PR_METHOD_SET] = msg_prison_set, [PR_METHOD_GET] = msg_prison_get, [PR_METHOD_REMOVE] = msg_prison_remove, }; msginfo.msgmax = msginfo.msgseg * msginfo.msgssz; msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK); msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK); msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK); msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG, M_WAITOK | M_ZERO); /* * msginfo.msgssz should be a power of two for efficiency reasons. * It is also pretty silly if msginfo.msgssz is less than 8 * or greater than about 256 so ... */ i = 8; while (i < 1024 && i != msginfo.msgssz) i <<= 1; if (i != msginfo.msgssz) { DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz, msginfo.msgssz)); panic("msginfo.msgssz not a small power of 2"); } if (msginfo.msgseg > 32767) { DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg)); panic("msginfo.msgseg > 32767"); } for (i = 0; i < msginfo.msgseg; i++) { if (i > 0) msgmaps[i-1].next = i; msgmaps[i].next = -1; /* implies entry is available */ } free_msgmaps = 0; nfree_msgmaps = msginfo.msgseg; for (i = 0; i < msginfo.msgtql; i++) { msghdrs[i].msg_type = 0; if (i > 0) msghdrs[i-1].msg_next = &msghdrs[i]; msghdrs[i].msg_next = NULL; #ifdef MAC mac_sysvmsg_init(&msghdrs[i]); #endif } free_msghdrs = &msghdrs[0]; for (i = 0; i < msginfo.msgmni; i++) { msqids[i].u.msg_qbytes = 0; /* implies entry is available */ msqids[i].u.msg_perm.seq = 0; /* reset to a known value */ msqids[i].u.msg_perm.mode = 0; #ifdef MAC mac_sysvmsq_init(&msqids[i]); #endif } mtx_init(&msq_mtx, "msq", NULL, MTX_DEF); /* Set current prisons according to their allow.sysvipc. */ msg_prison_slot = osd_jail_register(NULL, methods); rsv = osd_reserve(msg_prison_slot); prison_lock(&prison0); (void)osd_jail_set_reserved(&prison0, msg_prison_slot, rsv, &prison0); prison_unlock(&prison0); rsv = NULL; sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { if (rsv == NULL) rsv = osd_reserve(msg_prison_slot); prison_lock(pr); if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) { (void)osd_jail_set_reserved(pr, msg_prison_slot, rsv, &prison0); rsv = NULL; } prison_unlock(pr); } if (rsv != NULL) osd_free_reserved(rsv); sx_sunlock(&allprison_lock); error = syscall_helper_register(msg_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(msg32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int msgunload() { struct msqid_kernel *msqkptr; int msqid; #ifdef MAC int i; #endif syscall_helper_unregister(msg_syscalls); #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(msg32_syscalls); #endif for (msqid = 0; msqid < msginfo.msgmni; msqid++) { msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes != 0 || (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) break; } if (msqid != msginfo.msgmni) return (EBUSY); if (msg_prison_slot != 0) osd_jail_deregister(msg_prison_slot); #ifdef MAC for (i = 0; i < msginfo.msgtql; i++) mac_sysvmsg_destroy(&msghdrs[i]); for (msqid = 0; msqid < msginfo.msgmni; msqid++) mac_sysvmsq_destroy(&msqids[msqid]); #endif free(msgpool, M_MSG); free(msgmaps, M_MSG); free(msghdrs, M_MSG); free(msqids, M_MSG); mtx_destroy(&msq_mtx); return (0); } static int sysvmsg_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = msginit(); if (error != 0) msgunload(); break; case MOD_UNLOAD: error = msgunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvmsg_mod = { "sysvmsg", &sysvmsg_modload, NULL }; DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST); MODULE_VERSION(sysvmsg, 1); static void -msg_freehdr(msghdr) - struct msg *msghdr; +msg_freehdr(struct msg *msghdr) { while (msghdr->msg_ts > 0) { short next; if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg) panic("msghdr->msg_spot out of range"); next = msgmaps[msghdr->msg_spot].next; msgmaps[msghdr->msg_spot].next = free_msgmaps; free_msgmaps = msghdr->msg_spot; nfree_msgmaps++; msghdr->msg_spot = next; if (msghdr->msg_ts >= msginfo.msgssz) msghdr->msg_ts -= msginfo.msgssz; else msghdr->msg_ts = 0; } if (msghdr->msg_spot != -1) panic("msghdr->msg_spot != -1"); msghdr->msg_next = free_msghdrs; free_msghdrs = msghdr; #ifdef MAC mac_sysvmsg_cleanup(msghdr); #endif } static void msq_remove(struct msqid_kernel *msqkptr) { struct msg *msghdr; racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1); racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum); racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes); crfree(msqkptr->cred); msqkptr->cred = NULL; /* Free the message headers */ msghdr = msqkptr->u.__msg_first; while (msghdr != NULL) { struct msg *msghdr_tmp; /* Free the segments of each message */ msqkptr->u.msg_cbytes -= msghdr->msg_ts; msqkptr->u.msg_qnum--; msghdr_tmp = msghdr; msghdr = msghdr->msg_next; msg_freehdr(msghdr_tmp); } if (msqkptr->u.msg_cbytes != 0) panic("msg_cbytes is screwed up"); if (msqkptr->u.msg_qnum != 0) panic("msg_qnum is screwed up"); msqkptr->u.msg_qbytes = 0; /* Mark it as free */ #ifdef MAC mac_sysvmsq_cleanup(msqkptr); #endif wakeup(msqkptr); } static struct prison * msg_find_prison(struct ucred *cred) { struct prison *pr, *rpr; pr = cred->cr_prison; prison_lock(pr); rpr = osd_jail_get(pr, msg_prison_slot); prison_unlock(pr); return rpr; } static int msq_prison_cansee(struct prison *rpr, struct msqid_kernel *msqkptr) { if (msqkptr->cred == NULL || !(rpr == msqkptr->cred->cr_prison || prison_ischild(rpr, msqkptr->cred->cr_prison))) return (EINVAL); return (0); } #ifndef _SYS_SYSPROTO_H_ struct msgctl_args { int msqid; int cmd; struct msqid_ds *buf; }; #endif int sys_msgctl(struct thread *td, struct msgctl_args *uap) { int msqid = uap->msqid; int cmd = uap->cmd; struct msqid_ds msqbuf; int error; DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf)); if (cmd == IPC_SET && (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0) return (error); error = kern_msgctl(td, msqid, cmd, &msqbuf); if (cmd == IPC_STAT && error == 0) error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds)); return (error); } int -kern_msgctl(td, msqid, cmd, msqbuf) - struct thread *td; - int msqid; - int cmd; - struct msqid_ds *msqbuf; +kern_msgctl(struct thread *td, int msqid, int cmd, struct msqid_ds *msqbuf) { int rval, error, msqix; struct msqid_kernel *msqkptr; struct prison *rpr; rpr = msg_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); AUDIT_ARG_SVIPC_CMD(cmd); AUDIT_ARG_SVIPC_ID(msqid); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); return (EINVAL); } msqkptr = &msqids[msqix]; mtx_lock(&msq_mtx); if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such msqid\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } error = msq_prison_cansee(rpr, msqkptr); if (error != 0) { DPRINTF(("requester can't see prison\n")); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd); if (error != 0) goto done2; #endif error = 0; rval = 0; switch (cmd) { case IPC_RMID: { #ifdef MAC struct msg *msghdr; #endif if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M))) goto done2; #ifdef MAC /* * Check that the thread has MAC access permissions to * individual msghdrs. Note: We need to do this in a * separate loop because the actual loop alters the * msq/msghdr info as it progresses, and there is no going * back if half the way through we discover that the * thread cannot free a certain msghdr. The msq will get * into an inconsistent state. */ for (msghdr = msqkptr->u.__msg_first; msghdr != NULL; msghdr = msghdr->msg_next) { error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr); if (error != 0) goto done2; } #endif msq_remove(msqkptr); } break; case IPC_SET: AUDIT_ARG_SVIPC_PERM(&msqbuf->msg_perm); if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M))) goto done2; if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) { error = priv_check(td, PRIV_IPC_MSGSIZE); if (error) goto done2; } if (msqbuf->msg_qbytes > msginfo.msgmnb) { DPRINTF(("can't increase msg_qbytes beyond %d" "(truncating)\n", msginfo.msgmnb)); msqbuf->msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */ } if (msqbuf->msg_qbytes == 0) { DPRINTF(("can't reduce msg_qbytes to 0\n")); error = EINVAL; /* non-standard errno! */ goto done2; } msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid; /* change the owner */ msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid; /* change the owner */ msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) | (msqbuf->msg_perm.mode & 0777); msqkptr->u.msg_qbytes = msqbuf->msg_qbytes; msqkptr->u.msg_ctime = time_second; break; case IPC_STAT: if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) { DPRINTF(("requester doesn't have read access\n")); goto done2; } *msqbuf = msqkptr->u; if (td->td_ucred->cr_prison != msqkptr->cred->cr_prison) msqbuf->msg_perm.key = IPC_PRIVATE; break; default: DPRINTF(("invalid command %d\n", cmd)); error = EINVAL; goto done2; } if (error == 0) td->td_retval[0] = rval; done2: mtx_unlock(&msq_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct msgget_args { key_t key; int msgflg; }; #endif int sys_msgget(struct thread *td, struct msgget_args *uap) { int msqid, error = 0; int key = uap->key; int msgflg = uap->msgflg; struct ucred *cred = td->td_ucred; struct msqid_kernel *msqkptr = NULL; DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg)); if (msg_find_prison(cred) == NULL) return (ENOSYS); mtx_lock(&msq_mtx); if (key != IPC_PRIVATE) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes != 0 && msqkptr->cred != NULL && msqkptr->cred->cr_prison == cred->cr_prison && msqkptr->u.msg_perm.key == key) break; } if (msqid < msginfo.msgmni) { DPRINTF(("found public key\n")); if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) { DPRINTF(("not exclusive\n")); error = EEXIST; goto done2; } AUDIT_ARG_SVIPC_ID(IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm)); if ((error = ipcperm(td, &msqkptr->u.msg_perm, msgflg & 0700))) { DPRINTF(("requester doesn't have 0%o access\n", msgflg & 0700)); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqget(cred, msqkptr); if (error != 0) goto done2; #endif goto found; } } DPRINTF(("need to allocate the msqid_ds\n")); if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { /* * Look for an unallocated and unlocked msqid_ds. * msqid_ds's can be locked by msgsnd or msgrcv while * they are copying the message in/out. We can't * re-use the entry until they release it. */ msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes == 0 && (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0) break; } if (msqid == msginfo.msgmni) { DPRINTF(("no more msqid_ds's available\n")); error = ENOSPC; goto done2; } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_NMSGQ, 1); PROC_UNLOCK(td->td_proc); if (error != 0) { error = ENOSPC; goto done2; } } #endif DPRINTF(("msqid %d is available\n", msqid)); msqkptr->u.msg_perm.key = key; msqkptr->u.msg_perm.cuid = cred->cr_uid; msqkptr->u.msg_perm.uid = cred->cr_uid; msqkptr->u.msg_perm.cgid = cred->cr_gid; msqkptr->u.msg_perm.gid = cred->cr_gid; msqkptr->u.msg_perm.mode = (msgflg & 0777); msqkptr->cred = crhold(cred); /* Make sure that the returned msqid is unique */ msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff; msqkptr->u.__msg_first = NULL; msqkptr->u.__msg_last = NULL; msqkptr->u.msg_cbytes = 0; msqkptr->u.msg_qnum = 0; msqkptr->u.msg_qbytes = msginfo.msgmnb; msqkptr->u.msg_lspid = 0; msqkptr->u.msg_lrpid = 0; msqkptr->u.msg_stime = 0; msqkptr->u.msg_rtime = 0; msqkptr->u.msg_ctime = time_second; #ifdef MAC mac_sysvmsq_create(cred, msqkptr); #endif AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm); } else { DPRINTF(("didn't find it and wasn't asked to create it\n")); error = ENOENT; goto done2; } found: /* Construct the unique msqid */ td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm); done2: mtx_unlock(&msq_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct msgsnd_args { int msqid; const void *msgp; /* XXX msgp is actually mtext. */ size_t msgsz; int msgflg; }; #endif int kern_msgsnd(struct thread *td, int msqid, const void *msgp, size_t msgsz, int msgflg, long mtype) { int msqix, segs_needed, error = 0; struct msqid_kernel *msqkptr; struct msg *msghdr; struct prison *rpr; short next; #ifdef RACCT size_t saved_msgsz = 0; #endif rpr = msg_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); mtx_lock(&msq_mtx); AUDIT_ARG_SVIPC_ID(msqid); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); error = EINVAL; goto done2; } msqkptr = &msqids[msqix]; AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm); if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such message queue id\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } if ((error = msq_prison_cansee(rpr, msqkptr))) { DPRINTF(("requester can't see prison\n")); goto done2; } if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) { DPRINTF(("requester doesn't have write access\n")); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr); if (error != 0) goto done2; #endif #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) { PROC_UNLOCK(td->td_proc); error = EAGAIN; goto done2; } saved_msgsz = msgsz; if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) { racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1); PROC_UNLOCK(td->td_proc); error = EAGAIN; goto done2; } PROC_UNLOCK(td->td_proc); } #endif segs_needed = howmany(msgsz, msginfo.msgssz); DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz, segs_needed)); for (;;) { int need_more_resources = 0; /* * check msgsz * (inside this loop in case msg_qbytes changes while we sleep) */ if (msgsz > msqkptr->u.msg_qbytes) { DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n")); error = EINVAL; goto done3; } if (msqkptr->u.msg_perm.mode & MSG_LOCKED) { DPRINTF(("msqid is locked\n")); need_more_resources = 1; } if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) { DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n")); need_more_resources = 1; } if (segs_needed > nfree_msgmaps) { DPRINTF(("segs_needed > nfree_msgmaps\n")); need_more_resources = 1; } if (free_msghdrs == NULL) { DPRINTF(("no more msghdrs\n")); need_more_resources = 1; } if (need_more_resources) { int we_own_it; if ((msgflg & IPC_NOWAIT) != 0) { DPRINTF(("need more resources but caller " "doesn't want to wait\n")); error = EAGAIN; goto done3; } if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) { DPRINTF(("we don't own the msqid_ds\n")); we_own_it = 0; } else { /* Force later arrivals to wait for our request */ DPRINTF(("we own the msqid_ds\n")); msqkptr->u.msg_perm.mode |= MSG_LOCKED; we_own_it = 1; } DPRINTF(("msgsnd: goodnight\n")); error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH, "msgsnd", hz); DPRINTF(("msgsnd: good morning, error=%d\n", error)); if (we_own_it) msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; if (error == EWOULDBLOCK) { DPRINTF(("msgsnd: timed out\n")); continue; } if (error != 0) { DPRINTF(("msgsnd: interrupted system call\n")); error = EINTR; goto done3; } /* * Make sure that the msq queue still exists */ if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("msqid deleted\n")); error = EIDRM; goto done3; } } else { DPRINTF(("got all the resources that we need\n")); break; } } /* * We have the resources that we need. * Make sure! */ if (msqkptr->u.msg_perm.mode & MSG_LOCKED) panic("msg_perm.mode & MSG_LOCKED"); if (segs_needed > nfree_msgmaps) panic("segs_needed > nfree_msgmaps"); if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) panic("msgsz + msg_cbytes > msg_qbytes"); if (free_msghdrs == NULL) panic("no more msghdrs"); /* * Re-lock the msqid_ds in case we page-fault when copying in the * message */ if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) panic("msqid_ds is already locked"); msqkptr->u.msg_perm.mode |= MSG_LOCKED; /* * Allocate a message header */ msghdr = free_msghdrs; free_msghdrs = msghdr->msg_next; msghdr->msg_spot = -1; msghdr->msg_ts = msgsz; msghdr->msg_type = mtype; #ifdef MAC /* * XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here * immediately? Or, should it be checked just before the msg is * enqueued in the msgq (as it is done now)? */ mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr); #endif /* * Allocate space for the message */ while (segs_needed > 0) { if (nfree_msgmaps <= 0) panic("not enough msgmaps"); if (free_msgmaps == -1) panic("nil free_msgmaps"); next = free_msgmaps; if (next <= -1) panic("next too low #1"); if (next >= msginfo.msgseg) panic("next out of range #1"); DPRINTF(("allocating segment %d to message\n", next)); free_msgmaps = msgmaps[next].next; nfree_msgmaps--; msgmaps[next].next = msghdr->msg_spot; msghdr->msg_spot = next; segs_needed--; } /* * Validate the message type */ if (msghdr->msg_type < 1) { msg_freehdr(msghdr); msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; wakeup(msqkptr); DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type)); error = EINVAL; goto done3; } /* * Copy in the message body */ next = msghdr->msg_spot; while (msgsz > 0) { size_t tlen; if (msgsz > msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz; if (next <= -1) panic("next too low #2"); if (next >= msginfo.msgseg) panic("next out of range #2"); mtx_unlock(&msq_mtx); if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz], tlen)) != 0) { mtx_lock(&msq_mtx); DPRINTF(("error %d copying in message segment\n", error)); msg_freehdr(msghdr); msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; wakeup(msqkptr); goto done3; } mtx_lock(&msq_mtx); msgsz -= tlen; msgp = (const char *)msgp + tlen; next = msgmaps[next].next; } if (next != -1) panic("didn't use all the msg segments"); /* * We've got the message. Unlock the msqid_ds. */ msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; /* * Make sure that the msqid_ds is still allocated. */ if (msqkptr->u.msg_qbytes == 0) { msg_freehdr(msghdr); wakeup(msqkptr); error = EIDRM; goto done3; } #ifdef MAC /* * Note: Since the task/thread allocates the msghdr and usually * primes it with its own MAC label, for a majority of policies, it * won't be necessary to check whether the msghdr has access * permissions to the msgq. The mac_sysvmsq_check_msqsnd check would * suffice in that case. However, this hook may be required where * individual policies derive a non-identical label for the msghdr * from the current thread label and may want to check the msghdr * enqueue permissions, along with read/write permissions to the * msgq. */ error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr); if (error != 0) { msg_freehdr(msghdr); wakeup(msqkptr); goto done3; } #endif /* * Put the message into the queue */ if (msqkptr->u.__msg_first == NULL) { msqkptr->u.__msg_first = msghdr; msqkptr->u.__msg_last = msghdr; } else { msqkptr->u.__msg_last->msg_next = msghdr; msqkptr->u.__msg_last = msghdr; } msqkptr->u.__msg_last->msg_next = NULL; msqkptr->u.msg_cbytes += msghdr->msg_ts; msqkptr->u.msg_qnum++; msqkptr->u.msg_lspid = td->td_proc->p_pid; msqkptr->u.msg_stime = time_second; wakeup(msqkptr); td->td_retval[0] = 0; done3: #ifdef RACCT if (racct_enable && error != 0) { PROC_LOCK(td->td_proc); racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1); racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz); PROC_UNLOCK(td->td_proc); } #endif done2: mtx_unlock(&msq_mtx); return (error); } int sys_msgsnd(struct thread *td, struct msgsnd_args *uap) { int error; long mtype; DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp, uap->msgsz, uap->msgflg)); if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) { DPRINTF(("error %d copying the message type\n", error)); return (error); } return (kern_msgsnd(td, uap->msqid, (const char *)uap->msgp + sizeof(mtype), uap->msgsz, uap->msgflg, mtype)); } #ifndef _SYS_SYSPROTO_H_ struct msgrcv_args { int msqid; void *msgp; size_t msgsz; long msgtyp; int msgflg; }; #endif /* XXX msgp is actually mtext. */ int kern_msgrcv(struct thread *td, int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg, long *mtype) { size_t len; struct msqid_kernel *msqkptr; struct msg *msghdr; struct prison *rpr; int msqix, error = 0; short next; rpr = msg_find_prison(td->td_ucred); if (rpr == NULL) return (ENOSYS); AUDIT_ARG_SVIPC_ID(msqid); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); return (EINVAL); } msqkptr = &msqids[msqix]; mtx_lock(&msq_mtx); AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm); if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such message queue id\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } if ((error = msq_prison_cansee(rpr, msqkptr))) { DPRINTF(("requester can't see prison\n")); goto done2; } if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) { DPRINTF(("requester doesn't have read access\n")); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr); if (error != 0) goto done2; #endif msghdr = NULL; while (msghdr == NULL) { if (msgtyp == 0) { msghdr = msqkptr->u.__msg_first; if (msghdr != NULL) { if (msgsz < msghdr->msg_ts && (msgflg & MSG_NOERROR) == 0) { DPRINTF(("first message on the queue " "is too big (want %zu, got %d)\n", msgsz, msghdr->msg_ts)); error = E2BIG; goto done2; } #ifdef MAC error = mac_sysvmsq_check_msgrcv(td->td_ucred, msghdr); if (error != 0) goto done2; #endif if (msqkptr->u.__msg_first == msqkptr->u.__msg_last) { msqkptr->u.__msg_first = NULL; msqkptr->u.__msg_last = NULL; } else { msqkptr->u.__msg_first = msghdr->msg_next; if (msqkptr->u.__msg_first == NULL) panic("msg_first/last screwed up #1"); } } } else { struct msg *previous; struct msg **prev; previous = NULL; prev = &(msqkptr->u.__msg_first); while ((msghdr = *prev) != NULL) { /* * Is this message's type an exact match or is * this message's type less than or equal to * the absolute value of a negative msgtyp? * Note that the second half of this test can * NEVER be true if msgtyp is positive since * msg_type is always positive! */ if (msgtyp == msghdr->msg_type || msghdr->msg_type <= -msgtyp) { DPRINTF(("found message type %ld, " "requested %ld\n", msghdr->msg_type, msgtyp)); if (msgsz < msghdr->msg_ts && (msgflg & MSG_NOERROR) == 0) { DPRINTF(("requested message " "on the queue is too big " "(want %zu, got %hu)\n", msgsz, msghdr->msg_ts)); error = E2BIG; goto done2; } #ifdef MAC error = mac_sysvmsq_check_msgrcv( td->td_ucred, msghdr); if (error != 0) goto done2; #endif *prev = msghdr->msg_next; if (msghdr == msqkptr->u.__msg_last) { if (previous == NULL) { if (prev != &msqkptr->u.__msg_first) panic("__msg_first/last screwed up #2"); msqkptr->u.__msg_first = NULL; msqkptr->u.__msg_last = NULL; } else { if (prev == &msqkptr->u.__msg_first) panic("__msg_first/last screwed up #3"); msqkptr->u.__msg_last = previous; } } break; } previous = msghdr; prev = &(msghdr->msg_next); } } /* * We've either extracted the msghdr for the appropriate * message or there isn't one. * If there is one then bail out of this loop. */ if (msghdr != NULL) break; /* * Hmph! No message found. Does the user want to wait? */ if ((msgflg & IPC_NOWAIT) != 0) { DPRINTF(("no appropriate message found (msgtyp=%ld)\n", msgtyp)); /* The SVID says to return ENOMSG. */ error = ENOMSG; goto done2; } /* * Wait for something to happen */ DPRINTF(("msgrcv: goodnight\n")); error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH, "msgrcv", 0); DPRINTF(("msgrcv: good morning (error=%d)\n", error)); if (error != 0) { DPRINTF(("msgrcv: interrupted system call\n")); error = EINTR; goto done2; } /* * Make sure that the msq queue still exists */ if (msqkptr->u.msg_qbytes == 0 || msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("msqid deleted\n")); error = EIDRM; goto done2; } } /* * Return the message to the user. * * First, do the bookkeeping (before we risk being interrupted). */ msqkptr->u.msg_cbytes -= msghdr->msg_ts; msqkptr->u.msg_qnum--; msqkptr->u.msg_lrpid = td->td_proc->p_pid; msqkptr->u.msg_rtime = time_second; racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1); racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts); /* * Make msgsz the actual amount that we'll be returning. * Note that this effectively truncates the message if it is too long * (since msgsz is never increased). */ DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz, msghdr->msg_ts)); if (msgsz > msghdr->msg_ts) msgsz = msghdr->msg_ts; *mtype = msghdr->msg_type; /* * Return the segments to the user */ next = msghdr->msg_spot; for (len = 0; len < msgsz; len += msginfo.msgssz) { size_t tlen; if (msgsz - len > msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz - len; if (next <= -1) panic("next too low #3"); if (next >= msginfo.msgseg) panic("next out of range #3"); mtx_unlock(&msq_mtx); error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen); mtx_lock(&msq_mtx); if (error != 0) { DPRINTF(("error (%d) copying out message segment\n", error)); msg_freehdr(msghdr); wakeup(msqkptr); goto done2; } msgp = (char *)msgp + tlen; next = msgmaps[next].next; } /* * Done, return the actual number of bytes copied out. */ msg_freehdr(msghdr); wakeup(msqkptr); td->td_retval[0] = msgsz; done2: mtx_unlock(&msq_mtx); return (error); } int sys_msgrcv(struct thread *td, struct msgrcv_args *uap) { int error; long mtype; DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid, uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg)); if ((error = kern_msgrcv(td, uap->msqid, (char *)uap->msgp + sizeof(mtype), uap->msgsz, uap->msgtyp, uap->msgflg, &mtype)) != 0) return (error); if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0) DPRINTF(("error %d copying the message type\n", error)); return (error); } static int sysctl_msqids(SYSCTL_HANDLER_ARGS) { struct msqid_kernel tmsqk; #ifdef COMPAT_FREEBSD32 struct msqid_kernel32 tmsqk32; #endif struct prison *pr, *rpr; void *outaddr; size_t outsize; int error, i; pr = req->td->td_ucred->cr_prison; rpr = msg_find_prison(req->td->td_ucred); error = 0; for (i = 0; i < msginfo.msgmni; i++) { mtx_lock(&msq_mtx); if (msqids[i].u.msg_qbytes == 0 || rpr == NULL || msq_prison_cansee(rpr, &msqids[i]) != 0) bzero(&tmsqk, sizeof(tmsqk)); else { tmsqk = msqids[i]; if (tmsqk.cred->cr_prison != pr) tmsqk.u.msg_perm.key = IPC_PRIVATE; } mtx_unlock(&msq_mtx); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { bzero(&tmsqk32, sizeof(tmsqk32)); freebsd32_ipcperm_out(&tmsqk.u.msg_perm, &tmsqk32.u.msg_perm); /* Don't copy u.msg_first or u.msg_last */ CP(tmsqk, tmsqk32, u.msg_cbytes); CP(tmsqk, tmsqk32, u.msg_qnum); CP(tmsqk, tmsqk32, u.msg_qbytes); CP(tmsqk, tmsqk32, u.msg_lspid); CP(tmsqk, tmsqk32, u.msg_lrpid); CP(tmsqk, tmsqk32, u.msg_stime); CP(tmsqk, tmsqk32, u.msg_rtime); CP(tmsqk, tmsqk32, u.msg_ctime); /* Don't copy label or cred */ outaddr = &tmsqk32; outsize = sizeof(tmsqk32); } else #endif { /* Don't leak kernel pointers */ tmsqk.u.__msg_first = NULL; tmsqk.u.__msg_last = NULL; tmsqk.label = NULL; tmsqk.cred = NULL; /* * XXX: some padding also exists, but we take care to * allocate our pool of msqid_kernel structs with * zeroed memory so this should be OK. */ outaddr = &tmsqk; outsize = sizeof(tmsqk); } error = SYSCTL_OUT(req, outaddr, outsize); if (error != 0) break; } return (error); } SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0, "Maximum message size"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0, "Number of message queue identifiers"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0, "Maximum number of bytes in a queue"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0, "Maximum number of messages in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0, "Size of a message segment"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0, "Number of message segments"); SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_msqids, "", "Array of struct msqid_kernel for each potential message queue"); static int msg_prison_check(void *obj, void *data) { struct prison *pr = obj; struct prison *prpr; struct vfsoptlist *opts = data; int error, jsys; /* * sysvmsg is a jailsys integer. * It must be "disable" if the parent jail is disabled. */ error = vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys)); if (error != ENOENT) { if (error != 0) return (error); switch (jsys) { case JAIL_SYS_DISABLE: break; case JAIL_SYS_NEW: case JAIL_SYS_INHERIT: prison_lock(pr->pr_parent); prpr = osd_jail_get(pr->pr_parent, msg_prison_slot); prison_unlock(pr->pr_parent); if (prpr == NULL) return (EPERM); break; default: return (EINVAL); } } return (0); } static int msg_prison_set(void *obj, void *data) { struct prison *pr = obj; struct prison *tpr, *orpr, *nrpr, *trpr; struct vfsoptlist *opts = data; void *rsv; int jsys, descend; /* * sysvmsg controls which jail is the root of the associated msgs (this * jail or same as the parent), or if the feature is available at all. */ if (vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys)) == ENOENT) jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0) ? JAIL_SYS_INHERIT : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0) ? JAIL_SYS_DISABLE : -1; if (jsys == JAIL_SYS_DISABLE) { prison_lock(pr); orpr = osd_jail_get(pr, msg_prison_slot); if (orpr != NULL) osd_jail_del(pr, msg_prison_slot); prison_unlock(pr); if (orpr != NULL) { if (orpr == pr) msg_prison_cleanup(pr); /* Disable all child jails as well. */ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { prison_lock(tpr); trpr = osd_jail_get(tpr, msg_prison_slot); if (trpr != NULL) { osd_jail_del(tpr, msg_prison_slot); prison_unlock(tpr); if (trpr == tpr) msg_prison_cleanup(tpr); } else { prison_unlock(tpr); descend = 0; } } } } else if (jsys != -1) { if (jsys == JAIL_SYS_NEW) nrpr = pr; else { prison_lock(pr->pr_parent); nrpr = osd_jail_get(pr->pr_parent, msg_prison_slot); prison_unlock(pr->pr_parent); } rsv = osd_reserve(msg_prison_slot); prison_lock(pr); orpr = osd_jail_get(pr, msg_prison_slot); if (orpr != nrpr) (void)osd_jail_set_reserved(pr, msg_prison_slot, rsv, nrpr); else osd_free_reserved(rsv); prison_unlock(pr); if (orpr != nrpr) { if (orpr == pr) msg_prison_cleanup(pr); if (orpr != NULL) { /* Change child jails matching the old root, */ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { prison_lock(tpr); trpr = osd_jail_get(tpr, msg_prison_slot); if (trpr == orpr) { (void)osd_jail_set(tpr, msg_prison_slot, nrpr); prison_unlock(tpr); if (trpr == tpr) msg_prison_cleanup(tpr); } else { prison_unlock(tpr); descend = 0; } } } } } return (0); } static int msg_prison_get(void *obj, void *data) { struct prison *pr = obj; struct prison *rpr; struct vfsoptlist *opts = data; int error, jsys; /* Set sysvmsg based on the jail's root prison. */ prison_lock(pr); rpr = osd_jail_get(pr, msg_prison_slot); prison_unlock(pr); jsys = rpr == NULL ? JAIL_SYS_DISABLE : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, "sysvmsg", &jsys, sizeof(jsys)); if (error == ENOENT) error = 0; return (error); } static int msg_prison_remove(void *obj, void *data __unused) { struct prison *pr = obj; struct prison *rpr; prison_lock(pr); rpr = osd_jail_get(pr, msg_prison_slot); prison_unlock(pr); if (rpr == pr) msg_prison_cleanup(pr); return (0); } static void msg_prison_cleanup(struct prison *pr) { struct msqid_kernel *msqkptr; int i; /* Remove any msqs that belong to this jail. */ mtx_lock(&msq_mtx); for (i = 0; i < msginfo.msgmni; i++) { msqkptr = &msqids[i]; if (msqkptr->u.msg_qbytes != 0 && msqkptr->cred != NULL && msqkptr->cred->cr_prison == pr) msq_remove(msqkptr); } mtx_unlock(&msq_mtx); } SYSCTL_JAIL_PARAM_SYS_NODE(sysvmsg, CTLFLAG_RW, "SYSV message queues"); #ifdef COMPAT_FREEBSD32 int freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap) { #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) AUDIT_ARG_SVIPC_WHICH(uap->which); switch (uap->which) { case 0: return (freebsd7_freebsd32_msgctl(td, (struct freebsd7_freebsd32_msgctl_args *)&uap->a2)); case 2: return (freebsd32_msgsnd(td, (struct freebsd32_msgsnd_args *)&uap->a2)); case 3: return (freebsd32_msgrcv(td, (struct freebsd32_msgrcv_args *)&uap->a2)); default: return (sys_msgsys(td, (struct msgsys_args *)uap)); } #else return (nosys(td, NULL)); #endif } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) int freebsd7_freebsd32_msgctl(struct thread *td, struct freebsd7_freebsd32_msgctl_args *uap) { struct msqid_ds msqbuf; struct msqid_ds32_old msqbuf32; int error; if (uap->cmd == IPC_SET) { error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32)); if (error) return (error); freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm); PTRIN_CP(msqbuf32, msqbuf, __msg_first); PTRIN_CP(msqbuf32, msqbuf, __msg_last); CP(msqbuf32, msqbuf, msg_cbytes); CP(msqbuf32, msqbuf, msg_qnum); CP(msqbuf32, msqbuf, msg_qbytes); CP(msqbuf32, msqbuf, msg_lspid); CP(msqbuf32, msqbuf, msg_lrpid); CP(msqbuf32, msqbuf, msg_stime); CP(msqbuf32, msqbuf, msg_rtime); CP(msqbuf32, msqbuf, msg_ctime); } error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf); if (error) return (error); if (uap->cmd == IPC_STAT) { bzero(&msqbuf32, sizeof(msqbuf32)); freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm); PTROUT_CP(msqbuf, msqbuf32, __msg_first); PTROUT_CP(msqbuf, msqbuf32, __msg_last); CP(msqbuf, msqbuf32, msg_cbytes); CP(msqbuf, msqbuf32, msg_qnum); CP(msqbuf, msqbuf32, msg_qbytes); CP(msqbuf, msqbuf32, msg_lspid); CP(msqbuf, msqbuf32, msg_lrpid); CP(msqbuf, msqbuf32, msg_stime); CP(msqbuf, msqbuf32, msg_rtime); CP(msqbuf, msqbuf32, msg_ctime); error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32)); } return (error); } #endif int freebsd32_msgctl(struct thread *td, struct freebsd32_msgctl_args *uap) { struct msqid_ds msqbuf; struct msqid_ds32 msqbuf32; int error; if (uap->cmd == IPC_SET) { error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32)); if (error) return (error); freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm); PTRIN_CP(msqbuf32, msqbuf, __msg_first); PTRIN_CP(msqbuf32, msqbuf, __msg_last); CP(msqbuf32, msqbuf, msg_cbytes); CP(msqbuf32, msqbuf, msg_qnum); CP(msqbuf32, msqbuf, msg_qbytes); CP(msqbuf32, msqbuf, msg_lspid); CP(msqbuf32, msqbuf, msg_lrpid); CP(msqbuf32, msqbuf, msg_stime); CP(msqbuf32, msqbuf, msg_rtime); CP(msqbuf32, msqbuf, msg_ctime); } error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf); if (error) return (error); if (uap->cmd == IPC_STAT) { freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm); PTROUT_CP(msqbuf, msqbuf32, __msg_first); PTROUT_CP(msqbuf, msqbuf32, __msg_last); CP(msqbuf, msqbuf32, msg_cbytes); CP(msqbuf, msqbuf32, msg_qnum); CP(msqbuf, msqbuf32, msg_qbytes); CP(msqbuf, msqbuf32, msg_lspid); CP(msqbuf, msqbuf32, msg_lrpid); CP(msqbuf, msqbuf32, msg_stime); CP(msqbuf, msqbuf32, msg_rtime); CP(msqbuf, msqbuf32, msg_ctime); error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32)); } return (error); } int freebsd32_msgsnd(struct thread *td, struct freebsd32_msgsnd_args *uap) { const void *msgp; long mtype; int32_t mtype32; int error; msgp = PTRIN(uap->msgp); if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0) return (error); mtype = mtype32; return (kern_msgsnd(td, uap->msqid, (const char *)msgp + sizeof(mtype32), uap->msgsz, uap->msgflg, mtype)); } int freebsd32_msgrcv(struct thread *td, struct freebsd32_msgrcv_args *uap) { void *msgp; long mtype; int32_t mtype32; int error; msgp = PTRIN(uap->msgp); if ((error = kern_msgrcv(td, uap->msqid, (char *)msgp + sizeof(mtype32), uap->msgsz, uap->msgtyp, uap->msgflg, &mtype)) != 0) return (error); mtype32 = (int32_t)mtype; return (copyout(&mtype32, msgp, sizeof(mtype32))); } #endif #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *msgcalls[] = { (sy_call_t *)freebsd7_msgctl, (sy_call_t *)sys_msgget, (sy_call_t *)sys_msgsnd, (sy_call_t *)sys_msgrcv }; /* * Entry point for all MSG calls. * * XXX actually varargs. * struct msgsys_args { * int which; * int a2; * int a3; * int a4; * int a5; * int a6; * } *uap; */ int sys_msgsys(struct thread *td, struct msgsys_args *uap) { int error; AUDIT_ARG_SVIPC_WHICH(uap->which); if (uap->which < 0 || uap->which >= nitems(msgcalls)) return (EINVAL); error = (*msgcalls[uap->which])(td, &uap->a2); return (error); } #ifndef CP #define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0) #endif #ifndef _SYS_SYSPROTO_H_ struct freebsd7_msgctl_args { int msqid; int cmd; struct msqid_ds_old *buf; }; #endif int freebsd7_msgctl(struct thread *td, struct freebsd7_msgctl_args *uap) { struct msqid_ds_old msqold; struct msqid_ds msqbuf; int error; DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd, uap->buf)); if (uap->cmd == IPC_SET) { error = copyin(uap->buf, &msqold, sizeof(msqold)); if (error) return (error); ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm); CP(msqold, msqbuf, __msg_first); CP(msqold, msqbuf, __msg_last); CP(msqold, msqbuf, msg_cbytes); CP(msqold, msqbuf, msg_qnum); CP(msqold, msqbuf, msg_qbytes); CP(msqold, msqbuf, msg_lspid); CP(msqold, msqbuf, msg_lrpid); CP(msqold, msqbuf, msg_stime); CP(msqold, msqbuf, msg_rtime); CP(msqold, msqbuf, msg_ctime); } error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf); if (error) return (error); if (uap->cmd == IPC_STAT) { bzero(&msqold, sizeof(msqold)); ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm); CP(msqbuf, msqold, __msg_first); CP(msqbuf, msqold, __msg_last); CP(msqbuf, msqold, msg_cbytes); CP(msqbuf, msqold, msg_qnum); CP(msqbuf, msqold, msg_qbytes); CP(msqbuf, msqold, msg_lspid); CP(msqbuf, msqold, msg_lrpid); CP(msqbuf, msqold, msg_stime); CP(msqbuf, msqold, msg_rtime); CP(msqbuf, msqold, msg_ctime); error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old)); } return (error); } #undef CP #endif /* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 || COMPAT_FREEBSD7 */ Index: head/sys/kern/vfs_cluster.c =================================================================== --- head/sys/kern/vfs_cluster.c (revision 334485) +++ head/sys/kern/vfs_cluster.c (revision 334486) @@ -1,1069 +1,1068 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * Modifications/enhancements: * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_debug_cluster.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(CLUSTERDEBUG) static int rcluster= 0; SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "Debug VFS clustering code"); #endif static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer"); static struct cluster_save *cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags); static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp); static void cluster_callback(struct buf *); static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); static int read_max = 64; SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, "Cluster read-ahead max block count"); static int read_min = 1; SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0, "Cluster read min block count"); /* * Read data to a buf, including read-ahead if we find this to be beneficial. * cluster_read replaces bread. */ int cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, struct ucred *cred, long totread, int seqcount, int gbflags, struct buf **bpp) { struct buf *bp, *rbp, *reqbp; struct bufobj *bo; struct thread *td; daddr_t blkno, origblkno; int maxra, racluster; int error, ncontig; int i; error = 0; td = curthread; bo = &vp->v_bufobj; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; /* * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = seqcount; maxra = min(read_max, maxra); maxra = min(nbuf/8, maxra); if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) maxra = (filesize / size) - lblkno; /* * get the requested block */ error = getblkx(vp, lblkno, size, 0, 0, gbflags, &bp); if (error != 0) { *bpp = NULL; return (error); } gbflags &= ~GB_NOSPARSE; origblkno = lblkno; *bpp = reqbp = bp; /* * if it is in the cache, then check to see if the reads have been * sequential. If they have, then try some read-ahead, otherwise * back-off on prospective read-aheads. */ if (bp->b_flags & B_CACHE) { if (!seqcount) { return 0; } else if ((bp->b_flags & B_RAM) == 0) { return 0; } else { bp->b_flags &= ~B_RAM; BO_RLOCK(bo); for (i = 1; i < maxra; i++) { /* * Stop if the buffer does not exist or it * is invalid (about to go away?) */ rbp = gbincore(&vp->v_bufobj, lblkno+i); if (rbp == NULL || (rbp->b_flags & B_INVAL)) break; /* * Set another read-ahead mark so we know * to check again. (If we can lock the * buffer without waiting) */ if ((((i % racluster) == (racluster - 1)) || (i == (maxra - 1))) && (0 == BUF_LOCK(rbp, LK_EXCLUSIVE | LK_NOWAIT, NULL))) { rbp->b_flags |= B_RAM; BUF_UNLOCK(rbp); } } BO_RUNLOCK(bo); if (i >= maxra) { return 0; } lblkno += i; } reqbp = bp = NULL; /* * If it isn't in the cache, then get a chunk from * disk if sequential, otherwise just get the block. */ } else { off_t firstread = bp->b_offset; int nblks; long minread; KASSERT(bp->b_offset != NOOFFSET, ("cluster_read: no buffer offset")); ncontig = 0; /* * Adjust totread if needed */ minread = read_min * size; if (minread > totread) totread = minread; /* * Compute the total number of blocks that we should read * synchronously. */ if (firstread + totread > filesize) totread = filesize - firstread; nblks = howmany(totread, size); if (nblks > racluster) nblks = racluster; /* * Now compute the number of contiguous blocks. */ if (nblks > 1) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); /* * If this failed to map just do the original block. */ if (error || blkno == -1) ncontig = 0; } /* * If we have contiguous data available do a cluster * otherwise just read the requested block. */ if (ncontig) { /* Account for our first block. */ ncontig = min(ncontig + 1, nblks); if (ncontig < nblks) nblks = ncontig; bp = cluster_rbuild(vp, filesize, lblkno, blkno, size, nblks, gbflags, bp); lblkno += (bp->b_bufsize / size); } else { bp->b_flags |= B_RAM; bp->b_iocmd = BIO_READ; lblkno += 1; } } /* * handle the synchronous read so that it is available ASAP. */ if (bp) { if ((bp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(bp, 0); } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, bp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; } /* * If we have been doing sequential I/O, then do some read-ahead. */ while (lblkno < (origblkno + maxra)) { error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL); if (error) break; if (blkno == -1) break; /* * We could throttle ncontig here by maxra but we might as * well read the data if it is contiguous. We're throttled * by racluster anyway. */ if (ncontig) { ncontig = min(ncontig + 1, racluster); rbp = cluster_rbuild(vp, filesize, lblkno, blkno, size, ncontig, gbflags, NULL); lblkno += (rbp->b_bufsize / size); if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } } else { rbp = getblk(vp, lblkno, size, 0, 0, gbflags); lblkno += 1; if (rbp->b_flags & B_DELWRI) { bqrelse(rbp); continue; } rbp->b_flags |= B_ASYNC | B_RAM; rbp->b_iocmd = BIO_READ; rbp->b_blkno = blkno; } if (rbp->b_flags & B_CACHE) { rbp->b_flags &= ~B_ASYNC; bqrelse(rbp); continue; } if ((rbp->b_flags & B_CLUSTER) == 0) { vfs_busy_pages(rbp, 0); } rbp->b_flags &= ~B_INVAL; rbp->b_ioflags &= ~BIO_ERROR; if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL) BUF_KERNPROC(rbp); rbp->b_iooffset = dbtob(rbp->b_blkno); bstrategy(rbp); #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, rbp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; } if (reqbp) { /* * Like bread, always brelse() the buffer when * returning an error. */ error = bufwait(reqbp); if (error != 0) { brelse(reqbp); *bpp = NULL; } } return (error); } /* * If blocks are contiguous on disk, use this to provide clustered * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ static struct buf * cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp) { struct buf *bp, *tbp; daddr_t bn; off_t off; long tinc, tsize; int i, inc, j, k, toff; KASSERT(size == vp->v_mount->mnt_stat.f_iosize, ("cluster_rbuild: size %ld != f_iosize %jd\n", size, (intmax_t)vp->v_mount->mnt_stat.f_iosize)); /* * avoid a division */ while ((u_quad_t) size * (lbn + run) > filesize) { --run; } if (fbp) { tbp = fbp; tbp->b_iocmd = BIO_READ; } else { tbp = getblk(vp, lbn, size, 0, 0, gbflags); if (tbp->b_flags & B_CACHE) return tbp; tbp->b_flags |= B_ASYNC | B_RAM; tbp->b_iocmd = BIO_READ; } tbp->b_blkno = blkno; if( (tbp->b_flags & B_MALLOC) || ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) return tbp; bp = trypbuf(&cluster_pbuf_freecnt); if (bp == NULL) return tbp; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO; if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; } else { bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); } bp->b_iocmd = BIO_READ; bp->b_iodone = cluster_callback; bp->b_blkno = blkno; bp->b_lblkno = lbn; bp->b_offset = tbp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); pbgetvp(vp, bp); TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; inc = btodb(size); for (bn = blkno, i = 0; i < run; ++i, bn += inc) { if (i == 0) { VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); vfs_drain_busy_pages(tbp); vm_object_pip_add(tbp->b_bufobj->bo_object, tbp->b_npages); for (k = 0; k < tbp->b_npages; k++) vm_page_sbusy(tbp->b_pages[k]); VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); } else { if ((bp->b_npages * PAGE_SIZE) + round_page(size) > vp->v_mount->mnt_iosize_max) { break; } tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT | (gbflags & GB_UNMAPPED)); /* Don't wait around for locked bufs. */ if (tbp == NULL) break; /* * Stop scanning if the buffer is fully valid * (marked B_CACHE), or locked (may be doing a * background write), or if the buffer is not * VMIO backed. The clustering code can only deal * with VMIO-backed buffers. The bo lock is not * required for the BKGRDINPROG check since it * can not be set without the buf lock. */ if ((tbp->b_vflags & BV_BKGRDINPROG) || (tbp->b_flags & B_CACHE) || (tbp->b_flags & B_VMIO) == 0) { bqrelse(tbp); break; } /* * The buffer must be completely invalid in order to * take part in the cluster. If it is partially valid * then we stop. */ off = tbp->b_offset; tsize = size; VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); for (j = 0; tsize > 0; j++) { toff = off & PAGE_MASK; tinc = tsize; if (toff + tinc > PAGE_SIZE) tinc = PAGE_SIZE - toff; VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object); if ((tbp->b_pages[j]->valid & vm_page_bits(toff, tinc)) != 0) break; if (vm_page_xbusied(tbp->b_pages[j])) break; vm_object_pip_add(tbp->b_bufobj->bo_object, 1); vm_page_sbusy(tbp->b_pages[j]); off += tinc; tsize -= tinc; } if (tsize > 0) { clean_sbusy: vm_object_pip_add(tbp->b_bufobj->bo_object, -j); for (k = 0; k < j; k++) vm_page_sunbusy(tbp->b_pages[k]); VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); bqrelse(tbp); break; } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); /* * Set a read-ahead mark as appropriate */ if ((fbp && (i == 1)) || (i == (run - 1))) tbp->b_flags |= B_RAM; /* * Set the buffer up for an async read (XXX should * we do this only if we do not wind up brelse()ing?). * Set the block number if it isn't set, otherwise * if it is make sure it matches the block number we * expect. */ tbp->b_flags |= B_ASYNC; tbp->b_iocmd = BIO_READ; if (tbp->b_blkno == tbp->b_lblkno) { tbp->b_blkno = bn; } else if (tbp->b_blkno != bn) { VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); goto clean_sbusy; } } /* * XXX fbp from caller may not be B_ASYNC, but we are going * to biodone() it in cluster_callback() anyway */ BUF_KERNPROC(tbp); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); for (j = 0; j < tbp->b_npages; j += 1) { vm_page_t m; m = tbp->b_pages[j]; if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages-1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } if (m->valid == VM_PAGE_BITS_ALL) tbp->b_pages[j] = bogus_page; } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); /* * Don't inherit tbp->b_bufsize as it may be larger due to * a non-page-aligned size. Instead just aggregate using * 'size'. */ if (tbp->b_bcount != size) printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size); if (tbp->b_bufsize != size) printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size); bp->b_bcount += size; bp->b_bufsize += size; } /* * Fully valid pages in the cluster are already good and do not need * to be re-read from disk. Replace the page with bogus_page */ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (j = 0; j < bp->b_npages; j++) { VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object); if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL) bp->b_pages[j] = bogus_page; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); if (bp->b_bufsize > bp->b_kvasize) panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); if (buf_mapped(bp)) { pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); } return (bp); } /* * Cleanup after a clustered read or write. * This is complicated by the fact that any of the buffers might have * extra memory (if there were no empty buffer headers at allocbuf time) * that we will need to shift around. */ static void -cluster_callback(bp) - struct buf *bp; +cluster_callback(struct buf *bp) { struct buf *nbp, *tbp; int error = 0; /* * Must propagate errors to all the components. */ if (bp->b_ioflags & BIO_ERROR) error = bp->b_error; if (buf_mapped(bp)) { pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); } /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); tbp; tbp = nbp) { nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); if (error) { tbp->b_ioflags |= BIO_ERROR; tbp->b_error = error; } else { tbp->b_dirtyoff = tbp->b_dirtyend = 0; tbp->b_flags &= ~B_INVAL; tbp->b_ioflags &= ~BIO_ERROR; /* * XXX the bdwrite()/bqrelse() issued during * cluster building clears B_RELBUF (see bqrelse() * comment). If direct I/O was specified, we have * to restore it here to allow the buffer and VM * to be freed. */ if (tbp->b_flags & B_DIRECT) tbp->b_flags |= B_RELBUF; } bufdone(tbp); } pbrelvp(bp); relpbuf(bp, &cluster_pbuf_freecnt); } /* * cluster_wbuild_wb: * * Implement modified write build for cluster. * * write_behind = 0 write behind disabled * write_behind = 1 write behind normal (default) * write_behind = 2 write behind backed-off */ static __inline int cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags) { int r = 0; switch (write_behind) { case 2: if (start_lbn < len) break; start_lbn -= len; /* FALLTHROUGH */ case 1: r = cluster_wbuild(vp, size, start_lbn, len, gbflags); /* FALLTHROUGH */ default: /* FALLTHROUGH */ break; } return(r); } /* * Do clustered write for FFS. * * Three cases: * 1. Write is not sequential (write asynchronously) * Write is sequential: * 2. beginning of cluster - begin cluster * 3. middle of a cluster - add to cluster * 4. end of a cluster - asynchronously write cluster */ void cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, int gbflags) { daddr_t lbn; int maxclen, cursize; int lblocksize; int async; if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; if (vp->v_type == VREG) { async = DOINGASYNC(vp); lblocksize = vp->v_mount->mnt_stat.f_iosize; } else { async = 0; lblocksize = bp->b_bufsize; } lbn = bp->b_lblkno; KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); /* Initialize vnode to beginning of file. */ if (lbn == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1; if (vp->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process * seeked to another point in the file since its last * write, or we have reached our maximum cluster size, * then push the previous cluster. Otherwise try * reallocating to make it sequential. * * Change to algorithm: only push previous cluster if * it was sequential from the point of view of the * seqcount heuristic, otherwise leave the buffer * intact so we can potentially optimize the I/O * later on in the buf_daemon or update daemon * flush. */ cursize = vp->v_lastw - vp->v_cstart + 1; if (((u_quad_t) bp->b_offset + lblocksize) != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { if (!async && seqcount > 0) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize, gbflags); } } else { struct buf **bpp, **endbp; struct cluster_save *buflist; buflist = cluster_collectbufs(vp, bp, gbflags); endbp = &buflist->bs_children [buflist->bs_nchildren - 1]; if (VOP_REALLOCBLKS(vp, buflist)) { /* * Failed, push the previous cluster * if *really* writing sequentially * in the logical file (seqcount > 1), * otherwise delay it in the hopes that * the low level disk driver can * optimize the write ordering. */ for (bpp = buflist->bs_children; bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, cursize, gbflags); } } else { /* * Succeeded, keep building cluster. */ for (bpp = buflist->bs_children; bpp <= endbp; bpp++) bdwrite(*bpp); free(buflist, M_SEGMENT); vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; return; } } } /* * Consider beginning a cluster. If at end of file, make * cluster as large as possible, otherwise find size of * existing cluster. */ if ((vp->v_type == VREG) && ((u_quad_t) bp->b_offset + lblocksize) != filesize && (bp->b_blkno == bp->b_lblkno) && (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || bp->b_blkno == -1)) { bawrite(bp); vp->v_clen = 0; vp->v_lasta = bp->b_blkno; vp->v_cstart = lbn + 1; vp->v_lastw = lbn; return; } vp->v_clen = maxclen; if (!async && maxclen == 0) { /* I/O not contiguous */ vp->v_cstart = lbn + 1; bawrite(bp); } else { /* Wait for rest of cluster */ vp->v_cstart = lbn; bdwrite(bp); } } else if (lbn == vp->v_cstart + vp->v_clen) { /* * At end of cluster, write it out if seqcount tells us we * are operating sequentially, otherwise let the buf or * update daemon handle it. */ bdwrite(bp); if (seqcount > 1) { cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1, gbflags); } vp->v_clen = 0; vp->v_cstart = lbn + 1; } else if (vm_page_count_severe()) { /* * We are low on memory, get it going NOW */ bawrite(bp); } else { /* * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); } vp->v_lastw = lbn; vp->v_lasta = bp->b_blkno; } /* * This is an awful lot like cluster_rbuild...wish they could be combined. * The last lbn argument is the current block on which I/O is being * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ int cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags) { struct buf *bp, *tbp; struct bufobj *bo; int i, j; int totalwritten = 0; int dbsize = btodb(size); if (!unmapped_buf_allowed) gbflags &= ~GB_UNMAPPED; bo = &vp->v_bufobj; while (len > 0) { /* * If the buffer is not delayed-write (i.e. dirty), or it * is delayed-write but either locked or inval, it cannot * partake in the clustered write. */ BO_LOCK(bo); if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { BO_UNLOCK(bo); ++start_lbn; --len; continue; } if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) { ++start_lbn; --len; continue; } if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) { BUF_UNLOCK(tbp); ++start_lbn; --len; continue; } bremfree(tbp); tbp->b_flags &= ~B_DONE; /* * Extra memory in the buffer, punt on this buffer. * XXX we could handle this in most cases, but we would * have to push the extra memory down to after our max * possible cluster size and then potentially pull it back * up if the cluster was terminated prematurely--too much * hassle. */ if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) != (B_CLUSTEROK | B_VMIO)) || (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || ((bp = (vp->v_vflag & VV_MD) != 0 ? trypbuf(&cluster_pbuf_freecnt) : getpbuf(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; --len; continue; } /* * We got a pbuf to make the cluster in. * so initialise it. */ TAILQ_INIT(&bp->b_cluster.cluster_head); bp->b_bcount = 0; bp->b_bufsize = 0; bp->b_npages = 0; if (tbp->b_wcred != NOCRED) bp->b_wcred = crhold(tbp->b_wcred); bp->b_blkno = tbp->b_blkno; bp->b_lblkno = tbp->b_lblkno; bp->b_offset = tbp->b_offset; /* * We are synthesizing a buffer out of vm_page_t's, but * if the block size is not page aligned then the starting * address may not be either. Inherit the b_data offset * from the original buffer. */ if ((gbflags & GB_UNMAPPED) == 0 || (tbp->b_flags & B_VMIO) == 0) { bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); } else { bp->b_data = unmapped_buf; } bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); bp->b_iodone = cluster_callback; pbgetvp(vp, bp); /* * From this location in the file, scan forward to see * if there are buffers with adjacent data that need to * be written as well. */ for (i = 0; i < len; ++i, ++start_lbn) { if (i != 0) { /* If not the first buffer */ /* * If the adjacent data is not even in core it * can't need to be written. */ BO_LOCK(bo); if ((tbp = gbincore(bo, start_lbn)) == NULL || (tbp->b_vflags & BV_BKGRDINPROG)) { BO_UNLOCK(bo); break; } /* * If it IS in core, but has different * characteristics, or is locked (which * means it could be undergoing a background * I/O or be in a weird state), then don't * cluster with it. */ if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) break; if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | B_INVAL | B_DELWRI | B_NEEDCOMMIT)) != (B_DELWRI | B_CLUSTEROK | (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || tbp->b_wcred != bp->b_wcred) { BUF_UNLOCK(tbp); break; } /* * Check that the combined cluster * would make sense with regard to pages * and would not be too large */ if ((tbp->b_bcount != size) || ((bp->b_blkno + (dbsize * i)) != tbp->b_blkno) || ((tbp->b_npages + bp->b_npages) > (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { BUF_UNLOCK(tbp); break; } /* * Ok, it's passed all the tests, * so remove it from the free list * and mark it busy. We will use it. */ bremfree(tbp); tbp->b_flags &= ~B_DONE; } /* end of code for non-first buffers only */ /* * If the IO is via the VM then we do some * special VM hackery (yuck). Since the buffer's * block size may not be page-aligned it is possible * for a page to be shared between two buffers. We * have to get rid of the duplication when building * the cluster. */ if (tbp->b_flags & B_VMIO) { vm_page_t m; VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object); if (i == 0) { vfs_drain_busy_pages(tbp); } else { /* if not first buffer */ for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; if (vm_page_xbusied(m)) { VM_OBJECT_WUNLOCK( tbp->b_object); bqrelse(tbp); goto finishcluster; } } } for (j = 0; j < tbp->b_npages; j += 1) { m = tbp->b_pages[j]; vm_page_sbusy(m); vm_object_pip_add(m->object, 1); if ((bp->b_npages == 0) || (bp->b_pages[bp->b_npages - 1] != m)) { bp->b_pages[bp->b_npages] = m; bp->b_npages++; } } VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object); } bp->b_bcount += size; bp->b_bufsize += size; /* * If any of the clustered buffers have their * B_BARRIER flag set, transfer that request to * the cluster. */ bp->b_flags |= (tbp->b_flags & B_BARRIER); tbp->b_flags &= ~(B_DONE | B_BARRIER); tbp->b_flags |= B_ASYNC; tbp->b_ioflags &= ~BIO_ERROR; tbp->b_iocmd = BIO_WRITE; bundirty(tbp); reassignbuf(tbp); /* put on clean list */ bufobj_wref(tbp->b_bufobj); BUF_KERNPROC(tbp); buf_track(tbp, __func__); TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, tbp, b_cluster.cluster_entry); } finishcluster: if (buf_mapped(bp)) { pmap_qenter(trunc_page((vm_offset_t) bp->b_data), (vm_page_t *)bp->b_pages, bp->b_npages); } if (bp->b_bufsize > bp->b_kvasize) panic( "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", bp->b_bufsize, bp->b_kvasize); totalwritten += bp->b_bufsize; bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bufsize; bawrite(bp); len -= i; } return totalwritten; } /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ static struct cluster_save * cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags) { struct cluster_save *buflist; struct buf *bp; daddr_t lbn; int i, len; len = vp->v_lastw - vp->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED, gbflags, &bp); buflist->bs_children[i] = bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buflist->bs_children[i] = bp = last_bp; if (bp->b_blkno == bp->b_lblkno) VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); buflist->bs_nchildren = i + 1; return (buflist); } Index: head/sys/kern/vfs_vnops.c =================================================================== --- head/sys/kern/vfs_vnops.c (revision 334485) +++ head/sys/kern/vfs_vnops.c (revision 334486) @@ -1,2513 +1,2489 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 2012 Konstantin Belousov * Copyright (c) 2013, 2014 The FreeBSD Foundation * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_rdwr_t vn_io_fault; static fo_truncate_t vn_truncate; static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; static fo_kqfilter_t vn_kqfilter; static fo_stat_t vn_statfile; static fo_close_t vn_closefile; static fo_mmap_t vn_mmap; struct fileops vnops = { .fo_read = vn_io_fault, .fo_write = vn_io_fault, .fo_truncate = vn_truncate, .fo_ioctl = vn_ioctl, .fo_poll = vn_poll, .fo_kqfilter = vn_kqfilter, .fo_stat = vn_statfile, .fo_close = vn_closefile, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = vn_mmap, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; static const int io_hold_cnt = 16; static int vn_io_fault_enable = 1; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); static int vn_io_fault_prefault = 0; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW, &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); static u_long vn_io_faults_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); /* * Returns true if vn_io_fault mode of handling the i/o request should * be used. */ static bool do_vn_io_fault(struct vnode *vp, struct uio *uio) { struct mount *mp; return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && (mp = vp->v_mount) != NULL && (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); } /* * Structure used to pass arguments to vn_io_fault1(), to do either * file- or vnode-based I/O calls. */ struct vn_io_fault_args { enum { VN_IO_FAULT_FOP, VN_IO_FAULT_VOP } kind; struct ucred *cred; int flags; union { struct fop_args_tag { struct file *fp; fo_rdwr_t *doio; } fop_args; struct vop_args_tag { struct vnode *vp; } vop_args; } args; }; static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td); int -vn_open(ndp, flagp, cmode, fp) - struct nameidata *ndp; - int *flagp, cmode; - struct file *fp; +vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp) { struct thread *td = ndp->ni_cnd.cn_thread; return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); } /* * Common code for vnode open operations via a name lookup. * Lookup the vnode and invoke VOP_CREATE if needed. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * Note that this does NOT free nameidata for the successful case, * due to the NDINIT being done elsewhere. */ int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp) { struct vnode *vp; struct mount *mp; struct thread *td = ndp->ni_cnd.cn_thread; struct vattr vat; struct vattr *vap = &vat; int fmode, error; restart: fmode = *flagp; if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT | O_EXCL | O_DIRECTORY)) return (EINVAL); else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; /* * Set NOCACHE to avoid flushing the cache when * rolling in many files at once. */ ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE; if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; if (!(vn_open_flags & VN_OPEN_NOAUDIT)) ndp->ni_cnd.cn_flags |= AUDITVNODE1; if (vn_open_flags & VN_OPEN_NOCAPCHECK) ndp->ni_cnd.cn_flags |= NOCAPCHECK; bwillwrite(); if ((error = namei(ndp)) != 0) return (error); if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(ndp->ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) ndp->ni_cnd.cn_flags |= MAKEENTRY; #ifdef MAC error = mac_vnode_check_create(cred, ndp->ni_dvp, &ndp->ni_cnd, vap); if (error == 0) #endif error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); vput(ndp->ni_dvp); vn_finished_write(mp); if (error) { NDFREE(ndp, NDF_ONLY_PNBUF); return (error); } fmode &= ~O_TRUNC; vp = ndp->ni_vp; } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = ISOPEN | ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; if (!(fmode & FWRITE)) ndp->ni_cnd.cn_flags |= LOCKSHARED; if (!(vn_open_flags & VN_OPEN_NOAUDIT)) ndp->ni_cnd.cn_flags |= AUDITVNODE1; if (vn_open_flags & VN_OPEN_NOCAPCHECK) ndp->ni_cnd.cn_flags |= NOCAPCHECK; if ((error = namei(ndp)) != 0) return (error); vp = ndp->ni_vp; } error = vn_open_vnode(vp, fmode, cred, td, fp); if (error) goto bad; *flagp = fmode; return (0); bad: NDFREE(ndp, NDF_ONLY_PNBUF); vput(vp); *flagp = fmode; ndp->ni_vp = NULL; return (error); } /* * Common code for vnode open operations once a vnode is located. * Check permissions, and call the VOP_OPEN routine. */ int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp) { accmode_t accmode; struct flock lf; int error, lock_flags, type; if (vp->v_type == VLNK) return (EMLINK); if (vp->v_type == VSOCK) return (EOPNOTSUPP); if (vp->v_type != VDIR && fmode & O_DIRECTORY) return (ENOTDIR); accmode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) return (EISDIR); accmode |= VWRITE; } if (fmode & FREAD) accmode |= VREAD; if (fmode & FEXEC) accmode |= VEXEC; if ((fmode & O_APPEND) && (fmode & FWRITE)) accmode |= VAPPEND; #ifdef MAC if (fmode & O_CREAT) accmode |= VCREAT; if (fmode & O_VERIFY) accmode |= VVERIFY; error = mac_vnode_check_open(cred, vp, accmode); if (error) return (error); accmode &= ~(VCREAT | VVERIFY); #endif if ((fmode & O_CREAT) == 0) { if (accmode & VWRITE) { error = vn_writechk(vp); if (error) return (error); } if (accmode) { error = VOP_ACCESS(vp, accmode, cred, td); if (error) return (error); } } if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vn_lock(vp, LK_UPGRADE | LK_RETRY); if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0) return (error); while ((fmode & (O_EXLOCK | O_SHLOCK)) != 0) { KASSERT(fp != NULL, ("open with flock requires fp")); if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) { error = EOPNOTSUPP; break; } lock_flags = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (fmode & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); if (error == 0) fp->f_flag |= FHASLOCK; vn_lock(vp, lock_flags | LK_RETRY); if (error != 0) break; if ((vp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; break; } /* * Another thread might have used this vnode as an * executable while the vnode lock was dropped. * Ensure the vnode is still able to be opened for * writing after the lock has been obtained. */ if ((accmode & VWRITE) != 0) error = vn_writechk(vp); break; } if (error != 0) { fp->f_flag |= FOPENFAILED; fp->f_vnode = vp; if (fp->f_ops == &badfileops) { fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; } vref(vp); } else if ((fmode & FWRITE) != 0) { VOP_ADD_WRITECOUNT(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); return (error); } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. */ int vn_writechk(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, "vn_writechk"); /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (VOP_IS_TEXT(vp)) return (ETXTBSY); return (0); } /* * Vnode close call */ static int vn_close1(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td, bool keep_ref) { struct mount *mp; int error, lock_flags; if (vp->v_type != VFIFO && (flags & FWRITE) == 0 && MNT_EXTENDED_SHARED(vp->v_mount)) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, lock_flags | LK_RETRY); AUDIT_ARG_VNODE1(vp); if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) { VNASSERT(vp->v_writecount > 0, vp, ("vn_close: negative writecount")); VOP_ADD_WRITECOUNT(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } error = VOP_CLOSE(vp, flags, file_cred, td); if (keep_ref) VOP_UNLOCK(vp, 0); else vput(vp); vn_finished_write(mp); return (error); } int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td) { return (vn_close1(vp, flags, file_cred, td, false)); } /* * Heuristic to detect sequential operation. */ static int sequential_heuristic(struct uio *uio, struct file *fp) { ASSERT_VOP_LOCKED(fp->f_vnode, __func__); if (fp->f_flag & FRDAHEAD) return (fp->f_seqcount << IO_SEQSHIFT); /* * Offset 0 is handled specially. open() sets f_seqcount to 1 so * that the first I/O is normally considered to be slightly * sequential. Seeking to offset 0 doesn't change sequentiality * unless previous seeks have reduced f_seqcount to 0, in which * case offset 0 is not special. */ if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || uio->uio_offset == fp->f_nextoff) { /* * f_seqcount is in units of fixed-size blocks so that it * depends mainly on the amount of sequential I/O and not * much on the number of sequential I/O's. The fixed size * of 16384 is hard-coded here since it is (not quite) just * a magic size that works well here. This size is more * closely related to the best I/O size for real disks than * to any block size used by software. */ fp->f_seqcount += howmany(uio->uio_resid, 16384); if (fp->f_seqcount > IO_SEQMAX) fp->f_seqcount = IO_SEQMAX; return (fp->f_seqcount << IO_SEQSHIFT); } /* Not sequential. Quickly draw-down sequentiality. */ if (fp->f_seqcount > 1) fp->f_seqcount = 1; else fp->f_seqcount = 0; return (0); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td) { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; void *rl_cookie; struct vn_io_fault_args args; int error, lock_flags; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_td = td; error = 0; if ((ioflg & IO_NODELOCKED) == 0) { if ((ioflg & IO_RANGELOCKED) == 0) { if (rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, offset, offset + len); } else { rl_cookie = vn_rangelock_wlock(vp, offset, offset + len); } } else rl_cookie = NULL; mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; if (MNT_SHARED_WRITES(mp) || ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; } else lock_flags = LK_SHARED; vn_lock(vp, lock_flags | LK_RETRY); } else rl_cookie = NULL; ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) error = mac_vnode_check_read(active_cred, file_cred, vp); else error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { if (file_cred != NULL) cred = file_cred; else cred = active_cred; if (do_vn_io_fault(vp, &auio)) { args.kind = VN_IO_FAULT_VOP; args.cred = cred; args.flags = ioflg; args.args.vop_args.vp = vp; error = vn_io_fault1(vp, &auio, &args, td); } else if (rw == UIO_READ) { error = VOP_READ(vp, &auio, ioflg, cred); } else /* if (rw == UIO_WRITE) */ { error = VOP_WRITE(vp, &auio, ioflg, cred); } } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp, 0); if (mp != NULL) vn_finished_write(mp); } out: if (rl_cookie != NULL) vn_rangelock_unlock(vp, rl_cookie); return (error); } /* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). */ int -vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, - file_cred, aresid, td) - enum uio_rw rw; - struct vnode *vp; - void *base; - size_t len; - off_t offset; - enum uio_seg segflg; - int ioflg; - struct ucred *active_cred; - struct ucred *file_cred; - size_t *aresid; - struct thread *td; +vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, + off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, + struct ucred *file_cred, size_t *aresid, struct thread *td) { int error = 0; ssize_t iaresid; do { int chunk; /* * Force `offset' to a multiple of MAXBSIZE except possibly * for the first chunk, so that filesystems only need to * write full blocks except possibly for the first and last * chunks. */ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; if (chunk > len) chunk = len; if (rw != UIO_READ && vp->v_type == VREG) bwillwrite(); iaresid = 0; error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, active_cred, file_cred, &iaresid, td); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base = (char *)base + chunk; kern_yield(PRI_USER); } while (len); if (aresid) *aresid = len + iaresid; return (error); } off_t foffset_lock(struct file *fp, int flags) { struct mtx *mtxp; off_t res; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); #if OFF_MAX <= LONG_MAX /* * Caller only wants the current f_offset value. Assume that * the long and shorter integer types reads are atomic. */ if ((flags & FOF_NOLOCK) != 0) return (fp->f_offset); #endif /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOLOCK) == 0) { while (fp->f_vnread_flags & FOFFSET_LOCKED) { fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; msleep(&fp->f_vnread_flags, mtxp, PUSER -1, "vofflock", 0); } fp->f_vnread_flags |= FOFFSET_LOCKED; } res = fp->f_offset; mtx_unlock(mtxp); return (res); } void foffset_unlock(struct file *fp, off_t val, int flags) { struct mtx *mtxp; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); #if OFF_MAX <= LONG_MAX if ((flags & FOF_NOLOCK) != 0) { if ((flags & FOF_NOUPDATE) == 0) fp->f_offset = val; if ((flags & FOF_NEXTOFF) != 0) fp->f_nextoff = val; return; } #endif mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOUPDATE) == 0) fp->f_offset = val; if ((flags & FOF_NEXTOFF) != 0) fp->f_nextoff = val; if ((flags & FOF_NOLOCK) == 0) { KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, ("Lost FOFFSET_LOCKED")); if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) wakeup(&fp->f_vnread_flags); fp->f_vnread_flags = 0; } mtx_unlock(mtxp); } void foffset_lock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) uio->uio_offset = foffset_lock(fp, flags); } void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) foffset_unlock(fp, uio->uio_offset, flags); } static int get_advice(struct file *fp, struct uio *uio) { struct mtx *mtxp; int ret; ret = POSIX_FADV_NORMAL; if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG) return (ret); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if (fp->f_advice != NULL && uio->uio_offset >= fp->f_advice->fa_start && uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) ret = fp->f_advice->fa_advice; mtx_unlock(mtxp); return (ret); } /* * File table vnode read routine. */ static int -vn_read(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - int flags; - struct thread *td; +vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; off_t orig_offset; int error, ioflag; int advice; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; advice = get_advice(fp, uio); vn_lock(vp, LK_SHARED | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* Disable read-ahead for random I/O. */ break; } orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_READ(vp, uio, ioflag, fp->f_cred); fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); if (error == 0 && advice == POSIX_FADV_NOREUSE && orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush pages and buffers * for the backing file after a POSIX_FADV_NOREUSE * read(2). */ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, POSIX_FADV_DONTNEED); return (error); } /* * File table vnode write routine. */ static int -vn_write(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - int flags; - struct thread *td; +vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; struct mount *mp; off_t orig_offset; int error, ioflag, lock_flags; int advice; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; if (vp->v_type == VREG) bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; mp = NULL; if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto unlock; advice = get_advice(fp, uio); if (MNT_SHARED_WRITES(mp) || (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) { lock_flags = LK_SHARED; } else { lock_flags = LK_EXCLUSIVE; } vn_lock(vp, lock_flags | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* XXX: Is this correct? */ break; } orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); if (vp->v_type != VCHR) vn_finished_write(mp); if (error == 0 && advice == POSIX_FADV_NOREUSE && orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush pages and buffers * for the backing file after a POSIX_FADV_NOREUSE * write(2). */ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, POSIX_FADV_DONTNEED); unlock: return (error); } /* * The vn_io_fault() is a wrapper around vn_read() and vn_write() to * prevent the following deadlock: * * Assume that the thread A reads from the vnode vp1 into userspace * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is * currently not resident, then system ends up with the call chain * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) * which establishes lock order vp1->vn_lock, then vp2->vn_lock. * If, at the same time, thread B reads from vnode vp2 into buffer buf2 * backed by the pages of vnode vp1, and some page in buf2 is not * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. * * To prevent the lock order reversal and deadlock, vn_io_fault() does * not allow page faults to happen during VOP_READ() or VOP_WRITE(). * Instead, it first tries to do the whole range i/o with pagefaults * disabled. If all pages in the i/o buffer are resident and mapped, * VOP will succeed (ignoring the genuine filesystem errors). * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do * i/o in chunks, with all pages in the chunk prefaulted and held * using vm_fault_quick_hold_pages(). * * Filesystems using this deadlock avoidance scheme should use the * array of the held pages from uio, saved in the curthread->td_ma, * instead of doing uiomove(). A helper function * vn_io_fault_uiomove() converts uiomove request into * uiomove_fromphys() over td_ma array. * * Since vnode locks do not cover the whole i/o anymore, rangelocks * make the current i/o request atomic with respect to other i/os and * truncations. */ /* * Decode vn_io_fault_args and perform the corresponding i/o. */ static int vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, struct thread *td) { int error, save; error = 0; save = vm_fault_disable_pagefaults(); switch (args->kind) { case VN_IO_FAULT_FOP: error = (args->args.fop_args.doio)(args->args.fop_args.fp, uio, args->cred, args->flags, td); break; case VN_IO_FAULT_VOP: if (uio->uio_rw == UIO_READ) { error = VOP_READ(args->args.vop_args.vp, uio, args->flags, args->cred); } else if (uio->uio_rw == UIO_WRITE) { error = VOP_WRITE(args->args.vop_args.vp, uio, args->flags, args->cred); } break; default: panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind, uio->uio_rw); } vm_fault_enable_pagefaults(save); return (error); } static int vn_io_fault_touch(char *base, const struct uio *uio) { int r; r = fubyte(base); if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1)) return (EFAULT); return (0); } static int vn_io_fault_prefault_user(const struct uio *uio) { char *base; const struct iovec *iov; size_t len; ssize_t resid; int error, i; KASSERT(uio->uio_segflg == UIO_USERSPACE, ("vn_io_fault_prefault userspace")); error = i = 0; iov = uio->uio_iov; resid = uio->uio_resid; base = iov->iov_base; len = iov->iov_len; while (resid > 0) { error = vn_io_fault_touch(base, uio); if (error != 0) break; if (len < PAGE_SIZE) { if (len != 0) { error = vn_io_fault_touch(base + len - 1, uio); if (error != 0) break; resid -= len; } if (++i >= uio->uio_iovcnt) break; iov = uio->uio_iov + i; base = iov->iov_base; len = iov->iov_len; } else { len -= PAGE_SIZE; base += PAGE_SIZE; resid -= PAGE_SIZE; } } return (error); } /* * Common code for vn_io_fault(), agnostic to the kind of i/o request. * Uses vn_io_fault_doio() to make the call to an actual i/o function. * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request * into args and call vn_io_fault1() to handle faults during the user * mode buffer accesses. */ static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td) { vm_page_t ma[io_hold_cnt + 2]; struct uio *uio_clone, short_uio; struct iovec short_iovec[1]; vm_page_t *prev_td_ma; vm_prot_t prot; vm_offset_t addr, end; size_t len, resid; ssize_t adv; int error, cnt, saveheld, prev_td_ma_cnt; if (vn_io_fault_prefault) { error = vn_io_fault_prefault_user(uio); if (error != 0) return (error); /* Or ignore ? */ } prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; /* * The UFS follows IO_UNIT directive and replays back both * uio_offset and uio_resid if an error is encountered during the * operation. But, since the iovec may be already advanced, * uio is still in an inconsistent state. * * Cache a copy of the original uio, which is advanced to the redo * point using UIO_NOCOPY below. */ uio_clone = cloneuio(uio); resid = uio->uio_resid; short_uio.uio_segflg = UIO_USERSPACE; short_uio.uio_rw = uio->uio_rw; short_uio.uio_td = uio->uio_td; error = vn_io_fault_doio(args, uio, td); if (error != EFAULT) goto out; atomic_add_long(&vn_io_faults_cnt, 1); uio_clone->uio_segflg = UIO_NOCOPY; uiomove(NULL, resid - uio->uio_resid, uio_clone); uio_clone->uio_segflg = uio->uio_segflg; saveheld = curthread_pflags_set(TDP_UIOHELD); prev_td_ma = td->td_ma; prev_td_ma_cnt = td->td_ma_cnt; while (uio_clone->uio_resid != 0) { len = uio_clone->uio_iov->iov_len; if (len == 0) { KASSERT(uio_clone->uio_iovcnt >= 1, ("iovcnt underflow")); uio_clone->uio_iov++; uio_clone->uio_iovcnt--; continue; } if (len > io_hold_cnt * PAGE_SIZE) len = io_hold_cnt * PAGE_SIZE; addr = (uintptr_t)uio_clone->uio_iov->iov_base; end = round_page(addr + len); if (end < addr) { error = EFAULT; break; } cnt = atop(end - trunc_page(addr)); /* * A perfectly misaligned address and length could cause * both the start and the end of the chunk to use partial * page. +2 accounts for such a situation. */ cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, addr, len, prot, ma, io_hold_cnt + 2); if (cnt == -1) { error = EFAULT; break; } short_uio.uio_iov = &short_iovec[0]; short_iovec[0].iov_base = (void *)addr; short_uio.uio_iovcnt = 1; short_uio.uio_resid = short_iovec[0].iov_len = len; short_uio.uio_offset = uio_clone->uio_offset; td->td_ma = ma; td->td_ma_cnt = cnt; error = vn_io_fault_doio(args, &short_uio, td); vm_page_unhold_pages(ma, cnt); adv = len - short_uio.uio_resid; uio_clone->uio_iov->iov_base = (char *)uio_clone->uio_iov->iov_base + adv; uio_clone->uio_iov->iov_len -= adv; uio_clone->uio_resid -= adv; uio_clone->uio_offset += adv; uio->uio_resid -= adv; uio->uio_offset += adv; if (error != 0 || adv == 0) break; } td->td_ma = prev_td_ma; td->td_ma_cnt = prev_td_ma_cnt; curthread_pflags_restore(saveheld); out: free(uio_clone, M_IOV); return (error); } static int vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { fo_rdwr_t *doio; struct vnode *vp; void *rl_cookie; struct vn_io_fault_args args; int error; doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; vp = fp->f_vnode; foffset_lock_uio(fp, uio, flags); if (do_vn_io_fault(vp, uio)) { args.kind = VN_IO_FAULT_FOP; args.args.fop_args.fp = fp; args.args.fop_args.doio = doio; args.cred = active_cred; args.flags = flags | FOF_OFFSET; if (uio->uio_rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } else if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0) { /* For appenders, punt and lock the whole range. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); } else { rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } error = vn_io_fault1(vp, uio, &args, td); vn_rangelock_unlock(vp, rl_cookie); } else { error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); } foffset_unlock_uio(fp, uio, flags); return (error); } /* * Helper function to perform the requested uiomove operation using * the held pages for io->uio_iov[0].iov_base buffer instead of * copyin/copyout. Access to the pages with uiomove_fromphys() * instead of iov_base prevents page faults that could occur due to * pmap_collect() invalidating the mapping created by * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or * object cleanup revoking the write access from page mappings. * * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() * instead of plain uiomove(). */ int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) { struct uio transp_uio; struct iovec transp_iov[1]; struct thread *td; size_t adv; int error, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove(data, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); transp_iov[0].iov_base = data; transp_uio.uio_iov = &transp_iov[0]; transp_uio.uio_iovcnt = 1; if (xfersize > uio->uio_resid) xfersize = uio->uio_resid; transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; transp_uio.uio_offset = 0; transp_uio.uio_segflg = UIO_SYSSPACE; /* * Since transp_iov points to data, and td_ma page array * corresponds to original uio->uio_iov, we need to invert the * direction of the i/o operation as passed to * uiomove_fromphys(). */ switch (uio->uio_rw) { case UIO_WRITE: transp_uio.uio_rw = UIO_READ; break; case UIO_READ: transp_uio.uio_rw = UIO_WRITE; break; } transp_uio.uio_td = uio->uio_td; error = uiomove_fromphys(td->td_ma, ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, xfersize, &transp_uio); adv = xfersize - transp_uio.uio_resid; pgadv = (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; uio->uio_iov->iov_len -= adv; uio->uio_resid -= adv; uio->uio_offset += adv; return (error); } int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio) { struct thread *td; vm_offset_t iov_base; int cnt, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove_fromphys(ma, offset, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; iov_base = (vm_offset_t)uio->uio_iov->iov_base; switch (uio->uio_rw) { case UIO_WRITE: pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, offset, cnt); break; case UIO_READ: pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, cnt); break; } pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)(iov_base + cnt); uio->uio_iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; return (0); } /* * File table truncate routine. */ static int vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct vattr vattr; struct mount *mp; struct vnode *vp; void *rl_cookie; int error; vp = fp->f_vnode; /* * Lock the whole range for truncation. Otherwise split i/o * might happen partly before and partly after the truncation. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) goto out1; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error) goto out; #endif error = vn_writechk(vp); if (error == 0) { VATTR_NULL(&vattr); vattr.va_size = length; if ((fp->f_flag & O_FSYNC) != 0) vattr.va_vaflags |= VA_SYNC; error = VOP_SETATTR(vp, &vattr, fp->f_cred); } out: VOP_UNLOCK(vp, 0); vn_finished_write(mp); out1: vn_rangelock_unlock(vp, rl_cookie); return (error); } /* * File table vnode stat routine. */ static int -vn_statfile(fp, sb, active_cred, td) - struct file *fp; - struct stat *sb; - struct ucred *active_cred; - struct thread *td; +vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred, + struct thread *td) { struct vnode *vp = fp->f_vnode; int error; vn_lock(vp, LK_SHARED | LK_RETRY); error = vn_stat(vp, sb, active_cred, fp->f_cred, td); VOP_UNLOCK(vp, 0); return (error); } /* * Stat a vnode; implementation for the stat syscall */ int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td) { struct vattr vattr; struct vattr *vap; int error; u_short mode; AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_stat(active_cred, file_cred, vp); if (error) return (error); #endif vap = &vattr; /* * Initialize defaults for new and unusual fields, so that file * systems which don't support these fields don't need to know * about them. */ vap->va_birthtime.tv_sec = -1; vap->va_birthtime.tv_nsec = 0; vap->va_fsid = VNOVAL; vap->va_rdev = NODEV; error = VOP_GETATTR(vp, vap, active_cred); if (error) return (error); /* * Zero the spare stat fields */ bzero(sb, sizeof *sb); /* * Copy from vattr table */ if (vap->va_fsid != VNOVAL) sb->st_dev = vap->va_fsid; else sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vap->va_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: return (EBADF); } sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; if (vap->va_size > OFF_MAX) return (EOVERFLOW); sb->st_size = vap->va_size; sb->st_atim = vap->va_atime; sb->st_mtim = vap->va_mtime; sb->st_ctim = vap->va_ctime; sb->st_birthtim = vap->va_birthtime; /* * According to www.opengroup.org, the meaning of st_blksize is * "a filesystem-specific preferred I/O block size for this * object. In some filesystem types, this may vary from file * to file" * Use miminum/default of PAGE_SIZE (e.g. for VCHR). */ sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); sb->st_flags = vap->va_flags; if (priv_check(td, PRIV_VFS_GENERATION)) sb->st_gen = 0; else sb->st_gen = vap->va_gen; sb->st_blocks = vap->va_bytes / S_BLKSIZE; return (0); } /* * File table vnode ioctl routine. */ static int vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { struct vattr vattr; struct vnode *vp; int error; vp = fp->f_vnode; switch (vp->v_type) { case VDIR: case VREG: switch (com) { case FIONREAD: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, active_cred); VOP_UNLOCK(vp, 0); if (error == 0) *(int *)data = vattr.va_size - fp->f_offset; return (error); case FIONBIO: case FIOASYNC: return (0); default: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); } break; case VCHR: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); default: return (ENOTTY); } } /* * File table vnode poll routine. */ static int vn_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct vnode *vp; int error; vp = fp->f_vnode; #ifdef MAC vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); VOP_UNLOCK(vp, 0); if (!error) #endif error = VOP_POLL(vp, events, fp->f_cred, td); return (error); } /* * Acquire the requested lock and then check for validity. LK_RETRY * permits vn_lock to return doomed vnodes. */ int _vn_lock(struct vnode *vp, int flags, char *file, int line) { int error; VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vn_lock: no locktype")); VNASSERT(vp->v_holdcnt != 0, vp, ("vn_lock: zero hold count")); retry: error = VOP_LOCK1(vp, flags, file, line); flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */ KASSERT((flags & LK_RETRY) == 0 || error == 0, ("vn_lock: error %d incompatible with flags %#x", error, flags)); if ((flags & LK_RETRY) == 0) { if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) { VOP_UNLOCK(vp, 0); error = ENOENT; } } else if (error != 0) goto retry; return (error); } /* * File table vnode close routine. */ static int vn_closefile(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; int error; bool ref; vp = fp->f_vnode; fp->f_ops = &badfileops; ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE; error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref); if (__predict_false(ref)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); vrele(vp); } return (error); } static bool vn_suspendable(struct mount *mp) { return (mp->mnt_op->vfs_susp_clean != NULL); } /* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ static int vn_start_write_locked(struct mount *mp, int flags) { int error, mflags; mtx_assert(MNT_MTX(mp), MA_OWNED); error = 0; /* * Check on status of suspension. */ if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || mp->mnt_susp_owner != curthread) { mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0) | (PUSER - 1); while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { if (flags & V_NOWAIT) { error = EWOULDBLOCK; goto unlock; } error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0); if (error) goto unlock; } } if (flags & V_XSLEEP) goto unlock; mp->mnt_writeopcount++; unlock: if (error != 0 || (flags & V_XSLEEP) != 0) MNT_REL(mp); MNT_IUNLOCK(mp); return (error); } int vn_start_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), ("V_MNTREF requires mp")); error = 0; /* * If a vnode is provided, get and return the mount point that * to which it will write. */ if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } if ((mp = *mpp) == NULL) return (0); if (!vn_suspendable(mp)) { if (vp != NULL || (flags & V_MNTREF) != 0) vfs_rel(mp); return (0); } /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ MNT_ILOCK(mp); if (vp == NULL && (flags & V_MNTREF) == 0) MNT_REF(mp); return (vn_start_write_locked(mp, flags)); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), ("V_MNTREF requires mp")); retry: if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if ((mp = *mpp) == NULL) return (0); if (!vn_suspendable(mp)) { if (vp != NULL || (flags & V_MNTREF) != 0) vfs_rel(mp); return (0); } /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ MNT_ILOCK(mp); if (vp == NULL && (flags & V_MNTREF) == 0) MNT_REF(mp); if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { mp->mnt_secondary_writes++; mp->mnt_secondary_accwrites++; MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP | ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0), "suspfs", 0); vfs_rel(mp); if (error == 0) goto retry; return (error); } /* * Filesystem write operation has completed. If we are suspending and this * operation is the last one, notify the suspender that the suspension is * now in effect. */ void vn_finished_write(struct mount *mp) { if (mp == NULL || !vn_suspendable(mp)) return; MNT_ILOCK(mp); MNT_REL(mp); mp->mnt_writeopcount--; if (mp->mnt_writeopcount < 0) panic("vn_finished_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_writeopcount <= 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); } /* * Filesystem secondary write operation has completed. If we are * suspending and this operation is the last one, notify the suspender * that the suspension is now in effect. */ void vn_finished_secondary_write(struct mount *mp) { if (mp == NULL || !vn_suspendable(mp)) return; MNT_ILOCK(mp); MNT_REL(mp); mp->mnt_secondary_writes--; if (mp->mnt_secondary_writes < 0) panic("vn_finished_secondary_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_secondary_writes <= 0) wakeup(&mp->mnt_secondary_writes); MNT_IUNLOCK(mp); } /* * Request a filesystem to suspend write operations. */ int vfs_write_suspend(struct mount *mp, int flags) { int error; MPASS(vn_suspendable(mp)); MNT_ILOCK(mp); if (mp->mnt_susp_owner == curthread) { MNT_IUNLOCK(mp); return (EALREADY); } while (mp->mnt_kern_flag & MNTK_SUSPEND) msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); /* * Unmount holds a write reference on the mount point. If we * own busy reference and drain for writers, we deadlock with * the reference draining in the unmount path. Callers of * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if * vfs_busy() reference is owned and caller is not in the * unmount context. */ if ((flags & VS_SKIP_UNMOUNT) != 0 && (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { MNT_IUNLOCK(mp); return (EBUSY); } mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = curthread; if (mp->mnt_writeopcount > 0) (void) msleep(&mp->mnt_writeopcount, MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); else MNT_IUNLOCK(mp); if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) vfs_write_resume(mp, 0); return (error); } /* * Request a filesystem to resume write operations. */ void vfs_write_resume(struct mount *mp, int flags) { MPASS(vn_suspendable(mp)); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | MNTK_SUSPENDED); mp->mnt_susp_owner = NULL; wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); curthread->td_pflags &= ~TDP_IGNSUSP; if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); mp->mnt_writeopcount++; } MNT_IUNLOCK(mp); if ((flags & VR_NO_SUSPCLR) == 0) VFS_SUSP_CLEAN(mp); } else if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); vn_start_write_locked(mp, 0); } else { MNT_IUNLOCK(mp); } } /* * Helper loop around vfs_write_suspend() for filesystem unmount VFS * methods. */ int vfs_write_suspend_umnt(struct mount *mp) { int error; MPASS(vn_suspendable(mp)); KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, ("vfs_write_suspend_umnt: recursed")); /* dounmount() already called vn_start_write(). */ for (;;) { vn_finished_write(mp); error = vfs_write_suspend(mp, 0); if (error != 0) { vn_start_write(NULL, &mp, V_WAIT); return (error); } MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) break; MNT_IUNLOCK(mp); vn_start_write(NULL, &mp, V_WAIT); } mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); curthread->td_pflags |= TDP_IGNSUSP; return (0); } /* * Implement kqueues for files by translating it to vnode operation. */ static int vn_kqfilter(struct file *fp, struct knote *kn) { return (VOP_KQFILTER(fp->f_vnode, kn)); } /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; int error; iov.iov_len = *buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = *buflen; if ((ioflg & IO_NODELOCKED) == 0) vn_lock(vp, LK_SHARED | LK_RETRY); ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute retrieval as kernel */ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp, 0); if (error == 0) { *buflen = *buflen - auio.uio_resid; } return (error); } /* * XXX failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; struct mount *mp; int error; iov.iov_len = buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = buflen; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute setting as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0); } return (error); } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute removal as kernel */ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0); } return (error); } static int vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); } int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) { return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, lkflags, rvp)); } int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp) { struct mount *mp; int ltype, error; ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); mp = vp->v_mount; ltype = VOP_ISLOCKED(vp); KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, ("vn_vget_ino: vp not locked")); error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp, 0); error = vfs_busy(mp, 0); vn_lock(vp, ltype | LK_RETRY); vfs_rel(mp); if (error != 0) return (ENOENT); if (vp->v_iflag & VI_DOOMED) { vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp, 0); error = alloc(mp, alloc_arg, lkflags, rvp); vfs_unbusy(mp); if (*rvp != vp) vn_lock(vp, ltype | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { if (error == 0) { if (*rvp == vp) vunref(vp); else vput(*rvp); } error = ENOENT; } return (error); } int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, struct thread *td) { if (vp->v_type != VREG || td == NULL) return (0); if ((uoff_t)uio->uio_offset + uio->uio_resid > lim_cur(td, RLIMIT_FSIZE)) { PROC_LOCK(td->td_proc); kern_psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } return (0); } int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp, 0); #endif return (setfmode(td, active_cred, vp, mode)); } int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp, 0); #endif return (setfown(td, active_cred, vp, uid, gid)); } void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) { vm_object_t object; if ((object = vp->v_object) == NULL) return; VM_OBJECT_WLOCK(object); vm_object_page_remove(object, start, end, 0); VM_OBJECT_WUNLOCK(object); } int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) { struct vattr va; daddr_t bn, bnp; uint64_t bsize; off_t noff; int error; KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, ("Wrong command %lu", cmd)); if (vn_lock(vp, LK_SHARED) != 0) return (EBADF); if (vp->v_type != VREG) { error = ENOTTY; goto unlock; } error = VOP_GETATTR(vp, &va, cred); if (error != 0) goto unlock; noff = *off; if (noff >= va.va_size) { error = ENXIO; goto unlock; } bsize = vp->v_mount->mnt_stat.f_iosize; for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) { error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); if (error == EOPNOTSUPP) { error = ENOTTY; goto unlock; } if ((bnp == -1 && cmd == FIOSEEKHOLE) || (bnp != -1 && cmd == FIOSEEKDATA)) { noff = bn * bsize; if (noff < *off) noff = *off; goto unlock; } } if (noff > va.va_size) noff = va.va_size; /* noff == va.va_size. There is an implicit hole at the end of file. */ if (cmd == FIOSEEKDATA) error = ENXIO; unlock: VOP_UNLOCK(vp, 0); if (error == 0) *off = noff; return (error); } int vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct ucred *cred; struct vnode *vp; struct vattr vattr; off_t foffset, size; int error, noneg; cred = td->td_ucred; vp = fp->f_vnode; foffset = foffset_lock(fp, 0); noneg = (vp->v_type != VCHR); error = 0; switch (whence) { case L_INCR: if (noneg && (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); if (error) break; /* * If the file references a disk device, then fetch * the media size and use that to determine the ending * offset. */ if (vattr.va_size == 0 && vp->v_type == VCHR && fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) vattr.va_size = size; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; case SEEK_DATA: error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); break; case SEEK_HOLE: error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) goto drop; VFS_KNOTE_UNLOCKED(vp, 0); td->td_uretoff.tdu_off = offset; drop: foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { int error; /* * Grant permission if the caller is the owner of the file, or * the super-user, or has ACL_WRITE_ATTRIBUTES permission on * on the file. If the time pointer is null, then write * permission on the file is also sufficient. * * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES * will be allowed to set the times [..] to the current * server time. */ error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) error = VOP_ACCESS(vp, VWRITE, cred, td); return (error); } int vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct vnode *vp; int error; if (fp->f_type == DTYPE_FIFO) kif->kf_type = KF_TYPE_FIFO; else kif->kf_type = KF_TYPE_VNODE; vp = fp->f_vnode; vref(vp); FILEDESC_SUNLOCK(fdp); error = vn_fill_kinfo_vnode(vp, kif); vrele(vp); FILEDESC_SLOCK(fdp); return (error); } static inline void vn_fill_junk(struct kinfo_file *kif) { size_t len, olen; /* * Simulate vn_fullpath returning changing values for a given * vp during e.g. coredump. */ len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1; olen = strlen(kif->kf_path); if (len < olen) strcpy(&kif->kf_path[len - 1], "$"); else for (; olen < len; olen++) strcpy(&kif->kf_path[olen], "A"); } int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif) { struct vattr va; char *fullpath, *freepath; int error; kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type); freepath = NULL; fullpath = "-"; error = vn_fullpath(curthread, vp, &fullpath, &freepath); if (error == 0) { strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); } if (freepath != NULL) free(freepath, M_TEMP); KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path, vn_fill_junk(kif); ); /* * Retrieve vnode attributes. */ va.va_fsid = VNOVAL; va.va_rdev = NODEV; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, curthread->td_ucred); VOP_UNLOCK(vp, 0); if (error != 0) return (error); if (va.va_fsid != VNOVAL) kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; else kif->kf_un.kf_file.kf_file_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; kif->kf_un.kf_file.kf_file_fsid_freebsd11 = kif->kf_un.kf_file.kf_file_fsid; /* truncate */ kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); kif->kf_un.kf_file.kf_file_size = va.va_size; kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; kif->kf_un.kf_file.kf_file_rdev_freebsd11 = kif->kf_un.kf_file.kf_file_rdev; /* truncate */ return (0); } int vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { #ifdef HWPMC_HOOKS struct pmckern_map_in pkm; #endif struct mount *mp; struct vnode *vp; vm_object_t object; vm_prot_t maxprot; boolean_t writecounted; int error; #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) /* * POSIX shared-memory objects are defined to have * kernel persistence, and are not defined to support * read(2)/write(2) -- or even open(2). Thus, we can * use MAP_ASYNC to trade on-disk coherence for speed. * The shm_open(3) library routine turns on the FPOSIXSHM * flag to request this behavior. */ if ((fp->f_flag & FPOSIXSHM) != 0) flags |= MAP_NOSYNC; #endif vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } else { maxprot |= VM_PROT_WRITE; cap_maxprot |= VM_PROT_WRITE; } maxprot &= cap_maxprot; /* * For regular files and shared memory, POSIX requires that * the value of foff be a legitimate offset within the data * object. In particular, negative offsets are invalid. * Blocking negative offsets and overflows here avoids * possible wraparound or user-level access into reserved * ranges of the data object later. In contrast, POSIX does * not dictate how offsets are used by device drivers, so in * the case of a device mapping a negative offset is passed * on. */ if ( #ifdef _LP64 size > OFF_MAX || #endif foff < 0 || foff > OFF_MAX - size) return (EINVAL); writecounted = FALSE; error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, &foff, &object, &writecounted); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, writecounted, td); if (error != 0) { /* * If this mapping was accounted for in the vnode's * writecount, then undo that now. */ if (writecounted) vnode_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } #ifdef HWPMC_HOOKS /* Inform hwpmc(4) if an executable is being mapped. */ if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) { if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) { pkm.pm_file = vp; pkm.pm_address = (uintptr_t) *addr; PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm); } } #endif return (error); } void vn_fsid(struct vnode *vp, struct vattr *va) { fsid_t *f; f = &vp->v_mount->mnt_stat.f_fsid; va->va_fsid = (uint32_t)f->val[1]; va->va_fsid <<= sizeof(f->val[1]) * NBBY; va->va_fsid += (uint32_t)f->val[0]; }