diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 685222c7893f..590a3875b170 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -1,2714 +1,2725 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); static uma_zone_t file_zone; /* How to treat 'new' parameter when allocating a fd for do_dup(). */ enum dup_type { DUP_VARIABLE, DUP_FIXED }; static int do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval); static int fd_first_free(struct filedesc *, int, int); static int fd_last_used(struct filedesc *, int, int); static void fdgrowtable(struct filedesc *, int); static int fdrop_locked(struct file *fp, struct thread *td); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); /* * A process is initially started out with NDFILE descriptors stored within * this structure, selected to be enough for typical applications based on * the historical limit of 20 open files (and the usage of descriptors by * shells). If these descriptors are exhausted, a larger descriptor table * may be allocated, up to a process' resource limit; the internal arrays * are then unused. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* * Storage required per open file descriptor. */ #define OFILESIZE (sizeof(struct file *) + sizeof(char)) /* * Basic allocation of descriptors: * one of the above, plus arrays for NDFILE descriptors. */ struct filedesc0 { struct filedesc fd_fd; /* * These arrays are used when the number of open files is * <= NDFILE, and are then pointed to by the pointers above. */ struct file *fd_dfiles[NDFILE]; char fd_dfileflags[NDFILE]; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ struct filelist filehead; /* head of list of open files */ int openfiles; /* actual number of open files */ struct sx filelist_lock; /* sx to protect filelist */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* A mutex to protect the association between a proc and filedesc. */ static struct mtx fdesc_mtx; /* * Find the first zero bit in the given bitmap, starting at low and not * exceeding size - 1. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the highest non-zero bit in the given bitmap, starting at low and * not exceeding size - 1. */ static int fd_last_used(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, minoff; if (low >= size) return (-1); off = NDSLOT(size); if (size % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); if ((mask &= map[off]) != 0) return (off * NDENTRIES + flsl(mask) - 1); --off; } for (minoff = NDSLOT(low); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (low - 1); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused(struct filedesc *fdp, int fd) { FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); KASSERT(!fdisused(fdp, fd), ("fd already used")); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); if (fd > fdp->fd_lastfile) fdp->fd_lastfile = fd; if (fd == fdp->fd_freefile) fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); KASSERT(fdisused(fdp, fd), ("fd is already unused")); KASSERT(fdp->fd_ofiles[fd] == NULL, ("fd is still in use")); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; if (fd == fdp->fd_lastfile) fdp->fd_lastfile = fd_last_used(fdp, 0, fd); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* * MPSAFE */ /* ARGSUSED */ int getdtablesize(struct thread *td, struct getdtablesize_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); return (0); } /* * Duplicate a file descriptor to a particular value. * * note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* * MPSAFE */ /* ARGSUSED */ int dup2(struct thread *td, struct dup2_args *uap) { return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to, td->td_retval)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* * MPSAFE */ /* ARGSUSED */ int dup(struct thread *td, struct dup_args *uap) { return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* * MPSAFE */ /* ARGSUSED */ int fcntl(struct thread *td, struct fcntl_args *uap) { struct flock fl; intptr_t arg; int error; error = 0; switch (uap->cmd) { case F_GETLK: case F_SETLK: case F_SETLKW: error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); arg = (intptr_t)&fl; break; default: arg = uap->arg; break; } if (error) return (error); error = kern_fcntl(td, uap->fd, uap->cmd, arg); if (error) return (error); if (uap->cmd == F_GETLK) error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); return (error); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp; struct proc *p; char *pop; struct vnode *vp; u_int newmin; int error, flg, tmp; int giant_locked; /* * XXXRW: Some fcntl() calls require Giant -- others don't. Try to * avoid grabbing Giant for calls we know don't need it. */ switch (cmd) { case F_DUPFD: case F_GETFD: case F_SETFD: case F_GETFL: giant_locked = 0; break; default: giant_locked = 1; mtx_lock(&Giant); } error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; FILEDESC_LOCK(fdp); if ((unsigned)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) { FILEDESC_UNLOCK(fdp); error = EBADF; goto done2; } pop = &fdp->fd_ofileflags[fd]; switch (cmd) { case F_DUPFD: /* mtx_assert(&Giant, MA_NOTOWNED); */ FILEDESC_UNLOCK(fdp); newmin = arg; PROC_LOCK(p); if (newmin >= lim_cur(p, RLIMIT_NOFILE) || newmin >= maxfilesperproc) { PROC_UNLOCK(p); error = EINVAL; break; } PROC_UNLOCK(p); error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval); break; case F_GETFD: /* mtx_assert(&Giant, MA_NOTOWNED); */ td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0; FILEDESC_UNLOCK(fdp); break; case F_SETFD: /* mtx_assert(&Giant, MA_NOTOWNED); */ *pop = (*pop &~ UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); FILEDESC_UNLOCK(fdp); break; case F_GETFL: /* mtx_assert(&Giant, MA_NOTOWNED); */ FILE_LOCK(fp); td->td_retval[0] = OFLAGS(fp->f_flag); FILE_UNLOCK(fp); FILEDESC_UNLOCK(fdp); break; case F_SETFL: mtx_assert(&Giant, MA_OWNED); FILE_LOCK(fp); fhold_locked(fp); fp->f_flag &= ~FCNTLFLAGS; fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; FILE_UNLOCK(fp); FILEDESC_UNLOCK(fdp); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } FILE_LOCK(fp); fp->f_flag &= ~FNONBLOCK; FILE_UNLOCK(fp); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: mtx_assert(&Giant, MA_OWNED); fhold(fp); FILEDESC_UNLOCK(fdp); error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: mtx_assert(&Giant, MA_OWNED); fhold(fp); FILEDESC_UNLOCK(fdp); tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLKW: mtx_assert(&Giant, MA_OWNED); flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: mtx_assert(&Giant, MA_OWNED); if (fp->f_type != DTYPE_VNODE) { FILEDESC_UNLOCK(fdp); error = EBADF; break; } flp = (struct flock *)arg; if (flp->l_whence == SEEK_CUR) { if (fp->f_offset < 0 || (flp->l_start > 0 && fp->f_offset > OFF_MAX - flp->l_start)) { FILEDESC_UNLOCK(fdp); error = EOVERFLOW; break; } flp->l_start += fp->f_offset; } /* * VOP_ADVLOCK() may block. */ fhold(fp); FILEDESC_UNLOCK(fdp); vp = fp->f_vnode; switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); break; default: error = EINVAL; break; } /* Check for race with close */ FILEDESC_LOCK_FAST(fdp); if ((unsigned) fd >= fdp->fd_nfiles || fp != fdp->fd_ofiles[fd]) { FILEDESC_UNLOCK_FAST(fdp); flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); } else FILEDESC_UNLOCK_FAST(fdp); fdrop(fp, td); break; case F_GETLK: mtx_assert(&Giant, MA_OWNED); if (fp->f_type != DTYPE_VNODE) { FILEDESC_UNLOCK(fdp); error = EBADF; break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { FILEDESC_UNLOCK(fdp); error = EINVAL; break; } if (flp->l_whence == SEEK_CUR) { if ((flp->l_start > 0 && fp->f_offset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && fp->f_offset < OFF_MIN - flp->l_start)) { FILEDESC_UNLOCK(fdp); error = EOVERFLOW; break; } flp->l_start += fp->f_offset; } /* * VOP_ADVLOCK() may block. */ fhold(fp); FILEDESC_UNLOCK(fdp); vp = fp->f_vnode; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); fdrop(fp, td); break; default: FILEDESC_UNLOCK(fdp); error = EINVAL; break; } done2: if (giant_locked) mtx_unlock(&Giant); return (error); } /* * Common code for dup, dup2, and fcntl(F_DUPFD). */ static int do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval) { struct filedesc *fdp; struct proc *p; struct file *fp; struct file *delfp; int error, holdleaders, maxfd; KASSERT((type == DUP_VARIABLE || type == DUP_FIXED), ("invalid dup type %d", type)); p = td->td_proc; fdp = p->p_fd; /* * Verify we have a valid descriptor to dup from and possibly to * dup to. */ if (old < 0 || new < 0) return (EBADF); PROC_LOCK(p); maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); if (new >= maxfd) return (EMFILE); FILEDESC_LOCK(fdp); if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { FILEDESC_UNLOCK(fdp); return (EBADF); } if (type == DUP_FIXED && old == new) { *retval = new; FILEDESC_UNLOCK(fdp); return (0); } fp = fdp->fd_ofiles[old]; fhold(fp); /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. Since the filedesc * lock may be temporarily dropped in the process, we have to look * out for a race. */ if (type == DUP_FIXED) { if (new >= fdp->fd_nfiles) fdgrowtable(fdp, new + 1); if (fdp->fd_ofiles[new] == NULL) fdused(fdp, new); } else { if ((error = fdalloc(td, new, &new)) != 0) { FILEDESC_UNLOCK(fdp); fdrop(fp, td); return (error); } } /* * If the old file changed out from under us then treat it as a * bad file descriptor. Userland should do its own locking to * avoid this case. */ if (fdp->fd_ofiles[old] != fp) { /* we've allocated a descriptor which we won't use */ if (fdp->fd_ofiles[new] == NULL) fdunused(fdp, new); FILEDESC_UNLOCK(fdp); fdrop(fp, td); return (EBADF); } KASSERT(old != new, ("new fd is same as old")); /* * Save info on the descriptor being overwritten. We cannot close * it without introducing an ownership race for the slot, since we * need to drop the filedesc lock to call closef(). * * XXX this duplicates parts of close(). */ delfp = fdp->fd_ofiles[new]; holdleaders = 0; if (delfp != NULL) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; holdleaders = 1; } } /* * Duplicate the source descriptor */ fdp->fd_ofiles[new] = fp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; if (new > fdp->fd_lastfile) fdp->fd_lastfile = new; *retval = new; /* * If we dup'd over a valid file, we now own the reference to it * and must dispose of it using closef() semantics (as if a * close() were performed on it). * * XXX this duplicates parts of close(). */ if (delfp != NULL) { knote_fdclose(td, new); if (delfp->f_type == DTYPE_MQUEUE) mq_fdclose(td, new, delfp); FILEDESC_UNLOCK(fdp); (void) closef(delfp, td); if (holdleaders) { FILEDESC_LOCK_FAST(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_UNLOCK_FAST(fdp); } } else { FILEDESC_UNLOCK(fdp); } return (0); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } *(sigio->sio_myref) = NULL; if ((sigio)->sio_pgid < 0) { struct pgrp *pg = (sigio)->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { struct proc *p = (sigio)->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); } /* * Free a list of sigio structures. * We only need to lock the SIGIO_LOCK because we have made ourselves * inaccessible to callers of fsetown and therefore do not need to lock * the proc or pgrp struct for the list manipulation. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio; sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; /* * Every entry of the list should belong * to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK_ASSERT(p, MA_NOTOWNED); } SIGIO_LOCK(); while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { *(sigio->sio_myref) = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); SIGIO_LOCK(); } SIGIO_UNLOCK(); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } ret = 0; /* Allocate and fill in the new sigio out of locks. */ MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; sx_slock(&proctree_lock); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { ret = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ PROC_UNLOCK(proc); if (proc->p_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } pgrp = NULL; } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; goto fail; } PGRP_UNLOCK(pgrp); /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } proc = NULL; } funsetown(sigiop); if (pgid > 0) { PROC_LOCK(proc); /* * Since funsetownlst() is called without the proctree * locked, we need to check for P_WEXIT. * XXX: is ESRCH correct? */ if ((proc->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(proc); ret = ESRCH; goto fail; } SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); sigio->sio_proc = proc; PROC_UNLOCK(proc); } else { PGRP_LOCK(pgrp); SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); SIGIO_LOCK(); *sigiop = sigio; SIGIO_UNLOCK(); return (0); fail: sx_sunlock(&proctree_lock); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(sigiop) struct sigio **sigiop; { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* * MPSAFE */ /* ARGSUSED */ int close(td, uap) struct thread *td; struct close_args *uap; { return (kern_close(td, uap->fd)); } int kern_close(td, fd) struct thread *td; int fd; { struct filedesc *fdp; struct file *fp; int error; int holdleaders; error = 0; holdleaders = 0; fdp = td->td_proc->p_fd; AUDIT_SYSCLOSE(td, fd); FILEDESC_LOCK(fdp); if ((unsigned)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) { FILEDESC_UNLOCK(fdp); return (EBADF); } fdp->fd_ofiles[fd] = NULL; fdp->fd_ofileflags[fd] = 0; fdunused(fdp, fd); if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; holdleaders = 1; } /* * We now hold the fp reference that used to be owned by the descriptor * array. * We have to unlock the FILEDESC *AFTER* knote_fdclose to prevent a * race of the fd getting opened, a knote added, and deleteing a knote * for the new fd. */ knote_fdclose(td, fd); if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, fd, fp); FILEDESC_UNLOCK(fdp); error = closef(fp, td); if (holdleaders) { FILEDESC_LOCK_FAST(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_UNLOCK_FAST(fdp); } return (error); } #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* * MPSAFE */ /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* * MPSAFE */ /* ARGSUSED */ int fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; int error; AUDIT_ARG(fd, fd); if ((error = fget(td, fd, &fp)) != 0) return (error); AUDIT_ARG(file, td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct nfstat_args { int fd; struct nstat *sb; }; #endif /* * MPSAFE */ /* ARGSUSED */ int nfstat(struct thread *td, struct nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtnstat(&ub, &nub); error = copyout(&nub, uap->sb, sizeof(nub)); } return (error); } /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int fpathconf(struct thread *td, struct fpathconf_args *uap) { struct file *fp; struct vnode *vp; int error; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); /* If asynchronous I/O is available, it works for all descriptors. */ if (uap->name == _PC_ASYNC_IO) { td->td_retval[0] = async_io_version; goto out; } vp = fp->f_vnode; if (vp != NULL) { int vfslocked; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_PATHCONF(vp, uap->name, td->td_retval); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (uap->name != _PC_PIPE_BUF) { error = EINVAL; } else { td->td_retval[0] = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Grow the file table to accomodate (at least) nfd descriptors. This may * block and drop the filedesc lock, but it will reacquire it before * returning. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct file **ntable; char *nfileflags; int nnfiles, onfiles; NDSLOTTYPE *nmap; FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* compute the size of the new table */ onfiles = fdp->fd_nfiles; nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* allocate a new table and (if required) new bitmaps */ FILEDESC_UNLOCK(fdp); MALLOC(ntable, struct file **, nnfiles * OFILESIZE, M_FILEDESC, M_ZERO | M_WAITOK); nfileflags = (char *)&ntable[nnfiles]; if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); else nmap = NULL; FILEDESC_LOCK(fdp); /* * We now have new tables ready to go. Since we dropped the * filedesc lock to call malloc(), watch out for a race. */ onfiles = fdp->fd_nfiles; if (onfiles >= nnfiles) { /* we lost the race, but that's OK */ free(ntable, M_FILEDESC); if (nmap != NULL) free(nmap, M_FILEDESC); return; } bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable)); bcopy(fdp->fd_ofileflags, nfileflags, onfiles); if (onfiles > NDFILE) free(fdp->fd_ofiles, M_FILEDESC); fdp->fd_ofiles = ntable; fdp->fd_ofileflags = nfileflags; if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap)); if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); fdp->fd_map = nmap; } fdp->fd_nfiles = nnfiles; } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd = -1, maxfd; FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; PROC_LOCK(p); maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); /* * Search the bitmap for a free descriptor. If none is found, try * to grow the file table. Keep at it until we either get a file * descriptor or run into process or system limits; fdgrowtable() * may drop the filedesc lock, so we're in a race. */ for (;;) { fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (fd >= maxfd) return (EMFILE); if (fd < fdp->fd_nfiles) break; fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd)); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd] == NULL, ("free descriptor isn't")); fdp->fd_ofileflags[fd] = 0; /* XXX needed? */ fdused(fdp, fd); *result = fd; return (0); } /* * Check to see whether n user file descriptors * are available to the process p. */ int fdavail(struct thread *td, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = td->td_proc->p_fd; struct file **fpp; int i, lim, last; FILEDESC_LOCK_ASSERT(fdp, MA_OWNED); PROC_LOCK(p); lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) return (1); last = min(fdp->fd_nfiles, lim); fpp = &fdp->fd_ofiles[fdp->fd_freefile]; for (i = last - fdp->fd_freefile; --i >= 0; fpp++) { if (*fpp == NULL && --n <= 0) return (1); } return (0); } /* * Create a new open file structure and allocate * a file decriptor for the process that refers to it. * We add one reference to the file for the descriptor table * and one reference for resultfp. This is to prevent us being * preempted and the entry in the descriptor table closed after * we release the FILEDESC lock. */ int falloc(struct thread *td, struct file **resultfp, int *resultfd) { struct proc *p = td->td_proc; struct file *fp, *fq; int error, i; int maxuserfiles = maxfiles - (maxfiles / 20); static struct timeval lastfail; static int curfail; fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); sx_xlock(&filelist_lock); if ((openfiles >= maxuserfiles && priv_check_cred(td->td_ucred, PRIV_MAXFILES, SUSER_RUID) != 0) || openfiles >= maxfiles) { if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n", td->td_ucred->cr_ruid); } sx_xunlock(&filelist_lock); uma_zfree(file_zone, fp); return (ENFILE); } openfiles++; /* * If the process has file descriptor zero open, add the new file * descriptor to the list of open files at that point, otherwise * put it at the front of the list of open files. */ fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep); fp->f_count = 1; if (resultfp) fp->f_count++; fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; fp->f_data = NULL; fp->f_vnode = NULL; FILEDESC_LOCK(p->p_fd); if ((fq = p->p_fd->fd_ofiles[0])) { LIST_INSERT_AFTER(fq, fp, f_list); } else { LIST_INSERT_HEAD(&filehead, fp, f_list); } sx_xunlock(&filelist_lock); if ((error = fdalloc(td, 0, &i))) { FILEDESC_UNLOCK(p->p_fd); fdrop(fp, td); if (resultfp) fdrop(fp, td); return (error); } p->p_fd->fd_ofiles[i] = fp; FILEDESC_UNLOCK(p->p_fd); if (resultfp) *resultfp = fp; if (resultfd) *resultfd = i; return (0); } /* * Build a new filedesc structure from another. * Copy the current, root, and jail root vnode references. */ struct filedesc * fdinit(struct filedesc *fdp) { struct filedesc0 *newfdp; newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO); mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF); if (fdp != NULL) { FILEDESC_LOCK(fdp); newfdp->fd_fd.fd_cdir = fdp->fd_cdir; if (newfdp->fd_fd.fd_cdir) VREF(newfdp->fd_fd.fd_cdir); newfdp->fd_fd.fd_rdir = fdp->fd_rdir; if (newfdp->fd_fd.fd_rdir) VREF(newfdp->fd_fd.fd_rdir); newfdp->fd_fd.fd_jdir = fdp->fd_jdir; if (newfdp->fd_fd.fd_jdir) VREF(newfdp->fd_fd.fd_jdir); FILEDESC_UNLOCK(fdp); } /* Create the file descriptor table. */ newfdp->fd_fd.fd_refcnt = 1; newfdp->fd_fd.fd_holdcnt = 1; newfdp->fd_fd.fd_cmask = CMASK; newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; newfdp->fd_fd.fd_nfiles = NDFILE; newfdp->fd_fd.fd_map = newfdp->fd_dmap; newfdp->fd_fd.fd_lastfile = -1; return (&newfdp->fd_fd); } static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; mtx_lock(&fdesc_mtx); fdp = p->p_fd; if (fdp != NULL) fdp->fd_holdcnt++; mtx_unlock(&fdesc_mtx); return (fdp); } static void fddrop(struct filedesc *fdp) { int i; mtx_lock(&fdesc_mtx); i = --fdp->fd_holdcnt; mtx_unlock(&fdesc_mtx); if (i > 0) return; mtx_destroy(&fdp->fd_mtx); FREE(fdp, M_FILEDESC); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { FILEDESC_LOCK_FAST(fdp); fdp->fd_refcnt++; FILEDESC_UNLOCK_FAST(fdp); return (fdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct proc *p, struct thread *td) { FILEDESC_LOCK_FAST(p->p_fd); if (p->p_fd->fd_refcnt > 1) { struct filedesc *tmp; FILEDESC_UNLOCK_FAST(p->p_fd); tmp = fdcopy(p->p_fd); fdfree(td); p->p_fd = tmp; } else FILEDESC_UNLOCK_FAST(p->p_fd); } /* * Copy a filedesc structure. * A NULL pointer in returns a NULL reference, this is to ease callers, * not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; int i; /* Certain daemons might not have file descriptors. */ if (fdp == NULL) return (NULL); newfdp = fdinit(fdp); FILEDESC_LOCK_FAST(fdp); while (fdp->fd_lastfile >= newfdp->fd_nfiles) { FILEDESC_UNLOCK_FAST(fdp); FILEDESC_LOCK(newfdp); fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_UNLOCK(newfdp); FILEDESC_LOCK_FAST(fdp); } /* copy everything except kqueue descriptors */ newfdp->fd_freefile = -1; for (i = 0; i <= fdp->fd_lastfile; ++i) { if (fdisused(fdp, i) && fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE) { newfdp->fd_ofiles[i] = fdp->fd_ofiles[i]; newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; fhold(newfdp->fd_ofiles[i]); newfdp->fd_lastfile = i; } else { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; } } FILEDESC_UNLOCK_FAST(fdp); FILEDESC_LOCK(newfdp); for (i = 0; i <= newfdp->fd_lastfile; ++i) if (newfdp->fd_ofiles[i] != NULL) fdused(newfdp, i); FILEDESC_UNLOCK(newfdp); FILEDESC_LOCK_FAST(fdp); if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_UNLOCK_FAST(fdp); return (newfdp); } /* * Release a filedesc structure. */ void fdfree(struct thread *td) { struct filedesc *fdp; struct file **fpp; int i, locked; struct filedesc_to_leader *fdtol; struct file *fp; struct vnode *cdir, *jdir, *rdir, *vp; struct flock lf; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; /* Check for special need to clear POSIX style locks */ fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { FILEDESC_LOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { for (i = 0, fpp = fdp->fd_ofiles; i <= fdp->fd_lastfile; i++, fpp++) { if (*fpp == NULL || (*fpp)->f_type != DTYPE_VNODE) continue; fp = *fpp; fhold(fp); FILEDESC_UNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; locked = VFS_LOCK_GIANT(vp->v_mount); (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc-> p_leader, F_UNLCK, &lf, F_POSIX); VFS_UNLOCK_GIANT(locked); FILEDESC_LOCK(fdp); fdrop(fp, td); fpp = fdp->fd_ofiles + i; } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or do_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx, PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader * remains valid in closef(). */ fdtol->fdl_wakeup = 1; msleep(fdtol, &fdp->fd_mtx, PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; td->td_proc->p_fdtol = NULL; FILEDESC_UNLOCK(fdp); if (fdtol != NULL) FREE(fdtol, M_FILEDESC_TO_LEADER); } FILEDESC_LOCK_FAST(fdp); i = --fdp->fd_refcnt; FILEDESC_UNLOCK_FAST(fdp); if (i > 0) return; /* * We are the last reference to the structure, so we can * safely assume it will not change out from under us. */ fpp = fdp->fd_ofiles; for (i = fdp->fd_lastfile; i-- >= 0; fpp++) { if (*fpp) (void) closef(*fpp, td); } FILEDESC_LOCK(fdp); /* XXX This should happen earlier. */ mtx_lock(&fdesc_mtx); td->td_proc->p_fd = NULL; mtx_unlock(&fdesc_mtx); if (fdp->fd_nfiles > NDFILE) FREE(fdp->fd_ofiles, M_FILEDESC); if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) FREE(fdp->fd_map, M_FILEDESC); fdp->fd_nfiles = 0; cdir = fdp->fd_cdir; fdp->fd_cdir = NULL; rdir = fdp->fd_rdir; fdp->fd_rdir = NULL; jdir = fdp->fd_jdir; fdp->fd_jdir = NULL; FILEDESC_UNLOCK(fdp); if (cdir) { locked = VFS_LOCK_GIANT(cdir->v_mount); vrele(cdir); VFS_UNLOCK_GIANT(locked); } if (rdir) { locked = VFS_LOCK_GIANT(rdir->v_mount); vrele(rdir); VFS_UNLOCK_GIANT(locked); } if (jdir) { locked = VFS_LOCK_GIANT(jdir->v_mount); vrele(jdir); VFS_UNLOCK_GIANT(locked); } fddrop(fdp); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since setugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static int is_unsafe(struct file *fp) { if (fp->f_type == DTYPE_VNODE) { struct vnode *vp = fp->f_vnode; if ((vp->v_vflag & VV_PROCDEP) != 0) return (1); } return (0); } /* * Make this setguid thing safe, if at all possible. */ void setugidsafety(struct thread *td) { struct filedesc *fdp; int i; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; /* * Note: fdp->fd_ofiles may be reallocated out from under us while * we are blocked in a close. Be careful! */ FILEDESC_LOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { if (i > 2) break; if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { struct file *fp; knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fp = fdp->fd_ofiles[i]; fdp->fd_ofiles[i] = NULL; fdp->fd_ofileflags[i] = 0; fdunused(fdp, i); FILEDESC_UNLOCK(fdp); (void) closef(fp, td); FILEDESC_LOCK(fdp); } } FILEDESC_UNLOCK(fdp); } /* * If a specific file object occupies a specific file descriptor, * close the file descriptor entry and drop a reference on the file * object. This is a convenience function to handle a subsequent * error in a function that calls falloc() that handles the race that * another thread might have closed the file descriptor out from under * the thread creating the file object. */ void fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td) { FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[idx] == fp) { fdp->fd_ofiles[idx] = NULL; fdunused(fdp, idx); FILEDESC_UNLOCK(fdp); fdrop(fp, td); } else { FILEDESC_UNLOCK(fdp); } } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; int i; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; FILEDESC_LOCK(fdp); /* * We cannot cache fd_ofiles or fd_ofileflags since operations * may block and rip them out from under us. */ for (i = 0; i <= fdp->fd_lastfile; i++) { if (fdp->fd_ofiles[i] != NULL && (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE || (fdp->fd_ofileflags[i] & UF_EXCLOSE))) { struct file *fp; knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fp = fdp->fd_ofiles[i]; fdp->fd_ofiles[i] = NULL; fdp->fd_ofileflags[i] = 0; fdunused(fdp, i); if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, i, fp); FILEDESC_UNLOCK(fdp); (void) closef(fp, td); FILEDESC_LOCK(fdp); } } FILEDESC_UNLOCK(fdp); } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct nameidata nd; struct filedesc *fdp; struct file *fp; register_t retval; int fd, i, error, flags, devnull; fdp = td->td_proc->p_fd; if (fdp == NULL) return (0); KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); devnull = -1; error = 0; for (i = 0; i < 3; i++) { if (fdp->fd_ofiles[i] != NULL) continue; if (devnull < 0) { int vfslocked; error = falloc(td, &fp, &fd); if (error != 0) break; /* Note extra ref on `fp' held for us by falloc(). */ KASSERT(fd == i, ("oof, we didn't get our fd")); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, "/dev/null", td); flags = FREAD | FWRITE; error = vn_open(&nd, &flags, 0, fd); if (error != 0) { /* * Someone may have closed the entry in the * file descriptor table, so check it hasn't * changed before dropping the reference count. */ FILEDESC_LOCK(fdp); KASSERT(fdp->fd_ofiles[fd] == fp, ("table not shared, how did it change?")); fdp->fd_ofiles[fd] = NULL; fdunused(fdp, fd); FILEDESC_UNLOCK(fdp); fdrop(fp, td); fdrop(fp, td); break; } vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); fp->f_flag = flags; fp->f_vnode = nd.ni_vp; if (fp->f_data == NULL) fp->f_data = nd.ni_vp; if (fp->f_ops == &badfileops) fp->f_ops = &vnops; fp->f_type = DTYPE_VNODE; VOP_UNLOCK(nd.ni_vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); devnull = fd; fdrop(fp, td); } else { error = do_dup(td, DUP_FIXED, devnull, i, &retval); if (error != 0) break; } } return (error); } /* * Internal form of close. * Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. * * XXXRW: Giant is not required for the caller, but often will be held; this * makes it moderately likely the Giant will be recursed in the VFS case. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE && td != NULL) { int vfslocked; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table * is shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_LOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_UNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_LOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_UNLOCK(fdp); } VFS_UNLOCK_GIANT(vfslocked); } return (fdrop(fp, td)); } /* * Extract the file pointer associated with the specified descriptor for * the current user process. * * If the descriptor doesn't exist, EBADF is returned. * * If the descriptor exists but doesn't match 'flags' then * return EBADF for read attempts and EINVAL for write attempts. * * If 'hold' is set (non-zero) the file's refcount will be bumped on return. * It should be dropped with fdrop(). * If it is not set, then the refcount will not be bumped however the * thread's filedesc struct will be returned locked (for fgetsock). * * If an error occured the non-zero error is returned and *fpp is set to NULL. * Otherwise *fpp is set and zero is returned. */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold) { struct filedesc *fdp; struct file *fp; *fpp = NULL; if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) return (EBADF); FILEDESC_LOCK(fdp); if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) { FILEDESC_UNLOCK(fdp); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. * * Only one flag, or 0, may be specified. */ if (flags == FREAD && (fp->f_flag & FREAD) == 0) { FILEDESC_UNLOCK(fdp); return (EBADF); } if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) { FILEDESC_UNLOCK(fdp); return (EBADF); } if (hold) { fhold(fp); FILEDESC_UNLOCK(fdp); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, 0, 1)); } int fget_read(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, FREAD, 1)); } int fget_write(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, FWRITE, 1)); } /* * Like fget() but loads the underlying vnode, or returns an error if * the descriptor does not represent a vnode. Note that pipes use vnodes * but never have VM objects. The returned vnode will be vref()d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) { struct file *fp; int error; *vpp = NULL; if ((error = _fget(td, fd, &fp, 0, 0)) != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vref(*vpp); } FILEDESC_UNLOCK(td->td_proc->p_fd); return (error); } int fgetvp(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, 0)); } int fgetvp_read(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, FREAD)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, FWRITE)); } #endif /* * Like fget() but loads the underlying socket, or returns an error if * the descriptor does not represent a socket. * * We bump the ref count on the returned socket. XXX Also obtain the SX * lock in the future. * * XXXRW: fgetsock() and fputsock() are deprecated, as consumers should rely * on their file descriptor reference to prevent the socket from being * freed during use. */ int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) { struct file *fp; int error; NET_ASSERT_GIANT(); *spp = NULL; if (fflagp != NULL) *fflagp = 0; if ((error = _fget(td, fd, &fp, 0, 0)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { error = ENOTSOCK; } else { *spp = fp->f_data; if (fflagp) *fflagp = fp->f_flag; SOCK_LOCK(*spp); soref(*spp); SOCK_UNLOCK(*spp); } FILEDESC_UNLOCK(td->td_proc->p_fd); return (error); } /* * Drop the reference count on the socket and XXX release the SX lock in the * future. The last reference closes the socket. * * XXXRW: fputsock() is deprecated, see comment for fgetsock(). */ void fputsock(struct socket *so) { NET_ASSERT_GIANT(); ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); } int fdrop(struct file *fp, struct thread *td) { FILE_LOCK(fp); return (fdrop_locked(fp, td)); } /* * Drop reference on struct file passed in, may call closef if the * reference hits zero. * Expects struct file locked, and will unlock it. */ static int fdrop_locked(struct file *fp, struct thread *td) { int error; FILE_LOCK_ASSERT(fp, MA_OWNED); if (--fp->f_count > 0) { FILE_UNLOCK(fp); return (0); } + + /* + * We might have just dropped the last reference to a file + * object that is for a UNIX domain socket whose message + * buffers are being examined in unp_gc(). If that is the + * case, FWAIT will be set in f_gcflag and we need to wait for + * unp_gc() to finish its scan. + */ + while (fp->f_gcflag & FWAIT) + msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0); + /* We have the last ref so we can proceed without the file lock. */ FILE_UNLOCK(fp); if (fp->f_count < 0) panic("fdrop: count < 0"); if (fp->f_ops != &badfileops) error = fo_close(fp, td); else error = 0; sx_xlock(&filelist_lock); LIST_REMOVE(fp, f_list); openfiles--; sx_xunlock(&filelist_lock); crfree(fp->f_cred); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* * MPSAFE */ /* ARGSUSED */ int flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; int error; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); return (EOPNOTSUPP); } mtx_lock(&Giant); vp = fp->f_vnode; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; FILE_LOCK(fp); fp->f_flag &= ~FHASLOCK; FILE_UNLOCK(fp); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done2; } FILE_LOCK(fp); fp->f_flag |= FHASLOCK; FILE_UNLOCK(fp); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: fdrop(fp, td); mtx_unlock(&Giant); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error) { struct file *wfp; struct file *fp; /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_LOCK(fdp); if (dfd < 0 || dfd >= fdp->fd_nfiles || (wfp = fdp->fd_ofiles[dfd]) == NULL) { FILEDESC_UNLOCK(fdp); return (EBADF); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor * (indx) and return. * * For ENXIO steal away the file structure from (dfd) and * store it in (indx). (dfd) is effectively closed by * this operation. * * Any other error code is just returned. */ switch (error) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ FILE_LOCK(wfp); if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { FILE_UNLOCK(wfp); FILEDESC_UNLOCK(fdp); return (EACCES); } fp = fdp->fd_ofiles[indx]; fdp->fd_ofiles[indx] = wfp; fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; if (fp == NULL) fdused(fdp, indx); fhold_locked(wfp); FILE_UNLOCK(wfp); FILEDESC_UNLOCK(fdp); if (fp != NULL) { /* * We now own the reference to fp that the ofiles[] * array used to own. Release it. */ FILE_LOCK(fp); fdrop_locked(fp, td); } return (0); case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ fp = fdp->fd_ofiles[indx]; fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; fdp->fd_ofiles[dfd] = NULL; fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; fdp->fd_ofileflags[dfd] = 0; fdunused(fdp, dfd); if (fp == NULL) fdused(fdp, indx); if (fp != NULL) FILE_LOCK(fp); /* * We now own the reference to fp that the ofiles[] array * used to own. Release it. */ if (fp != NULL) fdrop_locked(fp, td); FILEDESC_UNLOCK(fdp); return (0); default: FILEDESC_UNLOCK(fdp); return (error); } /* NOTREACHED */ } /* * Scan all active processes to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new * mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct filedesc *fdp; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { fdp = fdhold(p); if (fdp == NULL) continue; nrele = 0; FILEDESC_LOCK_FAST(fdp); if (fdp->fd_cdir == olddp) { vref(newdp); fdp->fd_cdir = newdp; nrele++; } if (fdp->fd_rdir == olddp) { vref(newdp); fdp->fd_rdir = newdp; nrele++; } FILEDESC_UNLOCK_FAST(fdp); fddrop(fdp); while (nrele--) vrele(olddp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { vrele(rootvnode); vref(newdp); rootvnode = newdp; } } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; MALLOC(fdtol, struct filedesc_to_leader *, sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_LOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_UNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } /* * Get file structures. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n; /* * Note: because the number of file descriptors is calculated * in different ways for sizing vs returning the data, * there is information leakage from the first loop. However, * it is of a similar order of magnitude to the leakage from * global system statistics such as kern.openfiles. */ error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 16; /* A slight overestimate. */ sx_slock(&filelist_lock); LIST_FOREACH(fp, &filehead, f_list) { /* * We should grab the lock, but this is an * estimate, so does it really matter? */ /* mtx_lock(fp->f_mtxp); */ n += fp->f_count; /* mtx_unlock(f->f_mtxp); */ } sx_sunlock(&filelist_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); LIST_FOREACH(p, &allproc, p_list) { if (p->p_state == PRS_NEW) continue; PROC_LOCK(p); if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; PROC_UNLOCK(p); fdp = fdhold(p); if (fdp == NULL) continue; FILEDESC_LOCK_FAST(fdp); for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_ofiles[n]) == NULL) continue; xf.xf_fd = n; xf.xf_file = fp; xf.xf_data = fp->f_data; xf.xf_vnode = fp->f_vnode; xf.xf_type = fp->f_type; xf.xf_count = fp->f_count; xf.xf_msgcount = fp->f_msgcount; xf.xf_offset = fp->f_offset; xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } FILEDESC_UNLOCK_FAST(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnod"); case DTYPE_SOCKET: return ("sock"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_CRYPTO: return ("crpt"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; LIST_FOREACH(p, &allproc, p_list) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n < fdp->fd_nfiles; n++) { if (fp == fdp->fd_ofiles[n]) return (p); } } return (NULL); } DB_SHOW_COMMAND(files, db_show_files) { struct file *fp; struct proc *p; db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", "File", "Type", "Data", "Flag", "GCFl", "Count", "MCount", "Vnode", "FPID", "FCmd"); LIST_FOREACH(fp, &filehead, f_list) { p = file_to_first_proc(fp); db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, &openfiles, 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); sx_init(&filelist_lock, "filelist lock"); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL) /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (EBADF); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, }; /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL) diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index 33a6ec2e589e..9645c8143060 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -1,1903 +1,1938 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. * Copyright 2004-2006 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 */ /* * UNIX Domain (Local) Sockets * * This is an implementation of UNIX (local) domain sockets. Each socket has * an associated struct unpcb (UNIX protocol control block). Stream sockets * may be connected to 0 or 1 other socket. Datagram sockets may be * connected to 0, 1, or many other sockets. Sockets may be created and * connected in pairs (socketpair(2)), or bound/connected to using the file * system name space. For most purposes, only the receive socket buffer is * used, as sending on one socket delivers directly to the receive socket * buffer of a second socket. The implementation is substantially * complicated by the fact that "ancillary data", such as file descriptors or * credentials, may be passed across UNIX domain sockets. The potential for * passing UNIX domain sockets over other UNIX domain sockets requires the * implementation of a simple garbage collector to find and tear down cycles * of disconnected sockets. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include /* XXX must be before */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t unp_zone; static unp_gen_t unp_gencnt; static u_int unp_count; static struct unp_head unp_shead, unp_dhead; /* * Unix communications domain. * * TODO: * SEQPACKET, RDM * rethink name space problems * need a proper out-of-band * lock pushdown */ static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; static ino_t unp_ino; /* prototype for fake inode numbers */ struct mbuf *unp_addsockcred(struct thread *, struct mbuf *); /* * Both send and receive buffers are allocated PIPSIZ bytes of buffering for * stream sockets, although the total for sender and receiver is actually * only PIPSIZ. * * Datagram sockets really use the sendspace as the maximum datagram size, * and don't really want to reserve the sendspace. Their recvspace should be * large enough for at least one max-size datagram plus address. */ #ifndef PIPSIZ #define PIPSIZ 8192 #endif static u_long unpst_sendspace = PIPSIZ; static u_long unpst_recvspace = PIPSIZ; static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ static u_long unpdg_recvspace = 4*1024; static int unp_rights; /* file descriptors in flight */ SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain"); SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM"); SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM"); SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, &unpst_sendspace, 0, ""); SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, &unpst_recvspace, 0, ""); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, &unpdg_sendspace, 0, ""); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, &unpdg_recvspace, 0, ""); SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); /* * Currently, UNIX domain sockets are protected by a single subsystem lock, * which covers global data structures and variables, the contents of each * per-socket unpcb structure, and the so_pcb field in sockets attached to * the UNIX domain. This provides for a moderate degree of paralellism, as * receive operations on UNIX domain sockets do not need to acquire the * subsystem lock. Finer grained locking to permit send() without acquiring * a global lock would be a logical next step. * * The UNIX domain socket lock preceds all socket layer locks, including the * socket lock and socket buffer lock, permitting UNIX domain socket code to * call into socket support routines without releasing its locks. * * Some caution is required in areas where the UNIX domain socket code enters * VFS in order to create or find rendezvous points. This results in * dropping of the UNIX domain socket subsystem lock, acquisition of the * Giant lock, and potential sleeping. This increases the chances of races, * and exposes weaknesses in the socket->protocol API by offering poor * failure modes. */ static struct mtx unp_mtx; #define UNP_LOCK_INIT() \ mtx_init(&unp_mtx, "unp", NULL, MTX_DEF) #define UNP_LOCK() mtx_lock(&unp_mtx) #define UNP_UNLOCK() mtx_unlock(&unp_mtx) #define UNP_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED) #define UNP_UNLOCK_ASSERT() mtx_assert(&unp_mtx, MA_NOTOWNED) /* * Garbage collection of cyclic file descriptor/socket references occurs * asynchronously in a taskqueue context in order to avoid recursion and * reentrance in the UNIX domain socket, file descriptor, and socket layer * code. See unp_gc() for a full description. */ static struct task unp_gc_task; static int unp_connect(struct socket *,struct sockaddr *, struct thread *); static int unp_connect2(struct socket *so, struct socket *so2, int); static void unp_disconnect(struct unpcb *); static void unp_shutdown(struct unpcb *); static void unp_drop(struct unpcb *, int); static void unp_gc(__unused void *, int); static void unp_scan(struct mbuf *, void (*)(struct file *)); static void unp_mark(struct file *); static void unp_discard(struct file *); static void unp_freerights(struct file **, int); static int unp_internalize(struct mbuf **, struct thread *); static int unp_listen(struct socket *, struct unpcb *, int, struct thread *); /* * Definitions of protocols supported in the LOCAL domain. */ static struct domain localdomain; static struct protosw localsw[] = { { .pr_type = SOCK_STREAM, .pr_domain = &localdomain, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs }, { .pr_type = SOCK_DGRAM, .pr_domain = &localdomain, .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS, .pr_usrreqs = &uipc_usrreqs }, }; static struct domain localdomain = { .dom_family = AF_LOCAL, .dom_name = "local", .dom_init = unp_init, .dom_externalize = unp_externalize, .dom_dispose = unp_dispose, .dom_protosw = localsw, .dom_protoswNPROTOSW = &localsw[sizeof(localsw)/sizeof(localsw[0])] }; DOMAIN_SET(local); static void uipc_abort(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); UNP_LOCK(); unp_drop(unp, ECONNABORTED); UNP_UNLOCK(); } static int uipc_accept(struct socket *so, struct sockaddr **nam) { struct unpcb *unp; const struct sockaddr *sa; /* * Pass back name of connected socket, if it was bound and we are * still connected (our peer may have closed already!). */ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LOCK(); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_conn->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_UNLOCK(); return (0); } static int uipc_attach(struct socket *so, int proto, struct thread *td) { struct unpcb *unp; int error; KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { switch (so->so_type) { case SOCK_STREAM: error = soreserve(so, unpst_sendspace, unpst_recvspace); break; case SOCK_DGRAM: error = soreserve(so, unpdg_sendspace, unpdg_recvspace); break; default: panic("unp_attach"); } if (error) return (error); } unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO); if (unp == NULL) return (ENOBUFS); LIST_INIT(&unp->unp_refs); unp->unp_socket = so; so->so_pcb = unp; + unp->unp_refcount = 1; UNP_LOCK(); unp->unp_gencnt = ++unp_gencnt; unp_count++; LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead, unp, unp_link); UNP_UNLOCK(); return (0); } static int uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vattr vattr; int error, namelen; struct nameidata nd; struct unpcb *unp; struct vnode *vp; struct mount *mp; char *buf; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); if (namelen <= 0) return (EINVAL); /* * We don't allow simultaneous bind() calls on a single UNIX domain * socket, so flag in-progress operations, and return an error if an * operation is already in progress. * * Historically, we have not allowed a socket to be rebound, so this * also returns an error. Not allowing re-binding certainly * simplifies the implementation and avoids a great many possible * failure modes. */ UNP_LOCK(); if (unp->unp_vnode != NULL) { UNP_UNLOCK(); return (EINVAL); } if (unp->unp_flags & UNP_BINDING) { UNP_UNLOCK(); return (EALREADY); } unp->unp_flags |= UNP_BINDING; UNP_UNLOCK(); buf = malloc(namelen + 1, M_TEMP, M_WAITOK); strlcpy(buf, soun->sun_path, namelen + 1); mtx_lock(&Giant); restart: mtx_assert(&Giant, MA_OWNED); NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE, buf, td); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); if (error) goto error; vp = nd.ni_vp; if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (vp != NULL) { vrele(vp); error = EADDRINUSE; goto error; } error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error) goto error; goto restart; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); #ifdef MAC error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (error) { vn_finished_write(mp); goto error; } vp = nd.ni_vp; ASSERT_VOP_LOCKED(vp, "uipc_bind"); soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); UNP_LOCK(); vp->v_socket = unp->unp_socket; unp->unp_vnode = vp; unp->unp_addr = soun; unp->unp_flags &= ~UNP_BINDING; UNP_UNLOCK(); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); mtx_unlock(&Giant); free(buf, M_TEMP); return (0); error: UNP_LOCK(); unp->unp_flags &= ~UNP_BINDING; UNP_UNLOCK(); mtx_unlock(&Giant); free(buf, M_TEMP); return (error); } static int uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; KASSERT(td == curthread, ("uipc_connect: td != curthread")); UNP_LOCK(); error = unp_connect(so, nam, td); UNP_UNLOCK(); return (error); } /* * XXXRW: Should also unbind? */ static void uipc_close(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_close: unp == NULL")); UNP_LOCK(); unp_disconnect(unp); UNP_UNLOCK(); } int uipc_connect2(struct socket *so1, struct socket *so2) { struct unpcb *unp; int error; unp = sotounpcb(so1); KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); UNP_LOCK(); error = unp_connect2(so1, so2, PRU_CONNECT2); UNP_UNLOCK(); return (error); } /* control is EOPNOTSUPP */ static void uipc_detach(struct socket *so) { - int local_unp_rights; + struct sockaddr_un *saved_unp_addr; struct unpcb *unp; struct vnode *vp; + int freeunp, local_unp_rights; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); UNP_LOCK(); LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; --unp_count; if ((vp = unp->unp_vnode) != NULL) { /* * XXXRW: should v_socket be frobbed only while holding * Giant? */ unp->unp_vnode->v_socket = NULL; unp->unp_vnode = NULL; } if (unp->unp_conn != NULL) unp_disconnect(unp); while (!LIST_EMPTY(&unp->unp_refs)) { struct unpcb *ref = LIST_FIRST(&unp->unp_refs); unp_drop(ref, ECONNRESET); } unp->unp_socket->so_pcb = NULL; local_unp_rights = unp_rights; + saved_unp_addr = unp->unp_addr; + unp->unp_addr = NULL; + unp->unp_refcount--; + freeunp = (unp->unp_refcount == 0); UNP_UNLOCK(); - if (unp->unp_addr != NULL) - FREE(unp->unp_addr, M_SONAME); - uma_zfree(unp_zone, unp); + if (saved_unp_addr != NULL) + FREE(saved_unp_addr, M_SONAME); + if (freeunp) + uma_zfree(unp_zone, unp); if (vp) { int vfslocked; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } if (local_unp_rights) taskqueue_enqueue(taskqueue_thread, &unp_gc_task); } static int uipc_disconnect(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); UNP_LOCK(); unp_disconnect(unp); UNP_UNLOCK(); return (0); } static int uipc_listen(struct socket *so, int backlog, struct thread *td) { struct unpcb *unp; int error; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); UNP_LOCK(); if (unp->unp_vnode == NULL) { UNP_UNLOCK(); return (EINVAL); } error = unp_listen(so, unp, backlog, td); UNP_UNLOCK(); return (error); } static int uipc_peeraddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LOCK(); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL) sa = (struct sockaddr *) unp->unp_conn->unp_addr; else { /* * XXX: It seems that this test always fails even when * connection is established. So, this else clause is * added as workaround to return PF_LOCAL sockaddr. */ sa = &sun_noname; } bcopy(sa, *nam, sa->sa_len); UNP_UNLOCK(); return (0); } static int uipc_rcvd(struct socket *so, int flags) { struct unpcb *unp; struct socket *so2; u_int mbcnt, sbcc; u_long newhiwat; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL")); switch (so->so_type) { case SOCK_DGRAM: panic("uipc_rcvd DGRAM?"); /*NOTREACHED*/ case SOCK_STREAM: /* * Adjust backpressure on sender and wakeup any waiting to * write. */ SOCKBUF_LOCK(&so->so_rcv); mbcnt = so->so_rcv.sb_mbcnt; sbcc = so->so_rcv.sb_cc; SOCKBUF_UNLOCK(&so->so_rcv); UNP_LOCK(); if (unp->unp_conn == NULL) { UNP_UNLOCK(); break; } so2 = unp->unp_conn->unp_socket; SOCKBUF_LOCK(&so2->so_snd); so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt; newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc; (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); sowwakeup_locked(so2); unp->unp_mbcnt = mbcnt; unp->unp_cc = sbcc; UNP_UNLOCK(); break; default: panic("uipc_rcvd unknown socktype"); } return (0); } /* pru_rcvoob is EOPNOTSUPP */ static int uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; u_long newhiwat; int error = 0; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_send: unp == NULL")); if (flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; } if (control != NULL && (error = unp_internalize(&control, td))) goto release; UNP_LOCK(); switch (so->so_type) { case SOCK_DGRAM: { const struct sockaddr *from; if (nam != NULL) { if (unp->unp_conn != NULL) { error = EISCONN; break; } error = unp_connect(so, nam, td); if (error) break; } /* * Because connect() and send() are non-atomic in a sendto() * with a target address, it's possible that the socket will * have disconnected before the send() can run. In that case * return the slightly counter-intuitive but otherwise * correct error that the socket is not connected. */ unp2 = unp->unp_conn; if (unp2 == NULL) { error = ENOTCONN; break; } so2 = unp2->unp_socket; if (unp->unp_addr != NULL) from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; if (unp2->unp_flags & UNP_WANTCRED) control = unp_addsockcred(td, control); SOCKBUF_LOCK(&so2->so_rcv); if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { sorwakeup_locked(so2); m = NULL; control = NULL; } else { SOCKBUF_UNLOCK(&so2->so_rcv); error = ENOBUFS; } if (nam != NULL) unp_disconnect(unp); break; } case SOCK_STREAM: /* * Connect if not connected yet. * * Note: A better implementation would complain if not equal * to the peer's address. */ if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { error = unp_connect(so, nam, td); if (error) break; /* XXX */ } else { error = ENOTCONN; break; } } /* Lockless read. */ if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; break; } /* * Because connect() and send() are non-atomic in a sendto() * with a target address, it's possible that the socket will * have disconnected before the send() can run. In that case * return the slightly counter-intuitive but otherwise * correct error that the socket is not connected. */ unp2 = unp->unp_conn; if (unp2 == NULL) { error = ENOTCONN; break; } so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); if (unp2->unp_flags & UNP_WANTCRED) { /* * Credentials are passed only once on * SOCK_STREAM. */ unp2->unp_flags &= ~UNP_WANTCRED; control = unp_addsockcred(td, control); } /* * Send to paired receive port, and then reduce send buffer * hiwater marks to maintain backpressure. Wake up readers. */ if (control != NULL) { if (sbappendcontrol_locked(&so2->so_rcv, m, control)) control = NULL; } else { sbappend_locked(&so2->so_rcv, m); } mbcnt = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt; unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt; sbcc = so2->so_rcv.sb_cc; sorwakeup_locked(so2); SOCKBUF_LOCK(&so->so_snd); newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc); (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); so->so_snd.sb_mbmax -= mbcnt; SOCKBUF_UNLOCK(&so->so_snd); unp2->unp_cc = sbcc; m = NULL; break; default: panic("uipc_send unknown socktype"); } /* * SEND_EOF is equivalent to a SEND followed by * a SHUTDOWN. */ if (flags & PRUS_EOF) { socantsendmore(so); unp_shutdown(unp); } UNP_UNLOCK(); if (control != NULL && error != 0) unp_dispose(control); release: if (control != NULL) m_freem(control); if (m != NULL) m_freem(m); return (error); } static int uipc_sense(struct socket *so, struct stat *sb) { struct unpcb *unp; struct socket *so2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); UNP_LOCK(); sb->st_blksize = so->so_snd.sb_hiwat; if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) { so2 = unp->unp_conn->unp_socket; sb->st_blksize += so2->so_rcv.sb_cc; } sb->st_dev = NODEV; if (unp->unp_ino == 0) unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; sb->st_ino = unp->unp_ino; UNP_UNLOCK(); return (0); } static int uipc_shutdown(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); UNP_LOCK(); socantsendmore(so); unp_shutdown(unp); UNP_UNLOCK(); return (0); } static int uipc_sockaddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_LOCK(); if (unp->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_UNLOCK(); return (0); } struct pr_usrreqs uipc_usrreqs = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_connect = uipc_connect, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_close = uipc_close, }; int uipc_ctloutput(struct socket *so, struct sockopt *sopt) { struct unpcb *unp; struct xucred xu; int error, optval; if (sopt->sopt_level != 0) return (EINVAL); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); UNP_LOCK(); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case LOCAL_PEERCRED: if (unp->unp_flags & UNP_HAVEPC) xu = unp->unp_peercred; else { if (so->so_type == SOCK_STREAM) error = ENOTCONN; else error = EINVAL; } if (error == 0) error = sooptcopyout(sopt, &xu, sizeof(xu)); break; case LOCAL_CREDS: optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case LOCAL_CONNWAIT: optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EOPNOTSUPP; break; } break; case SOPT_SET: switch (sopt->sopt_name) { case LOCAL_CREDS: case LOCAL_CONNWAIT: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; #define OPTSET(bit) \ if (optval) \ unp->unp_flags |= bit; \ else \ unp->unp_flags &= ~bit; switch (sopt->sopt_name) { case LOCAL_CREDS: OPTSET(UNP_WANTCRED); break; case LOCAL_CONNWAIT: OPTSET(UNP_CONNWAIT); break; default: break; } break; #undef OPTSET default: error = ENOPROTOOPT; break; } break; default: error = EOPNOTSUPP; break; } UNP_UNLOCK(); return (error); } static int unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vnode *vp; struct socket *so2, *so3; struct unpcb *unp, *unp2, *unp3; int error, len; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; struct sockaddr *sa; UNP_LOCK_ASSERT(); unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) return (EINVAL); strlcpy(buf, soun->sun_path, len + 1); if (unp->unp_flags & UNP_CONNECTING) { UNP_UNLOCK(); return (EALREADY); } UNP_UNLOCK(); sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); error = namei(&nd); if (error) vp = NULL; else vp = nd.ni_vp; ASSERT_VOP_LOCKED(vp, "unp_connect"); NDFREE(&nd, NDF_ONLY_PNBUF); if (error) goto bad; if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); if (error) goto bad; mtx_unlock(&Giant); UNP_LOCK(); unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); so2 = vp->v_socket; if (so2 == NULL) { error = ECONNREFUSED; goto bad2; } if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad2; } if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (so2->so_options & SO_ACCEPTCONN) { /* * NB: drop locks here so unp_attach is entered w/o * locks; this avoids a recursive lock of the head * and holding sleep locks across a (potentially) * blocking malloc. */ UNP_UNLOCK(); so3 = sonewconn(so2, 0); UNP_LOCK(); } else so3 = NULL; if (so3 == NULL) { error = ECONNREFUSED; goto bad2; } unp = sotounpcb(so); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); unp3->unp_addr = (struct sockaddr_un *) sa; sa = NULL; } /* * unp_peercred management: * * The connecter's (client's) credentials are copied from its * process structure at the time of connect() (which is now). */ cru2x(td->td_ucred, &unp3->unp_peercred); unp3->unp_flags |= UNP_HAVEPC; /* * The receiver's (server's) credentials are copied from the * unp_peercred member of socket on which the former called * listen(); unp_listen() cached that process's credentials * at that time so we can use them now. */ KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, ("unp_connect: listener without cached peercred")); memcpy(&unp->unp_peercred, &unp2->unp_peercred, sizeof(unp->unp_peercred)); unp->unp_flags |= UNP_HAVEPC; if (unp2->unp_flags & UNP_WANTCRED) unp3->unp_flags |= UNP_WANTCRED; #ifdef MAC SOCK_LOCK(so); mac_set_socket_peer_from_socket(so, so3); mac_set_socket_peer_from_socket(so3, so); SOCK_UNLOCK(so); #endif so2 = so3; } error = unp_connect2(so, so2, PRU_CONNECT); bad2: UNP_UNLOCK(); mtx_lock(&Giant); bad: mtx_assert(&Giant, MA_OWNED); if (vp != NULL) vput(vp); mtx_unlock(&Giant); free(sa, M_SONAME); UNP_LOCK(); unp->unp_flags &= ~UNP_CONNECTING; return (error); } static int unp_connect2(struct socket *so, struct socket *so2, int req) { struct unpcb *unp = sotounpcb(so); struct unpcb *unp2; UNP_LOCK_ASSERT(); if (so2->so_type != so->so_type) return (EPROTOTYPE); unp2 = sotounpcb(so2); KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); unp->unp_conn = unp2; switch (so->so_type) { case SOCK_DGRAM: LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); soisconnected(so); break; case SOCK_STREAM: unp2->unp_conn = unp; if (req == PRU_CONNECT && ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) soisconnecting(so); else soisconnected(so); soisconnected(so2); break; default: panic("unp_connect2"); } return (0); } static void unp_disconnect(struct unpcb *unp) { struct unpcb *unp2 = unp->unp_conn; struct socket *so; UNP_LOCK_ASSERT(); if (unp2 == NULL) return; unp->unp_conn = NULL; switch (unp->unp_socket->so_type) { case SOCK_DGRAM: LIST_REMOVE(unp, unp_reflink); so = unp->unp_socket; SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); break; case SOCK_STREAM: soisdisconnected(unp->unp_socket); unp2->unp_conn = NULL; soisdisconnected(unp2->unp_socket); break; } } /* * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed by * the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers are * safe to reference. It first scans the list of struct unpcb's to generate * a pointer list, then it rescans its list one entry at a time to * externalize and copyout. It checks the generation number to see if a * struct unpcb has been reused, and will skip it if so. */ static int unp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; + int freeunp; struct unpcb *unp, **unp_list; unp_gen_t gencnt; struct xunpgen *xug; struct unp_head *head; struct xunpcb *xu; head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = unp_count; req->oldidx = 2 * (sizeof *xug) + (n + n/8) * sizeof(struct xunpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); UNP_LOCK(); gencnt = unp_gencnt; n = unp_count; UNP_UNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; xug->xug_gen = gencnt; xug->xug_sogen = so_gencnt; error = SYSCTL_OUT(req, xug, sizeof *xug); if (error) { free(xug, M_TEMP); return (error); } unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); UNP_LOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { if (unp->unp_gencnt <= gencnt) { if (cr_cansee(req->td->td_ucred, unp->unp_socket->so_cred)) continue; unp_list[i++] = unp; + unp->unp_refcount++; } } UNP_UNLOCK(); n = i; /* In case we lost some during malloc. */ error = 0; xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); for (i = 0; i < n; i++) { unp = unp_list[i]; - if (unp->unp_gencnt <= gencnt) { + UNP_LOCK(); + unp->unp_refcount--; + if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) { xu->xu_len = sizeof *xu; xu->xu_unpp = unp; /* * XXX - need more locking here to protect against * connect/disconnect races for SMP. */ if (unp->unp_addr != NULL) bcopy(unp->unp_addr, &xu->xu_addr, unp->unp_addr->sun_len); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) bcopy(unp->unp_conn->unp_addr, &xu->xu_caddr, unp->unp_conn->unp_addr->sun_len); bcopy(unp, &xu->xu_unp, sizeof *unp); sotoxsocket(unp->unp_socket, &xu->xu_socket); + UNP_UNLOCK(); error = SYSCTL_OUT(req, xu, sizeof *xu); + } else { + freeunp = (unp->unp_refcount == 0); + UNP_UNLOCK(); + if (freeunp) + uma_zfree(unp_zone, unp); } } free(xu, M_TEMP); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ xug->xug_gen = unp_gencnt; xug->xug_sogen = so_gencnt; xug->xug_count = unp_count; error = SYSCTL_OUT(req, xug, sizeof *xug); } free(unp_list, M_TEMP); free(xug, M_TEMP); return (error); } SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", "List of active local datagram sockets"); SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); static void unp_shutdown(struct unpcb *unp) { struct socket *so; UNP_LOCK_ASSERT(); if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && (so = unp->unp_conn->unp_socket)) socantrcvmore(so); } static void unp_drop(struct unpcb *unp, int errno) { struct socket *so = unp->unp_socket; UNP_LOCK_ASSERT(); so->so_error = errno; unp_disconnect(unp); } static void unp_freerights(struct file **rp, int fdcount) { int i; struct file *fp; for (i = 0; i < fdcount; i++) { fp = *rp; /* * Zero the pointer before calling unp_discard since it may * end up in unp_gc().. * * XXXRW: This is less true than it used to be. */ *rp++ = 0; unp_discard(fp); } } int unp_externalize(struct mbuf *control, struct mbuf **controlp) { struct thread *td = curthread; /* XXX */ struct cmsghdr *cm = mtod(control, struct cmsghdr *); int i; int *fdp; struct file **rp; struct file *fp; void *data; socklen_t clen = control->m_len, datalen; int error, newfds; int f; u_int newlen; UNP_UNLOCK_ASSERT(); error = 0; if (controlp != NULL) /* controlp == NULL => free control messages */ *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) { error = EINVAL; break; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { newfds = datalen / sizeof(struct file *); rp = data; /* If we're not outputting the descriptors free them. */ if (error || controlp == NULL) { unp_freerights(rp, newfds); goto next; } FILEDESC_LOCK(td->td_proc->p_fd); /* if the new FD's will not fit free them. */ if (!fdavail(td, newfds)) { FILEDESC_UNLOCK(td->td_proc->p_fd); error = EMSGSIZE; unp_freerights(rp, newfds); goto next; } /* * Now change each pointer to an fd in the global * table to an integer that is the index to the local * fd table entry that we set up to point to the * global one we are transferring. */ newlen = newfds * sizeof(int); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_UNLOCK(td->td_proc->p_fd); error = E2BIG; unp_freerights(rp, newfds); goto next; } fdp = (int *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); for (i = 0; i < newfds; i++) { if (fdalloc(td, 0, &f)) panic("unp_externalize fdalloc failed"); fp = *rp++; td->td_proc->p_fd->fd_ofiles[f] = fp; FILE_LOCK(fp); fp->f_msgcount--; FILE_UNLOCK(fp); unp_rights--; *fdp++ = f; } FILEDESC_UNLOCK(td->td_proc->p_fd); } else { /* We can just copy anything else across. */ if (error || controlp == NULL) goto next; *controlp = sbcreatecontrol(NULL, datalen, cm->cmsg_type, cm->cmsg_level); if (*controlp == NULL) { error = ENOBUFS; goto next; } bcopy(data, CMSG_DATA(mtod(*controlp, struct cmsghdr *)), datalen); } controlp = &(*controlp)->m_next; next: if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } m_freem(control); return (error); } static void unp_zone_change(void *tag) { uma_zone_set_max(unp_zone, maxsockets); } void unp_init(void) { unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, UMA_ALIGN_PTR, 0); if (unp_zone == NULL) panic("unp_init"); uma_zone_set_max(unp_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, NULL, EVENTHANDLER_PRI_ANY); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); TASK_INIT(&unp_gc_task, 0, unp_gc, NULL); UNP_LOCK_INIT(); } static int unp_internalize(struct mbuf **controlp, struct thread *td) { struct mbuf *control = *controlp; struct proc *p = td->td_proc; struct filedesc *fdescp = p->p_fd; struct cmsghdr *cm = mtod(control, struct cmsghdr *); struct cmsgcred *cmcred; struct file **rp; struct file *fp; struct timeval *tv; int i, fd, *fdp; void *data; socklen_t clen = control->m_len, datalen; int error, oldfds; u_int newlen; UNP_UNLOCK_ASSERT(); error = 0; *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > clen) { error = EINVAL; goto out; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { /* * Fill in credential information. */ case SCM_CREDS: *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), SCM_CREDS, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } cmcred = (struct cmsgcred *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); cmcred->cmcred_pid = p->p_pid; cmcred->cmcred_uid = td->td_ucred->cr_ruid; cmcred->cmcred_gid = td->td_ucred->cr_rgid; cmcred->cmcred_euid = td->td_ucred->cr_uid; cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); for (i = 0; i < cmcred->cmcred_ngroups; i++) cmcred->cmcred_groups[i] = td->td_ucred->cr_groups[i]; break; case SCM_RIGHTS: oldfds = datalen / sizeof (int); /* * Check that all the FDs passed in refer to legal * files. If not, reject the entire operation. */ fdp = data; FILEDESC_LOCK(fdescp); for (i = 0; i < oldfds; i++) { fd = *fdp++; if ((unsigned)fd >= fdescp->fd_nfiles || fdescp->fd_ofiles[fd] == NULL) { FILEDESC_UNLOCK(fdescp); error = EBADF; goto out; } fp = fdescp->fd_ofiles[fd]; if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { FILEDESC_UNLOCK(fdescp); error = EOPNOTSUPP; goto out; } } /* * Now replace the integer FDs with pointers to the * associated global file table entry.. */ newlen = oldfds * sizeof(struct file *); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_UNLOCK(fdescp); error = E2BIG; goto out; } fdp = data; rp = (struct file **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); for (i = 0; i < oldfds; i++) { fp = fdescp->fd_ofiles[*fdp++]; *rp++ = fp; FILE_LOCK(fp); fp->f_count++; fp->f_msgcount++; FILE_UNLOCK(fp); unp_rights++; } FILEDESC_UNLOCK(fdescp); break; case SCM_TIMESTAMP: *controlp = sbcreatecontrol(NULL, sizeof(*tv), SCM_TIMESTAMP, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } tv = (struct timeval *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); microtime(tv); break; default: error = EINVAL; goto out; } controlp = &(*controlp)->m_next; if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } out: m_freem(control); return (error); } struct mbuf * unp_addsockcred(struct thread *td, struct mbuf *control) { struct mbuf *m, *n, *n_prev; struct sockcred *sc; const struct cmsghdr *cm; int ngroups; int i; ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET); if (m == NULL) return (control); sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *)); sc->sc_uid = td->td_ucred->cr_ruid; sc->sc_euid = td->td_ucred->cr_uid; sc->sc_gid = td->td_ucred->cr_rgid; sc->sc_egid = td->td_ucred->cr_gid; sc->sc_ngroups = ngroups; for (i = 0; i < sc->sc_ngroups; i++) sc->sc_groups[i] = td->td_ucred->cr_groups[i]; /* * Unlink SCM_CREDS control messages (struct cmsgcred), since just * created SCM_CREDS control message (struct sockcred) has another * format. */ if (control != NULL) for (n = control, n_prev = NULL; n != NULL;) { cm = mtod(n, struct cmsghdr *); if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_CREDS) { if (n_prev == NULL) control = n->m_next; else n_prev->m_next = n->m_next; n = m_free(n); } else { n_prev = n; n = n->m_next; } } /* Prepend it to the head. */ m->m_next = control; return (m); } /* * unp_defer indicates whether additional work has been defered for a future * pass through unp_gc(). It is thread local and does not require explicit * synchronization. */ static int unp_defer; static int unp_taskcount; SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, ""); static int unp_recycled; SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, ""); static void unp_gc(__unused void *arg, int pending) { struct file *fp, *nextfp; struct socket *so; struct file **extra_ref, **fpp; int nunref, i; int nfiles_snap; int nfiles_slack = 20; unp_taskcount++; unp_defer = 0; /* - * Before going through all this, set all FDs to be NOT defered and + * Before going through all this, set all FDs to be NOT deferred and * NOT externally accessible. */ sx_slock(&filelist_lock); LIST_FOREACH(fp, &filehead, f_list) fp->f_gcflag &= ~(FMARK|FDEFER); do { KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer)); LIST_FOREACH(fp, &filehead, f_list) { FILE_LOCK(fp); /* * If the file is not open, skip it -- could be a * file in the process of being opened, or in the * process of being closed. If the file is * "closing", it may have been marked for deferred * consideration. Clear the flag now if so. */ if (fp->f_count == 0) { if (fp->f_gcflag & FDEFER) unp_defer--; fp->f_gcflag &= ~(FMARK|FDEFER); FILE_UNLOCK(fp); continue; } /* - * If we already marked it as 'defer' in a previous - * pass, then try process it this time and un-mark - * it. + * If we already marked it as 'defer' in a + * previous pass, then try to process it this + * time and un-mark it. */ if (fp->f_gcflag & FDEFER) { fp->f_gcflag &= ~FDEFER; unp_defer--; } else { /* - * if it's not defered, then check if it's + * if it's not deferred, then check if it's * already marked.. if so skip it */ if (fp->f_gcflag & FMARK) { FILE_UNLOCK(fp); continue; } /* * If all references are from messages in * transit, then skip it. it's not externally * accessible. */ if (fp->f_count == fp->f_msgcount) { FILE_UNLOCK(fp); continue; } /* * If it got this far then it must be * externally accessible. */ fp->f_gcflag |= FMARK; } /* - * Either it was defered, or it is externally + * Either it was deferred, or it is externally * accessible and not already marked so. Now check * if it is possibly one of OUR sockets. */ if (fp->f_type != DTYPE_SOCKET || (so = fp->f_data) == NULL) { FILE_UNLOCK(fp); continue; } - FILE_UNLOCK(fp); if (so->so_proto->pr_domain != &localdomain || - (so->so_proto->pr_flags&PR_RIGHTS) == 0) + (so->so_proto->pr_flags & PR_RIGHTS) == 0) { + FILE_UNLOCK(fp); continue; + } + + /* + * Tell any other threads that do a subsequent + * fdrop() that we are scanning the message + * buffers. + */ + fp->f_gcflag |= FWAIT; + FILE_UNLOCK(fp); + /* * So, Ok, it's one of our sockets and it IS - * externally accessible (or was defered). Now we + * externally accessible (or was deferred). Now we * look to see if we hold any file descriptors in its * message buffers. Follow those links and mark them * as accessible too. */ SOCKBUF_LOCK(&so->so_rcv); unp_scan(so->so_rcv.sb_mb, unp_mark); SOCKBUF_UNLOCK(&so->so_rcv); + + /* + * Wake up any threads waiting in fdrop(). + */ + FILE_LOCK(fp); + fp->f_gcflag &= ~FWAIT; + wakeup(&fp->f_gcflag); + FILE_UNLOCK(fp); } } while (unp_defer); sx_sunlock(&filelist_lock); /* * XXXRW: The following comments need updating for a post-SMPng and * deferred unp_gc() world, but are still generally accurate. * * We grab an extra reference to each of the file table entries that * are not otherwise accessible and then free the rights that are * stored in messages on them. * * The bug in the orginal code is a little tricky, so I'll describe * what's wrong with it here. * * It is incorrect to simply unp_discard each entry for f_msgcount * times -- consider the case of sockets A and B that contain * references to each other. On a last close of some other socket, * we trigger a gc since the number of outstanding rights (unp_rights) * is non-zero. If during the sweep phase the gc code unp_discards, * we end up doing a (full) closef on the descriptor. A closef on A * results in the following chain. Closef calls soo_close, which * calls soclose. Soclose calls first (through the switch * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply * returns because the previous instance had set unp_gcing, and we * return all the way back to soclose, which marks the socket with * SS_NOFDREF, and then calls sofree. Sofree calls sorflush to free * up the rights that are queued in messages on the socket A, i.e., * the reference on B. The sorflush calls via the dom_dispose switch * unp_dispose, which unp_scans with unp_discard. This second * instance of unp_discard just calls closef on B. * * Well, a similar chain occurs on B, resulting in a sorflush on B, * which results in another closef on A. Unfortunately, A is already * being closed, and the descriptor has already been marked with * SS_NOFDREF, and soclose panics at this point. * * Here, we first take an extra reference to each inaccessible * descriptor. Then, we call sorflush ourself, since we know it is a * Unix domain socket anyhow. After we destroy all the rights * carried in messages, we do a last closef to get rid of our extra * reference. This is the last close, and the unp_detach etc will * shut down the socket. * * 91/09/19, bsy@cs.cmu.edu */ again: nfiles_snap = openfiles + nfiles_slack; /* some slack */ extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP, M_WAITOK); sx_slock(&filelist_lock); if (nfiles_snap < openfiles) { sx_sunlock(&filelist_lock); free(extra_ref, M_TEMP); nfiles_slack += 20; goto again; } for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != NULL; fp = nextfp) { nextfp = LIST_NEXT(fp, f_list); FILE_LOCK(fp); /* * If it's not open, skip it */ if (fp->f_count == 0) { FILE_UNLOCK(fp); continue; } /* * If all refs are from msgs, and it's not marked accessible * then it must be referenced from some unreachable cycle of * (shut-down) FDs, so include it in our list of FDs to * remove. */ if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) { *fpp++ = fp; nunref++; fp->f_count++; } FILE_UNLOCK(fp); } sx_sunlock(&filelist_lock); /* * For each FD on our hit list, do the following two things: */ for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { struct file *tfp = *fpp; FILE_LOCK(tfp); if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) { FILE_UNLOCK(tfp); sorflush(tfp->f_data); } else { FILE_UNLOCK(tfp); } } for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { closef(*fpp, (struct thread *) NULL); unp_recycled++; } free(extra_ref, M_TEMP); } void unp_dispose(struct mbuf *m) { if (m) unp_scan(m, unp_discard); } static int unp_listen(struct socket *so, struct unpcb *unp, int backlog, struct thread *td) { int error; UNP_LOCK_ASSERT(); SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0) { cru2x(td->td_ucred, &unp->unp_peercred); unp->unp_flags |= UNP_HAVEPCCACHED; solisten_proto(so, backlog); } SOCK_UNLOCK(so); return (error); } static void unp_scan(struct mbuf *m0, void (*op)(struct file *)) { struct mbuf *m; struct file **rp; struct cmsghdr *cm; void *data; int i; socklen_t clen, datalen; int qfds; while (m0 != NULL) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) break; data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { qfds = datalen / sizeof (struct file *); rp = data; for (i = 0; i < qfds; i++) (*op)(*rp++); } if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } } m0 = m0->m_act; } } static void unp_mark(struct file *fp) { if (fp->f_gcflag & FMARK) return; unp_defer++; fp->f_gcflag |= (FMARK|FDEFER); } static void unp_discard(struct file *fp) { UNP_LOCK(); FILE_LOCK(fp); fp->f_msgcount--; unp_rights--; FILE_UNLOCK(fp); UNP_UNLOCK(); (void) closef(fp, (struct thread *)NULL); } diff --git a/sys/sys/file.h b/sys/sys/file.h index 16b0e35a09f7..58501fa83f37 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -1,310 +1,311 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)file.h 8.3 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_FILE_H_ #define _SYS_FILE_H_ #ifndef _KERNEL #include /* XXX */ #include #include #else #include #include #include struct stat; struct thread; struct uio; struct knote; struct vnode; struct socket; #endif /* _KERNEL */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_FIFO 4 /* fifo (named pipe) */ #define DTYPE_KQUEUE 5 /* event queue */ #define DTYPE_CRYPTO 6 /* crypto */ #define DTYPE_MQUEUE 7 /* posix message queue */ #ifdef _KERNEL struct file; struct ucred; typedef int fo_rdwr_t(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); #define FOF_OFFSET 1 /* Use the offset in uio argument */ typedef int fo_ioctl_t(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td); typedef int fo_poll_t(struct file *fp, int events, struct ucred *active_cred, struct thread *td); typedef int fo_kqfilter_t(struct file *fp, struct knote *kn); typedef int fo_stat_t(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td); typedef int fo_close_t(struct file *fp, struct thread *td); typedef int fo_flags_t; struct fileops { fo_rdwr_t *fo_read; fo_rdwr_t *fo_write; fo_ioctl_t *fo_ioctl; fo_poll_t *fo_poll; fo_kqfilter_t *fo_kqfilter; fo_stat_t *fo_stat; fo_close_t *fo_close; fo_flags_t fo_flags; /* DFLAG_* below */ }; #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ /* * Kernel descriptor table. * One entry for each open kernel vnode and socket. * * Below is the list of locks that protects members in struct file. * * (fl) filelist_lock * (f) f_mtx in struct file * none not locked */ struct file { LIST_ENTRY(file) f_list;/* (fl) list of active files */ short f_type; /* descriptor type */ void *f_data; /* file descriptor specific data */ u_int f_flag; /* see fcntl.h */ struct mtx *f_mtxp; /* mutex to protect data */ struct fileops *f_ops; /* File operations */ struct ucred *f_cred; /* credentials associated with descriptor */ int f_count; /* (f) reference count */ struct vnode *f_vnode; /* NULL or applicable vnode */ /* DFLAG_SEEKABLE specific fields */ off_t f_offset; short f_vnread_flags; /* * (f) home grown sleep lock for f_offset * Used only for shared vnode locking in * vnread() */ #define FOFFSET_LOCKED 0x1 #define FOFFSET_LOCK_WAITING 0x2 /* DTYPE_SOCKET specific fields */ short f_gcflag; /* used by thread doing fd garbage collection */ #define FMARK 0x1 /* mark during gc() */ #define FDEFER 0x2 /* defer for next gc pass */ +#define FWAIT 0x4 /* gc is scanning message buffers */ int f_msgcount; /* (f) references from message queue */ /* DTYPE_VNODE specific fields */ int f_seqcount; /* * count of sequential accesses -- cleared * by most seek operations. */ off_t f_nextoff; /* * offset of next expected read or write */ void *f_label; /* Place-holder for struct label pointer. */ }; #endif /* _KERNEL */ /* * Userland version of struct file, for sysctl */ struct xfile { size_t xf_size; /* size of struct xfile */ pid_t xf_pid; /* owning process */ uid_t xf_uid; /* effective uid of owning process */ int xf_fd; /* descriptor number */ void *xf_file; /* address of struct file */ short xf_type; /* descriptor type */ int xf_count; /* reference count */ int xf_msgcount; /* references from message queue */ off_t xf_offset; /* file offset */ void *xf_data; /* file descriptor specific data */ void *xf_vnode; /* vnode pointer */ u_int xf_flag; /* flags (see fcntl.h) */ }; #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_FILE); #endif LIST_HEAD(filelist, file); extern struct filelist filehead; /* (fl) head of list of open files */ extern struct fileops vnops; extern struct fileops badfileops; extern struct fileops socketops; extern int maxfiles; /* kernel limit on number of open files */ extern int maxfilesperproc; /* per process limit on number of open files */ extern int openfiles; /* (fl) actual number of open files */ extern struct sx filelist_lock; /* sx to protect filelist and openfiles */ int fget(struct thread *td, int fd, struct file **fpp); int fget_read(struct thread *td, int fd, struct file **fpp); int fget_write(struct thread *td, int fd, struct file **fpp); int fdrop(struct file *fp, struct thread *td); /* * The socket operations are used a couple of places. * XXX: This is wrong, they should go through the operations vector for * XXX: sockets instead of going directly for the individual functions. /phk */ fo_rdwr_t soo_read; fo_rdwr_t soo_write; fo_ioctl_t soo_ioctl; fo_poll_t soo_poll; fo_kqfilter_t soo_kqfilter; fo_stat_t soo_stat; fo_close_t soo_close; /* Lock a file. */ #define FILE_LOCK(f) mtx_lock((f)->f_mtxp) #define FILE_UNLOCK(f) mtx_unlock((f)->f_mtxp) #define FILE_LOCKED(f) mtx_owned((f)->f_mtxp) #define FILE_LOCK_ASSERT(f, type) mtx_assert((f)->f_mtxp, (type)) int fgetvp(struct thread *td, int fd, struct vnode **vpp); int fgetvp_read(struct thread *td, int fd, struct vnode **vpp); int fgetvp_write(struct thread *td, int fd, struct vnode **vpp); int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp); void fputsock(struct socket *sp); #define fhold_locked(fp) \ do { \ FILE_LOCK_ASSERT(fp, MA_OWNED); \ (fp)->f_count++; \ } while (0) #define fhold(fp) \ do { \ FILE_LOCK(fp); \ (fp)->f_count++; \ FILE_UNLOCK(fp); \ } while (0) static __inline fo_rdwr_t fo_read; static __inline fo_rdwr_t fo_write; static __inline fo_ioctl_t fo_ioctl; static __inline fo_poll_t fo_poll; static __inline fo_kqfilter_t fo_kqfilter; static __inline fo_stat_t fo_stat; static __inline fo_close_t fo_close; static __inline int fo_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; int flags; struct thread *td; { return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td)); } static __inline int fo_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; int flags; struct thread *td; { return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td)); } static __inline int fo_ioctl(fp, com, data, active_cred, td) struct file *fp; u_long com; void *data; struct ucred *active_cred; struct thread *td; { return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td)); } static __inline int fo_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td)); } static __inline int fo_stat(fp, sb, active_cred, td) struct file *fp; struct stat *sb; struct ucred *active_cred; struct thread *td; { return ((*fp->f_ops->fo_stat)(fp, sb, active_cred, td)); } static __inline int fo_close(fp, td) struct file *fp; struct thread *td; { return ((*fp->f_ops->fo_close)(fp, td)); } static __inline int fo_kqfilter(fp, kn) struct file *fp; struct knote *kn; { return ((*fp->f_ops->fo_kqfilter)(fp, kn)); } #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ diff --git a/sys/sys/unpcb.h b/sys/sys/unpcb.h index b910f035acd9..129583dbe804 100644 --- a/sys/sys/unpcb.h +++ b/sys/sys/unpcb.h @@ -1,139 +1,140 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)unpcb.h 8.1 (Berkeley) 6/2/93 * $FreeBSD$ */ #ifndef _SYS_UNPCB_H_ #define _SYS_UNPCB_H_ #include #include /* * Protocol control block for an active * instance of a UNIX internal protocol. * * A socket may be associated with a vnode in the * filesystem. If so, the unp_vnode pointer holds * a reference count to this vnode, which should be irele'd * when the socket goes away. * * A socket may be connected to another socket, in which * case the control block of the socket to which it is connected * is given by unp_conn. * * A socket may be referenced by a number of sockets (e.g. several * sockets may be connected to a datagram socket.) These sockets * are in a linked list starting with unp_refs, linked through * unp_nextref and null-terminated. Note that a socket may be referenced * by a number of other sockets and may also reference a socket (not * necessarily one which is referencing it). This generates * the need for unp_refs and unp_nextref to be separate fields. * * Stream sockets keep copies of receive sockbuf sb_cc and sb_mbcnt * so that changes in the sockbuf may be computed to modify * back pressure on the sender accordingly. */ typedef u_quad_t unp_gen_t; LIST_HEAD(unp_head, unpcb); struct unpcb { LIST_ENTRY(unpcb) unp_link; /* glue on list of all PCBs */ struct socket *unp_socket; /* pointer back to socket */ struct vnode *unp_vnode; /* if associated with file */ ino_t unp_ino; /* fake inode number */ struct unpcb *unp_conn; /* control block of connected socket */ struct unp_head unp_refs; /* referencing socket linked list */ LIST_ENTRY(unpcb) unp_reflink; /* link in unp_refs list */ struct sockaddr_un *unp_addr; /* bound address of socket */ int unp_cc; /* copy of rcv.sb_cc */ int unp_mbcnt; /* copy of rcv.sb_mbcnt */ unp_gen_t unp_gencnt; /* generation count of this instance */ int unp_flags; /* flags */ struct xucred unp_peercred; /* peer credentials, if applicable */ + u_int unp_refcount; }; /* * Flags in unp_flags. * * UNP_HAVEPC - indicates that the unp_peercred member is filled in * and is really the credentials of the connected peer. This is used * to determine whether the contents should be sent to the user or * not. * * UNP_HAVEPCCACHED - indicates that the unp_peercred member is filled * in, but does *not* contain the credentials of the connected peer * (there may not even be a peer). This is set in unp_listen() when * it fills in unp_peercred for later consumption by unp_connect(). */ #define UNP_HAVEPC 0x001 #define UNP_HAVEPCCACHED 0x002 #define UNP_WANTCRED 0x004 /* credentials wanted */ #define UNP_CONNWAIT 0x008 /* connect blocks until accepted */ /* * These flags are used to handle non-atomicity in connect() and bind() * operations on a socket: in particular, to avoid races between multiple * threads or processes operating simultaneously on the same socket. */ #define UNP_CONNECTING 0x010 /* Currently connecting. */ #define UNP_BINDING 0x020 /* Currently binding. */ #define sotounpcb(so) ((struct unpcb *)((so)->so_pcb)) /* Hack alert -- this structure depends on . */ #ifdef _SYS_SOCKETVAR_H_ struct xunpcb { size_t xu_len; /* length of this structure */ struct unpcb *xu_unpp; /* to help netstat, fstat */ struct unpcb xu_unp; /* our information */ union { struct sockaddr_un xuu_addr; /* our bound address */ char xu_dummy1[256]; } xu_au; #define xu_addr xu_au.xuu_addr union { struct sockaddr_un xuu_caddr; /* their bound address */ char xu_dummy2[256]; } xu_cau; #define xu_caddr xu_cau.xuu_caddr struct xsocket xu_socket; u_quad_t xu_alignment_hack; }; struct xunpgen { size_t xug_len; u_int xug_count; unp_gen_t xug_gen; so_gen_t xug_sogen; }; #endif /* _SYS_SOCKETVAR_H_ */ #endif /* _SYS_UNPCB_H_ */