Index: head/sys/kern/kern_event.c =================================================================== --- head/sys/kern/kern_event.c (revision 101986) +++ head/sys/kern/kern_event.c (revision 101987) @@ -1,1085 +1,1085 @@ /*- * Copyright (c) 1999,2000,2001 Jonathan Lemon * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); static int kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp, const struct timespec *timeout, struct thread *td); static int kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); static int kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); static int kqueue_ioctl(struct file *fp, u_long com, void *data, struct thread *td); static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td); static int kqueue_kqfilter(struct file *fp, struct knote *kn); static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td); static int kqueue_close(struct file *fp, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static struct fileops kqueueops = { kqueue_read, kqueue_write, kqueue_ioctl, kqueue_poll, kqueue_kqfilter, kqueue_stat, kqueue_close }; static void knote_attach(struct knote *kn, struct filedesc *fdp); static void knote_drop(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(void); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static int filt_timer(struct knote *kn, long hint); static struct filterops file_filtops = { 1, filt_fileattach, NULL, NULL }; static struct filterops kqread_filtops = { 1, NULL, filt_kqdetach, filt_kqueue }; static struct filterops proc_filtops = { 0, filt_procattach, filt_procdetach, filt_proc }; static struct filterops timer_filtops = { 0, filt_timerattach, filt_timerdetach, filt_timer }; static uma_zone_t knote_zone; static int kq_ncallouts = 0; static int kq_calloutmax = (4 * 1024); SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); #define KNOTE_ACTIVATE(kn) do { \ kn->kn_status |= KN_ACTIVE; \ if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue(kn); \ } while(0) #define KN_HASHSIZE 64 /* XXX should be tunable */ #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; struct filterops null_filtops = { 0, filt_nullattach, NULL, NULL }; extern struct filterops sig_filtops; /* * Table for for all system-defined filters. */ static struct filterops *sysfilt_ops[] = { &file_filtops, /* EVFILT_READ */ &file_filtops, /* EVFILT_WRITE */ &null_filtops, /* EVFILT_AIO */ &file_filtops, /* EVFILT_VNODE */ &proc_filtops, /* EVFILT_PROC */ &sig_filtops, /* EVFILT_SIGNAL */ &timer_filtops, /* EVFILT_TIMER */ &file_filtops, /* EVFILT_NETDEV */ }; static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (1); kn->kn_fop = &kqread_filtops; SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } static int filt_procattach(struct knote *kn) { struct proc *p; int error; p = pfind(kn->kn_id); if (p == NULL) return (ESRCH); if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * internal flag indicating registration done by kernel */ if (kn->kn_flags & EV_FLAG1) { kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_flags &= ~EV_FLAG1; } SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ static void filt_procdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; if (kn->kn_status & KN_DETACHED) return; PROC_LOCK(p); SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext); PROC_UNLOCK(p); } static int filt_proc(struct knote *kn, long hint) { u_int event; /* * mask off extra data */ event = (u_int)hint & NOTE_PCTRLMASK; /* * if the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* * process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_status |= KN_DETACHED; kn->kn_flags |= (EV_EOF | EV_ONESHOT); return (1); } /* * process forked, and user wants to track the new process, * so attach a new knote to it, and immediately report an * event with the parent's pid. */ if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { struct kevent kev; int error; /* * register knote with new process. */ kev.ident = hint & NOTE_PDATAMASK; /* pid */ kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata; /* preserve udata */ error = kqueue_register(kn->kn_kq, &kev, NULL); if (error) kn->kn_fflags |= NOTE_TRACKERR; } return (kn->kn_fflags != 0); } static void filt_timerexpire(void *knx) { struct knote *kn = knx; struct callout *calloutp; struct timeval tv; int tticks; kn->kn_data++; KNOTE_ACTIVATE(kn); if ((kn->kn_flags & EV_ONESHOT) == 0) { tv.tv_sec = kn->kn_sdata / 1000; tv.tv_usec = (kn->kn_sdata % 1000) * 1000; tticks = tvtohz(&tv); calloutp = (struct callout *)kn->kn_hook; callout_reset(calloutp, tticks, filt_timerexpire, kn); } } /* * data contains amount of time to sleep, in milliseconds */ static int filt_timerattach(struct knote *kn) { struct callout *calloutp; struct timeval tv; int tticks; if (kq_ncallouts >= kq_calloutmax) return (ENOMEM); kq_ncallouts++; tv.tv_sec = kn->kn_sdata / 1000; tv.tv_usec = (kn->kn_sdata % 1000) * 1000; tticks = tvtohz(&tv); kn->kn_flags |= EV_CLEAR; /* automatically set */ MALLOC(calloutp, struct callout *, sizeof(*calloutp), M_KQUEUE, M_WAITOK); callout_init(calloutp, 0); callout_reset(calloutp, tticks, filt_timerexpire, kn); kn->kn_hook = calloutp; return (0); } static void filt_timerdetach(struct knote *kn) { struct callout *calloutp; calloutp = (struct callout *)kn->kn_hook; callout_stop(calloutp); FREE(calloutp, M_KQUEUE); kq_ncallouts--; } static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } /* * MPSAFE */ int kqueue(struct thread *td, struct kqueue_args *uap) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; int fd, error; mtx_lock(&Giant); fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd); if (error) goto done2; kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO); TAILQ_INIT(&kq->kq_head); FILE_LOCK(fp); fp->f_flag = FREAD | FWRITE; fp->f_type = DTYPE_KQUEUE; fp->f_ops = &kqueueops; TAILQ_INIT(&kq->kq_head); fp->f_data = kq; FILE_UNLOCK(fp); FILEDESC_LOCK(fdp); td->td_retval[0] = fd; if (fdp->fd_knlistsize < 0) fdp->fd_knlistsize = 0; /* this process has a kq */ FILEDESC_UNLOCK(fdp); kq->kq_fdp = fdp; done2: mtx_unlock(&Giant); return (error); } #ifndef _SYS_SYSPROTO_H_ struct kevent_args { int fd; const struct kevent *changelist; int nchanges; struct kevent *eventlist; int nevents; const struct timespec *timeout; }; #endif /* * MPSAFE */ int kevent(struct thread *td, struct kevent_args *uap) { struct kevent *kevp; struct kqueue *kq; struct file *fp; struct timespec ts; int i, n, nerrors, error; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (fp->f_type != DTYPE_KQUEUE) { fdrop(fp, td); return (EBADF); } if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) goto done_nogiant; uap->timeout = &ts; } mtx_lock(&Giant); kq = (struct kqueue *)fp->f_data; nerrors = 0; while (uap->nchanges > 0) { n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges; error = copyin(uap->changelist, kq->kq_kev, n * sizeof(struct kevent)); if (error) goto done; for (i = 0; i < n; i++) { kevp = &kq->kq_kev[i]; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td); if (error) { if (uap->nevents != 0) { kevp->flags = EV_ERROR; kevp->data = error; (void) copyout(kevp, uap->eventlist, sizeof(*kevp)); uap->eventlist++; uap->nevents--; nerrors++; } else { goto done; } } } uap->nchanges -= n; uap->changelist += n; } if (nerrors) { td->td_retval[0] = nerrors; error = 0; goto done; } error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td); done: mtx_unlock(&Giant); done_nogiant: if (fp != NULL) fdrop(fp, td); return (error); } int kqueue_add_filteropts(int filt, struct filterops *filtops) { if (filt > 0) panic("filt(%d) > 0", filt); if (filt + EVFILT_SYSCOUNT < 0) panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0", filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT); if (sysfilt_ops[~filt] != &null_filtops) panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt); sysfilt_ops[~filt] = filtops; return (0); } int kqueue_del_filteropts(int filt) { if (filt > 0) panic("filt(%d) > 0", filt); if (filt + EVFILT_SYSCOUNT < 0) panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0", filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT); if (sysfilt_ops[~filt] == &null_filtops) panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt); sysfilt_ops[~filt] = &null_filtops; return (0); } int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td) { struct filedesc *fdp = kq->kq_fdp; struct filterops *fops; struct file *fp = NULL; struct knote *kn = NULL; int s, error = 0; if (kev->filter < 0) { if (kev->filter + EVFILT_SYSCOUNT < 0) return (EINVAL); fops = sysfilt_ops[~kev->filter]; /* to 0-base index */ } else { /* * XXX * filter attach routine is responsible for insuring that * the identifier can be attached to it. */ printf("unknown filter: %d\n", kev->filter); return (EINVAL); } FILEDESC_LOCK(fdp); if (fops->f_isfd) { /* validate descriptor */ if ((u_int)kev->ident >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[kev->ident]) == NULL) { FILEDESC_UNLOCK(fdp); return (EBADF); } fhold(fp); if (kev->ident < fdp->fd_knlistsize) { SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link) if (kq == kn->kn_kq && kev->filter == kn->kn_filter) break; } } else { if (fdp->fd_knhashmask != 0) { struct klist *list; list = &fdp->fd_knhash[ KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kq == kn->kn_kq && kev->filter == kn->kn_filter) break; } } FILEDESC_UNLOCK(fdp); if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { error = ENOENT; goto done; } /* * kn now contains the matching knote, or NULL if no match */ if (kev->flags & EV_ADD) { if (kn == NULL) { kn = knote_alloc(); if (kn == NULL) { error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference count to knote structure, and * do not release it at the end of this routine. */ fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; knote_attach(kn, fdp); if ((error = fops->f_attach(kn)) != 0) { knote_drop(kn, td); goto done; } } else { /* * The user may change some filter values after the * initial EV_ADD, but doing so will not reset any * filter which have already been triggered. */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kn->kn_kevent.udata = kev->udata; } s = splhigh(); if (kn->kn_fop->f_event(kn, 0)) KNOTE_ACTIVATE(kn); splx(s); } else if (kev->flags & EV_DELETE) { kn->kn_fop->f_detach(kn); knote_drop(kn, td); goto done; } if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0)) { s = splhigh(); kn->kn_status |= KN_DISABLED; splx(s); } if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { s = splhigh(); kn->kn_status &= ~KN_DISABLED; if ((kn->kn_status & KN_ACTIVE) && ((kn->kn_status & KN_QUEUED) == 0)) knote_enqueue(kn); splx(s); } done: if (fp != NULL) fdrop(fp, td); return (error); } static int kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp, const struct timespec *tsp, struct thread *td) { struct kqueue *kq; struct kevent *kevp; struct timeval atv, rtv, ttv; struct knote *kn, marker; int s, count, timeout, nkev = 0, error = 0; FILE_LOCK_ASSERT(fp, MA_NOTOWNED); kq = (struct kqueue *)fp->f_data; count = maxevents; if (count == 0) goto done; if (tsp != NULL) { TIMESPEC_TO_TIMEVAL(&atv, tsp); if (itimerfix(&atv)) { error = EINVAL; goto done; } if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) timeout = -1; else timeout = atv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&atv); getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; timeout = 0; } goto start; retry: if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) goto done; ttv = atv; timevalsub(&ttv, &rtv); timeout = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } start: kevp = kq->kq_kev; s = splhigh(); if (kq->kq_count == 0) { if (timeout < 0) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout); } splx(s); if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); while (count) { kn = TAILQ_FIRST(&kq->kq_head); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if (kn == &marker) { splx(s); if (count == maxevents) goto retry; goto done; } if (kn->kn_status & KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if ((kn->kn_flags & EV_ONESHOT) == 0 && kn->kn_fop->f_event(kn, 0) == 0) { kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; continue; } *kevp = kn->kn_kevent; kevp++; nkev++; if (kn->kn_flags & EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; splx(s); kn->kn_fop->f_detach(kn); knote_drop(kn, td); s = splhigh(); } else if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else { TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); } count--; if (nkev == KQ_NEVENTS) { splx(s); error = copyout(&kq->kq_kev, ulistp, sizeof(struct kevent) * nkev); ulistp += nkev; nkev = 0; kevp = kq->kq_kev; s = splhigh(); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); splx(s); done: if (nkev != 0) error = copyout(&kq->kq_kev, ulistp, sizeof(struct kevent) * nkev); td->td_retval[0] = maxevents - count; return (error); } /* * XXX * This could be expanded to call kqueue_scan, if desired. */ /*ARGSUSED*/ static int kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long com, void *data, struct thread *td) { return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, - struct thread *td) + struct thread *td) { struct kqueue *kq; int revents = 0; int s = splnet(); kq = (struct kqueue *)fp->f_data; if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); kq->kq_state |= KQ_SEL; } } splx(s); return (revents); } /*ARGSUSED*/ static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, - struct thread *td) + struct thread *td) { struct kqueue *kq; kq = (struct kqueue *)fp->f_data; bzero((void *)st, sizeof(*st)); st->st_size = kq->kq_count; st->st_blksize = sizeof(struct kevent); st->st_mode = S_IFIFO; return (0); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = (struct kqueue *)fp->f_data; struct filedesc *fdp = td->td_proc->p_fd; struct knote **knp, *kn, *kn0; int i; FILEDESC_LOCK(fdp); for (i = 0; i < fdp->fd_knlistsize; i++) { knp = &SLIST_FIRST(&fdp->fd_knlist[i]); kn = *knp; while (kn != NULL) { kn0 = SLIST_NEXT(kn, kn_link); if (kq == kn->kn_kq) { kn->kn_fop->f_detach(kn); *knp = kn0; FILE_LOCK(kn->kn_fp); FILEDESC_UNLOCK(fdp); fdrop_locked(kn->kn_fp, td); knote_free(kn); FILEDESC_LOCK(fdp); } else { knp = &SLIST_NEXT(kn, kn_link); } kn = kn0; } } if (fdp->fd_knhashmask != 0) { for (i = 0; i < fdp->fd_knhashmask + 1; i++) { knp = &SLIST_FIRST(&fdp->fd_knhash[i]); kn = *knp; while (kn != NULL) { kn0 = SLIST_NEXT(kn, kn_link); if (kq == kn->kn_kq) { kn->kn_fop->f_detach(kn); *knp = kn0; /* XXX non-fd release of kn->kn_ptr */ FILEDESC_UNLOCK(fdp); knote_free(kn); FILEDESC_LOCK(fdp); } else { knp = &SLIST_NEXT(kn, kn_link); } kn = kn0; } } } FILEDESC_UNLOCK(fdp); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static void kqueue_wakeup(struct kqueue *kq) { if (kq->kq_state & KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if (kq->kq_state & KQ_SEL) { kq->kq_state &= ~KQ_SEL; selwakeup(&kq->kq_sel); } KNOTE(&kq->kq_sel.si_note, 0); } /* * walk down a list of knotes, activating them if their event has triggered. */ void knote(struct klist *list, long hint) { struct knote *kn; SLIST_FOREACH(kn, list, kn_selnext) if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn); } /* * remove all knotes from a specified klist */ void knote_remove(struct thread *td, struct klist *list) { struct knote *kn; while ((kn = SLIST_FIRST(list)) != NULL) { kn->kn_fop->f_detach(kn); knote_drop(kn, td); } } /* * remove all knotes referencing a specified fd */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; struct klist *list; FILEDESC_LOCK(fdp); list = &fdp->fd_knlist[fd]; FILEDESC_UNLOCK(fdp); knote_remove(td, list); } static void knote_attach(struct knote *kn, struct filedesc *fdp) { struct klist *list, *oldlist; int size, newsize; FILEDESC_LOCK(fdp); if (! kn->kn_fop->f_isfd) { if (fdp->fd_knhashmask == 0) fdp->fd_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, &fdp->fd_knhashmask); list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; goto done; } if (fdp->fd_knlistsize <= kn->kn_id) { retry: size = fdp->fd_knlistsize; while (size <= kn->kn_id) size += KQEXTENT; FILEDESC_UNLOCK(fdp); MALLOC(list, struct klist *, size * sizeof(struct klist *), M_KQUEUE, M_WAITOK); FILEDESC_LOCK(fdp); newsize = fdp->fd_knlistsize; while (newsize <= kn->kn_id) newsize += KQEXTENT; if (newsize != size) { FILEDESC_UNLOCK(fdp); free(list, M_TEMP); FILEDESC_LOCK(fdp); goto retry; } bcopy(fdp->fd_knlist, list, fdp->fd_knlistsize * sizeof(struct klist *)); bzero((caddr_t)list + fdp->fd_knlistsize * sizeof(struct klist *), (size - fdp->fd_knlistsize) * sizeof(struct klist *)); if (fdp->fd_knlist != NULL) oldlist = fdp->fd_knlist; else oldlist = NULL; fdp->fd_knlistsize = size; fdp->fd_knlist = list; FILEDESC_UNLOCK(fdp); if (oldlist != NULL) FREE(oldlist, M_KQUEUE); FILEDESC_LOCK(fdp); } list = &fdp->fd_knlist[kn->kn_id]; done: FILEDESC_UNLOCK(fdp); SLIST_INSERT_HEAD(list, kn, kn_link); kn->kn_status = 0; } /* * should be called at spl == 0, since we don't want to hold spl * while calling fdrop and free. */ static void knote_drop(struct knote *kn, struct thread *td) { struct filedesc *fdp = td->td_proc->p_fd; struct klist *list; FILEDESC_LOCK(fdp); if (kn->kn_fop->f_isfd) list = &fdp->fd_knlist[kn->kn_id]; else list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)]; if (kn->kn_fop->f_isfd) FILE_LOCK(kn->kn_fp); FILEDESC_UNLOCK(fdp); SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); if (kn->kn_fop->f_isfd) fdrop_locked(kn->kn_fp, td); knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; int s = splhigh(); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; splx(s); kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; int s = splhigh(); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; splx(s); } static void knote_init(void) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) static struct knote * knote_alloc(void) { return ((struct knote *)uma_zalloc(knote_zone, M_WAITOK)); } static void knote_free(struct knote *kn) { uma_zfree(knote_zone, kn); } Index: head/sys/kern/sys_pipe.c =================================================================== --- head/sys/kern/sys_pipe.c (revision 101986) +++ head/sys/kern/sys_pipe.c (revision 101987) @@ -1,1504 +1,1504 @@ /* * Copyright (c) 1996 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. * * $FreeBSD$ */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and * the receiving process can copy it directly from the pages in the sending * process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. PIPE_SIZE is constrained by the * amount of kernel virtual memory. */ #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ /* * interfaces to the outside world */ static int pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); static int pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); static int pipe_close(struct file *fp, struct thread *td); static int pipe_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td); static int pipe_kqfilter(struct file *fp, struct knote *kn); static int pipe_stat(struct file *fp, struct stat *sb, - struct ucred *active_cred, struct thread *td); + struct ucred *active_cred, struct thread *td); static int pipe_ioctl(struct file *fp, u_long cmd, void *data, struct thread *td); static struct fileops pipeops = { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter, pipe_stat, pipe_close }; static void filt_pipedetach(struct knote *kn); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); static struct filterops pipe_rfiltops = { 1, NULL, filt_pipedetach, filt_piperead }; static struct filterops pipe_wfiltops = { 1, NULL, filt_pipedetach, filt_pipewrite }; #define PIPE_GET_GIANT(pipe) \ do { \ KASSERT(((pipe)->pipe_state & PIPE_LOCKFL) != 0, \ ("%s:%d PIPE_GET_GIANT: line pipe not locked", \ __FILE__, __LINE__)); \ PIPE_UNLOCK(pipe); \ mtx_lock(&Giant); \ } while (0) #define PIPE_DROP_GIANT(pipe) \ do { \ mtx_unlock(&Giant); \ PIPE_LOCK(pipe); \ } while (0) /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) /* * Maximum amount of kva for pipes -- this is kind-of a soft limit, but * is there so that on large systems, we don't exhaust it. */ #define MAXPIPEKVA (8*1024*1024) /* * Limit for direct transfers, we cannot, of course limit * the amount of kva for pipes in general though. */ #define LIMITPIPEKVA (16*1024*1024) /* * Limit the number of "big" pipes */ #define LIMITBIGPIPES 32 static int nbigpipe; static int amountpipekva; static void pipeinit(void *dummy __unused); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static int pipe_create(struct pipe **cpipep); static __inline int pipelock(struct pipe *cpipe, int catch); static __inline void pipeunlock(struct pipe *cpipe); static __inline void pipeselwakeup(struct pipe *cpipe); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); static void pipe_destroy_write_buffer(struct pipe *wpipe); static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); static void pipe_clone_write_buffer(struct pipe *wpipe); #endif static int pipespace(struct pipe *cpipe, int size); static uma_zone_t pipe_zone; SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); static void pipeinit(void *dummy __unused) { pipe_zone = uma_zcreate("PIPE", sizeof(struct pipe), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } /* * The pipe system call for the DTYPE_PIPE type of pipes */ /* ARGSUSED */ int pipe(td, uap) struct thread *td; struct pipe_args /* { int dummy; } */ *uap; { struct filedesc *fdp = td->td_proc->p_fd; struct file *rf, *wf; struct pipe *rpipe, *wpipe; struct mtx *pmtx; int fd, error; KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO); rpipe = wpipe = NULL; if (pipe_create(&rpipe) || pipe_create(&wpipe)) { pipeclose(rpipe); pipeclose(wpipe); free(pmtx, M_TEMP); return (ENFILE); } rpipe->pipe_state |= PIPE_DIRECTOK; wpipe->pipe_state |= PIPE_DIRECTOK; error = falloc(td, &rf, &fd); if (error) { pipeclose(rpipe); pipeclose(wpipe); free(pmtx, M_TEMP); return (error); } fhold(rf); td->td_retval[0] = fd; /* * Warning: once we've gotten past allocation of the fd for the * read-side, we can only drop the read side via fdrop() in order * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ FILE_LOCK(rf); rf->f_flag = FREAD | FWRITE; rf->f_type = DTYPE_PIPE; rf->f_data = rpipe; rf->f_ops = &pipeops; FILE_UNLOCK(rf); error = falloc(td, &wf, &fd); if (error) { FILEDESC_LOCK(fdp); if (fdp->fd_ofiles[td->td_retval[0]] == rf) { fdp->fd_ofiles[td->td_retval[0]] = NULL; FILEDESC_UNLOCK(fdp); fdrop(rf, td); } else FILEDESC_UNLOCK(fdp); fdrop(rf, td); /* rpipe has been closed by fdrop(). */ pipeclose(wpipe); free(pmtx, M_TEMP); return (error); } FILE_LOCK(wf); wf->f_flag = FREAD | FWRITE; wf->f_type = DTYPE_PIPE; wf->f_data = wpipe; wf->f_ops = &pipeops; FILE_UNLOCK(wf); td->td_retval[1] = fd; rpipe->pipe_peer = wpipe; wpipe->pipe_peer = rpipe; #ifdef MAC /* * struct pipe represents a pipe endpoint. The MAC label is shared * between the connected endpoints. As a result mac_init_pipe() and * mac_create_pipe() should only be called on one of the endpoints * after they have been connected. */ mac_init_pipe(rpipe); mac_create_pipe(td->td_ucred, rpipe); #endif mtx_init(pmtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx; fdrop(rf, td); return (0); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int pipespace(cpipe, size) struct pipe *cpipe; int size; { struct vm_object *object; caddr_t buffer; int npages, error; GIANT_REQUIRED; KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); npages = round_page(size)/PAGE_SIZE; /* * Create an object, I don't like the idea of paging to/from * kernel_object. * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. */ object = vm_object_allocate(OBJT_DEFAULT, npages); buffer = (caddr_t) vm_map_min(kernel_map); /* * Insert the object into the kernel map, and allocate kva for it. * The map entry is, by default, pageable. * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. */ error = vm_map_find(kernel_map, object, 0, (vm_offset_t *) &buffer, size, 1, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) { vm_object_deallocate(object); return (ENOMEM); } /* free old resources if we're resizing */ pipe_free_kmem(cpipe); cpipe->pipe_buffer.object = object; cpipe->pipe_buffer.buffer = buffer; cpipe->pipe_buffer.size = size; cpipe->pipe_buffer.in = 0; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = 0; amountpipekva += cpipe->pipe_buffer.size; return (0); } /* * initialize and allocate VM and memory for pipe */ static int pipe_create(cpipep) struct pipe **cpipep; { struct pipe *cpipe; int error; *cpipep = uma_zalloc(pipe_zone, M_WAITOK); if (*cpipep == NULL) return (ENOMEM); cpipe = *cpipep; /* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */ cpipe->pipe_buffer.object = NULL; #ifndef PIPE_NODIRECT cpipe->pipe_map.kva = NULL; #endif /* * protect so pipeclose() doesn't follow a junk pointer * if pipespace() fails. */ bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel)); cpipe->pipe_state = 0; cpipe->pipe_peer = NULL; cpipe->pipe_busy = 0; #ifndef PIPE_NODIRECT /* * pipe data structure initializations to support direct pipe I/O */ cpipe->pipe_map.cnt = 0; cpipe->pipe_map.kva = 0; cpipe->pipe_map.pos = 0; cpipe->pipe_map.npages = 0; /* cpipe->pipe_map.ms[] = invalid */ #endif cpipe->pipe_mtxp = NULL; /* avoid pipespace assertion */ error = pipespace(cpipe, PIPE_SIZE); if (error) return (error); vfs_timestamp(&cpipe->pipe_ctime); cpipe->pipe_atime = cpipe->pipe_ctime; cpipe->pipe_mtime = cpipe->pipe_ctime; return (0); } /* * lock a pipe for I/O, blocking other access */ static __inline int pipelock(cpipe, catch) struct pipe *cpipe; int catch; { int error; PIPE_LOCK_ASSERT(cpipe, MA_OWNED); while (cpipe->pipe_state & PIPE_LOCKFL) { cpipe->pipe_state |= PIPE_LWANT; error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO, "pipelk", 0); if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCKFL; return (0); } /* * unlock a pipe I/O lock */ static __inline void pipeunlock(cpipe) struct pipe *cpipe; { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); cpipe->pipe_state &= ~PIPE_LOCKFL; if (cpipe->pipe_state & PIPE_LWANT) { cpipe->pipe_state &= ~PIPE_LWANT; wakeup(cpipe); } } static __inline void pipeselwakeup(cpipe) struct pipe *cpipe; { if (cpipe->pipe_state & PIPE_SEL) { cpipe->pipe_state &= ~PIPE_SEL; selwakeup(&cpipe->pipe_sel); } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); KNOTE(&cpipe->pipe_sel.si_note, 0); } /* ARGSUSED */ static int pipe_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct pipe *rpipe = (struct pipe *) fp->f_data; int error; int nread = 0; u_int size; PIPE_LOCK(rpipe); ++rpipe->pipe_busy; error = pipelock(rpipe, 1); if (error) goto unlocked_error; #ifdef MAC error = mac_check_pipe_op(active_cred, rpipe, MAC_OP_PIPE_READ); if (error) goto locked_error; #endif while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); PIPE_LOCK(rpipe); if (error) break; rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_map.cnt) && (rpipe->pipe_state & PIPE_DIRECTW)) { caddr_t va; if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos; PIPE_UNLOCK(rpipe); error = uiomove(va, size, uio); PIPE_LOCK(rpipe); if (error) break; nread += size; rpipe->pipe_map.pos += size; rpipe->pipe_map.cnt -= size; if (rpipe->pipe_map.cnt == 0) { rpipe->pipe_state &= ~PIPE_DIRECTW; wakeup(rpipe); } #endif } else { /* * detect EOF condition * read returns 0 on EOF, no need to set error */ if (rpipe->pipe_state & PIPE_EOF) break; /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } /* * Break if some data was read. */ if (nread > 0) break; /* * Unlock the pipe buffer for our remaining processing. We * will either break out with an error or we will sleep and * relock to loop. */ pipeunlock(rpipe); /* * Handle non-blocking mode operation or * wait for more data. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; if ((error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0)) == 0) error = pipelock(rpipe, 1); } if (error) goto unlocked_error; } } #ifdef MAC locked_error: #endif pipeunlock(rpipe); /* XXX: should probably do this before getting any locks. */ if (error == 0) vfs_timestamp(&rpipe->pipe_atime); unlocked_error: --rpipe->pipe_busy; /* * PIPE_WANT processing only makes sense if pipe_busy is 0. */ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * Handle write blocking hysteresis. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) pipeselwakeup(rpipe); PIPE_UNLOCK(rpipe); return (error); } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int pipe_build_write_buffer(wpipe, uio) struct pipe *wpipe; struct uio *uio; { u_int size; int i; vm_offset_t addr, endaddr, paddr; GIANT_REQUIRED; PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); size = (u_int) uio->uio_iov->iov_len; if (size > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { vm_page_t m; /* * vm_fault_quick() can sleep. Consequently, * vm_page_lock_queue() and vm_page_unlock_queue() * should not be performed outside of this loop. */ if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 || (paddr = pmap_extract(vmspace_pmap(curproc->p_vmspace), addr)) == 0) { int j; vm_page_lock_queues(); for (j = 0; j < i; j++) vm_page_unwire(wpipe->pipe_map.ms[j], 1); vm_page_unlock_queues(); return (EFAULT); } m = PHYS_TO_VM_PAGE(paddr); vm_page_lock_queues(); vm_page_wire(m); vm_page_unlock_queues(); wpipe->pipe_map.ms[i] = m; } /* * set up the control block */ wpipe->pipe_map.npages = i; wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_map.cnt = size; /* * and map the buffer */ if (wpipe->pipe_map.kva == 0) { /* * We need to allocate space for an extra page because the * address range might (will) span pages at times. */ wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, wpipe->pipe_buffer.size + PAGE_SIZE); amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; } pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, wpipe->pipe_map.npages); /* * and update the uio data */ uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base += size; if (uio->uio_iov->iov_len == 0) uio->uio_iov++; uio->uio_resid -= size; uio->uio_offset += size; return (0); } /* * unmap and unwire the process buffer */ static void pipe_destroy_write_buffer(wpipe) struct pipe *wpipe; { int i; GIANT_REQUIRED; PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); if (wpipe->pipe_map.kva) { pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); if (amountpipekva > MAXPIPEKVA) { vm_offset_t kva = wpipe->pipe_map.kva; wpipe->pipe_map.kva = 0; kmem_free(kernel_map, kva, wpipe->pipe_buffer.size + PAGE_SIZE); amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; } } vm_page_lock_queues(); for (i = 0; i < wpipe->pipe_map.npages; i++) vm_page_unwire(wpipe->pipe_map.ms[i], 1); vm_page_unlock_queues(); wpipe->pipe_map.npages = 0; } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void pipe_clone_write_buffer(wpipe) struct pipe *wpipe; { int size; int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); size = wpipe->pipe_map.cnt; pos = wpipe->pipe_map.pos; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; wpipe->pipe_state &= ~PIPE_DIRECTW; PIPE_GET_GIANT(wpipe); bcopy((caddr_t) wpipe->pipe_map.kva + pos, wpipe->pipe_buffer.buffer, size); pipe_destroy_write_buffer(wpipe); PIPE_DROP_GIANT(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int pipe_direct_write(wpipe, uio) struct pipe *wpipe; struct uio *uio; { int error; retry: PIPE_LOCK_ASSERT(wpipe, MA_OWNED); while (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } wpipe->pipe_state |= PIPE_WANTW; error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdww", 0); if (error) goto error1; if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; goto error1; } } wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } wpipe->pipe_state |= PIPE_WANTW; error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwc", 0); if (error) goto error1; if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; goto error1; } goto retry; } wpipe->pipe_state |= PIPE_DIRECTW; pipelock(wpipe, 0); PIPE_GET_GIANT(wpipe); error = pipe_build_write_buffer(wpipe, uio); PIPE_DROP_GIANT(wpipe); pipeunlock(wpipe); if (error) { wpipe->pipe_state &= ~PIPE_DIRECTW; goto error1; } error = 0; while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { if (wpipe->pipe_state & PIPE_EOF) { pipelock(wpipe, 0); PIPE_GET_GIANT(wpipe); pipe_destroy_write_buffer(wpipe); PIPE_DROP_GIANT(wpipe); pipeunlock(wpipe); pipeselwakeup(wpipe); error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); } pipelock(wpipe,0); if (wpipe->pipe_state & PIPE_DIRECTW) { /* * this bit of trickery substitutes a kernel buffer for * the process that might be going away. */ pipe_clone_write_buffer(wpipe); } else { PIPE_GET_GIANT(wpipe); pipe_destroy_write_buffer(wpipe); PIPE_DROP_GIANT(wpipe); } pipeunlock(wpipe); return (error); error1: wakeup(wpipe); return (error); } #endif static int pipe_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { int error = 0; int orig_resid; struct pipe *wpipe, *rpipe; rpipe = (struct pipe *) fp->f_data; wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { PIPE_UNLOCK(rpipe); return (EPIPE); } #ifdef MAC error = mac_check_pipe_op(active_cred, wpipe, MAC_OP_PIPE_WRITE); if (error) { PIPE_UNLOCK(rpipe); return (error); } #endif ++wpipe->pipe_busy; /* * If it is advantageous to resize the pipe buffer, do * so. */ if ((uio->uio_resid > PIPE_SIZE) && (nbigpipe < LIMITBIGPIPES) && (wpipe->pipe_state & PIPE_DIRECTW) == 0 && (wpipe->pipe_buffer.size <= PIPE_SIZE) && (wpipe->pipe_buffer.cnt == 0)) { if ((error = pipelock(wpipe,1)) == 0) { PIPE_GET_GIANT(wpipe); if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) nbigpipe++; PIPE_DROP_GIANT(wpipe); pipeunlock(wpipe); } } /* * If an early error occured unbusy and return, waking up any pending * readers. */ if (error) { --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } PIPE_UNLOCK(rpipe); return(error); } KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone")); orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. * * The direct write mechanism will detect the reader going * away on us. */ if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && (fp->f_flag & FNONBLOCK) == 0 && (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) && (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) { error = pipe_direct_write( wpipe, uio); if (error) break; continue; } #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. We break out if a signal occurs or the * reader goes away. */ retrywrite: while (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipbww", 0); if (wpipe->pipe_state & PIPE_EOF) break; if (error) break; } if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; break; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) { if ((error = pipelock(wpipe,1)) == 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * It is possible for a direct write to * slip in on us... handle it here... */ if (wpipe->pipe_state & PIPE_DIRECTW) { pipeunlock(wpipe); goto retrywrite; } /* * If a process blocked in uiomove, our * value for space might be bad. * * XXX will we be ok if the reader has gone * away here? */ if (space > wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) { pipeunlock(wpipe); goto retrywrite; } /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; if (segsize > size) segsize = size; /* Transfer first segment */ PIPE_UNLOCK(rpipe); error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], segsize, uio); PIPE_LOCK(rpipe); if (error == 0 && segsize < size) { /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ if (wpipe->pipe_buffer.in + segsize != wpipe->pipe_buffer.size) panic("Expected pipe buffer wraparound disappeared"); PIPE_UNLOCK(rpipe); error = uiomove(&wpipe->pipe_buffer.buffer[0], size - segsize, uio); PIPE_LOCK(rpipe); } if (error == 0) { wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) { if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size) panic("Expected wraparound bad"); wpipe->pipe_buffer.in = size - segsize; } wpipe->pipe_buffer.cnt += size; if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size) panic("Pipe buffer overflow"); } pipeunlock(wpipe); } if (error) break; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; break; } /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipewr", 0); if (error != 0) break; /* * If read side wants to go away, we just issue a signal * to ourselves. */ if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; break; } } } --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if I/O was successful */ if ((wpipe->pipe_buffer.cnt == 0) && (uio->uio_resid == 0) && (error == EPIPE)) { error = 0; } if (error == 0) vfs_timestamp(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); PIPE_UNLOCK(rpipe); return (error); } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ int pipe_ioctl(fp, cmd, data, td) struct file *fp; u_long cmd; void *data; struct thread *td; { struct pipe *mpipe = (struct pipe *)fp->f_data; #ifdef MAC int error; /* XXXMAC: Pipe should be locked for this check. */ error = mac_check_pipe_ioctl(td->td_ucred, mpipe, cmd, data); if (error) return (error); #endif switch (cmd) { case FIONBIO: return (0); case FIOASYNC: PIPE_LOCK(mpipe); if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } PIPE_UNLOCK(mpipe); return (0); case FIONREAD: PIPE_LOCK(mpipe); if (mpipe->pipe_state & PIPE_DIRECTW) *(int *)data = mpipe->pipe_map.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; PIPE_UNLOCK(mpipe); return (0); case FIOSETOWN: return (fsetown(*(int *)data, &mpipe->pipe_sigio)); case FIOGETOWN: *(int *)data = fgetown(mpipe->pipe_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(mpipe->pipe_sigio); return (0); } return (ENOTTY); } int pipe_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { struct pipe *rpipe = (struct pipe *)fp->f_data; struct pipe *wpipe; int revents = 0; #ifdef MAC int error; #endif wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); #ifdef MAC error = mac_check_pipe_op(active_cred, rpipe, MAC_OP_PIPE_POLL); if (error) goto locked_error; #endif if (events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_state & PIPE_DIRECTW) || (rpipe->pipe_buffer.cnt > 0) || (rpipe->pipe_state & PIPE_EOF)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) || (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) revents |= events & (POLLOUT | POLLWRNORM); if ((rpipe->pipe_state & PIPE_EOF) || (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) { selrecord(td, &rpipe->pipe_sel); rpipe->pipe_state |= PIPE_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &wpipe->pipe_sel); wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC locked_error: #endif PIPE_UNLOCK(rpipe); return (revents); } /* * We shouldn't need locks here as we're doing a read and this should * be a natural race. */ static int pipe_stat(fp, ub, active_cred, td) struct file *fp; struct stat *ub; struct ucred *active_cred; struct thread *td; { struct pipe *pipe = (struct pipe *)fp->f_data; #ifdef MAC int error; /* XXXMAC: Pipe should be locked for this check. */ error = mac_check_pipe_op(active_cred, pipe, MAC_OP_PIPE_STAT); if (error) return (error); #endif bzero(ub, sizeof(*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = pipe->pipe_buffer.size; ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; ub->st_atimespec = pipe->pipe_atime; ub->st_mtimespec = pipe->pipe_mtime; ub->st_ctimespec = pipe->pipe_ctime; ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; /* * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. * XXX (st_dev, st_ino) should be unique. */ return (0); } /* ARGSUSED */ static int pipe_close(fp, td) struct file *fp; struct thread *td; { struct pipe *cpipe = (struct pipe *)fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; funsetown(&cpipe->pipe_sigio); pipeclose(cpipe); return (0); } static void pipe_free_kmem(cpipe) struct pipe *cpipe; { GIANT_REQUIRED; KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); if (cpipe->pipe_buffer.buffer != NULL) { if (cpipe->pipe_buffer.size > PIPE_SIZE) --nbigpipe; amountpipekva -= cpipe->pipe_buffer.size; kmem_free(kernel_map, (vm_offset_t)cpipe->pipe_buffer.buffer, cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; } #ifndef PIPE_NODIRECT if (cpipe->pipe_map.kva != NULL) { amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; kmem_free(kernel_map, cpipe->pipe_map.kva, cpipe->pipe_buffer.size + PAGE_SIZE); cpipe->pipe_map.cnt = 0; cpipe->pipe_map.kva = 0; cpipe->pipe_map.pos = 0; cpipe->pipe_map.npages = 0; } #endif } /* * shutdown the pipe */ static void pipeclose(cpipe) struct pipe *cpipe; { struct pipe *ppipe; int hadpeer; if (cpipe == NULL) return; hadpeer = 0; /* partially created pipes won't have a valid mutex. */ if (PIPE_MTX(cpipe) != NULL) PIPE_LOCK(cpipe); pipeselwakeup(cpipe); /* * If the other side is blocked, wake it up saying that * we want to close it down. */ while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT | PIPE_EOF; msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); } #ifdef MAC if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL) mac_destroy_pipe(cpipe); #endif /* * Disconnect from peer */ if ((ppipe = cpipe->pipe_peer) != NULL) { hadpeer++; pipeselwakeup(ppipe); ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); KNOTE(&ppipe->pipe_sel.si_note, 0); ppipe->pipe_peer = NULL; } /* * free resources */ if (PIPE_MTX(cpipe) != NULL) { PIPE_UNLOCK(cpipe); if (!hadpeer) { mtx_destroy(PIPE_MTX(cpipe)); free(PIPE_MTX(cpipe), M_TEMP); } } mtx_lock(&Giant); pipe_free_kmem(cpipe); uma_zfree(pipe_zone, cpipe); mtx_unlock(&Giant); } /*ARGSUSED*/ static int pipe_kqfilter(struct file *fp, struct knote *kn) { struct pipe *cpipe; cpipe = (struct pipe *)kn->kn_fp->f_data; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; cpipe = cpipe->pipe_peer; if (cpipe == NULL) /* other end of pipe has been closed */ return (EBADF); break; default: return (1); } kn->kn_hook = cpipe; PIPE_LOCK(cpipe); SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext); PIPE_UNLOCK(cpipe); return (0); } static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = (struct pipe *)kn->kn_hook; PIPE_LOCK(cpipe); SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext); PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int filt_piperead(struct knote *kn, long hint) { struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; struct pipe *wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); kn->kn_data = rpipe->pipe_buffer.cnt; if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) kn->kn_data = rpipe->pipe_map.cnt; if ((rpipe->pipe_state & PIPE_EOF) || (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_flags |= EV_EOF; PIPE_UNLOCK(rpipe); return (1); } PIPE_UNLOCK(rpipe); return (kn->kn_data > 0); } /*ARGSUSED*/ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data; struct pipe *wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; kn->kn_flags |= EV_EOF; PIPE_UNLOCK(rpipe); return (1); } kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; if (wpipe->pipe_state & PIPE_DIRECTW) kn->kn_data = 0; PIPE_UNLOCK(rpipe); return (kn->kn_data >= PIPE_BUF); }