Index: head/sys/kern/kern_event.c
===================================================================
--- head/sys/kern/kern_event.c	(revision 101986)
+++ head/sys/kern/kern_event.c	(revision 101987)
@@ -1,1085 +1,1085 @@
 /*-
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/malloc.h> 
 #include <sys/unistd.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/selinfo.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
 #include <sys/poll.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/uio.h>
 
 #include <vm/uma.h>
 
 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 static int	kqueue_scan(struct file *fp, int maxevents,
 		    struct kevent *ulistp, const struct timespec *timeout,
 		    struct thread *td);
 static int 	kqueue_read(struct file *fp, struct uio *uio,
 		    struct ucred *active_cred, int flags, struct thread *td);
 static int	kqueue_write(struct file *fp, struct uio *uio,
 		    struct ucred *active_cred, int flags, struct thread *td);
 static int	kqueue_ioctl(struct file *fp, u_long com, void *data,
 		    struct thread *td);
 static int 	kqueue_poll(struct file *fp, int events,
 		    struct ucred *active_cred, struct thread *td);
 static int 	kqueue_kqfilter(struct file *fp, struct knote *kn);
 static int 	kqueue_stat(struct file *fp, struct stat *st,
 		    struct ucred *active_cred, struct thread *td);
 static int 	kqueue_close(struct file *fp, struct thread *td);
 static void 	kqueue_wakeup(struct kqueue *kq);
 
 static struct fileops kqueueops = {
 	kqueue_read,
 	kqueue_write,
 	kqueue_ioctl,
 	kqueue_poll,
 	kqueue_kqfilter,
 	kqueue_stat,
 	kqueue_close
 };
 
 static void 	knote_attach(struct knote *kn, struct filedesc *fdp);
 static void 	knote_drop(struct knote *kn, struct thread *td);
 static void 	knote_enqueue(struct knote *kn);
 static void 	knote_dequeue(struct knote *kn);
 static void 	knote_init(void);
 static struct 	knote *knote_alloc(void);
 static void 	knote_free(struct knote *kn);
 
 static void	filt_kqdetach(struct knote *kn);
 static int	filt_kqueue(struct knote *kn, long hint);
 static int	filt_procattach(struct knote *kn);
 static void	filt_procdetach(struct knote *kn);
 static int	filt_proc(struct knote *kn, long hint);
 static int	filt_fileattach(struct knote *kn);
 static void	filt_timerexpire(void *knx);
 static int	filt_timerattach(struct knote *kn);
 static void	filt_timerdetach(struct knote *kn);
 static int	filt_timer(struct knote *kn, long hint);
 
 static struct filterops file_filtops =
 	{ 1, filt_fileattach, NULL, NULL };
 static struct filterops kqread_filtops =
 	{ 1, NULL, filt_kqdetach, filt_kqueue };
 static struct filterops proc_filtops =
 	{ 0, filt_procattach, filt_procdetach, filt_proc };
 static struct filterops timer_filtops =
 	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
 
 static uma_zone_t	knote_zone;
 static int 		kq_ncallouts = 0;
 static int 		kq_calloutmax = (4 * 1024);
 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
 #define KNOTE_ACTIVATE(kn) do { 					\
 	kn->kn_status |= KN_ACTIVE;					\
 	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
 		knote_enqueue(kn);					\
 } while(0)
 
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
 static int
 filt_nullattach(struct knote *kn)
 {
 
 	return (ENXIO);
 };
 
 struct filterops null_filtops =
 	{ 0, filt_nullattach, NULL, NULL };
 
 extern struct filterops sig_filtops;
 
 /*
  * Table for for all system-defined filters.
  */
 static struct filterops *sysfilt_ops[] = {
 	&file_filtops,			/* EVFILT_READ */
 	&file_filtops,			/* EVFILT_WRITE */
 	&null_filtops,			/* EVFILT_AIO */
 	&file_filtops,			/* EVFILT_VNODE */
 	&proc_filtops,			/* EVFILT_PROC */
 	&sig_filtops,			/* EVFILT_SIGNAL */
 	&timer_filtops,			/* EVFILT_TIMER */
 	&file_filtops,			/* EVFILT_NETDEV */
 };
 
 static int
 filt_fileattach(struct knote *kn)
 {
 	
 	return (fo_kqfilter(kn->kn_fp, kn));
 }
 
 /*ARGSUSED*/
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (1);
 
 	kn->kn_fop = &kqread_filtops;
 	SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
 	return (0);
 }
 
 static void
 filt_kqdetach(struct knote *kn)
 {
 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
 	SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
 }
 
 /*ARGSUSED*/
 static int
 filt_kqueue(struct knote *kn, long hint)
 {
 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
 	kn->kn_data = kq->kq_count;
 	return (kn->kn_data > 0);
 }
 
 static int
 filt_procattach(struct knote *kn)
 {
 	struct proc *p;
 	int error;
 
 	p = pfind(kn->kn_id);
 	if (p == NULL)
 		return (ESRCH);
 	if ((error = p_cansee(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/*
 	 * internal flag indicating registration done by kernel
 	 */
 	if (kn->kn_flags & EV_FLAG1) {
 		kn->kn_data = kn->kn_sdata;		/* ppid */
 		kn->kn_fflags = NOTE_CHILD;
 		kn->kn_flags &= ~EV_FLAG1;
 	}
 
 	SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
 	PROC_UNLOCK(p);
 
 	return (0);
 }
 
 /*
  * The knote may be attached to a different process, which may exit,
  * leaving nothing for the knote to be attached to.  So when the process
  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
  * it will be deleted when read out.  However, as part of the knote deletion,
  * this routine is called, so a check is needed to avoid actually performing
  * a detach, because the original process does not exist any more.
  */
 static void
 filt_procdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	if (kn->kn_status & KN_DETACHED)
 		return;
 
 	PROC_LOCK(p);
 	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
 	PROC_UNLOCK(p);
 }
 
 static int
 filt_proc(struct knote *kn, long hint)
 {
 	u_int event;
 
 	/*
 	 * mask off extra data
 	 */
 	event = (u_int)hint & NOTE_PCTRLMASK;
 
 	/*
 	 * if the user is interested in this event, record it.
 	 */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/*
 	 * process is gone, so flag the event as finished.
 	 */
 	if (event == NOTE_EXIT) {
 		kn->kn_status |= KN_DETACHED;
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT); 
 		return (1);
 	}
 
 	/*
 	 * process forked, and user wants to track the new process,
 	 * so attach a new knote to it, and immediately report an
 	 * event with the parent's pid.
 	 */
 	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
 		struct kevent kev;
 		int error;
 
 		/*
 		 * register knote with new process.
 		 */
 		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;			/* parent */
 		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
 		error = kqueue_register(kn->kn_kq, &kev, NULL);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
 static void
 filt_timerexpire(void *knx)
 {
 	struct knote *kn = knx;
 	struct callout *calloutp;
 	struct timeval tv;
 	int tticks;
 
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn);
 
 	if ((kn->kn_flags & EV_ONESHOT) == 0) {
 		tv.tv_sec = kn->kn_sdata / 1000;
 		tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
 		tticks = tvtohz(&tv);
 		calloutp = (struct callout *)kn->kn_hook;
 		callout_reset(calloutp, tticks, filt_timerexpire, kn);
 	}
 }
 
 /*
  * data contains amount of time to sleep, in milliseconds
  */ 
 static int
 filt_timerattach(struct knote *kn)
 {
 	struct callout *calloutp;
 	struct timeval tv;
 	int tticks;
 
 	if (kq_ncallouts >= kq_calloutmax)
 		return (ENOMEM);
 	kq_ncallouts++;
 
 	tv.tv_sec = kn->kn_sdata / 1000;
 	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
 	tticks = tvtohz(&tv);
 
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 	MALLOC(calloutp, struct callout *, sizeof(*calloutp),
 	    M_KQUEUE, M_WAITOK);
 	callout_init(calloutp, 0);
 	callout_reset(calloutp, tticks, filt_timerexpire, kn);
 	kn->kn_hook = calloutp;
 
 	return (0);
 }
 
 static void
 filt_timerdetach(struct knote *kn)
 {
 	struct callout *calloutp;
 
 	calloutp = (struct callout *)kn->kn_hook;
 	callout_stop(calloutp);
 	FREE(calloutp, M_KQUEUE);
 	kq_ncallouts--;
 }
 
 static int
 filt_timer(struct knote *kn, long hint)
 {
 
 	return (kn->kn_data != 0);
 }
 
 /*
  * MPSAFE
  */
 int
 kqueue(struct thread *td, struct kqueue_args *uap)
 {
 	struct filedesc *fdp;
 	struct kqueue *kq;
 	struct file *fp;
 	int fd, error;
 
 	mtx_lock(&Giant);
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd);
 	if (error)
 		goto done2;
 	kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&kq->kq_head);
 	FILE_LOCK(fp);
 	fp->f_flag = FREAD | FWRITE;
 	fp->f_type = DTYPE_KQUEUE;
 	fp->f_ops = &kqueueops;
 	TAILQ_INIT(&kq->kq_head);
 	fp->f_data = kq;
 	FILE_UNLOCK(fp);
 	FILEDESC_LOCK(fdp);
 	td->td_retval[0] = fd;
 	if (fdp->fd_knlistsize < 0)
 		fdp->fd_knlistsize = 0;		/* this process has a kq */
 	FILEDESC_UNLOCK(fdp);
 	kq->kq_fdp = fdp;
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kevent_args {
 	int	fd;
 	const struct kevent *changelist;
 	int	nchanges;
 	struct	kevent *eventlist;
 	int	nevents;
 	const struct timespec *timeout;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 kevent(struct thread *td, struct kevent_args *uap)
 {
 	struct kevent *kevp;
 	struct kqueue *kq;
 	struct file *fp;
 	struct timespec ts;
 	int i, n, nerrors, error;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_KQUEUE) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	if (uap->timeout != NULL) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			goto done_nogiant;
 		uap->timeout = &ts;
 	}
 	mtx_lock(&Giant);
 
 	kq = (struct kqueue *)fp->f_data;
 	nerrors = 0;
 
 	while (uap->nchanges > 0) {
 		n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
 		error = copyin(uap->changelist, kq->kq_kev,
 		    n * sizeof(struct kevent));
 		if (error)
 			goto done;
 		for (i = 0; i < n; i++) {
 			kevp = &kq->kq_kev[i];
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td);
 			if (error) {
 				if (uap->nevents != 0) {
 					kevp->flags = EV_ERROR;
 					kevp->data = error;
 					(void) copyout(kevp,
 					    uap->eventlist,
 					    sizeof(*kevp));
 					uap->eventlist++;
 					uap->nevents--;
 					nerrors++;
 				} else {
 					goto done;
 				}
 			}
 		}
 		uap->nchanges -= n;
 		uap->changelist += n;
 	}
 	if (nerrors) {
         	td->td_retval[0] = nerrors;
 		error = 0;
 		goto done;
 	}
 
 	error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td);
 done:
 	mtx_unlock(&Giant);
 done_nogiant:
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 int
 kqueue_add_filteropts(int filt, struct filterops *filtops)
 {
 
 	if (filt > 0)
 		panic("filt(%d) > 0", filt);
 	if (filt + EVFILT_SYSCOUNT < 0)
 		panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
 		    filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
 	if (sysfilt_ops[~filt] != &null_filtops)
 		panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
 	sysfilt_ops[~filt] = filtops;
 	return (0);
 }
 
 int
 kqueue_del_filteropts(int filt)
 {
 
 	if (filt > 0)
 		panic("filt(%d) > 0", filt);
 	if (filt + EVFILT_SYSCOUNT < 0)
 		panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
 		    filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
 	if (sysfilt_ops[~filt] == &null_filtops)
 		panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
 	sysfilt_ops[~filt] = &null_filtops;
 	return (0);
 }
 
 int
 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
 {
 	struct filedesc *fdp = kq->kq_fdp;
 	struct filterops *fops;
 	struct file *fp = NULL;
 	struct knote *kn = NULL;
 	int s, error = 0;
 
 	if (kev->filter < 0) {
 		if (kev->filter + EVFILT_SYSCOUNT < 0)
 			return (EINVAL);
 		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
 	} else {
 		/*
 		 * XXX
 		 * filter attach routine is responsible for insuring that
 		 * the identifier can be attached to it.
 		 */
 		printf("unknown filter: %d\n", kev->filter);
 		return (EINVAL);
 	}
 
 	FILEDESC_LOCK(fdp);
 	if (fops->f_isfd) {
 		/* validate descriptor */
 		if ((u_int)kev->ident >= fdp->fd_nfiles ||
 		    (fp = fdp->fd_ofiles[kev->ident]) == NULL) {
 			FILEDESC_UNLOCK(fdp);
 			return (EBADF);
 		}
 		fhold(fp);
 
 		if (kev->ident < fdp->fd_knlistsize) {
 			SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
 				if (kq == kn->kn_kq &&
 				    kev->filter == kn->kn_filter)
 					break;
 		}
 	} else {
 		if (fdp->fd_knhashmask != 0) {
 			struct klist *list;
 			
 			list = &fdp->fd_knhash[
 			    KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
 			SLIST_FOREACH(kn, list, kn_link)
 				if (kev->ident == kn->kn_id &&
 				    kq == kn->kn_kq &&
 				    kev->filter == kn->kn_filter)
 					break;
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 
 	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
 		error = ENOENT;
 		goto done;
 	}
 
 	/*
 	 * kn now contains the matching knote, or NULL if no match
 	 */
 	if (kev->flags & EV_ADD) {
 
 		if (kn == NULL) {
 			kn = knote_alloc();
 			if (kn == NULL) {
 				error = ENOMEM;
 				goto done;
 			}
 			kn->kn_fp = fp;
 			kn->kn_kq = kq;
 			kn->kn_fop = fops;
 
 			/*
 			 * apply reference count to knote structure, and
 			 * do not release it at the end of this routine.
 			 */
 			fp = NULL;
 
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kev->fflags = 0;
 			kev->data = 0;
 			kn->kn_kevent = *kev;
 
 			knote_attach(kn, fdp);
 			if ((error = fops->f_attach(kn)) != 0) {
 				knote_drop(kn, td);
 				goto done;
 			}
 		} else {
 			/*
 			 * The user may change some filter values after the
 			 * initial EV_ADD, but doing so will not reset any 
 			 * filter which have already been triggered.
 			 */
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kn->kn_kevent.udata = kev->udata;
 		}
 
 		s = splhigh();
 		if (kn->kn_fop->f_event(kn, 0))
 			KNOTE_ACTIVATE(kn);
 		splx(s);
 
 	} else if (kev->flags & EV_DELETE) {
 		kn->kn_fop->f_detach(kn);
 		knote_drop(kn, td);
 		goto done;
 	}
 
 	if ((kev->flags & EV_DISABLE) &&
 	    ((kn->kn_status & KN_DISABLED) == 0)) {
 		s = splhigh();
 		kn->kn_status |= KN_DISABLED;
 		splx(s);
 	}
 
 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
 		s = splhigh();
 		kn->kn_status &= ~KN_DISABLED;
 		if ((kn->kn_status & KN_ACTIVE) &&
 		    ((kn->kn_status & KN_QUEUED) == 0))
 			knote_enqueue(kn);
 		splx(s);
 	}
 
 done:
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 static int
 kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp,
 	const struct timespec *tsp, struct thread *td)
 {
 	struct kqueue *kq;
 	struct kevent *kevp;
 	struct timeval atv, rtv, ttv;
 	struct knote *kn, marker;
 	int s, count, timeout, nkev = 0, error = 0;
 
 	FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
 
 	kq = (struct kqueue *)fp->f_data;
 	count = maxevents;
 	if (count == 0)
 		goto done;
 
 	if (tsp != NULL) {
 		TIMESPEC_TO_TIMEVAL(&atv, tsp);
 		if (itimerfix(&atv)) {
 			error = EINVAL;
 			goto done;
 		}
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			timeout = -1;
 		else 
 			timeout = atv.tv_sec > 24 * 60 * 60 ?
 			    24 * 60 * 60 * hz : tvtohz(&atv);
 		getmicrouptime(&rtv);
 		timevaladd(&atv, &rtv);
 	} else {
 		atv.tv_sec = 0;
 		atv.tv_usec = 0;
 		timeout = 0;
 	}
 	goto start;
 
 retry:
 	if (atv.tv_sec || atv.tv_usec) {
 		getmicrouptime(&rtv);
 		if (timevalcmp(&rtv, &atv, >=))
 			goto done;
 		ttv = atv;
 		timevalsub(&ttv, &rtv);
 		timeout = ttv.tv_sec > 24 * 60 * 60 ?
 			24 * 60 * 60 * hz : tvtohz(&ttv);
 	}
 
 start:
 	kevp = kq->kq_kev;
 	s = splhigh();
 	if (kq->kq_count == 0) {
 		if (timeout < 0) { 
 			error = EWOULDBLOCK;
 		} else {
 			kq->kq_state |= KQ_SLEEP;
 			error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
 		}
 		splx(s);
 		if (error == 0)
 			goto retry;
 		/* don't restart after signals... */
 		if (error == ERESTART)
 			error = EINTR;
 		else if (error == EWOULDBLOCK)
 			error = 0;
 		goto done;
 	}
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); 
 	while (count) {
 		kn = TAILQ_FIRST(&kq->kq_head);
 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 
 		if (kn == &marker) {
 			splx(s);
 			if (count == maxevents)
 				goto retry;
 			goto done;
 		}
 		if (kn->kn_status & KN_DISABLED) {
 			kn->kn_status &= ~KN_QUEUED;
 			kq->kq_count--;
 			continue;
 		}
 		if ((kn->kn_flags & EV_ONESHOT) == 0 &&
 		    kn->kn_fop->f_event(kn, 0) == 0) {
 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
 			kq->kq_count--;
 			continue;
 		}
 		*kevp = kn->kn_kevent;
 		kevp++;
 		nkev++;
 		if (kn->kn_flags & EV_ONESHOT) {
 			kn->kn_status &= ~KN_QUEUED;
 			kq->kq_count--;
 			splx(s);
 			kn->kn_fop->f_detach(kn);
 			knote_drop(kn, td);
 			s = splhigh();
 		} else if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
 			kq->kq_count--;
 		} else {
 			TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 
 		}
 		count--;
 		if (nkev == KQ_NEVENTS) {
 			splx(s);
 			error = copyout(&kq->kq_kev, ulistp,
 			    sizeof(struct kevent) * nkev);
 			ulistp += nkev;
 			nkev = 0;
 			kevp = kq->kq_kev;
 			s = splhigh();
 			if (error)
 				break;
 		}
 	}
 	TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); 
 	splx(s);
 done:
 	if (nkev != 0)
 		error = copyout(&kq->kq_kev, ulistp,
 		    sizeof(struct kevent) * nkev);
         td->td_retval[0] = maxevents - count;
 	return (error);
 }
 
 /*
  * XXX
  * This could be expanded to call kqueue_scan, if desired.
  */
 /*ARGSUSED*/
 static int
 kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
 	int flags, struct thread *td)
 {
 	return (ENXIO);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
 	 int flags, struct thread *td)
 {
 	return (ENXIO);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_ioctl(struct file *fp, u_long com, void *data, struct thread *td)
 {
 	return (ENOTTY);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
-    struct thread *td)
+	struct thread *td)
 {
 	struct kqueue *kq;
 	int revents = 0;
 	int s = splnet();
 
 	kq = (struct kqueue *)fp->f_data;
         if (events & (POLLIN | POLLRDNORM)) {
                 if (kq->kq_count) {
                         revents |= events & (POLLIN | POLLRDNORM);
 		} else {
                         selrecord(td, &kq->kq_sel);
 			kq->kq_state |= KQ_SEL;
 		}
 	}
 	splx(s);
 	return (revents);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
-    struct thread *td)
+	struct thread *td)
 {
 	struct kqueue *kq;
 
 	kq = (struct kqueue *)fp->f_data;
 	bzero((void *)st, sizeof(*st));
 	st->st_size = kq->kq_count;
 	st->st_blksize = sizeof(struct kevent);
 	st->st_mode = S_IFIFO;
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_close(struct file *fp, struct thread *td)
 {
 	struct kqueue *kq = (struct kqueue *)fp->f_data;
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct knote **knp, *kn, *kn0;
 	int i;
 
 	FILEDESC_LOCK(fdp);
 	for (i = 0; i < fdp->fd_knlistsize; i++) {
 		knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
 		kn = *knp;
 		while (kn != NULL) {
 			kn0 = SLIST_NEXT(kn, kn_link);
 			if (kq == kn->kn_kq) {
 				kn->kn_fop->f_detach(kn);
 				*knp = kn0;
 				FILE_LOCK(kn->kn_fp);
 				FILEDESC_UNLOCK(fdp);
 				fdrop_locked(kn->kn_fp, td);
 				knote_free(kn);
 				FILEDESC_LOCK(fdp);
 			} else {
 				knp = &SLIST_NEXT(kn, kn_link);
 			}
 			kn = kn0;
 		}
 	}
 	if (fdp->fd_knhashmask != 0) {
 		for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
 			knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
 			kn = *knp;
 			while (kn != NULL) {
 				kn0 = SLIST_NEXT(kn, kn_link);
 				if (kq == kn->kn_kq) {
 					kn->kn_fop->f_detach(kn);
 					*knp = kn0;
 		/* XXX non-fd release of kn->kn_ptr */
 					FILEDESC_UNLOCK(fdp);
 					knote_free(kn);
 					FILEDESC_LOCK(fdp);
 				} else {
 					knp = &SLIST_NEXT(kn, kn_link);
 				}
 				kn = kn0;
 			}
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
 	return (0);
 }
 
 static void
 kqueue_wakeup(struct kqueue *kq)
 {
 
 	if (kq->kq_state & KQ_SLEEP) {
 		kq->kq_state &= ~KQ_SLEEP;
 		wakeup(kq);
 	}
 	if (kq->kq_state & KQ_SEL) {
 		kq->kq_state &= ~KQ_SEL;
 		selwakeup(&kq->kq_sel);
 	}
 	KNOTE(&kq->kq_sel.si_note, 0);
 }
 
 /*
  * walk down a list of knotes, activating them if their event has triggered.
  */
 void
 knote(struct klist *list, long hint)
 {
 	struct knote *kn;
 
 	SLIST_FOREACH(kn, list, kn_selnext)
 		if (kn->kn_fop->f_event(kn, hint))
 			KNOTE_ACTIVATE(kn);
 }
 
 /*
  * remove all knotes from a specified klist
  */
 void
 knote_remove(struct thread *td, struct klist *list)
 {
 	struct knote *kn;
 
 	while ((kn = SLIST_FIRST(list)) != NULL) {
 		kn->kn_fop->f_detach(kn);
 		knote_drop(kn, td);
 	}
 }
 
 /*
  * remove all knotes referencing a specified fd
  */
 void
 knote_fdclose(struct thread *td, int fd)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct klist *list;
 
 	FILEDESC_LOCK(fdp);
 	list = &fdp->fd_knlist[fd];
 	FILEDESC_UNLOCK(fdp);
 	knote_remove(td, list);
 }
 
 static void
 knote_attach(struct knote *kn, struct filedesc *fdp)
 {
 	struct klist *list, *oldlist;
 	int size, newsize;
 
 	FILEDESC_LOCK(fdp);
 
 	if (! kn->kn_fop->f_isfd) {
 		if (fdp->fd_knhashmask == 0)
 			fdp->fd_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
 			    &fdp->fd_knhashmask);
 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
 		goto done;
 	}
 
 	if (fdp->fd_knlistsize <= kn->kn_id) {
 retry:
 		size = fdp->fd_knlistsize;
 		while (size <= kn->kn_id)
 			size += KQEXTENT;
 		FILEDESC_UNLOCK(fdp);
 		MALLOC(list, struct klist *,
 		    size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
 		FILEDESC_LOCK(fdp);
 		newsize = fdp->fd_knlistsize;
 		while (newsize <= kn->kn_id)
 			newsize += KQEXTENT;
 		if (newsize != size) {
 			FILEDESC_UNLOCK(fdp);
 			free(list, M_TEMP);
 			FILEDESC_LOCK(fdp);
 			goto retry;
 		}
 		bcopy(fdp->fd_knlist, list,
 		    fdp->fd_knlistsize * sizeof(struct klist *));
 		bzero((caddr_t)list +
 		    fdp->fd_knlistsize * sizeof(struct klist *),
 		    (size - fdp->fd_knlistsize) * sizeof(struct klist *));
 		if (fdp->fd_knlist != NULL)
 			oldlist = fdp->fd_knlist;
 		else
 			oldlist = NULL;
 		fdp->fd_knlistsize = size;
 		fdp->fd_knlist = list;
 		FILEDESC_UNLOCK(fdp);
 		if (oldlist != NULL)
 			FREE(oldlist, M_KQUEUE);
 		FILEDESC_LOCK(fdp);
 	}
 	list = &fdp->fd_knlist[kn->kn_id];
 done:
 	FILEDESC_UNLOCK(fdp);
 	SLIST_INSERT_HEAD(list, kn, kn_link);
 	kn->kn_status = 0;
 }
 
 /*
  * should be called at spl == 0, since we don't want to hold spl
  * while calling fdrop and free.
  */
 static void
 knote_drop(struct knote *kn, struct thread *td)
 {
         struct filedesc *fdp = td->td_proc->p_fd;
 	struct klist *list;
 
 	FILEDESC_LOCK(fdp);
 	if (kn->kn_fop->f_isfd)
 		list = &fdp->fd_knlist[kn->kn_id];
 	else
 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
 	if (kn->kn_fop->f_isfd)
 		FILE_LOCK(kn->kn_fp);
 	FILEDESC_UNLOCK(fdp);
 
 	SLIST_REMOVE(list, kn, knote, kn_link);
 	if (kn->kn_status & KN_QUEUED)
 		knote_dequeue(kn);
 	if (kn->kn_fop->f_isfd)
 		fdrop_locked(kn->kn_fp, td);
 	knote_free(kn);
 }
 
 
 static void
 knote_enqueue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 	int s = splhigh();
 
 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 
 	kn->kn_status |= KN_QUEUED;
 	kq->kq_count++;
 	splx(s);
 	kqueue_wakeup(kq);
 }
 
 static void
 knote_dequeue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 	int s = splhigh();
 
 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
 
 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 
 	kn->kn_status &= ~KN_QUEUED;
 	kq->kq_count--;
 	splx(s);
 }
 
 static void
 knote_init(void)
 {
 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
 
 static struct knote *
 knote_alloc(void)
 {
 	return ((struct knote *)uma_zalloc(knote_zone, M_WAITOK));
 }
 
 static void
 knote_free(struct knote *kn)
 {
 	uma_zfree(knote_zone, kn);
 }
Index: head/sys/kern/sys_pipe.c
===================================================================
--- head/sys/kern/sys_pipe.c	(revision 101986)
+++ head/sys/kern/sys_pipe.c	(revision 101987)
@@ -1,1504 +1,1504 @@
 /*
  * Copyright (c) 1996 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Absolutely no warranty of function or purpose is made by the author
  *    John S. Dyson.
  * 4. Modifications may be freely made to this file if the above conditions
  *    are met.
  *
  * $FreeBSD$
  */
 
 /*
  * This file contains a high-performance replacement for the socket-based
  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
  * all features of sockets, but does do everything that pipes normally
  * do.
  */
 
 /*
  * This code has two modes of operation, a small write mode and a large
  * write mode.  The small write mode acts like conventional pipes with
  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
  * the receiving process can copy it directly from the pages in the sending
  * process.
  *
  * If the sending process receives a signal, it is possible that it will
  * go away, and certainly its address space can change, because control
  * is returned back to the user-mode side.  In that case, the pipe code
  * arranges to copy the buffer supplied by the user process, to a pageable
  * kernel buffer, and the receiving process will grab the data from the
  * pageable kernel buffer.  Since signals don't happen all that often,
  * the copy operation is normally eliminated.
  *
  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
  * happen for small transfers so that the system will not spend all of
  * its time context switching.  PIPE_SIZE is constrained by the
  * amount of kernel virtual memory.
  */
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/mutex.h>
 #include <sys/ttycom.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/sysproto.h>
 #include <sys/pipe.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/event.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 /*
  * Use this define if you want to disable *fancy* VM things.  Expect an
  * approx 30% decrease in transfer rate.  This could be useful for
  * NetBSD or OpenBSD.
  */
 /* #define PIPE_NODIRECT */
 
 /*
  * interfaces to the outside world
  */
 static int pipe_read(struct file *fp, struct uio *uio, 
 		struct ucred *active_cred, int flags, struct thread *td);
 static int pipe_write(struct file *fp, struct uio *uio, 
 		struct ucred *active_cred, int flags, struct thread *td);
 static int pipe_close(struct file *fp, struct thread *td);
 static int pipe_poll(struct file *fp, int events, struct ucred *active_cred,
 		struct thread *td);
 static int pipe_kqfilter(struct file *fp, struct knote *kn);
 static int pipe_stat(struct file *fp, struct stat *sb,
-		struct ucred *active_cred,  struct thread *td);
+		struct ucred *active_cred, struct thread *td);
 static int pipe_ioctl(struct file *fp, u_long cmd, void *data,
 		struct thread *td);
 
 static struct fileops pipeops = {
 	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
 	pipe_stat, pipe_close
 };
 
 static void	filt_pipedetach(struct knote *kn);
 static int	filt_piperead(struct knote *kn, long hint);
 static int	filt_pipewrite(struct knote *kn, long hint);
 
 static struct filterops pipe_rfiltops =
 	{ 1, NULL, filt_pipedetach, filt_piperead };
 static struct filterops pipe_wfiltops =
 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
 
 #define PIPE_GET_GIANT(pipe)						\
 	do {								\
 		KASSERT(((pipe)->pipe_state & PIPE_LOCKFL) != 0,	\
 		    ("%s:%d PIPE_GET_GIANT: line pipe not locked",	\
 		     __FILE__, __LINE__));				\
 		PIPE_UNLOCK(pipe);					\
 		mtx_lock(&Giant);					\
 	} while (0)
 
 #define PIPE_DROP_GIANT(pipe)						\
 	do {								\
 		mtx_unlock(&Giant);					\
 		PIPE_LOCK(pipe);					\
 	} while (0)
 
 /*
  * Default pipe buffer size(s), this can be kind-of large now because pipe
  * space is pageable.  The pipe code will try to maintain locality of
  * reference for performance reasons, so small amounts of outstanding I/O
  * will not wipe the cache.
  */
 #define MINPIPESIZE (PIPE_SIZE/3)
 #define MAXPIPESIZE (2*PIPE_SIZE/3)
 
 /*
  * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
  * is there so that on large systems, we don't exhaust it.
  */
 #define MAXPIPEKVA (8*1024*1024)
 
 /*
  * Limit for direct transfers, we cannot, of course limit
  * the amount of kva for pipes in general though.
  */
 #define LIMITPIPEKVA (16*1024*1024)
 
 /*
  * Limit the number of "big" pipes
  */
 #define LIMITBIGPIPES	32
 static int nbigpipe;
 
 static int amountpipekva;
 
 static void pipeinit(void *dummy __unused);
 static void pipeclose(struct pipe *cpipe);
 static void pipe_free_kmem(struct pipe *cpipe);
 static int pipe_create(struct pipe **cpipep);
 static __inline int pipelock(struct pipe *cpipe, int catch);
 static __inline void pipeunlock(struct pipe *cpipe);
 static __inline void pipeselwakeup(struct pipe *cpipe);
 #ifndef PIPE_NODIRECT
 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
 static void pipe_destroy_write_buffer(struct pipe *wpipe);
 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
 static void pipe_clone_write_buffer(struct pipe *wpipe);
 #endif
 static int pipespace(struct pipe *cpipe, int size);
 
 static uma_zone_t pipe_zone;
 
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
 
 static void
 pipeinit(void *dummy __unused)
 {
 	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipe), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 /*
  * The pipe system call for the DTYPE_PIPE type of pipes
  */
 
 /* ARGSUSED */
 int
 pipe(td, uap)
 	struct thread *td;
 	struct pipe_args /* {
 		int	dummy;
 	} */ *uap;
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *rf, *wf;
 	struct pipe *rpipe, *wpipe;
 	struct mtx *pmtx;
 	int fd, error;
 	
 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
 
 	pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO);
 	
 	rpipe = wpipe = NULL;
 	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
 		pipeclose(rpipe); 
 		pipeclose(wpipe); 
 		free(pmtx, M_TEMP);
 		return (ENFILE);
 	}
 	
 	rpipe->pipe_state |= PIPE_DIRECTOK;
 	wpipe->pipe_state |= PIPE_DIRECTOK;
 
 	error = falloc(td, &rf, &fd);
 	if (error) {
 		pipeclose(rpipe);
 		pipeclose(wpipe);
 		free(pmtx, M_TEMP);
 		return (error);
 	}
 	fhold(rf);
 	td->td_retval[0] = fd;
 
 	/*
 	 * Warning: once we've gotten past allocation of the fd for the
 	 * read-side, we can only drop the read side via fdrop() in order
 	 * to avoid races against processes which manage to dup() the read
 	 * side while we are blocked trying to allocate the write side.
 	 */
 	FILE_LOCK(rf);
 	rf->f_flag = FREAD | FWRITE;
 	rf->f_type = DTYPE_PIPE;
 	rf->f_data = rpipe;
 	rf->f_ops = &pipeops;
 	FILE_UNLOCK(rf);
 	error = falloc(td, &wf, &fd);
 	if (error) {
 		FILEDESC_LOCK(fdp);
 		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
 			fdp->fd_ofiles[td->td_retval[0]] = NULL;
 			FILEDESC_UNLOCK(fdp);
 			fdrop(rf, td);
 		} else
 			FILEDESC_UNLOCK(fdp);
 		fdrop(rf, td);
 		/* rpipe has been closed by fdrop(). */
 		pipeclose(wpipe);
 		free(pmtx, M_TEMP);
 		return (error);
 	}
 	FILE_LOCK(wf);
 	wf->f_flag = FREAD | FWRITE;
 	wf->f_type = DTYPE_PIPE;
 	wf->f_data = wpipe;
 	wf->f_ops = &pipeops;
 	FILE_UNLOCK(wf);
 	td->td_retval[1] = fd;
 	rpipe->pipe_peer = wpipe;
 	wpipe->pipe_peer = rpipe;
 #ifdef MAC
 	/*
 	 * struct pipe represents a pipe endpoint.  The MAC label is shared
 	 * between the connected endpoints.  As a result mac_init_pipe() and
 	 * mac_create_pipe() should only be called on one of the endpoints
 	 * after they have been connected.
 	 */
 	mac_init_pipe(rpipe);
 	mac_create_pipe(td->td_ucred, rpipe);
 #endif
 	mtx_init(pmtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
 	rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
 	fdrop(rf, td);
 
 	return (0);
 }
 
 /*
  * Allocate kva for pipe circular buffer, the space is pageable
  * This routine will 'realloc' the size of a pipe safely, if it fails
  * it will retain the old buffer.
  * If it fails it will return ENOMEM.
  */
 static int
 pipespace(cpipe, size)
 	struct pipe *cpipe;
 	int size;
 {
 	struct vm_object *object;
 	caddr_t buffer;
 	int npages, error;
 
 	GIANT_REQUIRED;
 	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
 	       ("pipespace: pipe mutex locked"));
 
 	npages = round_page(size)/PAGE_SIZE;
 	/*
 	 * Create an object, I don't like the idea of paging to/from
 	 * kernel_object.
 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
 	 */
 	object = vm_object_allocate(OBJT_DEFAULT, npages);
 	buffer = (caddr_t) vm_map_min(kernel_map);
 
 	/*
 	 * Insert the object into the kernel map, and allocate kva for it.
 	 * The map entry is, by default, pageable.
 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
 	 */
 	error = vm_map_find(kernel_map, object, 0,
 		(vm_offset_t *) &buffer, size, 1,
 		VM_PROT_ALL, VM_PROT_ALL, 0);
 
 	if (error != KERN_SUCCESS) {
 		vm_object_deallocate(object);
 		return (ENOMEM);
 	}
 
 	/* free old resources if we're resizing */
 	pipe_free_kmem(cpipe);
 	cpipe->pipe_buffer.object = object;
 	cpipe->pipe_buffer.buffer = buffer;
 	cpipe->pipe_buffer.size = size;
 	cpipe->pipe_buffer.in = 0;
 	cpipe->pipe_buffer.out = 0;
 	cpipe->pipe_buffer.cnt = 0;
 	amountpipekva += cpipe->pipe_buffer.size;
 	return (0);
 }
 
 /*
  * initialize and allocate VM and memory for pipe
  */
 static int
 pipe_create(cpipep)
 	struct pipe **cpipep;
 {
 	struct pipe *cpipe;
 	int error;
 
 	*cpipep = uma_zalloc(pipe_zone, M_WAITOK);
 	if (*cpipep == NULL)
 		return (ENOMEM);
 
 	cpipe = *cpipep;
 	
 	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
 	cpipe->pipe_buffer.object = NULL;
 #ifndef PIPE_NODIRECT
 	cpipe->pipe_map.kva = NULL;
 #endif
 	/*
 	 * protect so pipeclose() doesn't follow a junk pointer
 	 * if pipespace() fails.
 	 */
 	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
 	cpipe->pipe_state = 0;
 	cpipe->pipe_peer = NULL;
 	cpipe->pipe_busy = 0;
 
 #ifndef PIPE_NODIRECT
 	/*
 	 * pipe data structure initializations to support direct pipe I/O
 	 */
 	cpipe->pipe_map.cnt = 0;
 	cpipe->pipe_map.kva = 0;
 	cpipe->pipe_map.pos = 0;
 	cpipe->pipe_map.npages = 0;
 	/* cpipe->pipe_map.ms[] = invalid */
 #endif
 
 	cpipe->pipe_mtxp = NULL;	/* avoid pipespace assertion */
 	error = pipespace(cpipe, PIPE_SIZE);
 	if (error)
 		return (error);
 
 	vfs_timestamp(&cpipe->pipe_ctime);
 	cpipe->pipe_atime = cpipe->pipe_ctime;
 	cpipe->pipe_mtime = cpipe->pipe_ctime;
 
 	return (0);
 }
 
 
 /*
  * lock a pipe for I/O, blocking other access
  */
 static __inline int
 pipelock(cpipe, catch)
 	struct pipe *cpipe;
 	int catch;
 {
 	int error;
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	while (cpipe->pipe_state & PIPE_LOCKFL) {
 		cpipe->pipe_state |= PIPE_LWANT;
 		error = msleep(cpipe, PIPE_MTX(cpipe),
 		    catch ? (PRIBIO | PCATCH) : PRIBIO,
 		    "pipelk", 0);
 		if (error != 0) 
 			return (error);
 	}
 	cpipe->pipe_state |= PIPE_LOCKFL;
 	return (0);
 }
 
 /*
  * unlock a pipe I/O lock
  */
 static __inline void
 pipeunlock(cpipe)
 	struct pipe *cpipe;
 {
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	cpipe->pipe_state &= ~PIPE_LOCKFL;
 	if (cpipe->pipe_state & PIPE_LWANT) {
 		cpipe->pipe_state &= ~PIPE_LWANT;
 		wakeup(cpipe);
 	}
 }
 
 static __inline void
 pipeselwakeup(cpipe)
 	struct pipe *cpipe;
 {
 
 	if (cpipe->pipe_state & PIPE_SEL) {
 		cpipe->pipe_state &= ~PIPE_SEL;
 		selwakeup(&cpipe->pipe_sel);
 	}
 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
 	KNOTE(&cpipe->pipe_sel.si_note, 0);
 }
 
 /* ARGSUSED */
 static int
 pipe_read(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	struct thread *td;
 	int flags;
 {
 	struct pipe *rpipe = (struct pipe *) fp->f_data;
 	int error;
 	int nread = 0;
 	u_int size;
 
 	PIPE_LOCK(rpipe);
 	++rpipe->pipe_busy;
 	error = pipelock(rpipe, 1);
 	if (error)
 		goto unlocked_error;
 
 #ifdef MAC
 	error = mac_check_pipe_op(active_cred, rpipe, MAC_OP_PIPE_READ);
 	if (error)
 		goto locked_error;
 #endif
 
 	while (uio->uio_resid) {
 		/*
 		 * normal pipe buffer receive
 		 */
 		if (rpipe->pipe_buffer.cnt > 0) {
 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
 			if (size > rpipe->pipe_buffer.cnt)
 				size = rpipe->pipe_buffer.cnt;
 			if (size > (u_int) uio->uio_resid)
 				size = (u_int) uio->uio_resid;
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
 					size, uio);
 			PIPE_LOCK(rpipe);
 			if (error)
 				break;
 
 			rpipe->pipe_buffer.out += size;
 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
 				rpipe->pipe_buffer.out = 0;
 
 			rpipe->pipe_buffer.cnt -= size;
 
 			/*
 			 * If there is no more to read in the pipe, reset
 			 * its pointers to the beginning.  This improves
 			 * cache hit stats.
 			 */
 			if (rpipe->pipe_buffer.cnt == 0) {
 				rpipe->pipe_buffer.in = 0;
 				rpipe->pipe_buffer.out = 0;
 			}
 			nread += size;
 #ifndef PIPE_NODIRECT
 		/*
 		 * Direct copy, bypassing a kernel buffer.
 		 */
 		} else if ((size = rpipe->pipe_map.cnt) &&
 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
 			caddr_t	va;
 			if (size > (u_int) uio->uio_resid)
 				size = (u_int) uio->uio_resid;
 
 			va = (caddr_t) rpipe->pipe_map.kva +
 			    rpipe->pipe_map.pos;
 			PIPE_UNLOCK(rpipe);
 			error = uiomove(va, size, uio);
 			PIPE_LOCK(rpipe);
 			if (error)
 				break;
 			nread += size;
 			rpipe->pipe_map.pos += size;
 			rpipe->pipe_map.cnt -= size;
 			if (rpipe->pipe_map.cnt == 0) {
 				rpipe->pipe_state &= ~PIPE_DIRECTW;
 				wakeup(rpipe);
 			}
 #endif
 		} else {
 			/*
 			 * detect EOF condition
 			 * read returns 0 on EOF, no need to set error
 			 */
 			if (rpipe->pipe_state & PIPE_EOF)
 				break;
 
 			/*
 			 * If the "write-side" has been blocked, wake it up now.
 			 */
 			if (rpipe->pipe_state & PIPE_WANTW) {
 				rpipe->pipe_state &= ~PIPE_WANTW;
 				wakeup(rpipe);
 			}
 
 			/*
 			 * Break if some data was read.
 			 */
 			if (nread > 0)
 				break;
 
 			/*
 			 * Unlock the pipe buffer for our remaining processing.  We
 			 * will either break out with an error or we will sleep and
 			 * relock to loop.
 			 */
 			pipeunlock(rpipe);
 
 			/*
 			 * Handle non-blocking mode operation or
 			 * wait for more data.
 			 */
 			if (fp->f_flag & FNONBLOCK) {
 				error = EAGAIN;
 			} else {
 				rpipe->pipe_state |= PIPE_WANTR;
 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
 				    PRIBIO | PCATCH,
 				    "piperd", 0)) == 0)
 					error = pipelock(rpipe, 1);
 			}
 			if (error)
 				goto unlocked_error;
 		}
 	}
 #ifdef MAC
 locked_error:
 #endif
 	pipeunlock(rpipe);
 
 	/* XXX: should probably do this before getting any locks. */
 	if (error == 0)
 		vfs_timestamp(&rpipe->pipe_atime);
 unlocked_error:
 	--rpipe->pipe_busy;
 
 	/*
 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
 	 */
 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
 		wakeup(rpipe);
 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
 		/*
 		 * Handle write blocking hysteresis.
 		 */
 		if (rpipe->pipe_state & PIPE_WANTW) {
 			rpipe->pipe_state &= ~PIPE_WANTW;
 			wakeup(rpipe);
 		}
 	}
 
 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
 		pipeselwakeup(rpipe);
 
 	PIPE_UNLOCK(rpipe);
 	return (error);
 }
 
 #ifndef PIPE_NODIRECT
 /*
  * Map the sending processes' buffer into kernel space and wire it.
  * This is similar to a physical write operation.
  */
 static int
 pipe_build_write_buffer(wpipe, uio)
 	struct pipe *wpipe;
 	struct uio *uio;
 {
 	u_int size;
 	int i;
 	vm_offset_t addr, endaddr, paddr;
 
 	GIANT_REQUIRED;
 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
 
 	size = (u_int) uio->uio_iov->iov_len;
 	if (size > wpipe->pipe_buffer.size)
 		size = wpipe->pipe_buffer.size;
 
 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
 		vm_page_t m;
 
 		/*
 		 * vm_fault_quick() can sleep.  Consequently,
 		 * vm_page_lock_queue() and vm_page_unlock_queue()
 		 * should not be performed outside of this loop.
 		 */
 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
 		    (paddr = pmap_extract(vmspace_pmap(curproc->p_vmspace),
 		     addr)) == 0) {
 			int j;
 
 			vm_page_lock_queues();
 			for (j = 0; j < i; j++)
 				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
 			vm_page_unlock_queues();
 			return (EFAULT);
 		}
 
 		m = PHYS_TO_VM_PAGE(paddr);
 		vm_page_lock_queues();
 		vm_page_wire(m);
 		vm_page_unlock_queues();
 		wpipe->pipe_map.ms[i] = m;
 	}
 
 /*
  * set up the control block
  */
 	wpipe->pipe_map.npages = i;
 	wpipe->pipe_map.pos =
 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
 	wpipe->pipe_map.cnt = size;
 
 /*
  * and map the buffer
  */
 	if (wpipe->pipe_map.kva == 0) {
 		/*
 		 * We need to allocate space for an extra page because the
 		 * address range might (will) span pages at times.
 		 */
 		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
 			wpipe->pipe_buffer.size + PAGE_SIZE);
 		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
 	}
 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
 		wpipe->pipe_map.npages);
 
 /*
  * and update the uio data
  */
 
 	uio->uio_iov->iov_len -= size;
 	uio->uio_iov->iov_base += size;
 	if (uio->uio_iov->iov_len == 0)
 		uio->uio_iov++;
 	uio->uio_resid -= size;
 	uio->uio_offset += size;
 	return (0);
 }
 
 /*
  * unmap and unwire the process buffer
  */
 static void
 pipe_destroy_write_buffer(wpipe)
 	struct pipe *wpipe;
 {
 	int i;
 
 	GIANT_REQUIRED;
 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
 
 	if (wpipe->pipe_map.kva) {
 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
 
 		if (amountpipekva > MAXPIPEKVA) {
 			vm_offset_t kva = wpipe->pipe_map.kva;
 			wpipe->pipe_map.kva = 0;
 			kmem_free(kernel_map, kva,
 				wpipe->pipe_buffer.size + PAGE_SIZE);
 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
 		}
 	}
 	vm_page_lock_queues();
 	for (i = 0; i < wpipe->pipe_map.npages; i++)
 		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
 	vm_page_unlock_queues();
 	wpipe->pipe_map.npages = 0;
 }
 
 /*
  * In the case of a signal, the writing process might go away.  This
  * code copies the data into the circular buffer so that the source
  * pages can be freed without loss of data.
  */
 static void
 pipe_clone_write_buffer(wpipe)
 	struct pipe *wpipe;
 {
 	int size;
 	int pos;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	size = wpipe->pipe_map.cnt;
 	pos = wpipe->pipe_map.pos;
 
 	wpipe->pipe_buffer.in = size;
 	wpipe->pipe_buffer.out = 0;
 	wpipe->pipe_buffer.cnt = size;
 	wpipe->pipe_state &= ~PIPE_DIRECTW;
 
 	PIPE_GET_GIANT(wpipe);
 	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
 	    wpipe->pipe_buffer.buffer, size);
 	pipe_destroy_write_buffer(wpipe);
 	PIPE_DROP_GIANT(wpipe);
 }
 
 /*
  * This implements the pipe buffer write mechanism.  Note that only
  * a direct write OR a normal pipe write can be pending at any given time.
  * If there are any characters in the pipe buffer, the direct write will
  * be deferred until the receiving process grabs all of the bytes from
  * the pipe buffer.  Then the direct mapping write is set-up.
  */
 static int
 pipe_direct_write(wpipe, uio)
 	struct pipe *wpipe;
 	struct uio *uio;
 {
 	int error;
 
 retry:
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	while (wpipe->pipe_state & PIPE_DIRECTW) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		wpipe->pipe_state |= PIPE_WANTW;
 		error = msleep(wpipe, PIPE_MTX(wpipe),
 		    PRIBIO | PCATCH, "pipdww", 0);
 		if (error)
 			goto error1;
 		if (wpipe->pipe_state & PIPE_EOF) {
 			error = EPIPE;
 			goto error1;
 		}
 	}
 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
 	if (wpipe->pipe_buffer.cnt > 0) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 			
 		wpipe->pipe_state |= PIPE_WANTW;
 		error = msleep(wpipe, PIPE_MTX(wpipe),
 		    PRIBIO | PCATCH, "pipdwc", 0);
 		if (error)
 			goto error1;
 		if (wpipe->pipe_state & PIPE_EOF) {
 			error = EPIPE;
 			goto error1;
 		}
 		goto retry;
 	}
 
 	wpipe->pipe_state |= PIPE_DIRECTW;
 
 	pipelock(wpipe, 0);
 	PIPE_GET_GIANT(wpipe);
 	error = pipe_build_write_buffer(wpipe, uio);
 	PIPE_DROP_GIANT(wpipe);
 	pipeunlock(wpipe);
 	if (error) {
 		wpipe->pipe_state &= ~PIPE_DIRECTW;
 		goto error1;
 	}
 
 	error = 0;
 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
 		if (wpipe->pipe_state & PIPE_EOF) {
 			pipelock(wpipe, 0);
 			PIPE_GET_GIANT(wpipe);
 			pipe_destroy_write_buffer(wpipe);
 			PIPE_DROP_GIANT(wpipe);
 			pipeunlock(wpipe);
 			pipeselwakeup(wpipe);
 			error = EPIPE;
 			goto error1;
 		}
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
 		    "pipdwt", 0);
 	}
 
 	pipelock(wpipe,0);
 	if (wpipe->pipe_state & PIPE_DIRECTW) {
 		/*
 		 * this bit of trickery substitutes a kernel buffer for
 		 * the process that might be going away.
 		 */
 		pipe_clone_write_buffer(wpipe);
 	} else {
 		PIPE_GET_GIANT(wpipe);
 		pipe_destroy_write_buffer(wpipe);
 		PIPE_DROP_GIANT(wpipe);
 	}
 	pipeunlock(wpipe);
 	return (error);
 
 error1:
 	wakeup(wpipe);
 	return (error);
 }
 #endif
 	
 static int
 pipe_write(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	struct thread *td;
 	int flags;
 {
 	int error = 0;
 	int orig_resid;
 	struct pipe *wpipe, *rpipe;
 
 	rpipe = (struct pipe *) fp->f_data;
 	wpipe = rpipe->pipe_peer;
 
 	PIPE_LOCK(rpipe);
 	/*
 	 * detect loss of pipe read side, issue SIGPIPE if lost.
 	 */
 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
 		PIPE_UNLOCK(rpipe);
 		return (EPIPE);
 	}
 #ifdef MAC
 	error = mac_check_pipe_op(active_cred, wpipe, MAC_OP_PIPE_WRITE);
 	if (error) {
 		PIPE_UNLOCK(rpipe);
 		return (error);
 	}
 #endif
 	++wpipe->pipe_busy;
 
 	/*
 	 * If it is advantageous to resize the pipe buffer, do
 	 * so.
 	 */
 	if ((uio->uio_resid > PIPE_SIZE) &&
 		(nbigpipe < LIMITBIGPIPES) &&
 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
 		(wpipe->pipe_buffer.cnt == 0)) {
 
 		if ((error = pipelock(wpipe,1)) == 0) {
 			PIPE_GET_GIANT(wpipe);
 			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
 				nbigpipe++;
 			PIPE_DROP_GIANT(wpipe);
 			pipeunlock(wpipe);
 		}
 	}
 
 	/*
 	 * If an early error occured unbusy and return, waking up any pending
 	 * readers.
 	 */
 	if (error) {
 		--wpipe->pipe_busy;
 		if ((wpipe->pipe_busy == 0) && 
 		    (wpipe->pipe_state & PIPE_WANT)) {
 			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
 			wakeup(wpipe);
 		}
 		PIPE_UNLOCK(rpipe);
 		return(error);
 	}
 		
 	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
 
 	orig_resid = uio->uio_resid;
 
 	while (uio->uio_resid) {
 		int space;
 
 #ifndef PIPE_NODIRECT
 		/*
 		 * If the transfer is large, we can gain performance if
 		 * we do process-to-process copies directly.
 		 * If the write is non-blocking, we don't use the
 		 * direct write mechanism.
 		 *
 		 * The direct write mechanism will detect the reader going
 		 * away on us.
 		 */
 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
 		    (fp->f_flag & FNONBLOCK) == 0 &&
 			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
 			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
 			error = pipe_direct_write( wpipe, uio);
 			if (error)
 				break;
 			continue;
 		}
 #endif
 
 		/*
 		 * Pipe buffered writes cannot be coincidental with
 		 * direct writes.  We wait until the currently executing
 		 * direct write is completed before we start filling the
 		 * pipe buffer.  We break out if a signal occurs or the
 		 * reader goes away.
 		 */
 	retrywrite:
 		while (wpipe->pipe_state & PIPE_DIRECTW) {
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
 			    "pipbww", 0);
 			if (wpipe->pipe_state & PIPE_EOF)
 				break;
 			if (error)
 				break;
 		}
 		if (wpipe->pipe_state & PIPE_EOF) {
 			error = EPIPE;
 			break;
 		}
 
 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 
 		/* Writes of size <= PIPE_BUF must be atomic. */
 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
 			space = 0;
 
 		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
 			if ((error = pipelock(wpipe,1)) == 0) {
 				int size;	/* Transfer size */
 				int segsize;	/* first segment to transfer */
 
 				/*
 				 * It is possible for a direct write to
 				 * slip in on us... handle it here...
 				 */
 				if (wpipe->pipe_state & PIPE_DIRECTW) {
 					pipeunlock(wpipe);
 					goto retrywrite;
 				}
 				/* 
 				 * If a process blocked in uiomove, our
 				 * value for space might be bad.
 				 *
 				 * XXX will we be ok if the reader has gone
 				 * away here?
 				 */
 				if (space > wpipe->pipe_buffer.size - 
 				    wpipe->pipe_buffer.cnt) {
 					pipeunlock(wpipe);
 					goto retrywrite;
 				}
 
 				/*
 				 * Transfer size is minimum of uio transfer
 				 * and free space in pipe buffer.
 				 */
 				if (space > uio->uio_resid)
 					size = uio->uio_resid;
 				else
 					size = space;
 				/*
 				 * First segment to transfer is minimum of 
 				 * transfer size and contiguous space in
 				 * pipe buffer.  If first segment to transfer
 				 * is less than the transfer size, we've got
 				 * a wraparound in the buffer.
 				 */
 				segsize = wpipe->pipe_buffer.size - 
 					wpipe->pipe_buffer.in;
 				if (segsize > size)
 					segsize = size;
 				
 				/* Transfer first segment */
 
 				PIPE_UNLOCK(rpipe);
 				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 
 						segsize, uio);
 				PIPE_LOCK(rpipe);
 				
 				if (error == 0 && segsize < size) {
 					/* 
 					 * Transfer remaining part now, to
 					 * support atomic writes.  Wraparound
 					 * happened.
 					 */
 					if (wpipe->pipe_buffer.in + segsize != 
 					    wpipe->pipe_buffer.size)
 						panic("Expected pipe buffer wraparound disappeared");
 						
 					PIPE_UNLOCK(rpipe);
 					error = uiomove(&wpipe->pipe_buffer.buffer[0],
 							size - segsize, uio);
 					PIPE_LOCK(rpipe);
 				}
 				if (error == 0) {
 					wpipe->pipe_buffer.in += size;
 					if (wpipe->pipe_buffer.in >=
 					    wpipe->pipe_buffer.size) {
 						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
 							panic("Expected wraparound bad");
 						wpipe->pipe_buffer.in = size - segsize;
 					}
 				
 					wpipe->pipe_buffer.cnt += size;
 					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
 						panic("Pipe buffer overflow");
 				
 				}
 				pipeunlock(wpipe);
 			}
 			if (error)
 				break;
 
 		} else {
 			/*
 			 * If the "read-side" has been blocked, wake it up now.
 			 */
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 
 			/*
 			 * don't block on non-blocking I/O
 			 */
 			if (fp->f_flag & FNONBLOCK) {
 				error = EAGAIN;
 				break;
 			}
 
 			/*
 			 * We have no more space and have something to offer,
 			 * wake up select/poll.
 			 */
 			pipeselwakeup(wpipe);
 
 			wpipe->pipe_state |= PIPE_WANTW;
 			error = msleep(wpipe, PIPE_MTX(rpipe),
 			    PRIBIO | PCATCH, "pipewr", 0);
 			if (error != 0)
 				break;
 			/*
 			 * If read side wants to go away, we just issue a signal
 			 * to ourselves.
 			 */
 			if (wpipe->pipe_state & PIPE_EOF) {
 				error = EPIPE;
 				break;
 			}	
 		}
 	}
 
 	--wpipe->pipe_busy;
 
 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
 		wakeup(wpipe);
 	} else if (wpipe->pipe_buffer.cnt > 0) {
 		/*
 		 * If we have put any characters in the buffer, we wake up
 		 * the reader.
 		 */
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 	}
 
 	/*
 	 * Don't return EPIPE if I/O was successful
 	 */
 	if ((wpipe->pipe_buffer.cnt == 0) &&
 	    (uio->uio_resid == 0) &&
 	    (error == EPIPE)) {
 		error = 0;
 	}
 
 	if (error == 0)
 		vfs_timestamp(&wpipe->pipe_mtime);
 
 	/*
 	 * We have something to offer,
 	 * wake up select/poll.
 	 */
 	if (wpipe->pipe_buffer.cnt)
 		pipeselwakeup(wpipe);
 
 	PIPE_UNLOCK(rpipe);
 	return (error);
 }
 
 /*
  * we implement a very minimal set of ioctls for compatibility with sockets.
  */
 int
 pipe_ioctl(fp, cmd, data, td)
 	struct file *fp;
 	u_long cmd;
 	void *data;
 	struct thread *td;
 {
 	struct pipe *mpipe = (struct pipe *)fp->f_data;
 #ifdef MAC
 	int error;
 
 	/* XXXMAC: Pipe should be locked for this check. */
 	error = mac_check_pipe_ioctl(td->td_ucred, mpipe, cmd, data);
 	if (error)
 		return (error);
 #endif
 
 	switch (cmd) {
 
 	case FIONBIO:
 		return (0);
 
 	case FIOASYNC:
 		PIPE_LOCK(mpipe);
 		if (*(int *)data) {
 			mpipe->pipe_state |= PIPE_ASYNC;
 		} else {
 			mpipe->pipe_state &= ~PIPE_ASYNC;
 		}
 		PIPE_UNLOCK(mpipe);
 		return (0);
 
 	case FIONREAD:
 		PIPE_LOCK(mpipe);
 		if (mpipe->pipe_state & PIPE_DIRECTW)
 			*(int *)data = mpipe->pipe_map.cnt;
 		else
 			*(int *)data = mpipe->pipe_buffer.cnt;
 		PIPE_UNLOCK(mpipe);
 		return (0);
 
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(mpipe->pipe_sigio);
 		return (0);
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(mpipe->pipe_sigio);
 		return (0);
 
 	}
 	return (ENOTTY);
 }
 
 int
 pipe_poll(fp, events, active_cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct pipe *rpipe = (struct pipe *)fp->f_data;
 	struct pipe *wpipe;
 	int revents = 0;
 #ifdef MAC
 	int error;
 #endif
 
 	wpipe = rpipe->pipe_peer;
 	PIPE_LOCK(rpipe);
 #ifdef MAC
 	error = mac_check_pipe_op(active_cred, rpipe, MAC_OP_PIPE_POLL);
 	if (error)
 		goto locked_error;
 #endif
 	if (events & (POLLIN | POLLRDNORM))
 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
 		    (rpipe->pipe_buffer.cnt > 0) ||
 		    (rpipe->pipe_state & PIPE_EOF))
 			revents |= events & (POLLIN | POLLRDNORM);
 
 	if (events & (POLLOUT | POLLWRNORM))
 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
 			revents |= events & (POLLOUT | POLLWRNORM);
 
 	if ((rpipe->pipe_state & PIPE_EOF) ||
 	    (wpipe == NULL) ||
 	    (wpipe->pipe_state & PIPE_EOF))
 		revents |= POLLHUP;
 
 	if (revents == 0) {
 		if (events & (POLLIN | POLLRDNORM)) {
 			selrecord(td, &rpipe->pipe_sel);
 			rpipe->pipe_state |= PIPE_SEL;
 		}
 
 		if (events & (POLLOUT | POLLWRNORM)) {
 			selrecord(td, &wpipe->pipe_sel);
 			wpipe->pipe_state |= PIPE_SEL;
 		}
 	}
 #ifdef MAC
 locked_error:
 #endif
 	PIPE_UNLOCK(rpipe);
 
 	return (revents);
 }
 
 /*
  * We shouldn't need locks here as we're doing a read and this should
  * be a natural race.
  */
 static int
 pipe_stat(fp, ub, active_cred, td)
 	struct file *fp;
 	struct stat *ub;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct pipe *pipe = (struct pipe *)fp->f_data;
 #ifdef MAC
 	int error;
 
 	/* XXXMAC: Pipe should be locked for this check. */
 	error = mac_check_pipe_op(active_cred, pipe, MAC_OP_PIPE_STAT);
 	if (error)
 		return (error);
 #endif
 	bzero(ub, sizeof(*ub));
 	ub->st_mode = S_IFIFO;
 	ub->st_blksize = pipe->pipe_buffer.size;
 	ub->st_size = pipe->pipe_buffer.cnt;
 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
 	ub->st_atimespec = pipe->pipe_atime;
 	ub->st_mtimespec = pipe->pipe_mtime;
 	ub->st_ctimespec = pipe->pipe_ctime;
 	ub->st_uid = fp->f_cred->cr_uid;
 	ub->st_gid = fp->f_cred->cr_gid;
 	/*
 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
 	 * XXX (st_dev, st_ino) should be unique.
 	 */
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pipe_close(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 	struct pipe *cpipe = (struct pipe *)fp->f_data;
 
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	funsetown(&cpipe->pipe_sigio);
 	pipeclose(cpipe);
 	return (0);
 }
 
 static void
 pipe_free_kmem(cpipe)
 	struct pipe *cpipe;
 {
 
 	GIANT_REQUIRED;
 	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
 	       ("pipespace: pipe mutex locked"));
 
 	if (cpipe->pipe_buffer.buffer != NULL) {
 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
 			--nbigpipe;
 		amountpipekva -= cpipe->pipe_buffer.size;
 		kmem_free(kernel_map,
 			(vm_offset_t)cpipe->pipe_buffer.buffer,
 			cpipe->pipe_buffer.size);
 		cpipe->pipe_buffer.buffer = NULL;
 	}
 #ifndef PIPE_NODIRECT
 	if (cpipe->pipe_map.kva != NULL) {
 		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
 		kmem_free(kernel_map,
 			cpipe->pipe_map.kva,
 			cpipe->pipe_buffer.size + PAGE_SIZE);
 		cpipe->pipe_map.cnt = 0;
 		cpipe->pipe_map.kva = 0;
 		cpipe->pipe_map.pos = 0;
 		cpipe->pipe_map.npages = 0;
 	}
 #endif
 }
 
 /*
  * shutdown the pipe
  */
 static void
 pipeclose(cpipe)
 	struct pipe *cpipe;
 {
 	struct pipe *ppipe;
 	int hadpeer;
 
 	if (cpipe == NULL)
 		return;
 
 	hadpeer = 0;
 
 	/* partially created pipes won't have a valid mutex. */
 	if (PIPE_MTX(cpipe) != NULL)
 		PIPE_LOCK(cpipe);
 		
 	pipeselwakeup(cpipe);
 
 	/*
 	 * If the other side is blocked, wake it up saying that
 	 * we want to close it down.
 	 */
 	while (cpipe->pipe_busy) {
 		wakeup(cpipe);
 		cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
 	}
 
 #ifdef MAC
 	if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL)
 		mac_destroy_pipe(cpipe);
 #endif
 
 	/*
 	 * Disconnect from peer
 	 */
 	if ((ppipe = cpipe->pipe_peer) != NULL) {
 		hadpeer++;
 		pipeselwakeup(ppipe);
 
 		ppipe->pipe_state |= PIPE_EOF;
 		wakeup(ppipe);
 		KNOTE(&ppipe->pipe_sel.si_note, 0);
 		ppipe->pipe_peer = NULL;
 	}
 	/*
 	 * free resources
 	 */
 	if (PIPE_MTX(cpipe) != NULL) {
 		PIPE_UNLOCK(cpipe);
 		if (!hadpeer) {
 			mtx_destroy(PIPE_MTX(cpipe));
 			free(PIPE_MTX(cpipe), M_TEMP);
 		}
 	}
 	mtx_lock(&Giant);
 	pipe_free_kmem(cpipe);
 	uma_zfree(pipe_zone, cpipe);
 	mtx_unlock(&Giant);
 }
 
 /*ARGSUSED*/
 static int
 pipe_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct pipe *cpipe;
 
 	cpipe = (struct pipe *)kn->kn_fp->f_data;
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &pipe_rfiltops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &pipe_wfiltops;
 		cpipe = cpipe->pipe_peer;
 		if (cpipe == NULL)
 			/* other end of pipe has been closed */
 			return (EBADF);
 		break;
 	default:
 		return (1);
 	}
 	kn->kn_hook = cpipe;
 
 	PIPE_LOCK(cpipe);
 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
 	PIPE_UNLOCK(cpipe);
 	return (0);
 }
 
 static void
 filt_pipedetach(struct knote *kn)
 {
 	struct pipe *cpipe = (struct pipe *)kn->kn_hook;
 
 	PIPE_LOCK(cpipe);
 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
 	PIPE_UNLOCK(cpipe);
 }
 
 /*ARGSUSED*/
 static int
 filt_piperead(struct knote *kn, long hint)
 {
 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
 	struct pipe *wpipe = rpipe->pipe_peer;
 
 	PIPE_LOCK(rpipe);
 	kn->kn_data = rpipe->pipe_buffer.cnt;
 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
 		kn->kn_data = rpipe->pipe_map.cnt;
 
 	if ((rpipe->pipe_state & PIPE_EOF) ||
 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_flags |= EV_EOF;
 		PIPE_UNLOCK(rpipe);
 		return (1);
 	}
 	PIPE_UNLOCK(rpipe);
 	return (kn->kn_data > 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_pipewrite(struct knote *kn, long hint)
 {
 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
 	struct pipe *wpipe = rpipe->pipe_peer;
 
 	PIPE_LOCK(rpipe);
 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_data = 0;
 		kn->kn_flags |= EV_EOF; 
 		PIPE_UNLOCK(rpipe);
 		return (1);
 	}
 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 	if (wpipe->pipe_state & PIPE_DIRECTW)
 		kn->kn_data = 0;
 
 	PIPE_UNLOCK(rpipe);
 	return (kn->kn_data >= PIPE_BUF);
 }