Index: head/sys/fs/specfs/spec_vnops.c
===================================================================
--- head/sys/fs/specfs/spec_vnops.c	(revision 130550)
+++ head/sys/fs/specfs/spec_vnops.c	(revision 130551)
@@ -1,852 +1,852 @@
 /*
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)spec_vnops.c	8.14 (Berkeley) 5/21/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #include <sys/fcntl.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 #include <sys/tty.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 static int	spec_advlock(struct vop_advlock_args *);
 static int	spec_close(struct vop_close_args *);
 static int	spec_freeblks(struct vop_freeblks_args *);
 static int	spec_fsync(struct  vop_fsync_args *);
 static int	spec_getpages(struct vop_getpages_args *);
 static int	spec_ioctl(struct vop_ioctl_args *);
 static int	spec_kqfilter(struct vop_kqfilter_args *);
 static int	spec_open(struct vop_open_args *);
 static int	spec_poll(struct vop_poll_args *);
 static int	spec_print(struct vop_print_args *);
 static int	spec_read(struct vop_read_args *);
 static int	spec_specstrategy(struct vop_specstrategy_args *);
 static int	spec_write(struct vop_write_args *);
 
 vop_t **spec_vnodeop_p;
 static struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
 	{ &vop_default_desc,		(vop_t *) vop_defaultop },
 	{ &vop_access_desc,		(vop_t *) vop_ebadf },
 	{ &vop_advlock_desc,		(vop_t *) spec_advlock },
 	{ &vop_bmap_desc,		(vop_t *) vop_panic },
 	{ &vop_close_desc,		(vop_t *) spec_close },
 	{ &vop_create_desc,		(vop_t *) vop_panic },
 	{ &vop_freeblks_desc,		(vop_t *) spec_freeblks },
 	{ &vop_fsync_desc,		(vop_t *) spec_fsync },
 	{ &vop_getpages_desc,		(vop_t *) spec_getpages },
 	{ &vop_getwritemount_desc, 	(vop_t *) vop_stdgetwritemount },
 	{ &vop_ioctl_desc,		(vop_t *) spec_ioctl },
 	{ &vop_kqfilter_desc,		(vop_t *) spec_kqfilter },
 	{ &vop_lease_desc,		(vop_t *) vop_null },
 	{ &vop_link_desc,		(vop_t *) vop_panic },
 	{ &vop_mkdir_desc,		(vop_t *) vop_panic },
 	{ &vop_mknod_desc,		(vop_t *) vop_panic },
 	{ &vop_open_desc,		(vop_t *) spec_open },
 	{ &vop_pathconf_desc,		(vop_t *) vop_stdpathconf },
 	{ &vop_poll_desc,		(vop_t *) spec_poll },
 	{ &vop_print_desc,		(vop_t *) spec_print },
 	{ &vop_read_desc,		(vop_t *) spec_read },
 	{ &vop_readdir_desc,		(vop_t *) vop_panic },
 	{ &vop_readlink_desc,		(vop_t *) vop_panic },
 	{ &vop_reallocblks_desc,	(vop_t *) vop_panic },
 	{ &vop_reclaim_desc,		(vop_t *) vop_null },
 	{ &vop_remove_desc,		(vop_t *) vop_panic },
 	{ &vop_rename_desc,		(vop_t *) vop_panic },
 	{ &vop_rmdir_desc,		(vop_t *) vop_panic },
 	{ &vop_setattr_desc,		(vop_t *) vop_ebadf },
 	{ &vop_specstrategy_desc,	(vop_t *) spec_specstrategy },
 	{ &vop_strategy_desc,		(vop_t *) vop_panic },
 	{ &vop_symlink_desc,		(vop_t *) vop_panic },
 	{ &vop_write_desc,		(vop_t *) spec_write },
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc spec_vnodeop_opv_desc =
 	{ &spec_vnodeop_p, spec_vnodeop_entries };
 
 VNODEOP_SET(spec_vnodeop_opv_desc);
 
 int
 spec_vnoperate(ap)
 	struct vop_generic_args /* {
 		struct vnodeop_desc *a_desc;
 		<other random data follows, presumably>
 	} */ *ap;
 {
 	return (VOCALL(spec_vnodeop_p, ap->a_desc->vdesc_offset, ap));
 }
 
 /*
  * Open a special file.
  */
 /* ARGSUSED */
 static int
 spec_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	dev_t dev = vp->v_rdev;
 	int error;
 	struct cdevsw *dsw;
 
 	if (vp->v_type == VBLK)
 		return (ENXIO);
 
 	/* Don't allow open if fs is mounted -nodev. */
 	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
 		return (ENXIO);
 
 	if (dev == NODEV)
 		return (ENXIO);
 
 	dsw = devsw(dev);
 	if (dsw == NULL || dsw->d_open == NULL)
 		return (ENXIO);
 
 	/* Make this field valid before any I/O in d_open. */
 	if (dev->si_iosize_max == 0)
 		dev->si_iosize_max = DFLTPHYS;
 
 	/*
 	 * XXX: Disks get special billing here, but it is mostly wrong.
 	 * XXX: Disk partitions can overlap and the real checks should
 	 * XXX: take this into account, and consequently they need to
 	 * XXX: live in the disk slice code.  Some checks do.
 	 */
 	if (vn_isdisk(vp, NULL) && ap->a_cred != FSCRED &&
 	    (ap->a_mode & FWRITE)) {
 		/*
 		 * Never allow opens for write if the disk is mounted R/W.
 		 */
 		if (vp->v_rdev->si_mountpoint != NULL &&
 		    !(vp->v_rdev->si_mountpoint->mnt_flag & MNT_RDONLY))
 			return (EBUSY);
 
 		/*
 		 * When running in secure mode, do not allow opens
 		 * for writing if the disk is mounted.
 		 */
 		error = securelevel_ge(td->td_ucred, 1);
 		if (error && vfs_mountedon(vp))
 			return (error);
 
 		/*
 		 * When running in very secure mode, do not allow
 		 * opens for writing of any disks.
 		 */
 		error = securelevel_ge(td->td_ucred, 2);
 		if (error)
 			return (error);
 	}
 
 	/* XXX: Special casing of ttys for deadfs.  Probably redundant. */
 	if (dsw->d_flags & D_TTY)
 		vp->v_vflag |= VV_ISTTY;
 
 	VOP_UNLOCK(vp, 0, td);
 	dev_ref(dev);
 	cdevsw_ref(dsw);
 	if(!(dsw->d_flags & D_NEEDGIANT)) {
 		DROP_GIANT();
 		if (dsw->d_fdopen != NULL)
 			error = dsw->d_fdopen(dev, ap->a_mode, td, ap->a_fdidx);
 		else
 			error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 		PICKUP_GIANT();
 	} else if (dsw->d_fdopen != NULL)
 		error = dsw->d_fdopen(dev, ap->a_mode, td, ap->a_fdidx);
 	else
 		error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 	cdevsw_rel(dsw);
 	if (error != 0)
 		dev_rel(dev);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	if (error)
 		return (error);
 
 	if (dsw->d_flags & D_TTY) {
 		if (dev->si_tty) {
 			struct tty *tp;
 			tp = dev->si_tty;
 			if (!tp->t_stop) {
 				printf("Warning:%s: no t_stop, using nottystop\n", devtoname(dev));
 				tp->t_stop = nottystop;
 			}
 		}
 	}
 
 	if (vn_isdisk(vp, NULL)) {
 		if (!dev->si_bsize_phys)
 			dev->si_bsize_phys = DEV_BSIZE;
 	}
 	return (error);
 }
 
 /*
  * Vnode op for read
  */
 /* ARGSUSED */
 static int
 spec_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct thread *td;
 	struct uio *uio;
 	dev_t dev;
 	int error, resid;
 	struct cdevsw *dsw;
 
 	vp = ap->a_vp;
 	dev = vp->v_rdev;
 	uio = ap->a_uio;
 	td = uio->uio_td;
 	resid = uio->uio_resid;
 
 	if (resid == 0)
 		return (0);
 
 	dsw = devsw(dev);
 	VOP_UNLOCK(vp, 0, td);
 	KASSERT(dev->si_refcount > 0,
 	    ("specread() on un-referenced dev_t (%s)", devtoname(dev)));
 	cdevsw_ref(dsw);
 	if (!(dsw->d_flags & D_NEEDGIANT)) {
 		DROP_GIANT();
 		error = dsw->d_read(dev, uio, ap->a_ioflag);
 		PICKUP_GIANT();
 	} else
 		error = dsw->d_read(dev, uio, ap->a_ioflag);
 	cdevsw_rel(dsw);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		vfs_timestamp(&dev->si_atime);
 	return (error);
 }
 
 /*
  * Vnode op for write
  */
 /* ARGSUSED */
 static int
 spec_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct thread *td;
 	struct uio *uio;
 	dev_t dev;
 	int error, resid;
 	struct cdevsw *dsw;
 
 	vp = ap->a_vp;
 	dev = vp->v_rdev;
 	dsw = devsw(dev);
 	uio = ap->a_uio;
 	td = uio->uio_td;
 	resid = uio->uio_resid;
 
 	VOP_UNLOCK(vp, 0, td);
 	KASSERT(dev->si_refcount > 0,
 	    ("spec_write() on un-referenced dev_t (%s)", devtoname(dev)));
 	cdevsw_ref(dsw);
 	if (!(dsw->d_flags & D_NEEDGIANT)) {
 		DROP_GIANT();
 		error = dsw->d_write(dev, uio, ap->a_ioflag);
 		PICKUP_GIANT();
 	} else
 		error = dsw->d_write(dev, uio, ap->a_ioflag);
 	cdevsw_rel(dsw);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
 		vfs_timestamp(&dev->si_ctime);
 		dev->si_mtime = dev->si_ctime;
 	}
 	return (error);
 }
 
 /*
  * Device ioctl operation.
  */
 /* ARGSUSED */
 static int
 spec_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	dev_t dev;
 	int error;
 	struct cdevsw *dsw;
 
 	dev = ap->a_vp->v_rdev;
 	dsw = devsw(dev);
 	KASSERT(dev->si_refcount > 0,
 	    ("spec_ioctl() on un-referenced dev_t (%s)", devtoname(dev)));
 	cdevsw_ref(dsw);
 	if (!(dsw->d_flags & D_NEEDGIANT)) {
 		DROP_GIANT();
 		error = dsw->d_ioctl(dev, ap->a_command,
 		    ap->a_data, ap->a_fflag, ap->a_td);
 		PICKUP_GIANT();
 	} else 
 		error = dsw->d_ioctl(dev, ap->a_command,
 		    ap->a_data, ap->a_fflag, ap->a_td);
 	cdevsw_rel(dsw);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 spec_poll(ap)
 	struct vop_poll_args /* {
 		struct vnode *a_vp;
 		int  a_events;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	dev_t dev;
 	struct cdevsw *dsw;
 	int error;
 
 	dev = ap->a_vp->v_rdev;
 	dsw = devsw(dev);
 	KASSERT(dev->si_refcount > 0,
 	    ("spec_poll() on un-referenced dev_t (%s)", devtoname(dev)));
 	cdevsw_ref(dsw);
 	if (!(dsw->d_flags & D_NEEDGIANT)) {
 		/* XXX: not yet DROP_GIANT(); */
 		error = dsw->d_poll(dev, ap->a_events, ap->a_td);
 		/* XXX: not yet PICKUP_GIANT(); */
 	} else
 		error = dsw->d_poll(dev, ap->a_events, ap->a_td);
 	cdevsw_rel(dsw);
 	return(error);
 }
 
 /* ARGSUSED */
 static int
 spec_kqfilter(ap)
 	struct vop_kqfilter_args /* {
 		struct vnode *a_vp;
 		struct knote *a_kn;
 	} */ *ap;
 {
 	dev_t dev;
 	struct cdevsw *dsw;
 	int error;
 
 	dev = ap->a_vp->v_rdev;
 	dsw = devsw(dev);
 	KASSERT(dev->si_refcount > 0,
 	    ("spec_kqfilter() on un-referenced dev_t (%s)", devtoname(dev)));
 	cdevsw_ref(dsw);
 	if (!(dsw->d_flags & D_NEEDGIANT)) {
 		DROP_GIANT();
 		error = dsw->d_kqfilter(dev, ap->a_kn);
 		PICKUP_GIANT();
 	} else
 		error = dsw->d_kqfilter(dev, ap->a_kn);
 	cdevsw_rel(dsw);
 	return (error);
 }
 
 /*
  * Synch buffers associated with a block device
  */
 /* ARGSUSED */
 static int
 spec_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int  a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 	if (!vn_isdisk(ap->a_vp, NULL))
 		return (0);
 
 	return (vop_stdfsync(ap));
 }
 
 /*
  * Mutex to use when delaying niced I/O bound processes in spec_strategy().
  */
 static struct mtx strategy_mtx;
 static void
 strategy_init(void)
 {
 
 	mtx_init(&strategy_mtx, "strategy", NULL, MTX_DEF);
 }
 SYSINIT(strategy, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, strategy_init, NULL)
 
 static int doslowdown = 0;
 SYSCTL_INT(_debug, OID_AUTO, doslowdown, CTLFLAG_RW, &doslowdown, 0, "");
 
 /*
  * Just call the device strategy routine
  */
 static int
 spec_xstrategy(struct vnode *vp, struct buf *bp)
 {
 	struct mount *mp;
 	struct cdevsw *dsw;
 	struct thread *td = curthread;
 	
 	KASSERT(bp->b_iocmd == BIO_READ ||
 		bp->b_iocmd == BIO_WRITE ||
 		bp->b_iocmd == BIO_DELETE, 
 		("Wrong b_iocmd buf=%p cmd=%d", bp, bp->b_iocmd));
 
 	/*
 	 * Slow down disk requests for niced processes.
 	 */
-	if (doslowdown && td && td->td_ksegrp->kg_nice > 0) {
+	if (doslowdown && td && td->td_proc->p_nice > 0) {
 		mtx_lock(&strategy_mtx);
 		msleep(&strategy_mtx, &strategy_mtx,
 		    PPAUSE | PCATCH | PDROP, "ioslow",
-		    td->td_ksegrp->kg_nice);
+		    td->td_proc->p_nice);
 	}
 	/*
 	 * Collect statistics on synchronous and asynchronous read
 	 * and write counts for disks that have associated filesystems.
 	 */
 	if (vn_isdisk(vp, NULL) && (mp = vp->v_rdev->si_mountpoint) != NULL) {
 		if (bp->b_iocmd == BIO_WRITE) {
 			if (bp->b_lock.lk_lockholder == LK_KERNPROC)
 				mp->mnt_stat.f_asyncwrites++;
 			else
 				mp->mnt_stat.f_syncwrites++;
 		} else {
 			if (bp->b_lock.lk_lockholder == LK_KERNPROC)
 				mp->mnt_stat.f_asyncreads++;
 			else
 				mp->mnt_stat.f_syncreads++;
 		}
 	}
 	dsw = devsw(bp->b_dev);
 	if (dsw == NULL) {
 		bp->b_io.bio_error = ENXIO;
 		bp->b_io.bio_flags |= BIO_ERROR;
 		biodone(&bp->b_io);
 		return (0);
 	}
 	KASSERT(dsw->d_strategy != NULL,
 	   ("No strategy on dev %s responsible for buffer %p\n",
 	   devtoname(bp->b_dev), bp));
 	
 	if (!(dsw->d_flags & D_NEEDGIANT)) {
 		/* XXX: notyet DROP_GIANT(); */
 		DEV_STRATEGY(bp);
 		/* XXX: notyet PICKUP_GIANT(); */
 	} else
 		DEV_STRATEGY(bp);
 		
 	return (0);
 }
 
 static int
 spec_specstrategy(ap)
 	struct vop_specstrategy_args /* {
 		struct vnode *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 
 	KASSERT(ap->a_vp->v_rdev == ap->a_bp->b_dev,
 	    ("%s, dev %s != %s", __func__,
 	    devtoname(ap->a_vp->v_rdev),
 	    devtoname(ap->a_bp->b_dev)));
 	return spec_xstrategy(ap->a_vp, ap->a_bp);
 }
 
 static int
 spec_freeblks(ap)
 	struct vop_freeblks_args /* {
 		struct vnode *a_vp;
 		daddr_t a_addr;
 		daddr_t a_length;
 	} */ *ap;
 {
 	struct buf *bp;
 
 	/*
 	 * XXX: This assumes that strategy does the deed right away.
 	 * XXX: this may not be TRTTD.
 	 */
 	if ((ap->a_vp->v_rdev->si_flags & SI_CANDELETE) == 0)
 		return (0);
 	bp = geteblk(ap->a_length);
 	bp->b_iocmd = BIO_DELETE;
 	bp->b_dev = ap->a_vp->v_rdev;
 	bp->b_blkno = ap->a_addr;
 	bp->b_offset = dbtob(ap->a_addr);
 	bp->b_iooffset = bp->b_offset;
 	bp->b_bcount = ap->a_length;
 	BUF_KERNPROC(bp);
 	DEV_STRATEGY(bp);
 	return (0);
 }
 
 /*
  * Device close routine
  */
 /* ARGSUSED */
 static int
 spec_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp, *oldvp;
 	struct thread *td = ap->a_td;
 	dev_t dev = vp->v_rdev;
 	struct cdevsw *dsw;
 	int error;
 
 	/*
 	 * Hack: a tty device that is a controlling terminal
 	 * has a reference from the session structure.
 	 * We cannot easily tell that a character device is
 	 * a controlling terminal, unless it is the closing
 	 * process' controlling terminal.  In that case,
 	 * if the reference count is 2 (this last descriptor
 	 * plus the session), release the reference from the session.
 	 */
 
 	/*
 	 * This needs to be rewritten to take the vp interlock into
 	 * consideration.
 	 */
 
 	dsw = devsw(dev);
 	oldvp = NULL;
 	sx_xlock(&proctree_lock);
 	if (td && vp == td->td_proc->p_session->s_ttyvp) {
 		SESS_LOCK(td->td_proc->p_session);
 		VI_LOCK(vp);
 		if (count_dev(dev) == 2 && (vp->v_iflag & VI_XLOCK) == 0) {
 			td->td_proc->p_session->s_ttyvp = NULL;
 			oldvp = vp;
 		}
 		VI_UNLOCK(vp);
 		SESS_UNLOCK(td->td_proc->p_session);
 	}
 	sx_xunlock(&proctree_lock);
 	if (oldvp != NULL)
 		vrele(oldvp);
 	/*
 	 * We do not want to really close the device if it
 	 * is still in use unless we are trying to close it
 	 * forcibly. Since every use (buffer, vnode, swap, cmap)
 	 * holds a reference to the vnode, and because we mark
 	 * any other vnodes that alias this device, when the
 	 * sum of the reference counts on all the aliased
 	 * vnodes descends to one, we are on last close.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_iflag & VI_XLOCK) {
 		/* Forced close. */
 	} else if (dsw->d_flags & D_TRACKCLOSE) {
 		/* Keep device updated on status. */
 	} else if (count_dev(dev) > 1) {
 		VI_UNLOCK(vp);
 		return (0);
 	}
 	VI_UNLOCK(vp);
 	KASSERT(dev->si_refcount > 0,
 	    ("spec_close() on un-referenced dev_t (%s)", devtoname(dev)));
 	cdevsw_ref(dsw);
 	if (!(dsw->d_flags & D_NEEDGIANT)) {
 		DROP_GIANT();
 		error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td);
 		PICKUP_GIANT();
 	} else
 		error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td);
 	cdevsw_rel(dsw);
 	dev_rel(dev);
 	return (error);
 }
 
 /*
  * Print out the contents of a special device vnode.
  */
 static int
 spec_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 
 	printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev));
 	return (0);
 }
 
 /*
  * Special device advisory byte-level locks.
  */
 /* ARGSUSED */
 static int
 spec_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 
 	return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL);
 }
 
 static int
 spec_getpages(ap)
 	struct vop_getpages_args *ap;
 {
 	vm_offset_t kva;
 	int error;
 	int i, pcount, size, s;
 	daddr_t blkno;
 	struct buf *bp;
 	vm_page_t m;
 	vm_ooffset_t offset;
 	int toff, nextoff, nread;
 	struct vnode *vp = ap->a_vp;
 	int blksiz;
 	int gotreqpage;
 
 	GIANT_REQUIRED;
 
 	error = 0;
 	pcount = round_page(ap->a_count) / PAGE_SIZE;
 
 	/*
 	 * Calculate the offset of the transfer and do a sanity check.
 	 * FreeBSD currently only supports an 8 TB range due to b_blkno
 	 * being in DEV_BSIZE ( usually 512 ) byte chunks on call to
 	 * VOP_STRATEGY.  XXX
 	 */
 	offset = IDX_TO_OFF(ap->a_m[0]->pindex) + ap->a_offset;
 	blkno = btodb(offset);
 
 	/*
 	 * Round up physical size for real devices.  We cannot round using
 	 * v_mount's block size data because v_mount has nothing to do with
 	 * the device.  i.e. it's usually '/dev'.  We need the physical block
 	 * size for the device itself.
 	 *
 	 * We can't use v_rdev->si_mountpoint because it only exists when the
 	 * block device is mounted.  However, we can use v_rdev.
 	 */
 
 	if (vn_isdisk(vp, NULL))
 		blksiz = vp->v_rdev->si_bsize_phys;
 	else
 		blksiz = DEV_BSIZE;
 
 	size = (ap->a_count + blksiz - 1) & ~(blksiz - 1);
 
 	bp = getpbuf(NULL);
 	kva = (vm_offset_t)bp->b_data;
 
 	/*
 	 * Map the pages to be read into the kva.
 	 */
 	pmap_qenter(kva, ap->a_m, pcount);
 
 	/* Build a minimal buffer header. */
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = bdone;
 
 	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 	bp->b_rcred = crhold(curthread->td_ucred);
 	bp->b_wcred = crhold(curthread->td_ucred);
 	bp->b_iooffset = offset;
 	bp->b_blkno = blkno;
 	bp->b_lblkno = blkno;
 	pbgetvp(ap->a_vp, bp);
 	bp->b_bcount = size;
 	bp->b_bufsize = size;
 	bp->b_resid = 0;
 	bp->b_runningbufspace = bp->b_bufsize;
 	runningbufspace += bp->b_runningbufspace;
 
 	cnt.v_vnodein++;
 	cnt.v_vnodepgsin += pcount;
 
 	/* Do the input. */
 	spec_xstrategy(bp->b_vp, bp);
 
 	s = splbio();
 	bwait(bp, PVM, "spread");
 	splx(s);
 
 	if ((bp->b_ioflags & BIO_ERROR) != 0) {
 		if (bp->b_error)
 			error = bp->b_error;
 		else
 			error = EIO;
 	}
 
 	nread = size - bp->b_resid;
 
 	if (nread < ap->a_count) {
 		bzero((caddr_t)kva + nread,
 			ap->a_count - nread);
 	}
 	pmap_qremove(kva, pcount);
 
 	gotreqpage = 0;
 	/*
 	 * While the page is busy, its object field is immutable.
 	 */
 	VM_OBJECT_LOCK(ap->a_m[ap->a_reqpage]->object);
 	vm_page_lock_queues();
 	for (i = 0, toff = 0; i < pcount; i++, toff = nextoff) {
 		nextoff = toff + PAGE_SIZE;
 		m = ap->a_m[i];
 
 		if (nextoff <= nread) {
 			m->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(m);
 		} else if (toff < nread) {
 			/*
 			 * Since this is a VM request, we have to supply the
 			 * unaligned offset to allow vm_page_set_validclean()
 			 * to zero sub-DEV_BSIZE'd portions of the page.
 			 */
 			vm_page_set_validclean(m, 0, nread - toff);
 		} else {
 			m->valid = 0;
 			vm_page_undirty(m);
 		}
 
 		if (i != ap->a_reqpage) {
 			/*
 			 * Just in case someone was asking for this page we
 			 * now tell them that it is ok to use.
 			 */
 			if (!error || (m->valid == VM_PAGE_BITS_ALL)) {
 				if (m->valid) {
 					if (m->flags & PG_WANTED) {
 						vm_page_activate(m);
 					} else {
 						vm_page_deactivate(m);
 					}
 					vm_page_wakeup(m);
 				} else {
 					vm_page_free(m);
 				}
 			} else {
 				vm_page_free(m);
 			}
 		} else if (m->valid) {
 			gotreqpage = 1;
 			/*
 			 * Since this is a VM request, we need to make the
 			 * entire page presentable by zeroing invalid sections.
 			 */
 			if (m->valid != VM_PAGE_BITS_ALL)
 				vm_page_zero_invalid(m, FALSE);
 		}
 	}
 	vm_page_unlock_queues();
 	if (!gotreqpage) {
 		m = ap->a_m[ap->a_reqpage];
 		printf(
 	    "spec_getpages:(%s) I/O read failure: (error=%d) bp %p vp %p\n",
 			devtoname(bp->b_dev), error, bp, bp->b_vp);
 		printf(
 	    "               size: %d, resid: %ld, a_count: %d, valid: 0x%lx\n",
 		    size, bp->b_resid, ap->a_count, (u_long)m->valid);
 		printf(
 	    "               nread: %d, reqpage: %d, pindex: %lu, pcount: %d\n",
 		    nread, ap->a_reqpage, (u_long)m->pindex, pcount);
 		VM_OBJECT_UNLOCK(m->object);
 		/*
 		 * Free the buffer header back to the swap buffer pool.
 		 */
 		relpbuf(bp, NULL);
 		return VM_PAGER_ERROR;
 	}
 	VM_OBJECT_UNLOCK(ap->a_m[ap->a_reqpage]->object);
 	/*
 	 * Free the buffer header back to the swap buffer pool.
 	 */
 	relpbuf(bp, NULL);
 	return VM_PAGER_OK;
 }
Index: head/sys/i386/ibcs2/ibcs2_misc.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_misc.c	(revision 130550)
+++ head/sys/i386/ibcs2/ibcs2_misc.c	(revision 130551)
@@ -1,1205 +1,1205 @@
 /*
  * Copyright (c) 1995 Steven Wallace
  * Copyright (c) 1994, 1995 Scott Bartram
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This software was developed by the Computer Systems Engineering group
  * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
  * contributed to Berkeley.
  *
  * All advertising materials mentioning features or use of this software
  * must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Lawrence Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Header: sun_misc.c,v 1.16 93/04/07 02:46:27 torek Exp 
  *
  *	@(#)sun_misc.c	8.1 (Berkeley) 6/18/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * IBCS2 compatibility module.
  *
  * IBCS2 system calls that are implemented differently in BSD are
  * handled here.
  */
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/file.h>			/* Must come after sys/malloc.h */
 #include <sys/mutex.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/time.h>
 #include <sys/times.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 
 #include <machine/cpu.h>
 
 #include <i386/ibcs2/ibcs2_dirent.h>
 #include <i386/ibcs2/ibcs2_signal.h>
 #include <i386/ibcs2/ibcs2_proto.h>
 #include <i386/ibcs2/ibcs2_unistd.h>
 #include <i386/ibcs2/ibcs2_util.h>
 #include <i386/ibcs2/ibcs2_utime.h>
 #include <i386/ibcs2/ibcs2_xenix.h>
 
 int
 ibcs2_ulimit(td, uap)
 	struct thread *td;
 	struct ibcs2_ulimit_args *uap;
 {
 	struct rlimit rl;
 	struct proc *p;
 	int error;
 #define IBCS2_GETFSIZE		1
 #define IBCS2_SETFSIZE		2
 #define IBCS2_GETPSIZE		3
 #define IBCS2_GETDTABLESIZE	4
 
 	p = td->td_proc;
 	switch (uap->cmd) {
 	case IBCS2_GETFSIZE:
 		PROC_LOCK(p);
 		td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE);
 		PROC_UNLOCK(p);
 		if (td->td_retval[0] == -1)
 			td->td_retval[0] = 0x7fffffff;
 		return 0;
 	case IBCS2_SETFSIZE:
 		PROC_LOCK(p);
 		rl.rlim_max = lim_max(p, RLIMIT_FSIZE);
 		PROC_UNLOCK(p);
 		rl.rlim_cur = uap->newlimit;
 		error = kern_setrlimit(td, RLIMIT_FSIZE, &rl);
 		if (!error) {
 			PROC_LOCK(p);
 			td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE);
 			PROC_UNLOCK(p);
 		} else {
 			DPRINTF(("failed "));
 		}
 		return error;
 	case IBCS2_GETPSIZE:
 		PROC_LOCK(p);
 		td->td_retval[0] = lim_cur(p, RLIMIT_RSS); /* XXX */
 		PROC_UNLOCK(p);
 		return 0;
 	case IBCS2_GETDTABLESIZE:
 		uap->cmd = IBCS2_SC_OPEN_MAX;
 		return ibcs2_sysconf(td, (struct ibcs2_sysconf_args *)uap);
 	default:
 		return ENOSYS;
 	}
 }
 
 #define IBCS2_WSTOPPED       0177
 #define IBCS2_STOPCODE(sig)  ((sig) << 8 | IBCS2_WSTOPPED)
 int
 ibcs2_wait(td, uap)
 	struct thread *td;
 	struct ibcs2_wait_args *uap;
 {
 	int error, options, status;
 	int *statusp;
 	pid_t pid;
         struct trapframe *tf = td->td_frame;
 	
 	if ((tf->tf_eflags & (PSL_Z|PSL_PF|PSL_N|PSL_V))
             == (PSL_Z|PSL_PF|PSL_N|PSL_V)) {
 		/* waitpid */
 		pid = uap->a1;
 		statusp = (int *)uap->a2;
 		options = uap->a3;
 	} else {
 		/* wait */
 		pid = WAIT_ANY;
 		statusp = (int *)uap->a1;
 		options = 0;
 	}
 	error = kern_wait(td, pid, &status, options, NULL);
 	if (error)
 		return error;
 	if (statusp) {
 		/*
 		 * Convert status/signal result.
 		 */
 		if (WIFSTOPPED(status)) {
 			if (WSTOPSIG(status) <= 0 ||
 			    WSTOPSIG(status) > IBCS2_SIGTBLSZ)
 				return (EINVAL);
 			status =
 			  IBCS2_STOPCODE(bsd_to_ibcs2_sig[_SIG_IDX(WSTOPSIG(status))]);
 		} else if (WIFSIGNALED(status)) {
 			if (WTERMSIG(status) <= 0 ||
 			    WTERMSIG(status) > IBCS2_SIGTBLSZ)
 				return (EINVAL);
 			status = bsd_to_ibcs2_sig[_SIG_IDX(WTERMSIG(status))];
 		}
 		/* else exit status -- identical */
 
 		/* record result/status */
 		td->td_retval[1] = status;
 		return copyout(&status, statusp, sizeof(status));
 	}
 
 	return 0;
 }
 
 int
 ibcs2_execv(td, uap)
 	struct thread *td;
 	struct ibcs2_execv_args *uap;
 {
 	struct execve_args ea;
 	caddr_t sg = stackgap_init();
 
         CHECKALTEXIST(td, &sg, uap->path);
 	ea.fname = uap->path;
 	ea.argv = uap->argp;
 	ea.envv = NULL;
 	return execve(td, &ea);
 }
 
 int
 ibcs2_execve(td, uap) 
         struct thread *td;
         struct ibcs2_execve_args *uap;
 {
         caddr_t sg = stackgap_init();
         CHECKALTEXIST(td, &sg, uap->path);
         return execve(td, (struct execve_args *)uap);
 }
 
 int
 ibcs2_umount(td, uap)
 	struct thread *td;
 	struct ibcs2_umount_args *uap;
 {
 	struct unmount_args um;
 
 	um.path = uap->name;
 	um.flags = 0;
 	return unmount(td, &um);
 }
 
 int
 ibcs2_mount(td, uap)
 	struct thread *td;
 	struct ibcs2_mount_args *uap;
 {
 #ifdef notyet
 	int oflags = uap->flags, nflags, error;
 	char fsname[MFSNAMELEN];
 
 	if (oflags & (IBCS2_MS_NOSUB | IBCS2_MS_SYS5))
 		return (EINVAL);
 	if ((oflags & IBCS2_MS_NEWTYPE) == 0)
 		return (EINVAL);
 	nflags = 0;
 	if (oflags & IBCS2_MS_RDONLY)
 		nflags |= MNT_RDONLY;
 	if (oflags & IBCS2_MS_NOSUID)
 		nflags |= MNT_NOSUID;
 	if (oflags & IBCS2_MS_REMOUNT)
 		nflags |= MNT_UPDATE;
 	uap->flags = nflags;
 
 	if (error = copyinstr((caddr_t)uap->type, fsname, sizeof fsname,
 			      (u_int *)0))
 		return (error);
 
 	if (strcmp(fsname, "4.2") == 0) {
 		uap->type = (caddr_t)STACK_ALLOC();
 		if (error = copyout("ufs", uap->type, sizeof("ufs")))
 			return (error);
 	} else if (strcmp(fsname, "nfs") == 0) {
 		struct ibcs2_nfs_args sna;
 		struct sockaddr_in sain;
 		struct nfs_args na;
 		struct sockaddr sa;
 
 		if (error = copyin(uap->data, &sna, sizeof sna))
 			return (error);
 		if (error = copyin(sna.addr, &sain, sizeof sain))
 			return (error);
 		bcopy(&sain, &sa, sizeof sa);
 		sa.sa_len = sizeof(sain);
 		uap->data = (caddr_t)STACK_ALLOC();
 		na.addr = (struct sockaddr *)((int)uap->data + sizeof na);
 		na.sotype = SOCK_DGRAM;
 		na.proto = IPPROTO_UDP;
 		na.fh = (nfsv2fh_t *)sna.fh;
 		na.flags = sna.flags;
 		na.wsize = sna.wsize;
 		na.rsize = sna.rsize;
 		na.timeo = sna.timeo;
 		na.retrans = sna.retrans;
 		na.hostname = sna.hostname;
 
 		if (error = copyout(&sa, na.addr, sizeof sa))
 			return (error);
 		if (error = copyout(&na, uap->data, sizeof na))
 			return (error);
 	}
 	return (mount(td, uap));
 #else
 	return EINVAL;
 #endif
 }
 
 /*
  * Read iBCS2-style directory entries.  We suck them into kernel space so
  * that they can be massaged before being copied out to user code.  Like
  * SunOS, we squish out `empty' entries.
  *
  * This is quite ugly, but what do you expect from compatibility code?
  */
 
 int
 ibcs2_getdents(td, uap)
 	struct thread *td;
 	register struct ibcs2_getdents_args *uap;
 {
 	register struct vnode *vp;
 	register caddr_t inp, buf;	/* BSD-format */
 	register int len, reclen;	/* BSD-format */
 	register caddr_t outp;		/* iBCS2-format */
 	register int resid;		/* iBCS2-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct ibcs2_dirent idb;
 	off_t off;			/* true file offset */
 	int buflen, error, eofflag;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies;
 #define	BSD_DIRENT(cp)		((struct dirent *)(cp))
 #define	IBCS2_RECLEN(reclen)	(reclen + sizeof(u_short))
 
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 	if (vp->v_type != VDIR) {	/* XXX  vnode readdir op should do this */
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 
 	off = fp->f_offset;
 #define	DIRBLKSIZ	512		/* XXX we used to use ufs's DIRBLKSIZ */
 	buflen = max(DIRBLKSIZ, uap->nbytes);
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 #ifdef MAC
 	error = mac_check_vnode_readdir(td->td_ucred, vp);
 	if (error)
 		goto out;
 #endif
 
 	/*
 	 * First we read into the malloc'ed buffer, then
 	 * we massage it into user space, one record at a time.
 	 */
 	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0)
 		goto out;
 	inp = buf;
 	outp = uap->buf;
 	resid = uap->nbytes;
 	if ((len = buflen - auio.uio_resid) <= 0)
 		goto eof;
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			len -= BSD_DIRENT(inp)->d_reclen;
 			inp += BSD_DIRENT(inp)->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	for (; len > 0; len -= reclen) {
 		if (cookiep && ncookies == 0)
 			break;
 		reclen = BSD_DIRENT(inp)->d_reclen;
 		if (reclen & 3) {
 		        printf("ibcs2_getdents: reclen=%d\n", reclen);
 		        error = EFAULT;
 			goto out;
 		}
 		if (BSD_DIRENT(inp)->d_fileno == 0) {
 			inp += reclen;	/* it is a hole; squish it out */
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 			continue;
 		}
 		if (reclen > len || resid < IBCS2_RECLEN(reclen)) {
 			/* entry too big for buffer, so just stop */
 			outp++;
 			break;
 		}
 		/*
 		 * Massage in place to make an iBCS2-shaped dirent (otherwise
 		 * we have to worry about touching user memory outside of
 		 * the copyout() call).
 		 */
 		idb.d_ino = (ibcs2_ino_t)BSD_DIRENT(inp)->d_fileno;
 		idb.d_off = (ibcs2_off_t)off;
 		idb.d_reclen = (u_short)IBCS2_RECLEN(reclen);
 		if ((error = copyout((caddr_t)&idb, outp, 10)) != 0 ||
 		    (error = copyout(BSD_DIRENT(inp)->d_name, outp + 10,
 				     BSD_DIRENT(inp)->d_namlen + 1)) != 0)
 			goto out;
 		/* advance past this real entry */
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 		inp += reclen;
 		/* advance output past iBCS2-shaped entry */
 		outp += IBCS2_RECLEN(reclen);
 		resid -= IBCS2_RECLEN(reclen);
 	}
 	/* if we squished out the whole block, try again */
 	if (outp == uap->buf)
 		goto again;
 	fp->f_offset = off;		/* update the vnode offset */
 eof:
 	td->td_retval[0] = uap->nbytes - resid;
 out:
 	VOP_UNLOCK(vp, 0, td);
 	fdrop(fp, td);
 	if (cookies)
 		free(cookies, M_TEMP);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_read(td, uap)
 	struct thread *td;
 	struct ibcs2_read_args *uap;
 {
 	register struct vnode *vp;
 	register caddr_t inp, buf;	/* BSD-format */
 	register int len, reclen;	/* BSD-format */
 	register caddr_t outp;		/* iBCS2-format */
 	register int resid;		/* iBCS2-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct ibcs2_direct {
 		ibcs2_ino_t ino;
 		char name[14];
 	} idb;
 	off_t off;			/* true file offset */
 	int buflen, error, eofflag, size;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies;
 
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) {
 		if (error == EINVAL)
 			return read(td, (struct read_args *)uap);
 		else
 			return error;
 	}
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 	if (vp->v_type != VDIR) {
 		fdrop(fp, td);
 		return read(td, (struct read_args *)uap);
 	}
 
 	off = fp->f_offset;
 	if (vp->v_type != VDIR)
 		return read(td, (struct read_args *)uap);
 
 	DPRINTF(("ibcs2_read: read directory\n"));
 
 	buflen = max(DIRBLKSIZ, uap->nbytes);
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 #ifdef MAC
 	error = mac_check_vnode_readdir(td->td_ucred, vp);
 	if (error)
 		goto out;
 #endif
 
 	/*
 	 * First we read into the malloc'ed buffer, then
 	 * we massage it into user space, one record at a time.
 	 */
 	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) {
 		DPRINTF(("VOP_READDIR failed: %d\n", error));
 		goto out;
 	}
 	inp = buf;
 	outp = uap->buf;
 	resid = uap->nbytes;
 	if ((len = buflen - auio.uio_resid) <= 0)
 		goto eof;
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			len -= BSD_DIRENT(inp)->d_reclen;
 			inp += BSD_DIRENT(inp)->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	for (; len > 0 && resid > 0; len -= reclen) {
 		if (cookiep && ncookies == 0)
 			break;
 		reclen = BSD_DIRENT(inp)->d_reclen;
 		if (reclen & 3) {
 		        printf("ibcs2_read: reclen=%d\n", reclen);
 		        error = EFAULT;
 			goto out;
 		}
 		if (BSD_DIRENT(inp)->d_fileno == 0) {
 			inp += reclen;	/* it is a hole; squish it out */
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 			continue;
 		}
 		if (reclen > len || resid < sizeof(struct ibcs2_direct)) {
 			/* entry too big for buffer, so just stop */
 			outp++;
 			break;
 		}
 		/*
 		 * Massage in place to make an iBCS2-shaped dirent (otherwise
 		 * we have to worry about touching user memory outside of
 		 * the copyout() call).
 		 *
 		 * TODO: if length(filename) > 14, then break filename into
 		 * multiple entries and set inode = 0xffff except last
 		 */
 		idb.ino = (BSD_DIRENT(inp)->d_fileno > 0xfffe) ? 0xfffe :
 			BSD_DIRENT(inp)->d_fileno;
 		(void)copystr(BSD_DIRENT(inp)->d_name, idb.name, 14, &size);
 		bzero(idb.name + size, 14 - size);
 		if ((error = copyout(&idb, outp, sizeof(struct ibcs2_direct))) != 0)
 			goto out;
 		/* advance past this real entry */
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 		inp += reclen;
 		/* advance output past iBCS2-shaped entry */
 		outp += sizeof(struct ibcs2_direct);
 		resid -= sizeof(struct ibcs2_direct);
 	}
 	/* if we squished out the whole block, try again */
 	if (outp == uap->buf)
 		goto again;
 	fp->f_offset = off;		/* update the vnode offset */
 eof:
 	td->td_retval[0] = uap->nbytes - resid;
 out:
 	VOP_UNLOCK(vp, 0, td);
 	fdrop(fp, td);
 	if (cookies)
 		free(cookies, M_TEMP);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_mknod(td, uap)
 	struct thread *td;
 	struct ibcs2_mknod_args *uap;
 {
         caddr_t sg = stackgap_init();
 
         CHECKALTCREAT(td, &sg, uap->path);
 	if (S_ISFIFO(uap->mode)) {
                 struct mkfifo_args ap;
                 ap.path = uap->path;
                 ap.mode = uap->mode;
 		return mkfifo(td, &ap);
 	} else {
                 struct mknod_args ap;
                 ap.path = uap->path;
                 ap.mode = uap->mode;
                 ap.dev = uap->dev;
                 return mknod(td, &ap);
 	}
 }
 
 int
 ibcs2_getgroups(td, uap)
 	struct thread *td;
 	struct ibcs2_getgroups_args *uap;
 {
 	int error, i;
 	ibcs2_gid_t *iset = NULL;
 	struct getgroups_args sa;
 	gid_t *gp;
 	caddr_t sg = stackgap_init();
 
 	if (uap->gidsetsize < 0)
 		return (EINVAL);
 	if (uap->gidsetsize > NGROUPS_MAX)
 		uap->gidsetsize = NGROUPS_MAX;
 	sa.gidsetsize = uap->gidsetsize;
 	if (uap->gidsetsize) {
 		sa.gidset = stackgap_alloc(&sg, NGROUPS_MAX *
 						    sizeof(gid_t *));
 		iset = stackgap_alloc(&sg, uap->gidsetsize *
 				      sizeof(ibcs2_gid_t));
 	}
 	if ((error = getgroups(td, &sa)) != 0)
 		return error;
 	if (uap->gidsetsize == 0)
 		return 0;
 
 	for (i = 0, gp = sa.gidset; i < td->td_retval[0]; i++)
 		iset[i] = (ibcs2_gid_t)*gp++;
 	if (td->td_retval[0] && (error = copyout((caddr_t)iset,
 					  (caddr_t)uap->gidset,
 					  sizeof(ibcs2_gid_t) * td->td_retval[0])))
 		return error;
         return 0;
 }
 
 int
 ibcs2_setgroups(td, uap)
 	struct thread *td;
 	struct ibcs2_setgroups_args *uap;
 {
 	int error, i;
 	ibcs2_gid_t *iset;
 	struct setgroups_args sa;
 	gid_t *gp;
 	caddr_t sg = stackgap_init();
 
 	if (uap->gidsetsize < 0 || uap->gidsetsize > NGROUPS_MAX)
 		return (EINVAL);
 	sa.gidsetsize = uap->gidsetsize;
 	sa.gidset = stackgap_alloc(&sg, sa.gidsetsize *
 					    sizeof(gid_t *));
 	iset = stackgap_alloc(&sg, sa.gidsetsize *
 			      sizeof(ibcs2_gid_t *));
 	if (sa.gidsetsize) {
 		if ((error = copyin((caddr_t)uap->gidset, (caddr_t)iset, 
 				   sizeof(ibcs2_gid_t *) *
 				   uap->gidsetsize)) != 0)
 			return error;
 	}
 	for (i = 0, gp = sa.gidset; i < sa.gidsetsize; i++)
 		*gp++ = (gid_t)iset[i];
 	return setgroups(td, &sa);
 }
 
 int
 ibcs2_setuid(td, uap)
 	struct thread *td;
 	struct ibcs2_setuid_args *uap;
 {
 	struct setuid_args sa;
 
 	sa.uid = (uid_t)uap->uid;
 	return setuid(td, &sa);
 }
 
 int
 ibcs2_setgid(td, uap)
 	struct thread *td;
 	struct ibcs2_setgid_args *uap;
 {
 	struct setgid_args sa;
 
 	sa.gid = (gid_t)uap->gid;
 	return setgid(td, &sa);
 }
 
 int
 ibcs2_time(td, uap)
 	struct thread *td;
 	struct ibcs2_time_args *uap;
 {
 	struct timeval tv;
 
 	microtime(&tv);
 	td->td_retval[0] = tv.tv_sec;
 	if (uap->tp)
 		return copyout((caddr_t)&tv.tv_sec, (caddr_t)uap->tp,
 			       sizeof(ibcs2_time_t));
 	else
 		return 0;
 }
 
 int
 ibcs2_pathconf(td, uap)
 	struct thread *td;
 	struct ibcs2_pathconf_args *uap;
 {
 	uap->name++;	/* iBCS2 _PC_* defines are offset by one */
         return pathconf(td, (struct pathconf_args *)uap);
 }
 
 int
 ibcs2_fpathconf(td, uap)
 	struct thread *td;
 	struct ibcs2_fpathconf_args *uap;
 {
 	uap->name++;	/* iBCS2 _PC_* defines are offset by one */
         return fpathconf(td, (struct fpathconf_args *)uap);
 }
 
 int
 ibcs2_sysconf(td, uap)
 	struct thread *td;
 	struct ibcs2_sysconf_args *uap;
 {
 	int mib[2], value, len, error;
 	struct proc *p;
 
 	p = td->td_proc;
 	switch(uap->name) {
 	case IBCS2_SC_ARG_MAX:
 		mib[1] = KERN_ARGMAX;
 		break;
 
 	case IBCS2_SC_CHILD_MAX:
 		PROC_LOCK(p);
 		td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NPROC);
 		PROC_UNLOCK(p);
 		return 0;
 
 	case IBCS2_SC_CLK_TCK:
 		td->td_retval[0] = hz;
 		return 0;
 
 	case IBCS2_SC_NGROUPS_MAX:
 		mib[1] = KERN_NGROUPS;
 		break;
 
 	case IBCS2_SC_OPEN_MAX:
 		PROC_LOCK(p);
 		td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NOFILE);
 		PROC_UNLOCK(p);
 		return 0;
 		
 	case IBCS2_SC_JOB_CONTROL:
 		mib[1] = KERN_JOB_CONTROL;
 		break;
 		
 	case IBCS2_SC_SAVED_IDS:
 		mib[1] = KERN_SAVED_IDS;
 		break;
 		
 	case IBCS2_SC_VERSION:
 		mib[1] = KERN_POSIX1;
 		break;
 		
 	case IBCS2_SC_PASS_MAX:
 		td->td_retval[0] = 128;		/* XXX - should we create PASS_MAX ? */
 		return 0;
 
 	case IBCS2_SC_XOPEN_VERSION:
 		td->td_retval[0] = 2;		/* XXX: What should that be? */
 		return 0;
 		
 	default:
 		return EINVAL;
 	}
 
 	mib[0] = CTL_KERN;
 	len = sizeof(value);
 	error = kernel_sysctl(td, mib, 2, &value, &len, NULL, 0, NULL);
 	if (error)
 		return error;
 	td->td_retval[0] = value;
 	return 0;
 }
 
 int
 ibcs2_alarm(td, uap)
 	struct thread *td;
 	struct ibcs2_alarm_args *uap;
 {
 	int error;
         struct itimerval *itp, *oitp;
 	struct setitimer_args sa;
 	caddr_t sg = stackgap_init();
 
         itp = stackgap_alloc(&sg, sizeof(*itp));
 	oitp = stackgap_alloc(&sg, sizeof(*oitp));
         timevalclear(&itp->it_interval);
         itp->it_value.tv_sec = uap->sec;
         itp->it_value.tv_usec = 0;
 
 	sa.which = ITIMER_REAL;
 	sa.itv = itp;
 	sa.oitv = oitp;
         error = setitimer(td, &sa);
 	if (error)
 		return error;
         if (oitp->it_value.tv_usec)
                 oitp->it_value.tv_sec++;
         td->td_retval[0] = oitp->it_value.tv_sec;
         return 0;
 }
 
 int
 ibcs2_times(td, uap)
 	struct thread *td;
 	struct ibcs2_times_args *uap;
 {
 	int error;
 	struct getrusage_args ga;
 	struct tms tms;
         struct timeval t;
 	caddr_t sg = stackgap_init();
         struct rusage *ru = stackgap_alloc(&sg, sizeof(*ru));
 #define CONVTCK(r)      (r.tv_sec * hz + r.tv_usec / (1000000 / hz))
 
 	ga.who = RUSAGE_SELF;
 	ga.rusage = ru;
 	error = getrusage(td, &ga);
 	if (error)
                 return error;
         tms.tms_utime = CONVTCK(ru->ru_utime);
         tms.tms_stime = CONVTCK(ru->ru_stime);
 
 	ga.who = RUSAGE_CHILDREN;
         error = getrusage(td, &ga);
 	if (error)
 		return error;
         tms.tms_cutime = CONVTCK(ru->ru_utime);
         tms.tms_cstime = CONVTCK(ru->ru_stime);
 
 	microtime(&t);
         td->td_retval[0] = CONVTCK(t);
 	
 	return copyout((caddr_t)&tms, (caddr_t)uap->tp,
 		       sizeof(struct tms));
 }
 
 int
 ibcs2_stime(td, uap)
 	struct thread *td;
 	struct ibcs2_stime_args *uap;
 {
 	int error;
 	struct settimeofday_args sa;
 	caddr_t sg = stackgap_init();
 
 	sa.tv = stackgap_alloc(&sg, sizeof(*sa.tv));
 	sa.tzp = NULL;
 	if ((error = copyin((caddr_t)uap->timep,
 			   &(sa.tv->tv_sec), sizeof(long))) != 0)
 		return error;
 	sa.tv->tv_usec = 0;
 	if ((error = settimeofday(td, &sa)) != 0)
 		return EPERM;
 	return 0;
 }
 
 int
 ibcs2_utime(td, uap)
 	struct thread *td;
 	struct ibcs2_utime_args *uap;
 {
 	int error;
 	struct utimes_args sa;
 	struct timeval *tp;
 	caddr_t sg = stackgap_init();
 
         CHECKALTEXIST(td, &sg, uap->path);
 	sa.path = uap->path;
 	if (uap->buf) {
 		struct ibcs2_utimbuf ubuf;
 
 		if ((error = copyin((caddr_t)uap->buf, (caddr_t)&ubuf,
 				   sizeof(ubuf))) != 0)
 			return error;
 		sa.tptr = stackgap_alloc(&sg,
 						  2 * sizeof(struct timeval *));
 		tp = (struct timeval *)sa.tptr;
 		tp->tv_sec = ubuf.actime;
 		tp->tv_usec = 0;
 		tp++;
 		tp->tv_sec = ubuf.modtime;
 		tp->tv_usec = 0;
 	} else
 		sa.tptr = NULL;
 	return utimes(td, &sa);
 }
 
 int
 ibcs2_nice(td, uap)
 	struct thread *td;
 	struct ibcs2_nice_args *uap;
 {
 	int error;
 	struct setpriority_args sa;
 
 	sa.which = PRIO_PROCESS;
 	sa.who = 0;
-	sa.prio = td->td_ksegrp->kg_nice + uap->incr;
+	sa.prio = td->td_proc->p_nice + uap->incr;
 	if ((error = setpriority(td, &sa)) != 0)
 		return EPERM;
-	td->td_retval[0] = td->td_ksegrp->kg_nice;
+	td->td_retval[0] = td->td_proc->p_nice;
 	return 0;
 }
 
 /*
  * iBCS2 getpgrp, setpgrp, setsid, and setpgid
  */
 
 int
 ibcs2_pgrpsys(td, uap)
 	struct thread *td;
 	struct ibcs2_pgrpsys_args *uap;
 {
 	struct proc *p = td->td_proc;
 	switch (uap->type) {
 	case 0:			/* getpgrp */
 		PROC_LOCK(p);
 		td->td_retval[0] = p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 1:			/* setpgrp */
 	    {
 		struct setpgid_args sa;
 
 		sa.pid = 0;
 		sa.pgid = 0;
 		setpgid(td, &sa);
 		PROC_LOCK(p);
 		td->td_retval[0] = p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 	    }
 
 	case 2:			/* setpgid */
 	    {
 		struct setpgid_args sa;
 
 		sa.pid = uap->pid;
 		sa.pgid = uap->pgid;
 		return setpgid(td, &sa);
 	    }
 
 	case 3:			/* setsid */
 		return setsid(td, NULL);
 
 	default:
 		return EINVAL;
 	}
 }
 
 /*
  * XXX - need to check for nested calls
  */
 
 int
 ibcs2_plock(td, uap)
 	struct thread *td;
 	struct ibcs2_plock_args *uap;
 {
 	int error;
 #define IBCS2_UNLOCK	0
 #define IBCS2_PROCLOCK	1
 #define IBCS2_TEXTLOCK	2
 #define IBCS2_DATALOCK	4
 
 	
         if ((error = suser(td)) != 0)
                 return EPERM;
 	switch(uap->cmd) {
 	case IBCS2_UNLOCK:
 	case IBCS2_PROCLOCK:
 	case IBCS2_TEXTLOCK:
 	case IBCS2_DATALOCK:
 		return 0;	/* XXX - TODO */
 	}
 	return EINVAL;
 }
 
 int
 ibcs2_uadmin(td, uap)
 	struct thread *td;
 	struct ibcs2_uadmin_args *uap;
 {
 #define SCO_A_REBOOT        1
 #define SCO_A_SHUTDOWN      2
 #define SCO_A_REMOUNT       4
 #define SCO_A_CLOCK         8
 #define SCO_A_SETCONFIG     128
 #define SCO_A_GETDEV        130
 
 #define SCO_AD_HALT         0
 #define SCO_AD_BOOT         1
 #define SCO_AD_IBOOT        2
 #define SCO_AD_PWRDOWN      3
 #define SCO_AD_PWRNAP       4
 
 #define SCO_AD_PANICBOOT    1
 
 #define SCO_AD_GETBMAJ      0
 #define SCO_AD_GETCMAJ      1
 
         if (suser(td))
                 return EPERM;
 
 	switch(uap->cmd) {
 	case SCO_A_REBOOT:
 	case SCO_A_SHUTDOWN:
 		switch(uap->func) {
 			struct reboot_args r;
 		case SCO_AD_HALT:
 		case SCO_AD_PWRDOWN:
 		case SCO_AD_PWRNAP:
 			r.opt = RB_HALT;
 			reboot(td, &r);
 		case SCO_AD_BOOT:
 		case SCO_AD_IBOOT:
 			r.opt = RB_AUTOBOOT;
 			reboot(td, &r);
 		}
 		return EINVAL;
 	case SCO_A_REMOUNT:
 	case SCO_A_CLOCK:
 	case SCO_A_SETCONFIG:
 		return 0;
 	case SCO_A_GETDEV:
 		return EINVAL;	/* XXX - TODO */
 	}
 	return EINVAL;
 }
 
 int
 ibcs2_sysfs(td, uap)
 	struct thread *td;
 	struct ibcs2_sysfs_args *uap;
 {
 #define IBCS2_GETFSIND        1
 #define IBCS2_GETFSTYP        2
 #define IBCS2_GETNFSTYP       3
 
 	switch(uap->cmd) {
 	case IBCS2_GETFSIND:
 	case IBCS2_GETFSTYP:
 	case IBCS2_GETNFSTYP:
 		break;
 	}
 	return EINVAL;		/* XXX - TODO */
 }
 
 int
 ibcs2_unlink(td, uap)
 	struct thread *td;
 	struct ibcs2_unlink_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, uap->path);
 	return unlink(td, (struct unlink_args *)uap);
 }
 
 int
 ibcs2_chdir(td, uap)
 	struct thread *td;
 	struct ibcs2_chdir_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, uap->path);
 	return chdir(td, (struct chdir_args *)uap);
 }
 
 int
 ibcs2_chmod(td, uap)
 	struct thread *td;
 	struct ibcs2_chmod_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, uap->path);
 	return chmod(td, (struct chmod_args *)uap);
 }
 
 int
 ibcs2_chown(td, uap)
 	struct thread *td;
 	struct ibcs2_chown_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, uap->path);
 	return chown(td, (struct chown_args *)uap);
 }
 
 int
 ibcs2_rmdir(td, uap)
 	struct thread *td;
 	struct ibcs2_rmdir_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, uap->path);
 	return rmdir(td, (struct rmdir_args *)uap);
 }
 
 int
 ibcs2_mkdir(td, uap)
 	struct thread *td;
 	struct ibcs2_mkdir_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTCREAT(td, &sg, uap->path);
 	return mkdir(td, (struct mkdir_args *)uap);
 }
 
 int
 ibcs2_symlink(td, uap)
 	struct thread *td;
 	struct ibcs2_symlink_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, uap->path);
 	CHECKALTCREAT(td, &sg, uap->link);
 	return symlink(td, (struct symlink_args *)uap);
 }
 
 int
 ibcs2_rename(td, uap)
 	struct thread *td;
 	struct ibcs2_rename_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, uap->from);
 	CHECKALTCREAT(td, &sg, uap->to);
 	return rename(td, (struct rename_args *)uap);
 }
 
 int
 ibcs2_readlink(td, uap)
 	struct thread *td;
 	struct ibcs2_readlink_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, uap->path);
 	return readlink(td, (struct readlink_args *) uap);
 }
Index: head/sys/kern/init_main.c
===================================================================
--- head/sys/kern/init_main.c	(revision 130550)
+++ head/sys/kern/init_main.c	(revision 130551)
@@ -1,733 +1,733 @@
 /*
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_init_path.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/exec.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/sysent.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #include <sys/unistd.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/user.h>
 #include <sys/copyright.h>
 
 void mi_startup(void);				/* Should be elsewhere */
 
 /* Components of the first process -- never freed. */
 static struct session session0;
 static struct pgrp pgrp0;
 struct	proc proc0;
 struct	thread thread0;
 struct	kse kse0;
 struct	ksegrp ksegrp0;
 static struct filedesc0 filedesc0;
 struct	vmspace vmspace0;
 struct	proc *initproc;
 
 struct	vnode *rootvp;
 int	boothowto = 0;		/* initialized so that it can be patched */
 SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "");
 int	bootverbose;
 SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "");
 
 /*
  * This ensures that there is at least one entry so that the sysinit_set
  * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
  * executed.
  */
 SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL)
 
 /*
  * The sysinit table itself.  Items are checked off as the are run.
  * If we want to register new sysinit types, add them to newsysinit.
  */
 SET_DECLARE(sysinit_set, struct sysinit);
 struct sysinit **sysinit, **sysinit_end;
 struct sysinit **newsysinit, **newsysinit_end;
 
 /*
  * Merge a new sysinit set into the current set, reallocating it if
  * necessary.  This can only be called after malloc is running.
  */
 void
 sysinit_add(struct sysinit **set, struct sysinit **set_end)
 {
 	struct sysinit **newset;
 	struct sysinit **sipp;
 	struct sysinit **xipp;
 	int count;
 
 	count = set_end - set;
 	if (newsysinit)
 		count += newsysinit_end - newsysinit;
 	else
 		count += sysinit_end - sysinit;
 	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
 	if (newset == NULL)
 		panic("cannot malloc for sysinit");
 	xipp = newset;
 	if (newsysinit)
 		for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
 			*xipp++ = *sipp;
 	else
 		for (sipp = sysinit; sipp < sysinit_end; sipp++)
 			*xipp++ = *sipp;
 	for (sipp = set; sipp < set_end; sipp++)
 		*xipp++ = *sipp;
 	if (newsysinit)
 		free(newsysinit, M_TEMP);
 	newsysinit = newset;
 	newsysinit_end = newset + count;
 }
 
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
  * hard work is done in the lower-level initialization routines including
  * startup(), which does memory initialization and autoconfiguration.
  *
  * This allows simple addition of new kernel subsystems that require
  * boot time initialization.  It also allows substitution of subsystem
  * (for instance, a scheduler, kernel profiler, or VM system) by object
  * module.  Finally, it allows for optional "kernel threads".
  */
 void
 mi_startup(void)
 {
 
 	register struct sysinit **sipp;		/* system initialization*/
 	register struct sysinit **xipp;		/* interior loop of sort*/
 	register struct sysinit *save;		/* bubble*/
 
 	if (sysinit == NULL) {
 		sysinit = SET_BEGIN(sysinit_set);
 		sysinit_end = SET_LIMIT(sysinit_set);
 	}
 
 restart:
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
 	 */
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 		for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
 			if ((*sipp)->subsystem < (*xipp)->subsystem ||
 			     ((*sipp)->subsystem == (*xipp)->subsystem &&
 			      (*sipp)->order <= (*xipp)->order))
 				continue;	/* skip*/
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 *
 	 * The last item on the list is expected to be the scheduler,
 	 * which will not return.
 	 */
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s)*/
 
 		if ((*sipp)->subsystem == SI_SUB_DONE)
 			continue;
 
 		/* Call function */
 		(*((*sipp)->func))((*sipp)->udata);
 
 		/* Check off the one we're just done */
 		(*sipp)->subsystem = SI_SUB_DONE;
 
 		/* Check if we've installed more sysinit items via KLD */
 		if (newsysinit != NULL) {
 			if (sysinit != SET_BEGIN(sysinit_set))
 				free(sysinit, M_TEMP);
 			sysinit = newsysinit;
 			sysinit_end = newsysinit_end;
 			newsysinit = NULL;
 			newsysinit_end = NULL;
 			goto restart;
 		}
 	}
 
 	panic("Shouldn't get here!");
 	/* NOTREACHED*/
 }
 
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's belong elsewhere, but have not yet
  **** been moved.
  ****
  ***************************************************************************
  */
 static void
 print_caddr_t(void *data __unused)
 {
 	printf("%s", (char *)data);
 }
 SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
 SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version)
 
 #ifdef WITNESS
 static char wit_warn[] =
      "WARNING: WITNESS option enabled, expect reduced performance.\n";
 SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_SECOND + 1,
    print_caddr_t, wit_warn)
 #endif
 
 #ifdef DIAGNOSTIC
 static char diag_warn[] =
      "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
 SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_SECOND + 2,
     print_caddr_t, diag_warn)
 #endif
 
 static void
 set_boot_verbose(void *data __unused)
 {
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 }
 SYSINIT(boot_verbose, SI_SUB_TUNABLES, SI_ORDER_ANY, set_boot_verbose, NULL)
 
 struct sysentvec null_sysvec = {
 	0,
 	NULL,
 	0,
 	0,
 	NULL,
 	0,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	"null",
 	NULL,
 	NULL,
 	0,
 	PAGE_SIZE,
 	VM_MIN_ADDRESS,
 	VM_MAXUSER_ADDRESS,
 	USRSTACK,
 	PS_STRINGS,
 	VM_PROT_ALL,
 	NULL,
 	NULL,
 	NULL
 };
 
 /*
  ***************************************************************************
  ****
  **** The two following SYSINIT's are proc0 specific glue code.  I am not
  **** convinced that they can not be safely combined, but their order of
  **** operation has been maintained as the same as the original init_main.c
  **** for right now.
  ****
  **** These probably belong in init_proc.c or kern_proc.c, since they
  **** deal with proc0 (the fork template process).
  ****
  ***************************************************************************
  */
 /* ARGSUSED*/
 static void
 proc0_init(void *dummy __unused)
 {
 	register struct proc		*p;
 	register struct filedesc0	*fdp;
 	register unsigned i;
 	struct thread *td;
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	GIANT_REQUIRED;
 	p = &proc0;
 	td = &thread0;
 	ke = &kse0;
 	kg = &ksegrp0;
 
 	ke->ke_sched = kse0_sched;
 	kg->kg_sched = ksegrp0_sched;
 	p->p_sched = proc0_sched;
 	td->td_sched = thread0_sched;
 
 	/*
 	 * Initialize magic number.
 	 */
 	p->p_magic = P_MAGIC;
 
 	/*
 	 * Initialize thread, process and pgrp structures.
 	 */
 	procinit();
 	threadinit();
 
 	/*
 	 * Initialize sleep queue hash table
 	 */
 	sleepinit();
 
 	/*
 	 * additional VM structures
 	 */
 	vm_init2();
 
 	/*
 	 * Create process 0 (the swapper).
 	 */
 	LIST_INSERT_HEAD(&allproc, p, p_list);
 	LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
 	mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
 	p->p_pgrp = &pgrp0;
 	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
 	LIST_INIT(&pgrp0.pg_members);
 	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
 
 	pgrp0.pg_session = &session0;
 	mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
 	session0.s_count = 1;
 	session0.s_leader = p;
 
 	p->p_sysent = &null_sysvec;
 
 	/*
 	 * proc_linkup was already done in init_i386() or alphainit() etc.
 	 * because the earlier code needed to follow td->td_proc. Otherwise
 	 * I would have done it here.. maybe this means this should be
 	 * done earlier too.
 	 */
 	p->p_flag = P_SYSTEM;
 	p->p_sflag = PS_INMEM;
 	p->p_state = PRS_NORMAL;
+	p->p_nice = NZERO;
 	td->td_state = TDS_RUNNING;
-	kg->kg_nice = NZERO;
 	kg->kg_pri_class = PRI_TIMESHARE;
 	kg->kg_user_pri = PUSER;
 	td->td_priority = PVM;
 	td->td_base_pri = PUSER;
 	td->td_kse = ke; /* XXXKSE */
 	td->td_oncpu = 0;
 	ke->ke_state = KES_THREAD;
 	ke->ke_thread = td;
 	p->p_peers = 0;
 	p->p_leader = p;
 
 
 	bcopy("swapper", p->p_comm, sizeof ("swapper"));
 
 	callout_init(&p->p_itcallout, CALLOUT_MPSAFE);
 	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
 
 	/* Create credentials. */
 	p->p_ucred = crget();
 	p->p_ucred->cr_ngroups = 1;	/* group 0 */
 	p->p_ucred->cr_uidinfo = uifind(0);
 	p->p_ucred->cr_ruidinfo = uifind(0);
 	p->p_ucred->cr_prison = NULL;	/* Don't jail it. */
 #ifdef MAC
 	mac_create_proc0(p->p_ucred);
 #endif
 	td->td_ucred = crhold(p->p_ucred);
 
 	/* Create sigacts. */
 	p->p_sigacts = sigacts_alloc();
 
 	/* Initialize signal state for process 0. */
 	siginit(&proc0);
 
 	/* Create the file descriptor table. */
 	/* XXX this duplicates part of fdinit() */
 	fdp = &filedesc0;
 	p->p_fd = &fdp->fd_fd;
 	p->p_fdtol = NULL;
 	mtx_init(&fdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
 	fdp->fd_fd.fd_refcnt = 1;
 	fdp->fd_fd.fd_cmask = CMASK;
 	fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
 	fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
 	fdp->fd_fd.fd_nfiles = NDFILE;
 	fdp->fd_fd.fd_map = fdp->fd_dmap;
 
 	/* Create the limits structures. */
 	p->p_limit = lim_alloc();
 	for (i = 0; i < RLIM_NLIMITS; i++)
 		p->p_limit->pl_rlimit[i].rlim_cur =
 		    p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
 	p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur =
 	    p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
 	p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur =
 	    p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
 	i = ptoa(cnt.v_free_count);
 	p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = i;
 	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
 	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
 	p->p_cpulimit = RLIM_INFINITY;
 
 	/* Allocate a prototype map so we have something to fork. */
 	pmap_pinit0(vmspace_pmap(&vmspace0));
 	p->p_vmspace = &vmspace0;
 	vmspace0.vm_refcnt = 1;
 	vm_map_init(&vmspace0.vm_map, p->p_sysent->sv_minuser,
 	    p->p_sysent->sv_maxuser);
 	vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0);
 
 	/*
 	 * We continue to place resource usage info
 	 * in the user struct so that it's pageable.
 	 */
 	p->p_stats = &p->p_uarea->u_stats;
 
 	/*
 	 * Charge root for one process.
 	 */
 	(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
 }
 SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
 
 /* ARGSUSED*/
 static void
 proc0_post(void *dummy __unused)
 {
 	struct timespec ts;
 	struct proc *p;
 
 	/*
 	 * Now we can look at the time, having had a chance to verify the
 	 * time from the filesystem.  Pretend that proc0 started now.
 	 */
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		microuptime(&p->p_stats->p_start);
 		p->p_runtime.sec = 0;
 		p->p_runtime.frac = 0;
 	}
 	sx_sunlock(&allproc_lock);
 	binuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 
 	/*
 	 * Give the ``random'' number generator a thump.
 	 */
 	nanotime(&ts);
 	srandom(ts.tv_sec ^ ts.tv_nsec);
 }
 SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's and glue code should be moved to the
  **** respective files on a per subsystem basis.
  ****
  ***************************************************************************
  */
 
 
 /*
  ***************************************************************************
  ****
  **** The following code probably belongs in another file, like
  **** kern/init_init.c.
  ****
  ***************************************************************************
  */
 
 /*
  * List of paths to try when searching for "init".
  */
 static char init_path[MAXPATHLEN] =
 #ifdef	INIT_PATH
     __XSTRING(INIT_PATH);
 #else
     "/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall";
 #endif
 SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
 	"Path used to search the init process");
 
 /*
  * Start the initial user process; try exec'ing each pathname in init_path.
  * The program is invoked with one argument containing the boot flags.
  */
 static void
 start_init(void *dummy)
 {
 	vm_offset_t addr;
 	struct execve_args args;
 	int options, error;
 	char *var, *path, *next, *s;
 	char *ucp, **uap, *arg0, *arg1;
 	struct thread *td;
 	struct proc *p;
 	int init_does_devfs = 0;
 
 	mtx_lock(&Giant);
 
 	GIANT_REQUIRED;
 
 	td = curthread;
 	p = td->td_proc;
 
 	vfs_mountroot();
 
 	/* Get the vnode for '/'.  Set p->p_fd->fd_cdir to reference it. */
 	if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode))
 		panic("cannot find root vnode");
 	FILEDESC_LOCK(p->p_fd);
 	p->p_fd->fd_cdir = rootvnode;
 	VREF(p->p_fd->fd_cdir);
 	p->p_fd->fd_rdir = rootvnode;
 	VREF(p->p_fd->fd_rdir);
 	FILEDESC_UNLOCK(p->p_fd);
 	VOP_UNLOCK(rootvnode, 0, td);
 #ifdef MAC
 	mac_create_root_mount(td->td_ucred, TAILQ_FIRST(&mountlist));
 #endif
 
 	/*
 	 * For disk based systems, we probably cannot do this yet
 	 * since the fs will be read-only.  But a NFS root
 	 * might be ok.  It is worth a shot.
 	 */
 	error = kern_mkdir(td, "/dev", UIO_SYSSPACE, 0700);
 	if (error == EEXIST)
 		error = 0;
 	if (error == 0)
 		error = kernel_vmount(0, "fstype", "devfs",
 		    "fspath", "/dev", NULL);
 	if (error != 0)
 		init_does_devfs = 1;
 
 	/*
 	 * Need just enough stack to hold the faked-up "execve()" arguments.
 	 */
 	addr = p->p_sysent->sv_usrstack - PAGE_SIZE;
 	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
 			FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
 		panic("init: couldn't allocate argument space");
 	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
 	p->p_vmspace->vm_ssize = 1;
 
 	if ((var = getenv("init_path")) != NULL) {
 		strlcpy(init_path, var, sizeof(init_path));
 		freeenv(var);
 	}
 	
 	for (path = init_path; *path != '\0'; path = next) {
 		while (*path == ':')
 			path++;
 		if (*path == '\0')
 			break;
 		for (next = path; *next != '\0' && *next != ':'; next++)
 			/* nothing */ ;
 		if (bootverbose)
 			printf("start_init: trying %.*s\n", (int)(next - path),
 			    path);
 			
 		/*
 		 * Move out the boot flag argument.
 		 */
 		options = 0;
 		ucp = (char *)p->p_sysent->sv_usrstack;
 		(void)subyte(--ucp, 0);		/* trailing zero */
 		if (boothowto & RB_SINGLE) {
 			(void)subyte(--ucp, 's');
 			options = 1;
 		}
 #ifdef notyet
                 if (boothowto & RB_FASTBOOT) {
 			(void)subyte(--ucp, 'f');
 			options = 1;
 		}
 #endif
 
 #ifdef BOOTCDROM
 		(void)subyte(--ucp, 'C');
 		options = 1;
 #endif
 		if (init_does_devfs) {
 			(void)subyte(--ucp, 'd');
 			options = 1;
 		}
 
 		if (options == 0)
 			(void)subyte(--ucp, '-');
 		(void)subyte(--ucp, '-');		/* leading hyphen */
 		arg1 = ucp;
 
 		/*
 		 * Move out the file name (also arg 0).
 		 */
 		(void)subyte(--ucp, 0);
 		for (s = next - 1; s >= path; s--)
 			(void)subyte(--ucp, *s);
 		arg0 = ucp;
 
 		/*
 		 * Move out the arg pointers.
 		 */
 		uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
 		(void)suword((caddr_t)--uap, (long)0);	/* terminator */
 		(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
 		(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
 
 		/*
 		 * Point at the arguments.
 		 */
 		args.fname = arg0;
 		args.argv = uap;
 		args.envv = NULL;
 
 		/*
 		 * Now try to exec the program.  If can't for any reason
 		 * other than it doesn't exist, complain.
 		 *
 		 * Otherwise, return via fork_trampoline() all the way
 		 * to user mode as init!
 		 */
 		if ((error = execve(td, &args)) == 0) {
 			mtx_unlock(&Giant);
 			return;
 		}
 		if (error != ENOENT)
 			printf("exec %.*s: error %d\n", (int)(next - path), 
 			    path, error);
 	}
 	printf("init: not found in path %s\n", init_path);
 	panic("no init");
 }
 
 /*
  * Like kthread_create(), but runs in it's own address space.
  * We do this early to reserve pid 1.
  *
  * Note special case - do not make it runnable yet.  Other work
  * in progress will change this more.
  */
 static void
 create_init(const void *udata __unused)
 {
 	struct ucred *newcred, *oldcred;
 	int error;
 
 	error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc);
 	if (error)
 		panic("cannot fork init: %d\n", error);
 	KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
 	/* divorce init's credentials from the kernel's */
 	newcred = crget();
 	PROC_LOCK(initproc);
 	initproc->p_flag |= P_SYSTEM;
 	oldcred = initproc->p_ucred;
 	crcopy(newcred, oldcred);
 #ifdef MAC
 	mac_create_proc1(newcred);
 #endif
 	initproc->p_ucred = newcred;
 	PROC_UNLOCK(initproc);
 	crfree(oldcred);
 	cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
 	mtx_lock_spin(&sched_lock);
 	initproc->p_sflag |= PS_INMEM;
 	mtx_unlock_spin(&sched_lock);
 	cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
 }
 SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
 
 /*
  * Make it runnable now.
  */
 static void
 kick_init(const void *udata __unused)
 {
 	struct thread *td;
 
 	td = FIRST_THREAD_IN_PROC(initproc);
 	mtx_lock_spin(&sched_lock);
 	TD_SET_CAN_RUN(td);
 	setrunqueue(td);	/* XXXKSE */
 	mtx_unlock_spin(&sched_lock);
 }
 SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
Index: head/sys/kern/kern_clock.c
===================================================================
--- head/sys/kern/kern_clock.c	(revision 130550)
+++ head/sys/kern/kern_clock.c	(revision 130551)
@@ -1,555 +1,555 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ntp.h"
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/sysctl.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/limits.h>
 #include <sys/timetc.h>
 
 #include <machine/cpu.h>
 
 #ifdef GPROF
 #include <sys/gmon.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #ifdef DEVICE_POLLING
 extern void hardclock_device_poll(void);
 #endif /* DEVICE_POLLING */
 
 static void initclocks(void *dummy);
 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
 
 /* Some of these don't belong here, but it's easiest to concentrate them. */
 long cp_time[CPUSTATES];
 
 SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time),
     "LU", "CPU time statistics");
 
 #ifdef SW_WATCHDOG
 #include <sys/watchdog.h>
 
 static int watchdog_ticks;
 static int watchdog_enabled;
 static void watchdog_fire(void);
 static void watchdog_config(void *, u_int, int *);
 #endif /* SW_WATCHDOG */
 
 /*
  * Clock handling routines.
  *
  * This code is written to operate with two timers that run independently of
  * each other.
  *
  * The main timer, running hz times per second, is used to trigger interval
  * timers, timeouts and rescheduling as needed.
  *
  * The second timer handles kernel and user profiling,
  * and does resource use estimation.  If the second timer is programmable,
  * it is randomized to avoid aliasing between the two clocks.  For example,
  * the randomization prevents an adversary from always giving up the cpu
  * just before its quantum expires.  Otherwise, it would never accumulate
  * cpu ticks.  The mean frequency of the second timer is stathz.
  *
  * If no second timer exists, stathz will be zero; in this case we drive
  * profiling and statistics off the main clock.  This WILL NOT be accurate;
  * do not do it unless absolutely necessary.
  *
  * The statistics clock may (or may not) be run at a higher rate while
  * profiling.  This profile clock runs at profhz.  We require that profhz
  * be an integral multiple of stathz.
  *
  * If the statistics clock is running fast, it must be divided by the ratio
  * profhz/stathz for statistics.  (For profiling, every tick counts.)
  *
  * Time-of-day is maintained using a "timecounter", which may or may
  * not be related to the hardware generating the above mentioned
  * interrupts.
  */
 
 int	stathz;
 int	profhz;
 int	profprocs;
 int	ticks;
 int	psratio;
 
 /*
  * Initialize clock frequencies and start both clocks running.
  */
 /* ARGSUSED*/
 static void
 initclocks(dummy)
 	void *dummy;
 {
 	register int i;
 
 	/*
 	 * Set divisors to 1 (normal case) and let the machine-specific
 	 * code do its bit.
 	 */
 	cpu_initclocks();
 
 	/*
 	 * Compute profhz/stathz, and fix profhz if needed.
 	 */
 	i = stathz ? stathz : hz;
 	if (profhz == 0)
 		profhz = i;
 	psratio = profhz / i;
 #ifdef SW_WATCHDOG
 	EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0);
 #endif
 }
 
 /*
  * Each time the real-time timer fires, this function is called on all CPUs.
  * Note that hardclock() calls hardclock_process() for the boot CPU, so only
  * the other CPUs in the system need to call this function.
  */
 void
 hardclock_process(frame)
 	register struct clockframe *frame;
 {
 	struct pstats *pstats;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 
 	/*
 	 * Run current process's virtual and profile time, as needed.
 	 */
 	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
 	if (p->p_flag & P_SA) {
 		/* XXXKSE What to do? */
 	} else {
 		pstats = p->p_stats;
 		if (CLKF_USERMODE(frame) &&
 		    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
 		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
 			p->p_sflag |= PS_ALRMPEND;
 			td->td_flags |= TDF_ASTPENDING;
 		}
 		if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
 		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
 			p->p_sflag |= PS_PROFPEND;
 			td->td_flags |= TDF_ASTPENDING;
 		}
 	}
 	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
 }
 
 /*
  * The real-time timer, interrupting hz times per second.
  */
 void
 hardclock(frame)
 	register struct clockframe *frame;
 {
 	int need_softclock = 0;
 
 	CTR0(KTR_CLK, "hardclock fired");
 	hardclock_process(frame);
 
 	tc_ticktock();
 	/*
 	 * If no separate statistics clock is available, run it from here.
 	 *
 	 * XXX: this only works for UP
 	 */
 	if (stathz == 0) {
 		profclock(frame);
 		statclock(frame);
 	}
 
 #ifdef DEVICE_POLLING
 	hardclock_device_poll();	/* this is very short and quick */
 #endif /* DEVICE_POLLING */
 
 	/*
 	 * Process callouts at a very low cpu priority, so we don't keep the
 	 * relatively high clock interrupt priority any longer than necessary.
 	 */
 	mtx_lock_spin_flags(&callout_lock, MTX_QUIET);
 	ticks++;
 	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
 		need_softclock = 1;
 	} else if (softticks + 1 == ticks)
 		++softticks;
 	mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
 
 	/*
 	 * swi_sched acquires sched_lock, so we don't want to call it with
 	 * callout_lock held; incorrect locking order.
 	 */
 	if (need_softclock)
 		swi_sched(softclock_ih, 0);
 
 #ifdef SW_WATCHDOG
 	if (watchdog_enabled > 0 && --watchdog_ticks <= 0)
 		watchdog_fire();
 #endif /* SW_WATCHDOG */
 }
 
 /*
  * Compute number of ticks in the specified amount of time.
  */
 int
 tvtohz(tv)
 	struct timeval *tv;
 {
 	register unsigned long ticks;
 	register long sec, usec;
 
 	/*
 	 * If the number of usecs in the whole seconds part of the time
 	 * difference fits in a long, then the total number of usecs will
 	 * fit in an unsigned long.  Compute the total and convert it to
 	 * ticks, rounding up and adding 1 to allow for the current tick
 	 * to expire.  Rounding also depends on unsigned long arithmetic
 	 * to avoid overflow.
 	 *
 	 * Otherwise, if the number of ticks in the whole seconds part of
 	 * the time difference fits in a long, then convert the parts to
 	 * ticks separately and add, using similar rounding methods and
 	 * overflow avoidance.  This method would work in the previous
 	 * case but it is slightly slower and assumes that hz is integral.
 	 *
 	 * Otherwise, round the time difference down to the maximum
 	 * representable value.
 	 *
 	 * If ints have 32 bits, then the maximum value for any timeout in
 	 * 10ms ticks is 248 days.
 	 */
 	sec = tv->tv_sec;
 	usec = tv->tv_usec;
 	if (usec < 0) {
 		sec--;
 		usec += 1000000;
 	}
 	if (sec < 0) {
 #ifdef DIAGNOSTIC
 		if (usec > 0) {
 			sec++;
 			usec -= 1000000;
 		}
 		printf("tvotohz: negative time difference %ld sec %ld usec\n",
 		       sec, usec);
 #endif
 		ticks = 1;
 	} else if (sec <= LONG_MAX / 1000000)
 		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
 			/ tick + 1;
 	else if (sec <= LONG_MAX / hz)
 		ticks = sec * hz
 			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
 	else
 		ticks = LONG_MAX;
 	if (ticks > INT_MAX)
 		ticks = INT_MAX;
 	return ((int)ticks);
 }
 
 /*
  * Start profiling on a process.
  *
  * Kernel profiling passes proc0 which never exits and hence
  * keeps the profile clock running constantly.
  */
 void
 startprofclock(p)
 	register struct proc *p;
 {
 
 	/*
 	 * XXX; Right now sched_lock protects statclock(), but perhaps
 	 * it should be protected later on by a time_lock, which would
 	 * cover psdiv, etc. as well.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_STOPPROF)
 		return;
 	if ((p->p_flag & P_PROFIL) == 0) {
 		mtx_lock_spin(&sched_lock);
 		p->p_flag |= P_PROFIL;
 		if (++profprocs == 1)
 			cpu_startprofclock();
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * Stop profiling on a process.
  */
 void
 stopprofclock(p)
 	register struct proc *p;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_PROFIL) {
 		if (p->p_profthreads != 0) {
 			p->p_flag |= P_STOPPROF;
 			while (p->p_profthreads != 0)
 				msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
 				    "stopprof", 0);
 			p->p_flag &= ~P_STOPPROF;
 		}
 		if ((p->p_flag & P_PROFIL) == 0)
 			return;
 		mtx_lock_spin(&sched_lock);
 		p->p_flag &= ~P_PROFIL;
 		if (--profprocs == 0)
 			cpu_stopprofclock();
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * Statistics clock.  Grab profile sample, and if divider reaches 0,
  * do process and kernel statistics.  Most of the statistics are only
  * used by user-level statistics programs.  The main exceptions are
  * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu.
  * This should be called by all active processors.
  */
 void
 statclock(frame)
 	register struct clockframe *frame;
 {
 	struct pstats *pstats;
 	struct rusage *ru;
 	struct vmspace *vm;
 	struct thread *td;
 	struct proc *p;
 	long rss;
 
 	td = curthread;
 	p = td->td_proc;
 
 	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
 	if (CLKF_USERMODE(frame)) {
 		/*
 		 * Charge the time as appropriate.
 		 */
 		if (p->p_flag & P_SA)
 			thread_statclock(1);
 		p->p_uticks++;
-		if (td->td_ksegrp->kg_nice > NZERO)
+		if (p->p_nice > NZERO)
 			cp_time[CP_NICE]++;
 		else
 			cp_time[CP_USER]++;
 	} else {
 		/*
 		 * Came from kernel mode, so we were:
 		 * - handling an interrupt,
 		 * - doing syscall or trap work on behalf of the current
 		 *   user process, or
 		 * - spinning in the idle loop.
 		 * Whichever it is, charge the time as appropriate.
 		 * Note that we charge interrupts to the current process,
 		 * regardless of whether they are ``for'' that process,
 		 * so that we know how much of its real time was spent
 		 * in ``non-process'' (i.e., interrupt) work.
 		 */
 		if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) {
 			p->p_iticks++;
 			cp_time[CP_INTR]++;
 		} else {
 			if (p->p_flag & P_SA)
 				thread_statclock(0);
 			td->td_sticks++;
 			p->p_sticks++;
 			if (p != PCPU_GET(idlethread)->td_proc)
 				cp_time[CP_SYS]++;
 			else
 				cp_time[CP_IDLE]++;
 		}
 	}
 
 	sched_clock(td);
 
 	/* Update resource usage integrals and maximums. */
 	if ((pstats = p->p_stats) != NULL &&
 	    (ru = &pstats->p_ru) != NULL &&
 	    (vm = p->p_vmspace) != NULL) {
 		ru->ru_ixrss += pgtok(vm->vm_tsize);
 		ru->ru_idrss += pgtok(vm->vm_dsize);
 		ru->ru_isrss += pgtok(vm->vm_ssize);
 		rss = pgtok(vmspace_resident_count(vm));
 		if (ru->ru_maxrss < rss)
 			ru->ru_maxrss = rss;
 	}
 	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
 }
 
 void
 profclock(frame)
 	register struct clockframe *frame;
 {
 	struct thread *td;
 #ifdef GPROF
 	struct gmonparam *g;
 	int i;
 #endif
 
 	td = curthread;
 	if (CLKF_USERMODE(frame)) {
 		/*
 		 * Came from user mode; CPU was in user state.
 		 * If this process is being profiled, record the tick.
 		 * if there is no related user location yet, don't
 		 * bother trying to count it.
 		 */
 		if (td->td_proc->p_flag & P_PROFIL)
 			addupc_intr(td, CLKF_PC(frame), 1);
 	}
 #ifdef GPROF
 	else {
 		/*
 		 * Kernel statistics are just like addupc_intr, only easier.
 		 */
 		g = &_gmonparam;
 		if (g->state == GMON_PROF_ON) {
 			i = CLKF_PC(frame) - g->lowpc;
 			if (i < g->textsize) {
 				i /= HISTFRACTION * sizeof(*g->kcount);
 				g->kcount[i]++;
 			}
 		}
 	}
 #endif
 }
 
 /*
  * Return information about system clocks.
  */
 static int
 sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
 {
 	struct clockinfo clkinfo;
 	/*
 	 * Construct clockinfo structure.
 	 */
 	bzero(&clkinfo, sizeof(clkinfo));
 	clkinfo.hz = hz;
 	clkinfo.tick = tick;
 	clkinfo.profhz = profhz;
 	clkinfo.stathz = stathz ? stathz : hz;
 	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
 }
 
 SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
 	0, 0, sysctl_kern_clockrate, "S,clockinfo",
 	"Rate and period of various kernel clocks");
 
 #ifdef SW_WATCHDOG
 
 static void
 watchdog_config(void *unused __unused, u_int cmd, int *err)
 {
 	u_int u;
 
 	u = cmd & WD_INTERVAL;
 	if (cmd && u >= WD_TO_1SEC) {
 		u = cmd & WD_INTERVAL;
 		watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz;
 		watchdog_enabled = 1;
 		*err = 0;
 	} else {
 		watchdog_enabled = 0;
 	}
 }
 
 /*
  * Handle a watchdog timeout by dumping interrupt information and
  * then either dropping to DDB or panicing.
  */
 static void
 watchdog_fire(void)
 {
 	int nintr;
 	u_int64_t inttotal;
 	u_long *curintr;
 	char *curname;
 
 	curintr = intrcnt;
 	curname = intrnames;
 	inttotal = 0;
 	nintr = eintrcnt - intrcnt;
 	
 	printf("interrupt                   total\n");
 	while (--nintr >= 0) {
 		if (*curintr)
 			printf("%-12s %20lu\n", curname, *curintr);
 		curname += strlen(curname) + 1;
 		inttotal += *curintr++;
 	}
 	printf("Total        %20ju\n", (uintmax_t)inttotal);
 
 #ifdef DDB
 	db_print_backtrace();
 	Debugger("watchdog timeout");
 #else /* !DDB */
 	panic("watchdog timeout");
 #endif /* DDB */
 }
 
 #endif /* SW_WATCHDOG */
Index: head/sys/kern/kern_proc.c
===================================================================
--- head/sys/kern/kern_proc.c	(revision 130550)
+++ head/sys/kern/kern_proc.c	(revision 130551)
@@ -1,1242 +1,1242 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/filedesc.h>
 #include <sys/tty.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/jail.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/uma.h>
 #include <machine/critical.h>
 
 MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
 MALLOC_DEFINE(M_SESSION, "session", "session header");
 static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
 MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
 
 static void doenterpgrp(struct proc *, struct pgrp *);
 static void orphanpg(struct pgrp *pg);
 static void pgadjustjobc(struct pgrp *pgrp, int entering);
 static void pgdelete(struct pgrp *);
 static void proc_ctor(void *mem, int size, void *arg);
 static void proc_dtor(void *mem, int size, void *arg);
 static void proc_init(void *mem, int size);
 static void proc_fini(void *mem, int size);
 
 /*
  * Other process lists
  */
 struct pidhashhead *pidhashtbl;
 u_long pidhash;
 struct pgrphashhead *pgrphashtbl;
 u_long pgrphash;
 struct proclist allproc;
 struct proclist zombproc;
 struct sx allproc_lock;
 struct sx proctree_lock;
 struct mtx pargs_ref_lock;
 struct mtx ppeers_lock;
 uma_zone_t proc_zone;
 uma_zone_t ithread_zone;
 
 int kstack_pages = KSTACK_PAGES;
 int uarea_pages = UAREA_PAGES;
 SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, "");
 SYSCTL_INT(_kern, OID_AUTO, uarea_pages, CTLFLAG_RD, &uarea_pages, 0, "");
 
 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
 
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
 
 /*
  * Initialize global process hashing structures.
  */
 void
 procinit()
 {
 
 	sx_init(&allproc_lock, "allproc");
 	sx_init(&proctree_lock, "proctree");
 	mtx_init(&pargs_ref_lock, "struct pargs.ref", NULL, MTX_DEF);
 	mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
 	LIST_INIT(&allproc);
 	LIST_INIT(&zombproc);
 	pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
 	pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
 	proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
 	    proc_ctor, proc_dtor, proc_init, proc_fini,
 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uihashinit();
 }
 
 /*
  * Prepare a proc for use.
  */
 static void
 proc_ctor(void *mem, int size, void *arg)
 {
 	struct proc *p;
 
 	p = (struct proc *)mem;
 }
 
 /*
  * Reclaim a proc after use.
  */
 static void
 proc_dtor(void *mem, int size, void *arg)
 {
 	struct proc *p;
 	struct thread *td;
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	/* INVARIANTS checks go here */
 	p = (struct proc *)mem;
 	KASSERT((p->p_numthreads == 1),
 	    ("bad number of threads in exiting process"));
         td = FIRST_THREAD_IN_PROC(p);
 	KASSERT((td != NULL), ("proc_dtor: bad thread pointer"));
         kg = FIRST_KSEGRP_IN_PROC(p);
 	KASSERT((kg != NULL), ("proc_dtor: bad kg pointer"));
         ke = FIRST_KSE_IN_KSEGRP(kg);
 	KASSERT((ke != NULL), ("proc_dtor: bad ke pointer"));
 
 	/* Dispose of an alternate kstack, if it exists.
 	 * XXX What if there are more than one thread in the proc?
 	 *     The first thread in the proc is special and not
 	 *     freed, so you gotta do this here.
 	 */
 	if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0))
 		vm_thread_dispose_altkstack(td);
 
 	/*
 	 * We want to make sure we know the initial linkages.
 	 * so for now tear them down and remake them.
 	 * This is probably un-needed as we can probably rely
 	 * on the state coming in here from wait4().
 	 */
 	proc_linkup(p, kg, ke, td);
 }
 
 /*
  * Initialize type-stable parts of a proc (when newly created).
  */
 static void
 proc_init(void *mem, int size)
 {
 	struct proc *p;
 	struct thread *td;
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	p = (struct proc *)mem;
 	p->p_sched = (struct p_sched *)&p[1];
 	vm_proc_new(p);
 	td = thread_alloc();
 	ke = kse_alloc();
 	kg = ksegrp_alloc();
 	proc_linkup(p, kg, ke, td);
 	bzero(&p->p_mtx, sizeof(struct mtx));
 	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
 }
 
 /*
  * Tear down type-stable parts of a proc (just before being discarded)
  */
 static void
 proc_fini(void *mem, int size)
 {
 	struct proc *p;
 	struct thread *td;
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	p = (struct proc *)mem;
 	KASSERT((p->p_numthreads == 1),
 	    ("bad number of threads in freeing process"));
         td = FIRST_THREAD_IN_PROC(p);
 	KASSERT((td != NULL), ("proc_dtor: bad thread pointer"));
         kg = FIRST_KSEGRP_IN_PROC(p);
 	KASSERT((kg != NULL), ("proc_dtor: bad kg pointer"));
         ke = FIRST_KSE_IN_KSEGRP(kg);
 	KASSERT((ke != NULL), ("proc_dtor: bad ke pointer"));
 	vm_proc_dispose(p);
 	thread_free(td);
 	ksegrp_free(kg);
 	kse_free(ke);
 	mtx_destroy(&p->p_mtx);
 }
 
 /*
  * Is p an inferior of the current process?
  */
 int
 inferior(p)
 	register struct proc *p;
 {
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	for (; p != curproc; p = p->p_pptr)
 		if (p->p_pid == 0)
 			return (0);
 	return (1);
 }
 
 /*
  * Locate a process by number
  */
 struct proc *
 pfind(pid)
 	register pid_t pid;
 {
 	register struct proc *p;
 
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, PIDHASH(pid), p_hash)
 		if (p->p_pid == pid) {
 			PROC_LOCK(p);
 			break;
 		}
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
 
 /*
  * Locate a process group by number.
  * The caller must hold proctree_lock.
  */
 struct pgrp *
 pgfind(pgid)
 	register pid_t pgid;
 {
 	register struct pgrp *pgrp;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 
 	LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
 		if (pgrp->pg_id == pgid) {
 			PGRP_LOCK(pgrp);
 			return (pgrp);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Create a new process group.
  * pgid must be equal to the pid of p.
  * Begin a new session if required.
  */
 int
 enterpgrp(p, pgid, pgrp, sess)
 	register struct proc *p;
 	pid_t pgid;
 	struct pgrp *pgrp;
 	struct session *sess;
 {
 	struct pgrp *pgrp2;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 
 	KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
 	KASSERT(p->p_pid == pgid,
 	    ("enterpgrp: new pgrp and pid != pgid"));
 
 	pgrp2 = pgfind(pgid);
 
 	KASSERT(pgrp2 == NULL,
 	    ("enterpgrp: pgrp with pgid exists"));
 	KASSERT(!SESS_LEADER(p),
 	    ("enterpgrp: session leader attempted setpgrp"));
 
 	mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
 
 	if (sess != NULL) {
 		/*
 		 * new session
 		 */
 		mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
 		PROC_LOCK(p);
 		p->p_flag &= ~P_CONTROLT;
 		PROC_UNLOCK(p);
 		PGRP_LOCK(pgrp);
 		sess->s_leader = p;
 		sess->s_sid = p->p_pid;
 		sess->s_count = 1;
 		sess->s_ttyvp = NULL;
 		sess->s_ttyp = NULL;
 		bcopy(p->p_session->s_login, sess->s_login,
 			    sizeof(sess->s_login));
 		pgrp->pg_session = sess;
 		KASSERT(p == curproc,
 		    ("enterpgrp: mksession and p != curproc"));
 	} else {
 		pgrp->pg_session = p->p_session;
 		SESS_LOCK(pgrp->pg_session);
 		pgrp->pg_session->s_count++;
 		SESS_UNLOCK(pgrp->pg_session);
 		PGRP_LOCK(pgrp);
 	}
 	pgrp->pg_id = pgid;
 	LIST_INIT(&pgrp->pg_members);
 
 	/*
 	 * As we have an exclusive lock of proctree_lock,
 	 * this should not deadlock.
 	 */
 	LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
 	pgrp->pg_jobc = 0;
 	SLIST_INIT(&pgrp->pg_sigiolst);
 	PGRP_UNLOCK(pgrp);
 
 	doenterpgrp(p, pgrp);
 
 	return (0);
 }
 
 /*
  * Move p to an existing process group
  */
 int
 enterthispgrp(p, pgrp)
 	register struct proc *p;
 	struct pgrp *pgrp;
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
 	KASSERT(pgrp->pg_session == p->p_session,
 		("%s: pgrp's session %p, p->p_session %p.\n",
 		__func__,
 		pgrp->pg_session,
 		p->p_session));
 	KASSERT(pgrp != p->p_pgrp,
 		("%s: p belongs to pgrp.", __func__));
 
 	doenterpgrp(p, pgrp);
 
 	return (0);
 }
 
 /*
  * Move p to a process group
  */
 static void
 doenterpgrp(p, pgrp)
 	struct proc *p;
 	struct pgrp *pgrp;
 {
 	struct pgrp *savepgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
 
 	savepgrp = p->p_pgrp;
 
 	/*
 	 * Adjust eligibility of affected pgrps to participate in job control.
 	 * Increment eligibility counts before decrementing, otherwise we
 	 * could reach 0 spuriously during the first call.
 	 */
 	fixjobc(p, pgrp, 1);
 	fixjobc(p, p->p_pgrp, 0);
 
 	PGRP_LOCK(pgrp);
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
 	LIST_REMOVE(p, p_pglist);
 	p->p_pgrp = pgrp;
 	PROC_UNLOCK(p);
 	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
 	PGRP_UNLOCK(savepgrp);
 	PGRP_UNLOCK(pgrp);
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 }
 
 /*
  * remove process from process group
  */
 int
 leavepgrp(p)
 	register struct proc *p;
 {
 	struct pgrp *savepgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	savepgrp = p->p_pgrp;
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
 	LIST_REMOVE(p, p_pglist);
 	p->p_pgrp = NULL;
 	PROC_UNLOCK(p);
 	PGRP_UNLOCK(savepgrp);
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 	return (0);
 }
 
 /*
  * delete a process group
  */
 static void
 pgdelete(pgrp)
 	register struct pgrp *pgrp;
 {
 	struct session *savesess;
 	int i;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pgid.
 	 */
 	funsetownlst(&pgrp->pg_sigiolst);
 
 	PGRP_LOCK(pgrp);
 	if (pgrp->pg_session->s_ttyp != NULL &&
 	    pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
 		pgrp->pg_session->s_ttyp->t_pgrp = NULL;
 	LIST_REMOVE(pgrp, pg_hash);
 	savesess = pgrp->pg_session;
 	SESS_LOCK(savesess);
 	i = --savesess->s_count;
 	SESS_UNLOCK(savesess);
 	PGRP_UNLOCK(pgrp);
 	if (i == 0) {
 		if (savesess->s_ttyp != NULL)
 			ttyrel(savesess->s_ttyp);
 		mtx_destroy(&savesess->s_mtx);
 		FREE(savesess, M_SESSION);
 	}
 	mtx_destroy(&pgrp->pg_mtx);
 	FREE(pgrp, M_PGRP);
 }
 
 static void
 pgadjustjobc(pgrp, entering)
 	struct pgrp *pgrp;
 	int entering;
 {
 
 	PGRP_LOCK(pgrp);
 	if (entering)
 		pgrp->pg_jobc++;
 	else {
 		--pgrp->pg_jobc;
 		if (pgrp->pg_jobc == 0)
 			orphanpg(pgrp);
 	}
 	PGRP_UNLOCK(pgrp);
 }
 
 /*
  * Adjust pgrp jobc counters when specified process changes process group.
  * We count the number of processes in each process group that "qualify"
  * the group for terminal job control (those with a parent in a different
  * process group of the same session).  If that count reaches zero, the
  * process group becomes orphaned.  Check both the specified process'
  * process group and that of its children.
  * entering == 0 => p is leaving specified group.
  * entering == 1 => p is entering specified group.
  */
 void
 fixjobc(p, pgrp, entering)
 	register struct proc *p;
 	register struct pgrp *pgrp;
 	int entering;
 {
 	register struct pgrp *hispgrp;
 	register struct session *mysession;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
 
 	/*
 	 * Check p's parent to see whether p qualifies its own process
 	 * group; if so, adjust count for p's process group.
 	 */
 	mysession = pgrp->pg_session;
 	if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
 	    hispgrp->pg_session == mysession)
 		pgadjustjobc(pgrp, entering);
 
 	/*
 	 * Check this process' children to see whether they qualify
 	 * their process groups; if so, adjust counts for children's
 	 * process groups.
 	 */
 	LIST_FOREACH(p, &p->p_children, p_sibling) {
 		hispgrp = p->p_pgrp;
 		if (hispgrp == pgrp ||
 		    hispgrp->pg_session != mysession)
 			continue;
 		PROC_LOCK(p);
 		if (p->p_state == PRS_ZOMBIE) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		PROC_UNLOCK(p);
 		pgadjustjobc(hispgrp, entering);
 	}
 }
 
 /*
  * A process group has become orphaned;
  * if there are any stopped processes in the group,
  * hang-up all process in that group.
  */
 static void
 orphanpg(pg)
 	struct pgrp *pg;
 {
 	register struct proc *p;
 
 	PGRP_LOCK_ASSERT(pg, MA_OWNED);
 
 	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 		PROC_LOCK(p);
 		if (P_SHOULDSTOP(p)) {
 			PROC_UNLOCK(p);
 			LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 				PROC_LOCK(p);
 				psignal(p, SIGHUP);
 				psignal(p, SIGCONT);
 				PROC_UNLOCK(p);
 			}
 			return;
 		}
 		PROC_UNLOCK(p);
 	}
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(pgrpdump, pgrpdump)
 {
 	register struct pgrp *pgrp;
 	register struct proc *p;
 	register int i;
 
 	for (i = 0; i <= pgrphash; i++) {
 		if (!LIST_EMPTY(&pgrphashtbl[i])) {
 			printf("\tindx %d\n", i);
 			LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
 				printf(
 			"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
 				    (void *)pgrp, (long)pgrp->pg_id,
 				    (void *)pgrp->pg_session,
 				    pgrp->pg_session->s_count,
 				    (void *)LIST_FIRST(&pgrp->pg_members));
 				LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 					printf("\t\tpid %ld addr %p pgrp %p\n", 
 					    (long)p->p_pid, (void *)p,
 					    (void *)p->p_pgrp);
 				}
 			}
 		}
 	}
 }
 #endif /* DDB */
 void
 fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp);
 
 /*
  * Fill in a kinfo_proc structure for the specified process.
  * Must be called with the target process locked.
  */
 void
 fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
 {
 	fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp);
 }
 
 void
 fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
 {
 	struct proc *p;
 	struct thread *td0;
 	struct kse *ke;
 	struct ksegrp *kg;
 	struct tty *tp;
 	struct session *sp;
 	struct timeval tv;
 	struct sigacts *ps;
 
 	p = td->td_proc;
 
 	bzero(kp, sizeof(*kp));
 
 	kp->ki_structsize = sizeof(*kp);
 	kp->ki_paddr = p;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	kp->ki_addr =/* p->p_addr; */0; /* XXXKSE */
 	kp->ki_args = p->p_args;
 	kp->ki_textvp = p->p_textvp;
 #ifdef KTRACE
 	kp->ki_tracep = p->p_tracevp;
 	mtx_lock(&ktrace_mtx);
 	kp->ki_traceflag = p->p_traceflag;
 	mtx_unlock(&ktrace_mtx);
 #endif
 	kp->ki_fd = p->p_fd;
 	kp->ki_vmspace = p->p_vmspace;
 	if (p->p_ucred) {
 		kp->ki_uid = p->p_ucred->cr_uid;
 		kp->ki_ruid = p->p_ucred->cr_ruid;
 		kp->ki_svuid = p->p_ucred->cr_svuid;
 		/* XXX bde doesn't like KI_NGROUPS */
 		kp->ki_ngroups = min(p->p_ucred->cr_ngroups, KI_NGROUPS);
 		bcopy(p->p_ucred->cr_groups, kp->ki_groups,
 		    kp->ki_ngroups * sizeof(gid_t));
 		kp->ki_rgid = p->p_ucred->cr_rgid;
 		kp->ki_svgid = p->p_ucred->cr_svgid;
 	}
 	if (p->p_sigacts) {
 		ps = p->p_sigacts;
 		mtx_lock(&ps->ps_mtx);
 		kp->ki_sigignore = ps->ps_sigignore;
 		kp->ki_sigcatch = ps->ps_sigcatch;
 		mtx_unlock(&ps->ps_mtx);
 	}
 	mtx_lock_spin(&sched_lock);
 	if (p->p_state != PRS_NEW &&
 	    p->p_state != PRS_ZOMBIE &&
 	    p->p_vmspace != NULL) {
 		struct vmspace *vm = p->p_vmspace;
 
 		kp->ki_size = vm->vm_map.size;
 		kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
 		if (p->p_sflag & PS_INMEM)
 			kp->ki_rssize += UAREA_PAGES;
 		FOREACH_THREAD_IN_PROC(p, td0) {
 			if (!TD_IS_SWAPPED(td0))
 				kp->ki_rssize += td0->td_kstack_pages;
 			if (td0->td_altkstack_obj != NULL)
 				kp->ki_rssize += td0->td_altkstack_pages;
 		}
 		kp->ki_swrss = vm->vm_swrss;
 		kp->ki_tsize = vm->vm_tsize;
 		kp->ki_dsize = vm->vm_dsize;
 		kp->ki_ssize = vm->vm_ssize;
 	}
 	if ((p->p_sflag & PS_INMEM) && p->p_stats) {
 		kp->ki_start = p->p_stats->p_start;
 		timevaladd(&kp->ki_start, &boottime);
 		kp->ki_rusage = p->p_stats->p_ru;
 		kp->ki_childtime.tv_sec = p->p_stats->p_cru.ru_utime.tv_sec +
 		    p->p_stats->p_cru.ru_stime.tv_sec;
 		kp->ki_childtime.tv_usec = p->p_stats->p_cru.ru_utime.tv_usec +
 		    p->p_stats->p_cru.ru_stime.tv_usec;
 	}
 	if (p->p_state != PRS_ZOMBIE) {
 #if 0
 		if (td == NULL) {
 			/* XXXKSE: This should never happen. */
 			printf("fill_kinfo_proc(): pid %d has no threads!\n",
 			    p->p_pid);
 			mtx_unlock_spin(&sched_lock);
 			return;
 		}
 #endif
 		if (td->td_wmesg != NULL) {
 			strlcpy(kp->ki_wmesg, td->td_wmesg,
 			    sizeof(kp->ki_wmesg));
 		}
 		if (TD_ON_LOCK(td)) {
 			kp->ki_kiflag |= KI_LOCKBLOCK;
 			strlcpy(kp->ki_lockname, td->td_lockname,
 			    sizeof(kp->ki_lockname));
 		}
 
 		if (p->p_state == PRS_NORMAL) { /*  XXXKSE very approximate */
 			if (TD_ON_RUNQ(td) ||
 			    TD_CAN_RUN(td) ||
 			    TD_IS_RUNNING(td)) {
 				kp->ki_stat = SRUN;
 			} else if (P_SHOULDSTOP(p)) {
 				kp->ki_stat = SSTOP;
 			} else if (TD_IS_SLEEPING(td)) {
 				kp->ki_stat = SSLEEP;
 			} else if (TD_ON_LOCK(td)) {
 				kp->ki_stat = SLOCK;
 			} else {
 				kp->ki_stat = SWAIT;
 			}
 		} else {
 			kp->ki_stat = SIDL;
 		}
 
 		kp->ki_sflag = p->p_sflag;
 		kp->ki_swtime = p->p_swtime;
 		kp->ki_pid = p->p_pid;
+		kp->ki_nice = p->p_nice;
 		kg = td->td_ksegrp;
 		ke = td->td_kse;
 		bintime2timeval(&p->p_runtime, &tv);
 		kp->ki_runtime =
 		    tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;
 
 		/* things in the KSE GROUP */
 		kp->ki_estcpu = kg->kg_estcpu;
 		kp->ki_slptime = kg->kg_slptime;
 		kp->ki_pri.pri_user = kg->kg_user_pri;
 		kp->ki_pri.pri_class = kg->kg_pri_class;
-		kp->ki_nice = kg->kg_nice;
 
 		/* Things in the thread */
 		kp->ki_wchan = td->td_wchan;
 		kp->ki_pri.pri_level = td->td_priority;
 		kp->ki_pri.pri_native = td->td_base_pri;
 		kp->ki_lastcpu = td->td_lastcpu;
 		kp->ki_oncpu = td->td_oncpu;
 		kp->ki_tdflags = td->td_flags;
 		kp->ki_pcb = td->td_pcb;
 		kp->ki_kstack = (void *)td->td_kstack;
 		kp->ki_pctcpu = sched_pctcpu(td);
 
 		/* Things in the kse */
 		if (ke)
 			kp->ki_rqindex = ke->ke_rqindex;
 		else
 			kp->ki_rqindex = 0;
 
 	} else {
 		kp->ki_stat = SZOMB;
 	}
 	mtx_unlock_spin(&sched_lock);
 	sp = NULL;
 	tp = NULL;
 	if (p->p_pgrp) {
 		kp->ki_pgid = p->p_pgrp->pg_id;
 		kp->ki_jobc = p->p_pgrp->pg_jobc;
 		sp = p->p_pgrp->pg_session;
 
 		if (sp != NULL) {
 			kp->ki_sid = sp->s_sid;
 			SESS_LOCK(sp);
 			strlcpy(kp->ki_login, sp->s_login,
 			    sizeof(kp->ki_login));
 			if (sp->s_ttyvp)
 				kp->ki_kiflag |= KI_CTTY;
 			if (SESS_LEADER(p))
 				kp->ki_kiflag |= KI_SLEADER;
 			tp = sp->s_ttyp;
 			SESS_UNLOCK(sp);
 		}
 	}
 	if ((p->p_flag & P_CONTROLT) && tp != NULL) {
 		kp->ki_tdev = dev2udev(tp->t_dev);
 		kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
 		if (tp->t_session)
 			kp->ki_tsid = tp->t_session->s_sid;
 	} else
 		kp->ki_tdev = NOUDEV;
 	if (p->p_comm[0] != '\0') {
 		strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
 		strlcpy(kp->ki_ocomm, p->p_comm, sizeof(kp->ki_ocomm));
 	}
 	kp->ki_siglist = p->p_siglist;
         SIGSETOR(kp->ki_siglist, td->td_siglist);
 	kp->ki_sigmask = td->td_sigmask;
 	kp->ki_xstat = p->p_xstat;
 	kp->ki_acflag = p->p_acflag;
 	kp->ki_flag = p->p_flag;
 	/* If jailed(p->p_ucred), emulate the old P_JAILED flag. */
 	if (jailed(p->p_ucred))
 		kp->ki_flag |= P_JAILED;
 	kp->ki_lock = p->p_lock;
 	if (p->p_pptr)
 		kp->ki_ppid = p->p_pptr->p_pid;
 }
 
 /*
  * Locate a zombie process by number
  */
 struct proc *
 zpfind(pid_t pid)
 {
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &zombproc, p_list)
 		if (p->p_pid == pid) {
 			PROC_LOCK(p);
 			break;
 		}
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
 
 #define KERN_PROC_ZOMBMASK	0x3
 #define KERN_PROC_NOTHREADS	0x4
 
 /*
  * Must be called with the process locked and will return with it unlocked.
  */
 static int
 sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags)
 {
 	struct thread *td;
 	struct kinfo_proc kinfo_proc;
 	int error = 0;
 	struct proc *np;
 	pid_t pid = p->p_pid;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (flags & KERN_PROC_NOTHREADS) {
 		fill_kinfo_proc(p, &kinfo_proc);
 		PROC_UNLOCK(p);
 		error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 				   sizeof(kinfo_proc));
 		PROC_LOCK(p);
 	} else {
 		_PHOLD(p);
 		FOREACH_THREAD_IN_PROC(p, td) {
 			fill_kinfo_thread(td, &kinfo_proc);
 			PROC_UNLOCK(p);
 			error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 					   sizeof(kinfo_proc));
 			PROC_LOCK(p);
 			if (error)
 				break;
 		}
 		_PRELE(p);
 	}
 	PROC_UNLOCK(p);
 	if (error)
 		return (error);
 	if (flags & KERN_PROC_ZOMBMASK)
 		np = zpfind(pid);
 	else {
 		if (pid == 0)
 			return (0);
 		np = pfind(pid);
 	}
 	if (np == NULL)
 		return EAGAIN;
 	if (np != p) {
 		PROC_UNLOCK(np);
 		return EAGAIN;
 	}
 	PROC_UNLOCK(np);
 	return (0);
 }
 
 static int
 sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int*) arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	int flags, doingzomb, oid_number;
 	int error = 0;
 
 	oid_number = oidp->oid_number;
 	if (oid_number != KERN_PROC_ALL &&
 	    (oid_number & KERN_PROC_INC_THREAD) == 0)
 		flags = KERN_PROC_NOTHREADS;
 	else {
 		flags = 0;
 		oid_number &= ~KERN_PROC_INC_THREAD;
 	}
 	if (oid_number == KERN_PROC_PID) {
 		if (namelen != 1) 
 			return (EINVAL);
 		p = pfind((pid_t)name[0]);
 		if (!p)
 			return (ESRCH);
 		if ((error = p_cansee(curthread, p))) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 		error = sysctl_out_proc(p, req, flags);
 		return (error);
 	}
 
 	switch (oid_number) {
 	case KERN_PROC_ALL:
 		if (namelen != 0)
 			return (EINVAL);
 		break;
 	case KERN_PROC_PROC:
 		if (namelen != 0 && namelen != 1)
 			return (EINVAL);
 		break;
 	default:
 		if (namelen != 1)
 			return (EINVAL);
 		break;
 	}
 	
 	if (!req->oldptr) {
 		/* overestimate by 5 procs */
 		error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
 		if (error)
 			return (error);
 	}
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sx_slock(&allproc_lock);
 	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
 		if (!doingzomb)
 			p = LIST_FIRST(&allproc);
 		else
 			p = LIST_FIRST(&zombproc);
 		for (; p != 0; p = LIST_NEXT(p, p_list)) {
 			/*
 			 * Skip embryonic processes.
 			 */
 			mtx_lock_spin(&sched_lock);
 			if (p->p_state == PRS_NEW) {
 				mtx_unlock_spin(&sched_lock);
 				continue;
 			}
 			mtx_unlock_spin(&sched_lock);
 			PROC_LOCK(p);
 			/*
 			 * Show a user only appropriate processes.
 			 */
 			if (p_cansee(curthread, p)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * TODO - make more efficient (see notes below).
 			 * do by session.
 			 */
 			switch (oid_number) {
 
 			case KERN_PROC_PGRP:
 				/* could do this by traversing pgrp */
 				if (p->p_pgrp == NULL || 
 				    p->p_pgrp->pg_id != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RGID:
 				if (p->p_ucred == NULL ||
 				    p->p_ucred->cr_rgid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_SESSION:
 				if (p->p_session == NULL ||
 				    p->p_session->s_sid != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_TTY:
 				if ((p->p_flag & P_CONTROLT) == 0 ||
 				    p->p_session == NULL) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				SESS_LOCK(p->p_session);
 				if (p->p_session->s_ttyp == NULL ||
 				    dev2udev(p->p_session->s_ttyp->t_dev) != 
 				    (udev_t)name[0]) {
 					SESS_UNLOCK(p->p_session);
 					PROC_UNLOCK(p);
 					continue;
 				}
 				SESS_UNLOCK(p->p_session);
 				break;
 
 			case KERN_PROC_UID:
 				if (p->p_ucred == NULL || 
 				    p->p_ucred->cr_uid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RUID:
 				if (p->p_ucred == NULL || 
 				    p->p_ucred->cr_ruid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_PROC:
 				break;
 
 			default:
 				break;
 
 			}
 
 			error = sysctl_out_proc(p, req, flags | doingzomb);
 			if (error) {
 				sx_sunlock(&allproc_lock);
 				return (error);
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return (0);
 }
 
 struct pargs *
 pargs_alloc(int len)
 {
 	struct pargs *pa;
 
 	MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS,
 		M_WAITOK);
 	pa->ar_ref = 1;
 	pa->ar_length = len;
 	return (pa);
 }
 
 void
 pargs_free(struct pargs *pa)
 {
 
 	FREE(pa, M_PARGS);
 }
 
 void
 pargs_hold(struct pargs *pa)
 {
 
 	if (pa == NULL)
 		return;
 	PARGS_LOCK(pa);
 	pa->ar_ref++;
 	PARGS_UNLOCK(pa);
 }
 
 void
 pargs_drop(struct pargs *pa)
 {
 
 	if (pa == NULL)
 		return;
 	PARGS_LOCK(pa);
 	if (--pa->ar_ref == 0) {
 		PARGS_UNLOCK(pa);
 		pargs_free(pa);
 	} else
 		PARGS_UNLOCK(pa);
 }
 
 /*
  * This sysctl allows a process to retrieve the argument list or process
  * title for another process without groping around in the address space
  * of the other process.  It also allow a process to set its own "process 
  * title to a string of its own choice.
  */
 static int
 sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int*) arg1;
 	u_int namelen = arg2;
 	struct pargs *newpa, *pa;
 	struct proc *p;
 	int error = 0;
 
 	if (namelen != 1) 
 		return (EINVAL);
 
 	p = pfind((pid_t)name[0]);
 	if (!p)
 		return (ESRCH);
 
 	if ((error = p_cansee(curthread, p)) != 0) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 
 	if (req->newptr && curproc != p) {
 		PROC_UNLOCK(p);
 		return (EPERM);
 	}
 
 	pa = p->p_args;
 	pargs_hold(pa);
 	PROC_UNLOCK(p);
 	if (req->oldptr != NULL && pa != NULL)
 		error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
 	pargs_drop(pa);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
 		return (ENOMEM);
 	newpa = pargs_alloc(req->newlen);
 	error = SYSCTL_IN(req, newpa->ar_args, req->newlen);
 	if (error != 0) {
 		pargs_free(newpa);
 		return (error);
 	}
 	PROC_LOCK(p);
 	pa = p->p_args;
 	p->p_args = newpa;
 	PROC_UNLOCK(p);
 	pargs_drop(pa);
 	return (0);
 }
 
 static int
 sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	char *sv_name;
 	int *name;
 	int namelen;
 	int error;
 
 	namelen = arg2;
 	if (namelen != 1) 
 		return (EINVAL);
 
 	name = (int *)arg1;
 	if ((p = pfind((pid_t)name[0])) == NULL)
 		return (ESRCH);
 	if ((error = p_cansee(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	sv_name = p->p_sysent->sv_name;
 	PROC_UNLOCK(p);
 	return (sysctl_handle_string(oidp, sv_name, 0, req));
 }
 
 
 SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD,  0, "Process table");
 
 SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT,
 	0, 0, sysctl_kern_proc, "S,proc", "Return entire process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD,
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD,
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD,
 	sysctl_kern_proc, "Return process table, no threads");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_ANYBODY,
 	sysctl_kern_proc_args, "Process argument list");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD,
 	sysctl_kern_proc_sv_name, "Process syscall vector name (ABI type)");
 
 SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD), sid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Return process table, no threads");
Index: head/sys/kern/kern_resource.c
===================================================================
--- head/sys/kern/kern_resource.c	(revision 130550)
+++ head/sys/kern/kern_resource.c	(revision 130551)
@@ -1,1156 +1,1132 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/time.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 static int donice(struct thread *td, struct proc *chgp, int n);
 
 static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
 static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
 #define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
 static struct mtx uihashtbl_mtx;
 static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
 static u_long uihash;		/* size of hash table - 1 */
 
 static struct uidinfo	*uilookup(uid_t uid);
 
 /*
  * Resource controls and accounting.
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct getpriority_args {
 	int	which;
 	int	who;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 getpriority(td, uap)
 	struct thread *td;
 	register struct getpriority_args *uap;
 {
-	struct ksegrp *kg;
 	struct proc *p;
 	int error, low;
 
 	error = 0;
 	low = PRIO_MAX + 1;
 	switch (uap->which) {
 
 	case PRIO_PROCESS:
 		if (uap->who == 0)
-			low = td->td_ksegrp->kg_nice;
+			low = td->td_proc->p_nice;
 		else {
 			p = pfind(uap->who);
 			if (p == NULL)
 				break;
 			if (p_cansee(td, p) == 0) {
-				FOREACH_KSEGRP_IN_PROC(p, kg) {
-					if (kg->kg_nice < low)
-						low = kg->kg_nice;
-				}
+				low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		break;
 
 	case PRIO_PGRP: {
 		register struct pgrp *pg;
 
 		sx_slock(&proctree_lock);
 		if (uap->who == 0) {
 			pg = td->td_proc->p_pgrp;
 			PGRP_LOCK(pg);
 		} else {
 			pg = pgfind(uap->who);
 			if (pg == NULL) {
 				sx_sunlock(&proctree_lock);
 				break;
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (!p_cansee(td, p)) {
-				FOREACH_KSEGRP_IN_PROC(p, kg) {
-					if (kg->kg_nice < low)
-						low = kg->kg_nice;
-				}
+				if (p->p_nice < low)
+					low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pg);
 		break;
 	}
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
 		LIST_FOREACH(p, &allproc, p_list) {
 			PROC_LOCK(p);
 			if (!p_cansee(td, p) &&
 			    p->p_ucred->cr_uid == uap->who) {
-				FOREACH_KSEGRP_IN_PROC(p, kg) {
-					if (kg->kg_nice < low)
-						low = kg->kg_nice;
-				}
+				if (p->p_nice < low)
+					low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (low == PRIO_MAX + 1 && error == 0)
 		error = ESRCH;
 	td->td_retval[0] = low;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setpriority_args {
 	int	which;
 	int	who;
 	int	prio;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 setpriority(td, uap)
 	struct thread *td;
 	register struct setpriority_args *uap;
 {
 	struct proc *curp;
 	register struct proc *p;
 	int found = 0, error = 0;
 
 	curp = td->td_proc;
 	switch (uap->which) {
 	case PRIO_PROCESS:
 		if (uap->who == 0) {
 			PROC_LOCK(curp);
 			error = donice(td, curp, uap->prio);
 			PROC_UNLOCK(curp);
 		} else {
 			p = pfind(uap->who);
 			if (p == 0)
 				break;
 			if (p_cansee(td, p) == 0)
 				error = donice(td, p, uap->prio);
 			PROC_UNLOCK(p);
 		}
 		found++;
 		break;
 
 	case PRIO_PGRP: {
 		register struct pgrp *pg;
 
 		sx_slock(&proctree_lock);
 		if (uap->who == 0) {
 			pg = curp->p_pgrp;
 			PGRP_LOCK(pg);
 		} else {
 			pg = pgfind(uap->who);
 			if (pg == NULL) {
 				sx_sunlock(&proctree_lock);
 				break;
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (!p_cansee(td, p)) {
 				error = donice(td, p, uap->prio);
 				found++;
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pg);
 		break;
 	}
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_ucred->cr_uid == uap->who &&
 			    !p_cansee(td, p)) {
 				error = donice(td, p, uap->prio);
 				found++;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (found == 0 && error == 0)
 		error = ESRCH;
 	return (error);
 }
 
 /* 
- * Set "nice" for a process.  Doesn't really understand threaded processes
- * well but does try.  Has the unfortunate side effect of making all the NICE
- * values for a process's ksegrps the same.  This suggests that
- * NICE values should be stored as a process nice and deltas for the ksegrps.
- * (but not yet).
+ * Set "nice" for a (whole) process.
  */
 static int
 donice(struct thread *td, struct proc *p, int n)
 {
-	struct ksegrp *kg;
-	int error, low;
+	int error;
 
-	low = PRIO_MAX + 1;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((error = p_cansched(td, p)))
 		return (error);
 	if (n > PRIO_MAX)
 		n = PRIO_MAX;
 	if (n < PRIO_MIN)
 		n = PRIO_MIN;
-	/* 
-	 * Only allow nicing if to more than the lowest nice.
-	 * E.g., for nices of 4,3,2 allow nice to 3 but not 1
-	 */
-	FOREACH_KSEGRP_IN_PROC(p, kg) {
-		if (kg->kg_nice < low)
-			low = kg->kg_nice;
-	}
- 	if (n < low && suser(td) != 0)
+ 	if (n <  p->p_nice && suser(td) != 0)
 		return (EACCES);
 	mtx_lock_spin(&sched_lock);
-	FOREACH_KSEGRP_IN_PROC(p, kg) {
-		sched_nice(kg, n);
-	}
+	sched_nice(p, n);
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 /*
  * Set realtime priority
  *
  * MPSAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rtprio_args {
 	int		function;
 	pid_t		pid;
 	struct rtprio	*rtp;
 };
 #endif
 
 int
 rtprio(td, uap)
 	struct thread *td;		/* curthread */
 	register struct rtprio_args *uap;
 {
 	struct proc *curp;
 	struct proc *p;
 	struct ksegrp *kg;
 	struct rtprio rtp;
 	int cierror, error;
 
 	/* Perform copyin before acquiring locks if needed. */
 	if (uap->function == RTP_SET)
 		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 	else
 		cierror = 0;
 
 	curp = td->td_proc;
 	if (uap->pid == 0) {
 		p = curp;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(uap->pid);
 		if (p == NULL)
 			return (ESRCH);
 	}
 
 	switch (uap->function) {
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Return OUR priority if no pid specified,
 		 * or if one is, report the highest priority
 		 * in the process. There isn't much more you can do as 
 		 * there is only room to return a single priority.
 		 * XXXKSE  Maybe need a new interface to report 
 		 * priorities of multiple system scope threads.
 		 * Note: specifying our own pid is not the same
 		 * as leaving it zero.
 		 */
 		if (uap->pid == 0) {
 			pri_to_rtp(td->td_ksegrp, &rtp);
 		} else {
 			struct rtprio rtp2;
 
 			rtp.type = RTP_PRIO_IDLE;
 			rtp.prio = RTP_PRIO_MAX;
 			FOREACH_KSEGRP_IN_PROC(p, kg) {
 				pri_to_rtp(kg, &rtp2);
 				if ((rtp2.type <  rtp.type) ||
 				    ((rtp2.type == rtp.type) &&
 				     (rtp2.prio < rtp.prio))) {
 					rtp.type = rtp2.type;
 					rtp.prio = rtp2.prio;
 				}
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
 		if ((error = p_cansched(td, p)) || (error = cierror))
 			break;
 		/* disallow setting rtprio in most cases if not superuser */
 		if (suser(td) != 0) {
 			/* can't set someone else's */
 			if (uap->pid) {
 				error = EPERM;
 				break;
 			}
 			/* can't set realtime priority */
 /*
  * Realtime priority has to be restricted for reasons which should be
  * obvious. However, for idle priority, there is a potential for
  * system deadlock if an idleprio process gains a lock on a resource
  * that other processes need (and the idleprio process can't run
  * due to a CPU-bound normal process). Fix me! XXX
  */
 #if 0
  			if (RTP_PRIO_IS_REALTIME(rtp.type))
 #endif
 			if (rtp.type != RTP_PRIO_NORMAL) {
 				error = EPERM;
 				break;
 			}
 		}
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * If we are setting our own priority, set just our
 		 * KSEGRP but if we are doing another process,
 		 * do all the groups on that process. If we
 		 * specify our own pid we do the latter.
 		 */
 		if (uap->pid == 0) {
 			error = rtp_to_pri(&rtp, td->td_ksegrp);
 		} else {
 			FOREACH_KSEGRP_IN_PROC(p, kg) {
 				if ((error = rtp_to_pri(&rtp, kg)) != 0) {
 					break;
 				}
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 int
 rtp_to_pri(struct rtprio *rtp, struct ksegrp *kg)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (rtp->prio > RTP_PRIO_MAX)
 		return (EINVAL);
 	switch (RTP_PRIO_BASE(rtp->type)) {
 	case RTP_PRIO_REALTIME:
 		kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio;
 		break;
 	case RTP_PRIO_NORMAL:
 		kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio;
 		break;
 	case RTP_PRIO_IDLE:
 		kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio;
 		break;
 	default:
 		return (EINVAL);
 	}
 	sched_class(kg, rtp->type);
 	if (curthread->td_ksegrp == kg) {
 		curthread->td_base_pri = kg->kg_user_pri;
 		sched_prio(curthread, kg->kg_user_pri); /* XXX dubious */
 	}
 	return (0);
 }
 
 void
 pri_to_rtp(struct ksegrp *kg, struct rtprio *rtp)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	switch (PRI_BASE(kg->kg_pri_class)) {
 	case PRI_REALTIME:
 		rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME;
 		break;
 	case PRI_TIMESHARE:
 		rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE;
 		break;
 	case PRI_IDLE:
 		rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE;
 		break;
 	default:
 		break;
 	}
 	rtp->type = kg->kg_pri_class;
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 osetrlimit(td, uap)
 	struct thread *td;
 	register struct osetrlimit_args *uap;
 {
 	struct orlimit olim;
 	struct rlimit lim;
 	int error;
 
 	if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
 		return (error);
 	lim.rlim_cur = olim.rlim_cur;
 	lim.rlim_max = olim.rlim_max;
 	error = kern_setrlimit(td, uap->which, &lim);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ogetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 ogetrlimit(td, uap)
 	struct thread *td;
 	register struct ogetrlimit_args *uap;
 {
 	struct orlimit olim;
 	struct rlimit rl;
 	struct proc *p;
 	int error;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	lim_rlimit(p, uap->which, &rl);
 	PROC_UNLOCK(p);
 
 	/*
 	 * XXX would be more correct to convert only RLIM_INFINITY to the
 	 * old RLIM_INFINITY and fail with EOVERFLOW for other larger
 	 * values.  Most 64->32 and 32->16 conversions, including not
 	 * unimportant ones of uids are even more broken than what we
 	 * do here (they blindly truncate).  We don't do this correctly
 	 * here since we have little experience with EOVERFLOW yet.
 	 * Elsewhere, getuid() can't fail...
 	 */
 	olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
 	olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
 	error = copyout(&olim, uap->rlp, sizeof(olim));
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct __setrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 setrlimit(td, uap)
 	struct thread *td;
 	register struct __setrlimit_args *uap;
 {
 	struct rlimit alim;
 	int error;
 
 	if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
 		return (error);
 	error = kern_setrlimit(td, uap->which, &alim);
 	return (error);
 }
 
 int
 kern_setrlimit(td, which, limp)
 	struct thread *td;
 	u_int which;
 	struct rlimit *limp;
 {
 	struct plimit *newlim, *oldlim;
 	struct proc *p;
 	register struct rlimit *alimp;
 	rlim_t oldssiz;
 	int error;
 
 	if (which >= RLIM_NLIMITS)
 		return (EINVAL);
 
 	/*
 	 * Preserve historical bugs by treating negative limits as unsigned.
 	 */
 	if (limp->rlim_cur < 0)
 		limp->rlim_cur = RLIM_INFINITY;
 	if (limp->rlim_max < 0)
 		limp->rlim_max = RLIM_INFINITY;
 
 	oldssiz = 0;
 	p = td->td_proc;
 	newlim = lim_alloc();
 	PROC_LOCK(p);
 	oldlim = p->p_limit;
 	alimp = &oldlim->pl_rlimit[which];
 	if (limp->rlim_cur > alimp->rlim_max ||
 	    limp->rlim_max > alimp->rlim_max)
 		if ((error = suser_cred(td->td_ucred, PRISON_ROOT))) {
 			PROC_UNLOCK(p);
 			lim_free(newlim);
 			return (error);
 	}
 	if (limp->rlim_cur > limp->rlim_max)
 		limp->rlim_cur = limp->rlim_max;
 	lim_copy(newlim, oldlim);
 	alimp = &newlim->pl_rlimit[which];
 
 	switch (which) {
 
 	case RLIMIT_CPU:
 		mtx_lock_spin(&sched_lock);
 		p->p_cpulimit = limp->rlim_cur;
 		mtx_unlock_spin(&sched_lock);
 		break;
 	case RLIMIT_DATA:
 		if (limp->rlim_cur > maxdsiz)
 			limp->rlim_cur = maxdsiz;
 		if (limp->rlim_max > maxdsiz)
 			limp->rlim_max = maxdsiz;
 		break;
 
 	case RLIMIT_STACK:
 		if (limp->rlim_cur > maxssiz)
 			limp->rlim_cur = maxssiz;
 		if (limp->rlim_max > maxssiz)
 			limp->rlim_max = maxssiz;
 		oldssiz = alimp->rlim_cur;
 		break;
 
 	case RLIMIT_NOFILE:
 		if (limp->rlim_cur > maxfilesperproc)
 			limp->rlim_cur = maxfilesperproc;
 		if (limp->rlim_max > maxfilesperproc)
 			limp->rlim_max = maxfilesperproc;
 		break;
 
 	case RLIMIT_NPROC:
 		if (limp->rlim_cur > maxprocperuid)
 			limp->rlim_cur = maxprocperuid;
 		if (limp->rlim_max > maxprocperuid)
 			limp->rlim_max = maxprocperuid;
 		if (limp->rlim_cur < 1)
 			limp->rlim_cur = 1;
 		if (limp->rlim_max < 1)
 			limp->rlim_max = 1;
 		break;
 	}
 	*alimp = *limp;
 	p->p_limit = newlim;
 	PROC_UNLOCK(p);
 	lim_free(oldlim);
 
 	if (which == RLIMIT_STACK) {
 		/*
 		 * Stack is allocated to the max at exec time with only
 		 * "rlim_cur" bytes accessible.  If stack limit is going
 		 * up make more accessible, if going down make inaccessible.
 		 */
 		if (limp->rlim_cur != oldssiz) {
 			vm_offset_t addr;
 			vm_size_t size;
 			vm_prot_t prot;
 
 			mtx_lock(&Giant);
 			if (limp->rlim_cur > oldssiz) {
 				prot = p->p_sysent->sv_stackprot;
 				size = limp->rlim_cur - oldssiz;
 				addr = p->p_sysent->sv_usrstack -
 				    limp->rlim_cur;
 			} else {
 				prot = VM_PROT_NONE;
 				size = oldssiz - limp->rlim_cur;
 				addr = p->p_sysent->sv_usrstack -
 				    oldssiz;
 			}
 			addr = trunc_page(addr);
 			size = round_page(size);
 			(void) vm_map_protect(&p->p_vmspace->vm_map,
 					      addr, addr+size, prot, FALSE);
 			mtx_unlock(&Giant);
 		}
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __getrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 getrlimit(td, uap)
 	struct thread *td;
 	register struct __getrlimit_args *uap;
 {
 	struct rlimit rlim;
 	struct proc *p;
 	int error;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	lim_rlimit(p, uap->which, &rlim);
 	PROC_UNLOCK(p);
 	error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
 	return(error);
 }
 
 /*
  * Transform the running time and tick information in proc p into user,
  * system, and interrupt time usage.
  */
 void
 calcru(p, up, sp, ip)
 	struct proc *p;
 	struct timeval *up;
 	struct timeval *sp;
 	struct timeval *ip;
 {
 	struct bintime bt;
 	struct timeval tv;
 	/* {user, system, interrupt, total} {ticks, usec}; previous tu: */
 	u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* XXX: why spl-protect ?  worst case is an off-by-one report */
 
 	ut = p->p_uticks;
 	st = p->p_sticks;
 	it = p->p_iticks;
 
 	tt = ut + st + it;
 	if (tt == 0) {
 		st = 1;
 		tt = 1;
 	}
 	if (p == curthread->td_proc) {
 		/*
 		 * Adjust for the current time slice.  This is actually fairly
 		 * important since the error here is on the order of a time
 		 * quantum, which is much greater than the sampling error.
 		 * XXXKSE use a different test due to threads on other 
 		 * processors also being 'current'.
 		 */
 		binuptime(&bt);
 		bintime_sub(&bt, PCPU_PTR(switchtime));
 		bintime_add(&bt, &p->p_runtime);
 	} else
 		bt = p->p_runtime;
 	bintime2timeval(&bt, &tv);
 	tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
 	ptu = p->p_uu + p->p_su + p->p_iu;
 	if (tu < ptu || (int64_t)tu < 0) {
 		printf("calcru: negative time of %jd usec for pid %d (%s)\n",
 		    (intmax_t)tu, p->p_pid, p->p_comm);
 		tu = ptu;
 	}
 
 	/* Subdivide tu. */
 	uu = (tu * ut) / tt;
 	su = (tu * st) / tt;
 	iu = tu - uu - su;
 
 	/* Enforce monotonicity. */
 	if (uu < p->p_uu || su < p->p_su || iu < p->p_iu) {
 		if (uu < p->p_uu)
 			uu = p->p_uu;
 		else if (uu + p->p_su + p->p_iu > tu)
 			uu = tu - p->p_su - p->p_iu;
 		if (st == 0)
 			su = p->p_su;
 		else {
 			su = ((tu - uu) * st) / (st + it);
 			if (su < p->p_su)
 				su = p->p_su;
 			else if (uu + su + p->p_iu > tu)
 				su = tu - uu - p->p_iu;
 		}
 		KASSERT(uu + su + p->p_iu <= tu,
 		    ("calcru: monotonisation botch 1"));
 		iu = tu - uu - su;
 		KASSERT(iu >= p->p_iu,
 		    ("calcru: monotonisation botch 2"));
 	}
 	p->p_uu = uu;
 	p->p_su = su;
 	p->p_iu = iu;
 
 	up->tv_sec = uu / 1000000;
 	up->tv_usec = uu % 1000000;
 	sp->tv_sec = su / 1000000;
 	sp->tv_usec = su % 1000000;
 	if (ip != NULL) {
 		ip->tv_sec = iu / 1000000;
 		ip->tv_usec = iu % 1000000;
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getrusage_args {
 	int	who;
 	struct	rusage *rusage;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 getrusage(td, uap)
 	register struct thread *td;
 	register struct getrusage_args *uap;
 {
 	struct rusage ru;
 	struct proc *p;
 
 	p = td->td_proc;
 	switch (uap->who) {
 
 	case RUSAGE_SELF:
 		mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		calcru(p, &p->p_stats->p_ru.ru_utime, &p->p_stats->p_ru.ru_stime,
 		    NULL);
 		mtx_unlock_spin(&sched_lock);
 		ru = p->p_stats->p_ru;
 		mtx_unlock(&Giant);
 		break;
 
 	case RUSAGE_CHILDREN:
 		mtx_lock(&Giant);
 		ru = p->p_stats->p_cru;
 		mtx_unlock(&Giant);
 		break;
 
 	default:
 		return (EINVAL);
 		break;
 	}
 	return (copyout(&ru, uap->rusage, sizeof(struct rusage)));
 }
 
 void
 ruadd(ru, ru2)
 	register struct rusage *ru, *ru2;
 {
 	register long *ip, *ip2;
 	register int i;
 
 	timevaladd(&ru->ru_utime, &ru2->ru_utime);
 	timevaladd(&ru->ru_stime, &ru2->ru_stime);
 	if (ru->ru_maxrss < ru2->ru_maxrss)
 		ru->ru_maxrss = ru2->ru_maxrss;
 	ip = &ru->ru_first; ip2 = &ru2->ru_first;
 	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
 		*ip++ += *ip2++;
 }
 
 /*
  * Allocate a new resource limits structure and initialize its
  * reference count and mutex pointer.
  */
 struct plimit *
 lim_alloc()
 {
 	struct plimit *limp;
 
 	limp = (struct plimit *)malloc(sizeof(struct plimit), M_PLIMIT,
 	    M_WAITOK);
 	limp->pl_refcnt = 1;
 	limp->pl_mtx = mtx_pool_alloc(mtxpool_sleep);
 	return (limp);
 }
 
 struct plimit *
 lim_hold(limp)
 	struct plimit *limp;
 {
 
 	LIM_LOCK(limp);
 	limp->pl_refcnt++;
 	LIM_UNLOCK(limp);
 	return (limp);
 }
 
 void
 lim_free(limp)
 	struct plimit *limp;
 {
 
 	LIM_LOCK(limp);
 	KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
 	if (--limp->pl_refcnt == 0) {
 		LIM_UNLOCK(limp);
 		free((void *)limp, M_PLIMIT);
 		return;
 	}
 	LIM_UNLOCK(limp);
 }
 
 /*
  * Make a copy of the plimit structure.
  * We share these structures copy-on-write after fork.
  */
 void
 lim_copy(dst, src)
 	struct plimit *dst, *src;
 {
 
 	KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
 	bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
 }
 
 /*
  * Return the hard limit for a particular system resource.  The
  * which parameter specifies the index into the rlimit array.
  */
 rlim_t
 lim_max(struct proc *p, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit(p, which, &rl);
 	return (rl.rlim_max);
 }
 
 /*
  * Return the current (soft) limit for a particular system resource.
  * The which parameter which specifies the index into the rlimit array
  */
 rlim_t
 lim_cur(struct proc *p, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit(p, which, &rl);
 	return (rl.rlim_cur);
 }
 
 /*
  * Return a copy of the entire rlimit structure for the system limit
  * specified by 'which' in the rlimit structure pointed to by 'rlp'.
  */
 void
 lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(which >= 0 && which < RLIM_NLIMITS,
 	    ("request for invalid resource limit"));
 	*rlp = p->p_limit->pl_rlimit[which];
 }
 
 /*
  * Find the uidinfo structure for a uid.  This structure is used to
  * track the total resource consumption (process count, socket buffer
  * size, etc.) for the uid and impose limits.
  */
 void
 uihashinit()
 {
 
 	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
 	mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
 }
 
 /*
  * Look up a uidinfo struct for the parameter uid.
  * uihashtbl_mtx must be locked.
  */
 static struct uidinfo *
 uilookup(uid)
 	uid_t uid;
 {
 	struct uihashhead *uipp;
 	struct uidinfo *uip;
 
 	mtx_assert(&uihashtbl_mtx, MA_OWNED);
 	uipp = UIHASH(uid);
 	LIST_FOREACH(uip, uipp, ui_hash)
 		if (uip->ui_uid == uid)
 			break;
 
 	return (uip);
 }
 
 /*
  * Find or allocate a struct uidinfo for a particular uid.
  * Increase refcount on uidinfo struct returned.
  * uifree() should be called on a struct uidinfo when released.
  */
 struct uidinfo *
 uifind(uid)
 	uid_t uid;
 {
 	struct uidinfo *old_uip, *uip;
 
 	mtx_lock(&uihashtbl_mtx);
 	uip = uilookup(uid);
 	if (uip == NULL) {
 		mtx_unlock(&uihashtbl_mtx);
 		uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
 		mtx_lock(&uihashtbl_mtx);
 		/*
 		 * There's a chance someone created our uidinfo while we
 		 * were in malloc and not holding the lock, so we have to
 		 * make sure we don't insert a duplicate uidinfo.
 		 */
 		if ((old_uip = uilookup(uid)) != NULL) {
 			/* Someone else beat us to it. */
 			free(uip, M_UIDINFO);
 			uip = old_uip;
 		} else {
 			uip->ui_mtxp = mtx_pool_alloc(mtxpool_sleep);
 			uip->ui_uid = uid;
 			LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
 		}
 	}
 	uihold(uip);
 	mtx_unlock(&uihashtbl_mtx);
 	return (uip);
 }
 
 /*
  * Place another refcount on a uidinfo struct.
  */
 void
 uihold(uip)
 	struct uidinfo *uip;
 {
 
 	UIDINFO_LOCK(uip);
 	uip->ui_ref++;
 	UIDINFO_UNLOCK(uip);
 }
 
 /*-
  * Since uidinfo structs have a long lifetime, we use an
  * opportunistic refcounting scheme to avoid locking the lookup hash
  * for each release.
  *
  * If the refcount hits 0, we need to free the structure,
  * which means we need to lock the hash.
  * Optimal case:
  *   After locking the struct and lowering the refcount, if we find
  *   that we don't need to free, simply unlock and return.
  * Suboptimal case:
  *   If refcount lowering results in need to free, bump the count
  *   back up, loose the lock and aquire the locks in the proper
  *   order to try again.
  */
 void
 uifree(uip)
 	struct uidinfo *uip;
 {
 
 	/* Prepare for optimal case. */
 	UIDINFO_LOCK(uip);
 
 	if (--uip->ui_ref != 0) {
 		UIDINFO_UNLOCK(uip);
 		return;
 	}
 
 	/* Prepare for suboptimal case. */
 	uip->ui_ref++;
 	UIDINFO_UNLOCK(uip);
 	mtx_lock(&uihashtbl_mtx);
 	UIDINFO_LOCK(uip);
 
 	/*
 	 * We must subtract one from the count again because we backed out
 	 * our initial subtraction before dropping the lock.
 	 * Since another thread may have added a reference after we dropped the
 	 * initial lock we have to test for zero again.
 	 */
 	if (--uip->ui_ref == 0) {
 		LIST_REMOVE(uip, ui_hash);
 		mtx_unlock(&uihashtbl_mtx);
 		if (uip->ui_sbsize != 0)
 			printf("freeing uidinfo: uid = %d, sbsize = %jd\n",
 			    uip->ui_uid, (intmax_t)uip->ui_sbsize);
 		if (uip->ui_proccnt != 0)
 			printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
 			    uip->ui_uid, uip->ui_proccnt);
 		UIDINFO_UNLOCK(uip);
 		FREE(uip, M_UIDINFO);
 		return;
 	}
 
 	mtx_unlock(&uihashtbl_mtx);
 	UIDINFO_UNLOCK(uip);
 }
 
 /*
  * Change the count associated with number of processes
  * a given user is using.  When 'max' is 0, don't enforce a limit
  */
 int
 chgproccnt(uip, diff, max)
 	struct	uidinfo	*uip;
 	int	diff;
 	int	max;
 {
 
 	UIDINFO_LOCK(uip);
 	/* Don't allow them to exceed max, but allow subtraction. */
 	if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
 		UIDINFO_UNLOCK(uip);
 		return (0);
 	}
 	uip->ui_proccnt += diff;
 	if (uip->ui_proccnt < 0)
 		printf("negative proccnt for uid = %d\n", uip->ui_uid);
 	UIDINFO_UNLOCK(uip);
 	return (1);
 }
 
 /*
  * Change the total socket buffer size a user has used.
  */
 int
 chgsbsize(uip, hiwat, to, max)
 	struct	uidinfo	*uip;
 	u_int  *hiwat;
 	u_int	to;
 	rlim_t	max;
 {
 	rlim_t new;
 	int s;
 
 	s = splnet();
 	UIDINFO_LOCK(uip);
 	new = uip->ui_sbsize + to - *hiwat;
 	/* Don't allow them to exceed max, but allow subtraction */
 	if (to > *hiwat && new > max) {
 		splx(s);
 		UIDINFO_UNLOCK(uip);
 		return (0);
 	}
 	uip->ui_sbsize = new;
 	*hiwat = to;
 	if (uip->ui_sbsize < 0)
 		printf("negative sbsize for uid = %d\n", uip->ui_uid);
 	splx(s);
 	UIDINFO_UNLOCK(uip);
 	return (1);
 }
Index: head/sys/kern/sched_4bsd.c
===================================================================
--- head/sys/kern/sched_4bsd.c	(revision 130550)
+++ head/sys/kern/sched_4bsd.c	(revision 130551)
@@ -1,873 +1,876 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/kthread.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 
 #define KTR_4BSD	0x0
 
 /*
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
 #define	ESTCPULIM(e) \
     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #ifdef SMP
 #define	INVERSE_ESTCPU_WEIGHT	(8 * smp_cpus)
 #else
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #endif
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
 
 struct ke_sched {
 	int		ske_cpticks;	/* (j) Ticks of cpu time. */
 	struct runq	*ske_runq;	/* runq the kse is currently on */
 };
 #define ke_runq 	ke_sched->ske_runq
 #define KEF_BOUND	KEF_SCHED1
 
 #define SKE_RUNQ_PCPU(ke)						\
     ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq)
 
 /*
  * KSE_CAN_MIGRATE macro returns true if the kse can migrate between
  * cpus.
  */
 #define KSE_CAN_MIGRATE(ke)						\
     ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
 static struct ke_sched ke_sched;
 
 struct ke_sched *kse0_sched = &ke_sched;
 struct kg_sched *ksegrp0_sched = NULL;
 struct p_sched *proc0_sched = NULL;
 struct td_sched *thread0_sched = NULL;
 
 static int	sched_tdcnt;	/* Total runnable threads in the system. */
 static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
 #define	SCHED_QUANTUM	(hz / 10)	/* Default sched quantum */
 
 static struct callout roundrobin_callout;
 
 static void	setup_runqs(void);
 static void	roundrobin(void *arg);
 static void	schedcpu(void);
 static void	schedcpu_thread(void);
 static void	sched_setup(void *dummy);
 static void	maybe_resched(struct thread *td);
 static void	updatepri(struct ksegrp *kg);
 static void	resetpriority(struct ksegrp *kg);
 
 static struct kproc_desc sched_kp = {
         "schedcpu",
         schedcpu_thread,
         NULL
 };
 SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp)
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
 
 /*
  * Global run queue.
  */
 static struct runq runq;
 
 #ifdef SMP
 /*
  * Per-CPU run queues
  */
 static struct runq runq_pcpu[MAXCPU];
 #endif
 
 static void
 setup_runqs(void)
 {
 #ifdef SMP
 	int i;
 
 	for (i = 0; i < MAXCPU; ++i)
 		runq_init(&runq_pcpu[i]);
 #endif
 
 	runq_init(&runq);
 }
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val;
 
 	new_val = sched_quantum * tick;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
         if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val < tick)
 		return (EINVAL);
 	sched_quantum = new_val / tick;
 	hogticks = 2 * sched_quantum;
 	return (0);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof sched_quantum, sysctl_kern_quantum, "I",
 	"Roundrobin scheduling quantum in microseconds");
 
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
  */
 static void
 maybe_resched(struct thread *td)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (td->td_priority < curthread->td_priority && curthread->td_kse)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * Force switch among equal priority processes every 100ms.
  * We don't actually need to force a context switch of the current process.
  * The act of firing the event triggers a context switch to softclock() and
  * then switching back out again which is equivalent to a preemption, thus
  * no further work is needed on the local CPU.
  */
 /* ARGSUSED */
 static void
 roundrobin(void *arg)
 {
 
 #ifdef SMP
 	mtx_lock_spin(&sched_lock);
 	forward_roundrobin();
 	mtx_unlock_spin(&sched_lock);
 #endif
 
 	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
 }
 
 /*
  * Constants for digital decay and forget:
  *	90% of (kg_estcpu) usage in 5 * loadav time
  *	95% of (ke_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
  * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously.
  *
  * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
  * that the following for loop:
  * 	for (i = 0; i < (5 * loadavg); i++)
  * 		kg_estcpu *= decay;
  * will compute
  * 	kg_estcpu *= 0.1;
  * for all values of loadavg:
  *
  * Mathematically this loop can be expressed by saying:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * The system computes decay as:
  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
  *
  * We wish to prove that the system's computation of decay
  * will always fulfill the equation:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * If we compute b as:
  * 	b = 2 * loadavg
  * then
  * 	decay = b / (b + 1)
  *
  * We now need to prove two things:
  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
  *
  * Facts:
  *         For x close to zero, exp(x) =~ 1 + x, since
  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
  *         For x close to zero, ln(1+x) =~ x, since
  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
  *         ln(.1) =~ -2.30
  *
  * Proof of (1):
  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
  *	solving for factor,
  *      ln(factor) =~ (-2.30/5*loadav), or
  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
  *
  * Proof of (2):
  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
  *	solving for power,
  *      power*ln(b/(b+1)) =~ -2.30, or
  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
  *
  * Actual power values for the implemented algorithm are as follows:
  *      loadav: 1       2       3       4
  *      power:  5.68    10.32   14.94   19.55
  */
 
 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
 /* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 /*
  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
  *
  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
  *
  * If you don't want to bother with the faster/more-accurate formula, you
  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
  * (more general) method of calculating the %age of CPU used by a process.
  */
 #define	CCPU_SHIFT	11
 
 /*
  * Recompute process priorities, every hz ticks.
  * MP-safe, called without the Giant mutex.
  */
 /* ARGSUSED */
 static void
 schedcpu(void)
 {
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	struct thread *td;
 	struct proc *p;
 	struct kse *ke;
 	struct ksegrp *kg;
 	int awake, realstathz;
 
 	realstathz = stathz ? stathz : hz;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		/*
 		 * Prevent state changes and protect run queue.
 		 */
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Increment time in/out of memory.  We ignore overflow; with
 		 * 16-bit int's (remember them?) overflow takes 45 days.
 		 */
 		p->p_swtime++;
 		FOREACH_KSEGRP_IN_PROC(p, kg) { 
 			awake = 0;
 			FOREACH_KSE_IN_GROUP(kg, ke) {
 				/*
 				 * Increment sleep time (if sleeping).  We
 				 * ignore overflow, as above.
 				 */
 				/*
 				 * The kse slptimes are not touched in wakeup
 				 * because the thread may not HAVE a KSE.
 				 */
 				if (ke->ke_state == KES_ONRUNQ) {
 					awake = 1;
 					ke->ke_flags &= ~KEF_DIDRUN;
 				} else if ((ke->ke_state == KES_THREAD) &&
 				    (TD_IS_RUNNING(ke->ke_thread))) {
 					awake = 1;
 					/* Do not clear KEF_DIDRUN */
 				} else if (ke->ke_flags & KEF_DIDRUN) {
 					awake = 1;
 					ke->ke_flags &= ~KEF_DIDRUN;
 				}
 
 				/*
 				 * ke_pctcpu is only for ps and ttyinfo().
 				 * Do it per kse, and add them up at the end?
 				 * XXXKSE
 				 */
 				ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >>
 				    FSHIFT;
 				/*
 				 * If the kse has been idle the entire second,
 				 * stop recalculating its priority until
 				 * it wakes up.
 				 */
 				if (ke->ke_sched->ske_cpticks == 0)
 					continue;
 #if	(FSHIFT >= CCPU_SHIFT)
 				ke->ke_pctcpu += (realstathz == 100)
 				    ? ((fixpt_t) ke->ke_sched->ske_cpticks) <<
 				    (FSHIFT - CCPU_SHIFT) :
 				    100 * (((fixpt_t) ke->ke_sched->ske_cpticks)
 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 				ke->ke_pctcpu += ((FSCALE - ccpu) *
 				    (ke->ke_sched->ske_cpticks *
 				    FSCALE / realstathz)) >> FSHIFT;
 #endif
 				ke->ke_sched->ske_cpticks = 0;
 			} /* end of kse loop */
 			/* 
 			 * If there are ANY running threads in this KSEGRP,
 			 * then don't count it as sleeping.
 			 */
 			if (awake) {
 				if (kg->kg_slptime > 1) {
 					/*
 					 * In an ideal world, this should not
 					 * happen, because whoever woke us
 					 * up from the long sleep should have
 					 * unwound the slptime and reset our
 					 * priority before we run at the stale
 					 * priority.  Should KASSERT at some
 					 * point when all the cases are fixed.
 					 */
 					updatepri(kg);
 				}
 				kg->kg_slptime = 0;
 			} else
 				kg->kg_slptime++;
 			if (kg->kg_slptime > 1)
 				continue;
 			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
 		      	resetpriority(kg);
 			FOREACH_THREAD_IN_GROUP(kg, td) {
 				if (td->td_priority >= PUSER) {
 					sched_prio(td, kg->kg_user_pri);
 				}
 			}
 		} /* end of ksegrp loop */
 		mtx_unlock_spin(&sched_lock);
 	} /* end of process loop */
 	sx_sunlock(&allproc_lock);
 }
 
 /*
  * Main loop for a kthread that executes schedcpu once a second.
  */
 static void
 schedcpu_thread(void)
 {
 	int nowake;
 
 	for (;;) {
 		schedcpu();
 		tsleep(&nowake, curthread->td_priority, "-", hz);
 	}
 }
 
 /*
  * Recalculate the priority of a process after it has slept for a while.
  * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at
  * least six times the loadfactor will decay kg_estcpu to zero.
  */
 static void
 updatepri(struct ksegrp *kg)
 {
 	register fixpt_t loadfac;
 	register unsigned int newcpu;
 
 	loadfac = loadfactor(averunnable.ldavg[0]);
 	if (kg->kg_slptime > 5 * loadfac)
 		kg->kg_estcpu = 0;
 	else {
 		newcpu = kg->kg_estcpu;
 		kg->kg_slptime--;	/* was incremented in schedcpu() */
 		while (newcpu && --kg->kg_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
 		kg->kg_estcpu = newcpu;
 	}
 	resetpriority(kg);
 }
 
 /*
  * Compute the priority of a process when running in user mode.
  * Arrange to reschedule if the resulting priority is better
  * than that of the current process.
  */
 static void
 resetpriority(struct ksegrp *kg)
 {
 	register unsigned int newpriority;
 	struct thread *td;
 
 	if (kg->kg_pri_class == PRI_TIMESHARE) {
 		newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
-		    NICE_WEIGHT * (kg->kg_nice - PRIO_MIN);
+		    NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN);
 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 		    PRI_MAX_TIMESHARE);
 		kg->kg_user_pri = newpriority;
 	}
 	FOREACH_THREAD_IN_GROUP(kg, td) {
 		maybe_resched(td);			/* XXXKSE silly */
 	}
 }
 
 /* ARGSUSED */
 static void
 sched_setup(void *dummy)
 {
 	setup_runqs();
 
 	if (sched_quantum == 0)
 		sched_quantum = SCHED_QUANTUM;
 	hogticks = 2 * sched_quantum;
 
 	callout_init(&roundrobin_callout, CALLOUT_MPSAFE);
 
 	/* Kick off timeout driven events by calling first time. */
 	roundrobin(NULL);
 
 	/* Account for thread0. */
 	sched_tdcnt++;
 }
 
 /* External interfaces start here */
 int
 sched_runnable(void)
 {
 #ifdef SMP
 	return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
 #else
 	return runq_check(&runq);
 #endif
 }
 
 int 
 sched_rr_interval(void)
 {
 	if (sched_quantum == 0)
 		sched_quantum = SCHED_QUANTUM;
 	return (sched_quantum);
 }
 
 /*
  * We adjust the priority of the current process.  The priority of
  * a process gets worse as it accumulates CPU time.  The cpu usage
  * estimator (kg_estcpu) is increased here.  resetpriority() will
  * compute a different priority each time kg_estcpu increases by
  * INVERSE_ESTCPU_WEIGHT
  * (until MAXPRI is reached).  The cpu usage estimator ramps up
  * quite quickly when the process is running (linearly), and decays
  * away exponentially, at a rate which is proportionally slower when
  * the system is busy.  The basic principle is that the system will
  * 90% forget that the process used a lot of CPU time in 5 * loadav
  * seconds.  This causes the system to favor processes which haven't
  * run much recently, and to round-robin among other processes.
  */
 void
 sched_clock(struct thread *td)
 {
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = td->td_ksegrp;
 	ke = td->td_kse;
 
 	ke->ke_sched->ske_cpticks++;
 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
 	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(kg);
 		if (td->td_priority >= PUSER)
 			td->td_priority = kg->kg_user_pri;
 	}
 }
 
 /*
  * charge childs scheduling cpu usage to parent.
  *
  * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp.
  * Charge it to the ksegrp that did the wait since process estcpu is sum of
  * all ksegrps, this is strictly as expected.  Assume that the child process
  * aggregated all the estcpu into the 'built-in' ksegrp.
  */
 void
 sched_exit(struct proc *p, struct proc *p1)
 {
 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_exit_kse(struct kse *ke, struct kse *child)
 {
 }
 
 void
 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + child->kg_estcpu);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 	if ((child->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_tdcnt--;
 }
 
 void
 sched_fork(struct proc *p, struct proc *p1)
 {
 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_fork_kse(struct kse *ke, struct kse *child)
 {
 	child->ke_sched->ske_cpticks = 0;
 }
 
 void
 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	child->kg_estcpu = kg->kg_estcpu;
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
-sched_nice(struct ksegrp *kg, int nice)
+sched_nice(struct proc *p, int nice)
 {
+	struct ksegrp *kg;
 
-	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
-	kg->kg_nice = nice;
-	resetpriority(kg);
+	p->p_nice = nice;
+	FOREACH_KSEGRP_IN_PROC(p, kg) {
+		resetpriority(kg);
+	}
 }
 
 void
 sched_class(struct ksegrp *kg, int class)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_pri_class = class;
 }
 
 /*
  * Adjust the priority of a thread.
  * This may include moving the thread within the KSEGRP,
  * changing the assignment of a kse to the thread,
  * and moving a KSE in the system run queue.
  */
 void
 sched_prio(struct thread *td, u_char prio)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (TD_ON_RUNQ(td)) {
 		adjustrunqueue(td, prio);
 	} else {
 		td->td_priority = prio;
 	}
 }
 
 void
 sched_sleep(struct thread *td)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	td->td_ksegrp->kg_slptime = 0;
 	td->td_base_pri = td->td_priority;
 }
 
 void
 sched_switch(struct thread *td)
 {
 	struct thread *newtd;
 	struct kse *ke;
 	struct proc *p;
 
 	ke = td->td_kse;
 	p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_state == KES_THREAD), ("sched_switch: kse state?"));
 
 	if ((p->p_flag & P_NOLOAD) == 0)
 		sched_tdcnt--;
 	td->td_lastcpu = td->td_oncpu;
 	td->td_last_kse = ke;
 	td->td_flags &= ~TDF_NEEDRESCHED;
 	td->td_oncpu = NOCPU;
 	/*
 	 * At the last moment, if this thread is still marked RUNNING,
 	 * then put it back on the run queue as it has not been suspended
 	 * or stopped or any thing else similar.
 	 */
 	if (TD_IS_RUNNING(td)) {
 		/* Put us back on the run queue (kse and all). */
 		setrunqueue(td);
 	} else if (p->p_flag & P_SA) {
 		/*
 		 * We will not be on the run queue. So we must be
 		 * sleeping or similar. As it's available,
 		 * someone else can use the KSE if they need it.
 		 */
 		kse_reassign(ke);
 	}
 	newtd = choosethread();
 	if (td != newtd)
 		cpu_switch(td, newtd);
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	struct ksegrp *kg;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = td->td_ksegrp;
 	if (kg->kg_slptime > 1)
 		updatepri(kg);
 	kg->kg_slptime = 0;
 	setrunqueue(td);
 	maybe_resched(td);
 }
 
 void
 sched_add(struct thread *td)
 {
 	struct kse *ke;
 
 	ke = td->td_kse;
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
 	KASSERT((ke->ke_thread->td_kse != NULL),
 	    ("sched_add: No KSE on thread"));
 	KASSERT(ke->ke_state != KES_ONRUNQ,
 	    ("sched_add: kse %p (%s) already in run queue", ke,
 	    ke->ke_proc->p_comm));
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("sched_add: process swapped out"));
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;
 
 #ifdef SMP
 	if (KSE_CAN_MIGRATE(ke)) {
 		CTR1(KTR_4BSD, "adding kse:%p to gbl runq", ke);
 		ke->ke_runq = &runq;
 	} else {
 		CTR1(KTR_4BSD, "adding kse:%p to pcpu runq", ke);
 		if (!SKE_RUNQ_PCPU(ke))
 			ke->ke_runq = &runq_pcpu[PCPU_GET(cpuid)];
 	}
 #else
 	ke->ke_runq = &runq;
 #endif
 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_tdcnt++;
 	runq_add(ke->ke_runq, ke);
 }
 
 void
 sched_rem(struct thread *td)
 {
 	struct kse *ke;
 
 	ke = td->td_kse;
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("sched_rem: process swapped out"));
 	KASSERT((ke->ke_state == KES_ONRUNQ),
 	    ("sched_rem: KSE not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_tdcnt--;
 	runq_remove(ke->ke_sched->ske_runq, ke);
 
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;
 }
 
 struct kse *
 sched_choose(void)
 {
 	struct kse *ke;
 	struct runq *rq;
 
 #ifdef SMP
 	struct kse *kecpu;
 
 	rq = &runq;
 	ke = runq_choose(&runq);
 	kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
 
 	if (ke == NULL || 
 	    (kecpu != NULL && 
 	     kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) {
 		CTR2(KTR_4BSD, "choosing kse %p from pcpu runq %d", kecpu,
 		     PCPU_GET(cpuid));
 		ke = kecpu;
 		rq = &runq_pcpu[PCPU_GET(cpuid)];
 	} else { 
 		CTR1(KTR_4BSD, "choosing kse %p from main runq", ke);
 	}
 
 #else
 	rq = &runq;
 	ke = runq_choose(&runq);
 #endif
 
 	if (ke != NULL) {
 		runq_remove(rq, ke);
 		ke->ke_state = KES_THREAD;
 
 		KASSERT((ke->ke_thread != NULL),
 		    ("sched_choose: No thread on KSE"));
 		KASSERT((ke->ke_thread->td_kse != NULL),
 		    ("sched_choose: No KSE on thread"));
 		KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 		    ("sched_choose: process swapped out"));
 	}
 	return (ke);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	struct ksegrp *kg;
 	/*
 	 * XXX we cheat slightly on the locking here to avoid locking in
 	 * the usual case.  Setting td_priority here is essentially an
 	 * incomplete workaround for not setting it properly elsewhere.
 	 * Now that some interrupt handlers are threads, not setting it
 	 * properly elsewhere can clobber it in the window between setting
 	 * it here and returning to user mode, so don't waste time setting
 	 * it perfectly here.
 	 */
 	kg = td->td_ksegrp;
 	if (td->td_priority != kg->kg_user_pri) {
 		mtx_lock_spin(&sched_lock);
 		td->td_priority = kg->kg_user_pri;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 void
 sched_bind(struct thread *td, int cpu)
 {
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(TD_IS_RUNNING(td),
 	    ("sched_bind: cannot bind non-running thread"));
 
 	ke = td->td_kse;
 
 	ke->ke_flags |= KEF_BOUND;
 #ifdef SMP
 	ke->ke_runq = &runq_pcpu[cpu];
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 
 	ke->ke_state = KES_THREAD;
 
 	mi_switch(SW_VOL);
 #endif
 }
 
 void
 sched_unbind(struct thread* td)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	td->td_kse->ke_flags &= ~KEF_BOUND;
 }
 
 int
 sched_load(void)
 {
 	return (sched_tdcnt);
 }
 
 int
 sched_sizeof_kse(void)
 {
 	return (sizeof(struct kse) + sizeof(struct ke_sched));
 }
 int
 sched_sizeof_ksegrp(void)
 {
 	return (sizeof(struct ksegrp));
 }
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread));
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	struct kse *ke;
 
 	ke = td->td_kse;
 	if (ke == NULL)
 		ke = td->td_last_kse;
 	if (ke)
 		return (ke->ke_pctcpu);
 
 	return (0);
 }
Index: head/sys/kern/sched_ule.c
===================================================================
--- head/sys/kern/sched_ule.c	(revision 130550)
+++ head/sys/kern/sched_ule.c	(revision 130551)
@@ -1,1744 +1,1750 @@
 /*-
  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 #define KTR_ULE         KTR_NFS
 
 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 /* XXX This is bogus compatability crap for ps */
 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
 
 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
 
 static int slice_min = 1;
 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
 
 static int slice_max = 10;
 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
 
 int realstathz;
 int tickincr = 1;
 
 /*
  * These datastructures are allocated within their parent datastructure but
  * are scheduler specific.
  */
 
 struct ke_sched {
 	int		ske_slice;
 	struct runq	*ske_runq;
 	/* The following variables are only used for pctcpu calculation */
 	int		ske_ltick;	/* Last tick that we were running on */
 	int		ske_ftick;	/* First tick that we were running on */
 	int		ske_ticks;	/* Tick count */
 	/* CPU that we have affinity for. */
 	u_char		ske_cpu;
 };
 #define	ke_slice	ke_sched->ske_slice
 #define	ke_runq		ke_sched->ske_runq
 #define	ke_ltick	ke_sched->ske_ltick
 #define	ke_ftick	ke_sched->ske_ftick
 #define	ke_ticks	ke_sched->ske_ticks
 #define	ke_cpu		ke_sched->ske_cpu
 #define	ke_assign	ke_procq.tqe_next
 
 #define	KEF_ASSIGNED	KEF_SCHED0	/* KSE is being migrated. */
 #define	KEF_BOUND	KEF_SCHED1	/* KSE can not migrate. */
 
 struct kg_sched {
 	int	skg_slptime;		/* Number of ticks we vol. slept */
 	int	skg_runtime;		/* Number of ticks we were running */
 };
 #define	kg_slptime	kg_sched->skg_slptime
 #define	kg_runtime	kg_sched->skg_runtime
 
 struct td_sched {
 	int	std_slptime;
 };
 #define	td_slptime	td_sched->std_slptime
 
 struct td_sched td_sched;
 struct ke_sched ke_sched;
 struct kg_sched kg_sched;
 
 struct ke_sched *kse0_sched = &ke_sched;
 struct kg_sched *ksegrp0_sched = &kg_sched;
 struct p_sched *proc0_sched = NULL;
 struct td_sched *thread0_sched = &td_sched;
 
 /*
  * The priority is primarily determined by the interactivity score.  Thus, we
  * give lower(better) priorities to kse groups that use less CPU.  The nice
  * value is then directly added to this to allow nice to have some effect
  * on latency.
  *
  * PRI_RANGE:	Total priority range for timeshare threads.
  * PRI_NRESV:	Number of nice values.
  * PRI_BASE:	The start of the dynamic range.
  */
 #define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
 #define	SCHED_PRI_NRESV		((PRIO_MAX - PRIO_MIN) + 1)
 #define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
 #define	SCHED_PRI_BASE		(PRI_MIN_TIMESHARE)
 #define	SCHED_PRI_INTERACT(score)					\
     ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
 
 /*
  * These determine the interactivity of a process.
  *
  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
  *		before throttling back.
  * SLP_RUN_FORK:	Maximum slp+run time to inherit at fork time.
  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
  * INTERACT_THRESH:	Threshhold for placement on the current runq.
  */
 #define	SCHED_SLP_RUN_MAX	((hz * 5) << 10)
 #define	SCHED_SLP_RUN_FORK	((hz / 2) << 10)
 #define	SCHED_INTERACT_MAX	(100)
 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
 #define	SCHED_INTERACT_THRESH	(30)
 
 /*
  * These parameters and macros determine the size of the time slice that is
  * granted to each thread.
  *
  * SLICE_MIN:	Minimum time slice granted, in units of ticks.
  * SLICE_MAX:	Maximum time slice granted.
  * SLICE_RANGE:	Range of available time slices scaled by hz.
  * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
  * SLICE_NTHRESH:	The nice cutoff point for slice assignment.
  */
 #define	SCHED_SLICE_MIN			(slice_min)
 #define	SCHED_SLICE_MAX			(slice_max)
 #define	SCHED_SLICE_INTERACTIVE		(slice_max)
 #define	SCHED_SLICE_NTHRESH	(SCHED_PRI_NHALF - 1)
 #define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
 #define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
 #define	SCHED_SLICE_NICE(nice)						\
     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
 
 /*
  * This macro determines whether or not the kse belongs on the current or
  * next run queue.
  */
 #define	SCHED_INTERACTIVE(kg)						\
     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
 #define	SCHED_CURR(kg, ke)						\
     (ke->ke_thread->td_priority < kg->kg_user_pri ||			\
     SCHED_INTERACTIVE(kg))
 
 /*
  * Cpu percentage computation macros and defines.
  *
  * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
  * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
  */
 
 #define	SCHED_CPU_TIME	10
 #define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
 
 /*
  * kseq - per processor runqs and statistics.
  */
 struct kseq {
 	struct runq	ksq_idle;		/* Queue of IDLE threads. */
 	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
 	struct runq	*ksq_next;		/* Next timeshare queue. */
 	struct runq	*ksq_curr;		/* Current queue. */
 	int		ksq_load_timeshare;	/* Load for timeshare. */
 	int		ksq_load;		/* Aggregate load. */
 	short		ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
 	short		ksq_nicemin;		/* Least nice. */
 #ifdef SMP
 	int			ksq_transferable;
 	LIST_ENTRY(kseq)	ksq_siblings;	/* Next in kseq group. */
 	struct kseq_group	*ksq_group;	/* Our processor group. */
 	volatile struct kse	*ksq_assigned;	/* assigned by another CPU. */
 #else
 	int		ksq_sysload;		/* For loadavg, !ITHD load. */
 #endif
 };
 
 #ifdef SMP
 /*
  * kseq groups are groups of processors which can cheaply share threads.  When
  * one processor in the group goes idle it will check the runqs of the other
  * processors in its group prior to halting and waiting for an interrupt.
  * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
  * In a numa environment we'd want an idle bitmap per group and a two tiered
  * load balancer.
  */
 struct kseq_group {
 	int	ksg_cpus;		/* Count of CPUs in this kseq group. */
 	cpumask_t ksg_cpumask;		/* Mask of cpus in this group. */
 	cpumask_t ksg_idlemask;		/* Idle cpus in this group. */
 	cpumask_t ksg_mask;		/* Bit mask for first cpu. */
 	int	ksg_load;		/* Total load of this group. */
 	int	ksg_transferable;	/* Transferable load of this group. */
 	LIST_HEAD(, kseq)	ksg_members; /* Linked list of all members. */
 };
 #endif
 
 /*
  * One kse queue per processor.
  */
 #ifdef SMP
 static cpumask_t kseq_idle;
 static int ksg_maxid;
 static struct kseq	kseq_cpu[MAXCPU];
 static struct kseq_group kseq_groups[MAXCPU];
 static int bal_tick;
 static int gbal_tick;
 
 #define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
 #define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
 #define	KSEQ_ID(x)	((x) - kseq_cpu)
 #define	KSEQ_GROUP(x)	(&kseq_groups[(x)])
 #else	/* !SMP */
 static struct kseq	kseq_cpu;
 
 #define	KSEQ_SELF()	(&kseq_cpu)
 #define	KSEQ_CPU(x)	(&kseq_cpu)
 #endif
 
 static void sched_slice(struct kse *ke);
 static void sched_priority(struct ksegrp *kg);
 static int sched_interact_score(struct ksegrp *kg);
 static void sched_interact_update(struct ksegrp *kg);
 static void sched_interact_fork(struct ksegrp *kg);
 static void sched_pctcpu_update(struct kse *ke);
 
 /* Operations on per processor queues */
 static struct kse * kseq_choose(struct kseq *kseq);
 static void kseq_setup(struct kseq *kseq);
 static void kseq_load_add(struct kseq *kseq, struct kse *ke);
 static void kseq_load_rem(struct kseq *kseq, struct kse *ke);
 static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke);
 static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke);
 static void kseq_nice_add(struct kseq *kseq, int nice);
 static void kseq_nice_rem(struct kseq *kseq, int nice);
 void kseq_print(int cpu);
 #ifdef SMP
 static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class);
 static struct kse *runq_steal(struct runq *rq);
 static void sched_balance(void);
 static void sched_balance_groups(void);
 static void sched_balance_group(struct kseq_group *ksg);
 static void sched_balance_pair(struct kseq *high, struct kseq *low);
 static void kseq_move(struct kseq *from, int cpu);
 static int kseq_idled(struct kseq *kseq);
 static void kseq_notify(struct kse *ke, int cpu);
 static void kseq_assign(struct kseq *);
 static struct kse *kseq_steal(struct kseq *kseq, int stealidle);
 /*
  * On P4 Xeons the round-robin interrupt delivery is broken.  As a result of
  * this, we can't pin interrupts to the cpu that they were delivered to, 
  * otherwise all ithreads only run on CPU 0.
  */
 #ifdef __i386__
 #define	KSE_CAN_MIGRATE(ke, class)					\
     ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
 #else /* !__i386__ */
 #define	KSE_CAN_MIGRATE(ke, class)					\
     ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 &&		\
     ((ke)->ke_flags & KEF_BOUND) == 0)
 #endif /* !__i386__ */
 #endif
 
 void
 kseq_print(int cpu)
 {
 	struct kseq *kseq;
 	int i;
 
 	kseq = KSEQ_CPU(cpu);
 
 	printf("kseq:\n");
 	printf("\tload:           %d\n", kseq->ksq_load);
 	printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
 #ifdef SMP
 	printf("\tload transferable: %d\n", kseq->ksq_transferable);
 #endif
 	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
 	printf("\tnice counts:\n");
 	for (i = 0; i < SCHED_PRI_NRESV; i++)
 		if (kseq->ksq_nice[i])
 			printf("\t\t%d = %d\n",
 			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
 }
 
 static __inline void
 kseq_runq_add(struct kseq *kseq, struct kse *ke)
 {
 #ifdef SMP
 	if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) {
 		kseq->ksq_transferable++;
 		kseq->ksq_group->ksg_transferable++;
 	}
 #endif
 	runq_add(ke->ke_runq, ke);
 }
 
 static __inline void
 kseq_runq_rem(struct kseq *kseq, struct kse *ke)
 {
 #ifdef SMP
 	if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) {
 		kseq->ksq_transferable--;
 		kseq->ksq_group->ksg_transferable--;
 	}
 #endif
 	runq_remove(ke->ke_runq, ke);
 }
 
 static void
 kseq_load_add(struct kseq *kseq, struct kse *ke)
 {
 	int class;
 	mtx_assert(&sched_lock, MA_OWNED);
 	class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
 	if (class == PRI_TIMESHARE)
 		kseq->ksq_load_timeshare++;
 	kseq->ksq_load++;
 	if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
 #ifdef SMP
 		kseq->ksq_group->ksg_load++;
 #else
 		kseq->ksq_sysload++;
 #endif
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 		CTR6(KTR_ULE,
 		    "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
 		    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
-		    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
+		    ke->ke_proc->p_nice, kseq->ksq_nicemin);
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
-		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
+		kseq_nice_add(kseq, ke->ke_proc->p_nice);
 }
 
 static void
 kseq_load_rem(struct kseq *kseq, struct kse *ke)
 {
 	int class;
 	mtx_assert(&sched_lock, MA_OWNED);
 	class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
 	if (class == PRI_TIMESHARE)
 		kseq->ksq_load_timeshare--;
 	if (class != PRI_ITHD  && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
 #ifdef SMP
 		kseq->ksq_group->ksg_load--;
 #else
 		kseq->ksq_sysload--;
 #endif
 	kseq->ksq_load--;
 	ke->ke_runq = NULL;
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
-		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
+		kseq_nice_rem(kseq, ke->ke_proc->p_nice);
 }
 
 static void
 kseq_nice_add(struct kseq *kseq, int nice)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* Normalize to zero. */
 	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
 	if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1)
 		kseq->ksq_nicemin = nice;
 }
 
 static void
 kseq_nice_rem(struct kseq *kseq, int nice) 
 {
 	int n;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* Normalize to zero. */
 	n = nice + SCHED_PRI_NHALF;
 	kseq->ksq_nice[n]--;
 	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
 
 	/*
 	 * If this wasn't the smallest nice value or there are more in
 	 * this bucket we can just return.  Otherwise we have to recalculate
 	 * the smallest nice.
 	 */
 	if (nice != kseq->ksq_nicemin ||
 	    kseq->ksq_nice[n] != 0 ||
 	    kseq->ksq_load_timeshare == 0)
 		return;
 
 	for (; n < SCHED_PRI_NRESV; n++)
 		if (kseq->ksq_nice[n]) {
 			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
 			return;
 		}
 }
 
 #ifdef SMP
 /*
  * sched_balance is a simple CPU load balancing algorithm.  It operates by
  * finding the least loaded and most loaded cpu and equalizing their load
  * by migrating some processes.
  *
  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
  * installations will only have 2 cpus.  Secondly, load balancing too much at
  * once can have an unpleasant effect on the system.  The scheduler rarely has
  * enough information to make perfect decisions.  So this algorithm chooses
  * algorithm simplicity and more gradual effects on load in larger systems.
  *
  * It could be improved by considering the priorities and slices assigned to
  * each task prior to balancing them.  There are many pathological cases with
  * any approach and so the semi random algorithm below may work as well as any.
  *
  */
 static void
 sched_balance(void)
 {
 	struct kseq_group *high;
 	struct kseq_group *low;
 	struct kseq_group *ksg;
 	int cnt;
 	int i;
 
 	if (smp_started == 0)
 		goto out;
 	low = high = NULL;
 	i = random() % (ksg_maxid + 1);
 	for (cnt = 0; cnt <= ksg_maxid; cnt++) {
 		ksg = KSEQ_GROUP(i);
 		/*
 		 * Find the CPU with the highest load that has some
 		 * threads to transfer.
 		 */
 		if ((high == NULL || ksg->ksg_load > high->ksg_load)
 		    && ksg->ksg_transferable)
 			high = ksg;
 		if (low == NULL || ksg->ksg_load < low->ksg_load)
 			low = ksg;
 		if (++i > ksg_maxid)
 			i = 0;
 	}
 	if (low != NULL && high != NULL && high != low)
 		sched_balance_pair(LIST_FIRST(&high->ksg_members),
 		    LIST_FIRST(&low->ksg_members));
 out:
 	bal_tick = ticks + (random() % (hz * 2));
 }
 
 static void
 sched_balance_groups(void)
 {
 	int i;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (smp_started)
 		for (i = 0; i <= ksg_maxid; i++)
 			sched_balance_group(KSEQ_GROUP(i));
 	gbal_tick = ticks + (random() % (hz * 2));
 }
 
 static void
 sched_balance_group(struct kseq_group *ksg)
 {
 	struct kseq *kseq;
 	struct kseq *high;
 	struct kseq *low;
 	int load;
 
 	if (ksg->ksg_transferable == 0)
 		return;
 	low = NULL;
 	high = NULL;
 	LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
 		load = kseq->ksq_load;
 		if (high == NULL || load > high->ksq_load)
 			high = kseq;
 		if (low == NULL || load < low->ksq_load)
 			low = kseq;
 	}
 	if (high != NULL && low != NULL && high != low)
 		sched_balance_pair(high, low);
 }
 
 static void
 sched_balance_pair(struct kseq *high, struct kseq *low)
 {
 	int transferable;
 	int high_load;
 	int low_load;
 	int move;
 	int diff;
 	int i;
 
 	/*
 	 * If we're transfering within a group we have to use this specific
 	 * kseq's transferable count, otherwise we can steal from other members
 	 * of the group.
 	 */
 	if (high->ksq_group == low->ksq_group) {
 		transferable = high->ksq_transferable;
 		high_load = high->ksq_load;
 		low_load = low->ksq_load;
 	} else {
 		transferable = high->ksq_group->ksg_transferable;
 		high_load = high->ksq_group->ksg_load;
 		low_load = low->ksq_group->ksg_load;
 	}
 	if (transferable == 0)
 		return;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * kses we actually have to give up (transferable).
 	 */
 	diff = high_load - low_load;
 	move = diff / 2;
 	if (diff & 0x1)
 		move++;
 	move = min(move, transferable);
 	for (i = 0; i < move; i++)
 		kseq_move(high, KSEQ_ID(low));
 	return;
 }
 
 static void
 kseq_move(struct kseq *from, int cpu)
 {
 	struct kseq *kseq;
 	struct kseq *to;
 	struct kse *ke;
 
 	kseq = from;
 	to = KSEQ_CPU(cpu);
 	ke = kseq_steal(kseq, 1);
 	if (ke == NULL) {
 		struct kseq_group *ksg;
 
 		ksg = kseq->ksq_group;
 		LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
 			if (kseq == from || kseq->ksq_transferable == 0)
 				continue;
 			ke = kseq_steal(kseq, 1);
 			break;
 		}
 		if (ke == NULL)
 			panic("kseq_move: No KSEs available with a "
 			    "transferable count of %d\n", 
 			    ksg->ksg_transferable);
 	}
 	if (kseq == to)
 		return;
 	ke->ke_state = KES_THREAD;
 	kseq_runq_rem(kseq, ke);
 	kseq_load_rem(kseq, ke);
 	kseq_notify(ke, cpu);
 }
 
 static int
 kseq_idled(struct kseq *kseq)
 {
 	struct kseq_group *ksg;
 	struct kseq *steal;
 	struct kse *ke;
 
 	ksg = kseq->ksq_group;
 	/*
 	 * If we're in a cpu group, try and steal kses from another cpu in
 	 * the group before idling.
 	 */
 	if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
 		LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
 			if (steal == kseq || steal->ksq_transferable == 0)
 				continue;
 			ke = kseq_steal(steal, 0);
 			if (ke == NULL)
 				continue;
 			ke->ke_state = KES_THREAD;
 			kseq_runq_rem(steal, ke);
 			kseq_load_rem(steal, ke);
 			ke->ke_cpu = PCPU_GET(cpuid);
 			sched_add(ke->ke_thread);
 			return (0);
 		}
 	}
 	/*
 	 * We only set the idled bit when all of the cpus in the group are
 	 * idle.  Otherwise we could get into a situation where a KSE bounces
 	 * back and forth between two idle cores on seperate physical CPUs.
 	 */
 	ksg->ksg_idlemask |= PCPU_GET(cpumask);
 	if (ksg->ksg_idlemask != ksg->ksg_cpumask)
 		return (1);
 	atomic_set_int(&kseq_idle, ksg->ksg_mask);
 	return (1);
 }
 
 static void
 kseq_assign(struct kseq *kseq)
 {
 	struct kse *nke;
 	struct kse *ke;
 
 	do {
 		(volatile struct kse *)ke = kseq->ksq_assigned;
 	} while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL));
 	for (; ke != NULL; ke = nke) {
 		nke = ke->ke_assign;
 		ke->ke_flags &= ~KEF_ASSIGNED;
 		sched_add(ke->ke_thread);
 	}
 }
 
 static void
 kseq_notify(struct kse *ke, int cpu)
 {
 	struct kseq *kseq;
 	struct thread *td;
 	struct pcpu *pcpu;
 
 	ke->ke_cpu = cpu;
 	ke->ke_flags |= KEF_ASSIGNED;
 
 	kseq = KSEQ_CPU(cpu);
 
 	/*
 	 * Place a KSE on another cpu's queue and force a resched.
 	 */
 	do {
 		(volatile struct kse *)ke->ke_assign = kseq->ksq_assigned;
 	} while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke));
 	pcpu = pcpu_find(cpu);
 	td = pcpu->pc_curthread;
 	if (ke->ke_thread->td_priority < td->td_priority ||
 	    td == pcpu->pc_idlethread) {
 		td->td_flags |= TDF_NEEDRESCHED;
 		ipi_selected(1 << cpu, IPI_AST);
 	}
 }
 
 static struct kse *
 runq_steal(struct runq *rq)
 {
 	struct rqhead *rqh;
 	struct rqbits *rqb;
 	struct kse *ke;
 	int word;
 	int bit;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	rqb = &rq->rq_status;
 	for (word = 0; word < RQB_LEN; word++) {
 		if (rqb->rqb_bits[word] == 0)
 			continue;
 		for (bit = 0; bit < RQB_BPW; bit++) {
 			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
 				continue;
 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
 			TAILQ_FOREACH(ke, rqh, ke_procq) {
 				if (KSE_CAN_MIGRATE(ke,
 				    PRI_BASE(ke->ke_ksegrp->kg_pri_class)))
 					return (ke);
 			}
 		}
 	}
 	return (NULL);
 }
 
 static struct kse *
 kseq_steal(struct kseq *kseq, int stealidle)
 {
 	struct kse *ke;
 
 	/*
 	 * Steal from next first to try to get a non-interactive task that
 	 * may not have run for a while.
 	 */
 	if ((ke = runq_steal(kseq->ksq_next)) != NULL)
 		return (ke);
 	if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
 		return (ke);
 	if (stealidle)
 		return (runq_steal(&kseq->ksq_idle));
 	return (NULL);
 }
 
 int
 kseq_transfer(struct kseq *kseq, struct kse *ke, int class)
 {
 	struct kseq_group *ksg;
 	int cpu;
 
 	if (smp_started == 0)
 		return (0);
 	cpu = 0;
 	ksg = kseq->ksq_group;
 
 	/*
 	 * If there are any idle groups, give them our extra load.  The
 	 * threshold at which we start to reassign kses has a large impact
 	 * on the overall performance of the system.  Tuned too high and
 	 * some CPUs may idle.  Too low and there will be excess migration
 	 * and context switches.
 	 */
 	if (ksg->ksg_load > (ksg->ksg_cpus * 2) && kseq_idle) {
 		/*
 		 * Multiple cpus could find this bit simultaneously
 		 * but the race shouldn't be terrible.
 		 */
 		cpu = ffs(kseq_idle);
 		if (cpu)
 			atomic_clear_int(&kseq_idle, 1 << (cpu - 1));
 	}
 	/*
 	 * If another cpu in this group has idled, assign a thread over
 	 * to them after checking to see if there are idled groups.
 	 */
 	if (cpu == 0 && kseq->ksq_load > 1 && ksg->ksg_idlemask) {
 		cpu = ffs(ksg->ksg_idlemask);
 		if (cpu)
 			ksg->ksg_idlemask &= ~(1 << (cpu - 1));
 	}
 	/*
 	 * Now that we've found an idle CPU, migrate the thread.
 	 */
 	if (cpu) {
 		cpu--;
 		ke->ke_runq = NULL;
 		kseq_notify(ke, cpu);
 		return (1);
 	}
 	return (0);
 }
 
 #endif	/* SMP */
 
 /*
  * Pick the highest priority task we have and return it.
  */
 
 static struct kse *
 kseq_choose(struct kseq *kseq)
 {
 	struct kse *ke;
 	struct runq *swap;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	swap = NULL;
 
 	for (;;) {
 		ke = runq_choose(kseq->ksq_curr);
 		if (ke == NULL) {
 			/*
 			 * We already swaped once and didn't get anywhere.
 			 */
 			if (swap)
 				break;
 			swap = kseq->ksq_curr;
 			kseq->ksq_curr = kseq->ksq_next;
 			kseq->ksq_next = swap;
 			continue;
 		}
 		/*
 		 * If we encounter a slice of 0 the kse is in a
 		 * TIMESHARE kse group and its nice was too far out
 		 * of the range that receives slices. 
 		 */
 		if (ke->ke_slice == 0) {
 			runq_remove(ke->ke_runq, ke);
 			sched_slice(ke);
 			ke->ke_runq = kseq->ksq_next;
 			runq_add(ke->ke_runq, ke);
 			continue;
 		}
 		return (ke);
 	}
 
 	return (runq_choose(&kseq->ksq_idle));
 }
 
 static void
 kseq_setup(struct kseq *kseq)
 {
 	runq_init(&kseq->ksq_timeshare[0]);
 	runq_init(&kseq->ksq_timeshare[1]);
 	runq_init(&kseq->ksq_idle);
 	kseq->ksq_curr = &kseq->ksq_timeshare[0];
 	kseq->ksq_next = &kseq->ksq_timeshare[1];
 	kseq->ksq_load = 0;
 	kseq->ksq_load_timeshare = 0;
 }
 
 static void
 sched_setup(void *dummy)
 {
 #ifdef SMP
 	int balance_groups;
 	int i;
 #endif
 
 	slice_min = (hz/100);	/* 10ms */
 	slice_max = (hz/7);	/* ~140ms */
 
 #ifdef SMP
 	balance_groups = 0;
 	/*
 	 * Initialize the kseqs.
 	 */
 	for (i = 0; i < MAXCPU; i++) {
 		struct kseq *ksq;
 
 		ksq = &kseq_cpu[i];
 		ksq->ksq_assigned = NULL;
 		kseq_setup(&kseq_cpu[i]);
 	}
 	if (smp_topology == NULL) {
 		struct kseq_group *ksg;
 		struct kseq *ksq;
 
 		for (i = 0; i < MAXCPU; i++) {
 			ksq = &kseq_cpu[i];
 			ksg = &kseq_groups[i];
 			/*
 			 * Setup a kseq group with one member.
 			 */
 			ksq->ksq_transferable = 0;
 			ksq->ksq_group = ksg;
 			ksg->ksg_cpus = 1;
 			ksg->ksg_idlemask = 0;
 			ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
 			ksg->ksg_load = 0;
 			ksg->ksg_transferable = 0;
 			LIST_INIT(&ksg->ksg_members);
 			LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
 		}
 	} else {
 		struct kseq_group *ksg;
 		struct cpu_group *cg;
 		int j;
 
 		for (i = 0; i < smp_topology->ct_count; i++) {
 			cg = &smp_topology->ct_group[i];
 			ksg = &kseq_groups[i];
 			/*
 			 * Initialize the group.
 			 */
 			ksg->ksg_idlemask = 0;
 			ksg->ksg_load = 0;
 			ksg->ksg_transferable = 0;
 			ksg->ksg_cpus = cg->cg_count;
 			ksg->ksg_cpumask = cg->cg_mask;
 			LIST_INIT(&ksg->ksg_members);
 			/*
 			 * Find all of the group members and add them.
 			 */
 			for (j = 0; j < MAXCPU; j++) {
 				if ((cg->cg_mask & (1 << j)) != 0) {
 					if (ksg->ksg_mask == 0)
 						ksg->ksg_mask = 1 << j;
 					kseq_cpu[j].ksq_transferable = 0;
 					kseq_cpu[j].ksq_group = ksg;
 					LIST_INSERT_HEAD(&ksg->ksg_members,
 					    &kseq_cpu[j], ksq_siblings);
 				}
 			}
 			if (ksg->ksg_cpus > 1)
 				balance_groups = 1;
 		}
 		ksg_maxid = smp_topology->ct_count - 1;
 	}
 	/*
 	 * Stagger the group and global load balancer so they do not
 	 * interfere with each other.
 	 */
 	bal_tick = ticks + hz;
 	if (balance_groups)
 		gbal_tick = ticks + (hz / 2);
 #else
 	kseq_setup(KSEQ_SELF());
 #endif
 	mtx_lock_spin(&sched_lock);
 	kseq_load_add(KSEQ_SELF(), &kse0);
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Scale the scheduling priority according to the "interactivity" of this
  * process.
  */
 static void
 sched_priority(struct ksegrp *kg)
 {
 	int pri;
 
 	if (kg->kg_pri_class != PRI_TIMESHARE)
 		return;
 
 	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
 	pri += SCHED_PRI_BASE;
-	pri += kg->kg_nice;
+	pri += kg->kg_proc->p_nice;
 
 	if (pri > PRI_MAX_TIMESHARE)
 		pri = PRI_MAX_TIMESHARE;
 	else if (pri < PRI_MIN_TIMESHARE)
 		pri = PRI_MIN_TIMESHARE;
 
 	kg->kg_user_pri = pri;
 
 	return;
 }
 
 /*
  * Calculate a time slice based on the properties of the kseg and the runq
  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
  */
 static void
 sched_slice(struct kse *ke)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 
 	kg = ke->ke_ksegrp;
 	kseq = KSEQ_CPU(ke->ke_cpu);
 
 	/*
 	 * Rationale:
 	 * KSEs in interactive ksegs get the minimum slice so that we
 	 * quickly notice if it abuses its advantage.
 	 *
 	 * KSEs in non-interactive ksegs are assigned a slice that is
 	 * based on the ksegs nice value relative to the least nice kseg
 	 * on the run queue for this cpu.
 	 *
 	 * If the KSE is less nice than all others it gets the maximum
 	 * slice and other KSEs will adjust their slice relative to
 	 * this when they first expire.
 	 *
 	 * There is 20 point window that starts relative to the least
 	 * nice kse on the run queue.  Slice size is determined by
 	 * the kse distance from the last nice ksegrp.
 	 *
 	 * If the kse is outside of the window it will get no slice
 	 * and will be reevaluated each time it is selected on the
 	 * run queue.  The exception to this is nice 0 ksegs when
 	 * a nice -20 is running.  They are always granted a minimum
 	 * slice.
 	 */
 	if (!SCHED_INTERACTIVE(kg)) {
 		int nice;
 
-		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
+		nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin);
 		if (kseq->ksq_load_timeshare == 0 ||
-		    kg->kg_nice < kseq->ksq_nicemin)
+		    kg->kg_proc->p_nice < kseq->ksq_nicemin)
 			ke->ke_slice = SCHED_SLICE_MAX;
 		else if (nice <= SCHED_SLICE_NTHRESH)
 			ke->ke_slice = SCHED_SLICE_NICE(nice);
-		else if (kg->kg_nice == 0)
+		else if (kg->kg_proc->p_nice == 0)
 			ke->ke_slice = SCHED_SLICE_MIN;
 		else
 			ke->ke_slice = 0;
 	} else
 		ke->ke_slice = SCHED_SLICE_INTERACTIVE;
 
 	CTR6(KTR_ULE,
 	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
-	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
+	    ke, ke->ke_slice, kg->kg_proc->p_nice, kseq->ksq_nicemin,
 	    kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg));
 
 	return;
 }
 
 /*
  * This routine enforces a maximum limit on the amount of scheduling history
  * kept.  It is called after either the slptime or runtime is adjusted.
  * This routine will not operate correctly when slp or run times have been
  * adjusted to more than double their maximum.
  */
 static void
 sched_interact_update(struct ksegrp *kg)
 {
 	int sum;
 
 	sum = kg->kg_runtime + kg->kg_slptime;
 	if (sum < SCHED_SLP_RUN_MAX)
 		return;
 	/*
 	 * If we have exceeded by more than 1/5th then the algorithm below
 	 * will not bring us back into range.  Dividing by two here forces
 	 * us into the range of [3/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 	 */
 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
 		kg->kg_runtime /= 2;
 		kg->kg_slptime /= 2;
 		return;
 	}
 	kg->kg_runtime = (kg->kg_runtime / 5) * 4;
 	kg->kg_slptime = (kg->kg_slptime / 5) * 4;
 }
 
 static void
 sched_interact_fork(struct ksegrp *kg)
 {
 	int ratio;
 	int sum;
 
 	sum = kg->kg_runtime + kg->kg_slptime;
 	if (sum > SCHED_SLP_RUN_FORK) {
 		ratio = sum / SCHED_SLP_RUN_FORK;
 		kg->kg_runtime /= ratio;
 		kg->kg_slptime /= ratio;
 	}
 }
 
 static int
 sched_interact_score(struct ksegrp *kg)
 {
 	int div;
 
 	if (kg->kg_runtime > kg->kg_slptime) {
 		div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
 		return (SCHED_INTERACT_HALF +
 		    (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
 	} if (kg->kg_slptime > kg->kg_runtime) {
 		div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
 		return (kg->kg_runtime / div);
 	}
 
 	/*
 	 * This can happen if slptime and runtime are 0.
 	 */
 	return (0);
 
 }
 
 /*
  * This is only somewhat accurate since given many processes of the same
  * priority they will switch when their slices run out, which will be
  * at most SCHED_SLICE_MAX.
  */
 int
 sched_rr_interval(void)
 {
 	return (SCHED_SLICE_MAX);
 }
 
 static void
 sched_pctcpu_update(struct kse *ke)
 {
 	/*
 	 * Adjust counters and watermark for pctcpu calc.
 	 */
 	if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
 		/*
 		 * Shift the tick count out so that the divide doesn't
 		 * round away our results.
 		 */
 		ke->ke_ticks <<= 10;
 		ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
 			    SCHED_CPU_TICKS;
 		ke->ke_ticks >>= 10;
 	} else
 		ke->ke_ticks = 0;
 	ke->ke_ltick = ticks;
 	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
 }
 
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	struct kse *ke;
 
 	ke = td->td_kse;
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (TD_ON_RUNQ(td)) {
 		/*
 		 * If the priority has been elevated due to priority
 		 * propagation, we may have to move ourselves to a new
 		 * queue.  We still call adjustrunqueue below in case kse
 		 * needs to fix things up.
 		 */
 		if (prio < td->td_priority && ke &&
 		    (ke->ke_flags & KEF_ASSIGNED) == 0 &&
 		    ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
 			runq_remove(ke->ke_runq, ke);
 			ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
 			runq_add(ke->ke_runq, ke);
 		}
 		adjustrunqueue(td, prio);
 	} else
 		td->td_priority = prio;
 }
 
 void
 sched_switch(struct thread *td)
 {
 	struct thread *newtd;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	ke = td->td_kse;
 
 	td->td_last_kse = ke;
         td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
         td->td_flags &= ~TDF_NEEDRESCHED;
 
 	/*
 	 * If the KSE has been assigned it may be in the process of switching
 	 * to the new cpu.  This is the case in sched_bind().
 	 */
 	if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
 		if (TD_IS_RUNNING(td)) {
 			kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 			setrunqueue(td);
 		} else {
 			if (ke->ke_runq) {
 				kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 			} else if ((td->td_flags & TDF_IDLETD) == 0)
 				backtrace();
 			/*
 			 * We will not be on the run queue. So we must be
 			 * sleeping or similar.
 			 */
 			if (td->td_proc->p_flag & P_SA)
 				kse_reassign(ke);
 		}
 	}
 	newtd = choosethread();
 	if (td != newtd)
 		cpu_switch(td, newtd);
 	sched_lock.mtx_lock = (uintptr_t)td;
 
 	td->td_oncpu = PCPU_GET(cpuid);
 }
 
 void
-sched_nice(struct ksegrp *kg, int nice)
+sched_nice(struct proc *p, int nice)
 {
+	struct ksegrp *kg;
 	struct kse *ke;
 	struct thread *td;
 	struct kseq *kseq;
 
-	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	/*
 	 * We need to adjust the nice counts for running KSEs.
 	 */
-	if (kg->kg_pri_class == PRI_TIMESHARE)
-		FOREACH_KSE_IN_GROUP(kg, ke) {
-			if (ke->ke_runq == NULL)
-				continue;
-			kseq = KSEQ_CPU(ke->ke_cpu);
-			kseq_nice_rem(kseq, kg->kg_nice);
-			kseq_nice_add(kseq, nice);
+	FOREACH_KSEGRP_IN_PROC(p, kg) {
+		if (kg->kg_pri_class == PRI_TIMESHARE) {
+			FOREACH_KSE_IN_GROUP(kg, ke) {
+				if (ke->ke_runq == NULL)
+					continue;
+				kseq = KSEQ_CPU(ke->ke_cpu);
+				kseq_nice_rem(kseq, p->p_nice);
+				kseq_nice_add(kseq, nice);
+			}
 		}
-	kg->kg_nice = nice;
-	sched_priority(kg);
-	FOREACH_THREAD_IN_GROUP(kg, td)
-		td->td_flags |= TDF_NEEDRESCHED;
+	}
+	p->p_nice = nice;
+	FOREACH_KSEGRP_IN_PROC(p, kg) {
+		sched_priority(kg);
+		FOREACH_THREAD_IN_GROUP(kg, td)
+			td->td_flags |= TDF_NEEDRESCHED;
+	}
 }
 
 void
 sched_sleep(struct thread *td)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	td->td_slptime = ticks;
 	td->td_base_pri = td->td_priority;
 
 	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
 	    td->td_kse, td->td_slptime);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	/*
 	 * Let the kseg know how long we slept for.  This is because process
 	 * interactivity behavior is modeled in the kseg.
 	 */
 	if (td->td_slptime) {
 		struct ksegrp *kg;
 		int hzticks;
 
 		kg = td->td_ksegrp;
 		hzticks = (ticks - td->td_slptime) << 10;
 		if (hzticks >= SCHED_SLP_RUN_MAX) {
 			kg->kg_slptime = SCHED_SLP_RUN_MAX;
 			kg->kg_runtime = 1;
 		} else {
 			kg->kg_slptime += hzticks;
 			sched_interact_update(kg);
 		}
 		sched_priority(kg);
 		if (td->td_kse)
 			sched_slice(td->td_kse);
 		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
 		    td->td_kse, hzticks);
 		td->td_slptime = 0;
 	}
 	setrunqueue(td);
 }
 
 /*
  * Penalize the parent for creating a new child and initialize the child's
  * priority.
  */
 void
 sched_fork(struct proc *p, struct proc *p1)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
+	p1->p_nice = p->p_nice;
 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_fork_kse(struct kse *ke, struct kse *child)
 {
 
 	child->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
 	child->ke_cpu = ke->ke_cpu;
 	child->ke_runq = NULL;
 
 	/* Grab our parents cpu estimation information. */
 	child->ke_ticks = ke->ke_ticks;
 	child->ke_ltick = ke->ke_ltick;
 	child->ke_ftick = ke->ke_ftick;
 }
 
 void
 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
 
 	child->kg_slptime = kg->kg_slptime;
 	child->kg_runtime = kg->kg_runtime;
 	child->kg_user_pri = kg->kg_user_pri;
-	child->kg_nice = kg->kg_nice;
 	sched_interact_fork(child);
 	kg->kg_runtime += tickincr << 10;
 	sched_interact_update(kg);
 
 	CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)",
 	    kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 
 	    child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime);
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_class(struct ksegrp *kg, int class)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 	int nclass;
 	int oclass;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (kg->kg_pri_class == class)
 		return;
 
 	nclass = PRI_BASE(class);
 	oclass = PRI_BASE(kg->kg_pri_class);
 	FOREACH_KSE_IN_GROUP(kg, ke) {
 		if (ke->ke_state != KES_ONRUNQ &&
 		    ke->ke_state != KES_THREAD)
 			continue;
 		kseq = KSEQ_CPU(ke->ke_cpu);
 
 #ifdef SMP
 		/*
 		 * On SMP if we're on the RUNQ we must adjust the transferable
 		 * count because could be changing to or from an interrupt
 		 * class.
 		 */
 		if (ke->ke_state == KES_ONRUNQ) {
 			if (KSE_CAN_MIGRATE(ke, oclass)) {
 				kseq->ksq_transferable--;
 				kseq->ksq_group->ksg_transferable--;
 			}
 			if (KSE_CAN_MIGRATE(ke, nclass)) {
 				kseq->ksq_transferable++;
 				kseq->ksq_group->ksg_transferable++;
 			}
 		}
 #endif
 		if (oclass == PRI_TIMESHARE) {
 			kseq->ksq_load_timeshare--;
-			kseq_nice_rem(kseq, kg->kg_nice);
+			kseq_nice_rem(kseq, kg->kg_proc->p_nice);
 		}
 		if (nclass == PRI_TIMESHARE) {
 			kseq->ksq_load_timeshare++;
-			kseq_nice_add(kseq, kg->kg_nice);
+			kseq_nice_add(kseq, kg->kg_proc->p_nice);
 		}
 	}
 
 	kg->kg_pri_class = class;
 }
 
 /*
  * Return some of the child's priority and interactivity to the parent.
  */
 void
 sched_exit(struct proc *p, struct proc *child)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child));
 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child));
 }
 
 void
 sched_exit_kse(struct kse *ke, struct kse *child)
 {
 	kseq_load_rem(KSEQ_CPU(child->ke_cpu), child);
 }
 
 void
 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 	/* kg->kg_slptime += child->kg_slptime; */
 	kg->kg_runtime += child->kg_runtime;
 	sched_interact_update(kg);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_clock(struct thread *td)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 #ifdef SMP
 	if (ticks == bal_tick)
 		sched_balance();
 	if (ticks == gbal_tick)
 		sched_balance_groups();
 #endif
 	/*
 	 * sched_setup() apparently happens prior to stathz being set.  We
 	 * need to resolve the timers earlier in the boot so we can avoid
 	 * calculating this here.
 	 */
 	if (realstathz == 0) {
 		realstathz = stathz ? stathz : hz;
 		tickincr = hz / realstathz;
 		/*
 		 * XXX This does not work for values of stathz that are much
 		 * larger than hz.
 		 */
 		if (tickincr == 0)
 			tickincr = 1;
 	}
 
 	ke = td->td_kse;
 	kg = ke->ke_ksegrp;
 
 	/* Adjust ticks for pctcpu */
 	ke->ke_ticks++;
 	ke->ke_ltick = ticks;
 
 	/* Go up to one second beyond our max and then trim back down */
 	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
 		sched_pctcpu_update(ke);
 
 	if (td->td_flags & TDF_IDLETD)
 		return;
 
 	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
 	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
 	/*
 	 * We only do slicing code for TIMESHARE ksegrps.
 	 */
 	if (kg->kg_pri_class != PRI_TIMESHARE)
 		return;
 	/*
 	 * We used a tick charge it to the ksegrp so that we can compute our
 	 * interactivity.
 	 */
 	kg->kg_runtime += tickincr << 10;
 	sched_interact_update(kg);
 
 	/*
 	 * We used up one time slice.
 	 */
 	if (--ke->ke_slice > 0)
 		return;
 	/*
 	 * We're out of time, recompute priorities and requeue.
 	 */
 	kseq = KSEQ_SELF();
 	kseq_load_rem(kseq, ke);
 	sched_priority(kg);
 	sched_slice(ke);
 	if (SCHED_CURR(kg, ke))
 		ke->ke_runq = kseq->ksq_curr;
 	else
 		ke->ke_runq = kseq->ksq_next;
 	kseq_load_add(kseq, ke);
 	td->td_flags |= TDF_NEEDRESCHED;
 }
 
 int
 sched_runnable(void)
 {
 	struct kseq *kseq;
 	int load;
 
 	load = 1;
 
 	kseq = KSEQ_SELF();
 #ifdef SMP
 	if (kseq->ksq_assigned) {
 		mtx_lock_spin(&sched_lock);
 		kseq_assign(kseq);
 		mtx_unlock_spin(&sched_lock);
 	}
 #endif
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (kseq->ksq_load > 0)
 			goto out;
 	} else
 		if (kseq->ksq_load - 1 > 0)
 			goto out;
 	load = 0;
 out:
 	return (load);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	struct ksegrp *kg;
 
 	kg = td->td_ksegrp;
 	
 	if (td->td_priority != kg->kg_user_pri) {
 		mtx_lock_spin(&sched_lock);
 		td->td_priority = kg->kg_user_pri;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 struct kse *
 sched_choose(void)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kseq = KSEQ_SELF();
 #ifdef SMP
 restart:
 	if (kseq->ksq_assigned)
 		kseq_assign(kseq);
 #endif
 	ke = kseq_choose(kseq);
 	if (ke) {
 #ifdef SMP
 		if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
 			if (kseq_idled(kseq) == 0)
 				goto restart;
 #endif
 		kseq_runq_rem(kseq, ke);
 		ke->ke_state = KES_THREAD;
 
 		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
 			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
 			    ke, ke->ke_runq, ke->ke_slice,
 			    ke->ke_thread->td_priority);
 		}
 		return (ke);
 	}
 #ifdef SMP
 	if (kseq_idled(kseq) == 0)
 		goto restart;
 #endif
 	return (NULL);
 }
 
 void
 sched_add(struct thread *td)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 	struct kse *ke;
 	int class;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	ke = td->td_kse;
 	kg = td->td_ksegrp;
 	if (ke->ke_flags & KEF_ASSIGNED)
 		return;
 	kseq = KSEQ_SELF();
 	KASSERT((ke->ke_thread != NULL),
 	    ("sched_add: No thread on KSE"));
 	KASSERT((ke->ke_thread->td_kse != NULL),
 	    ("sched_add: No KSE on thread"));
 	KASSERT(ke->ke_state != KES_ONRUNQ,
 	    ("sched_add: kse %p (%s) already in run queue", ke,
 	    ke->ke_proc->p_comm));
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("sched_add: process swapped out"));
 	KASSERT(ke->ke_runq == NULL,
 	    ("sched_add: KSE %p is still assigned to a run queue", ke));
 
 	class = PRI_BASE(kg->kg_pri_class);
 	switch (class) {
 	case PRI_ITHD:
 	case PRI_REALTIME:
 		ke->ke_runq = kseq->ksq_curr;
 		ke->ke_slice = SCHED_SLICE_MAX;
 		ke->ke_cpu = PCPU_GET(cpuid);
 		break;
 	case PRI_TIMESHARE:
 		if (SCHED_CURR(kg, ke))
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = kseq->ksq_next;
 		break;
 	case PRI_IDLE:
 		/*
 		 * This is for priority prop.
 		 */
 		if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = &kseq->ksq_idle;
 		ke->ke_slice = SCHED_SLICE_MIN;
 		break;
 	default:
 		panic("Unknown pri class.");
 		break;
 	}
 #ifdef SMP
 	if (ke->ke_cpu != PCPU_GET(cpuid)) {
 		ke->ke_runq = NULL;
 		kseq_notify(ke, ke->ke_cpu);
 		return;
 	}
 	/*
 	 * If we had been idle, clear our bit in the group and potentially
 	 * the global bitmap.  If not, see if we should transfer this thread.
 	 */
 	if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
 	    (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
 		/*
 		 * Check to see if our group is unidling, and if so, remove it
 		 * from the global idle mask.
 		 */
 		if (kseq->ksq_group->ksg_idlemask ==
 		    kseq->ksq_group->ksg_cpumask)
 			atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
 		/*
 		 * Now remove ourselves from the group specific idle mask.
 		 */
 		kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
 	} else if (kseq->ksq_load > 1 && KSE_CAN_MIGRATE(ke, class))
 		if (kseq_transfer(kseq, ke, class))
 			return;
 #endif
         if (td->td_priority < curthread->td_priority)
                 curthread->td_flags |= TDF_NEEDRESCHED;
 
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;
 
 	kseq_runq_add(kseq, ke);
 	kseq_load_add(kseq, ke);
 }
 
 void
 sched_rem(struct thread *td)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 
 	ke = td->td_kse;
 	/*
 	 * It is safe to just return here because sched_rem() is only ever
 	 * used in places where we're immediately going to add the
 	 * kse back on again.  In that case it'll be added with the correct
 	 * thread and priority when the caller drops the sched_lock.
 	 */
 	if (ke->ke_flags & KEF_ASSIGNED)
 		return;
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_state == KES_ONRUNQ),
 	    ("sched_rem: KSE not on run queue"));
 
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;
 	kseq = KSEQ_CPU(ke->ke_cpu);
 	kseq_runq_rem(kseq, ke);
 	kseq_load_rem(kseq, ke);
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	fixpt_t pctcpu;
 	struct kse *ke;
 
 	pctcpu = 0;
 	ke = td->td_kse;
 	if (ke == NULL)
 		return (0);
 
 	mtx_lock_spin(&sched_lock);
 	if (ke->ke_ticks) {
 		int rtick;
 
 		/*
 		 * Don't update more frequently than twice a second.  Allowing
 		 * this causes the cpu usage to decay away too quickly due to
 		 * rounding errors.
 		 */
 		if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
 		    ke->ke_ltick < (ticks - (hz / 2)))
 			sched_pctcpu_update(ke);
 		/* How many rtick per second ? */
 		rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
 		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
 	}
 
 	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
 	mtx_unlock_spin(&sched_lock);
 
 	return (pctcpu);
 }
 
 void
 sched_bind(struct thread *td, int cpu)
 {
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	ke = td->td_kse;
 	ke->ke_flags |= KEF_BOUND;
 #ifdef SMP
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 	/* sched_rem without the runq_remove */
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;
 	kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
 	kseq_notify(ke, cpu);
 	/* When we return from mi_switch we'll be on the correct cpu. */
 	mi_switch(SW_VOL);
 #endif
 }
 
 void
 sched_unbind(struct thread *td)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	td->td_kse->ke_flags &= ~KEF_BOUND;
 }
 
 int
 sched_load(void)
 {
 #ifdef SMP
 	int total;
 	int i;
 
 	total = 0;
 	for (i = 0; i <= ksg_maxid; i++)
 		total += KSEQ_GROUP(i)->ksg_load;
 	return (total);
 #else
 	return (KSEQ_SELF()->ksq_sysload);
 #endif
 }
 
 int
 sched_sizeof_kse(void)
 {
 	return (sizeof(struct kse) + sizeof(struct ke_sched));
 }
 
 int
 sched_sizeof_ksegrp(void)
 {
 	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
Index: head/sys/sys/proc.h
===================================================================
--- head/sys/sys/proc.h	(revision 130550)
+++ head/sys/sys/proc.h	(revision 130551)
@@ -1,936 +1,936 @@
 /*-
  * Copyright (c) 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)proc.h	8.15 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PROC_H_
 #define	_SYS_PROC_H_
 
 #include <sys/callout.h>		/* For struct callout. */
 #include <sys/event.h>			/* For struct klist. */
 #ifndef _KERNEL
 #include <sys/filedesc.h>
 #endif
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/queue.h>
 #include <sys/priority.h>
 #include <sys/rtprio.h>			/* XXX. */
 #include <sys/runq.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #ifndef _KERNEL
 #include <sys/time.h>			/* For structs itimerval, timeval. */
 #else
 #include <sys/pcpu.h>
 #endif
 #include <sys/ucontext.h>
 #include <sys/ucred.h>
 #include <machine/proc.h>		/* Machine-dependent proc substruct. */
 
 /*
  * One structure allocated per session.
  *
  * List of locks
  * (m)		locked by s_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct session {
 	int		s_count;	/* (m) Ref cnt; pgrps in session. */
 	struct proc	*s_leader;	/* (m + e) Session leader. */
 	struct vnode	*s_ttyvp;	/* (m) Vnode of controlling tty. */
 	struct tty	*s_ttyp;	/* (m) Controlling tty. */
 	pid_t		s_sid;		/* (c) Session ID. */
 					/* (m) Setlogin() name: */
 	char		s_login[roundup(MAXLOGNAME, sizeof(long))];
 	struct mtx	s_mtx;		/* Mutex to protect members. */
 };
 
 /*
  * One structure allocated per process group.
  *
  * List of locks
  * (m)		locked by pg_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct pgrp {
 	LIST_ENTRY(pgrp) pg_hash;	/* (e) Hash chain. */
 	LIST_HEAD(, proc) pg_members;	/* (m + e) Pointer to pgrp members. */
 	struct session	*pg_session;	/* (c) Pointer to session. */
 	struct sigiolst	pg_sigiolst;	/* (m) List of sigio sources. */
 	pid_t		pg_id;		/* (c) Pgrp id. */
 	int		pg_jobc;	/* (m) job cntl proc count */
 	struct mtx	pg_mtx;		/*  Mutex to protect members */
 };
 
 /*
  * pargs, used to hold a copy of the command line, if it had a sane length.
  */
 struct pargs {
 	u_int	ar_ref;		/* Reference count. */
 	u_int	ar_length;	/* Length. */
 	u_char	ar_args[1];	/* Arguments. */
 };
 
 /*-
  * Description of a process.
  *
  * This structure contains the information needed to manage a thread of
  * control, known in UN*X as a process; it has references to substructures
  * containing descriptions of things that the process uses, but may share
  * with related processes.  The process structure and the substructures
  * are always addressable except for those marked "(CPU)" below,
  * which might be addressable only on a processor on which the process
  * is running.
  *
  * Below is a key of locks used to protect each member of struct proc.  The
  * lock is indicated by a reference to a specific character in parens in the
  * associated comment.
  *      * - not yet protected
  *      a - only touched by curproc or parent during fork/wait
  *      b - created at fork, never changes
  *		(exception aiods switch vmspaces, but they are also
  *		marked 'P_SYSTEM' so hopefully it will be left alone)
  *      c - locked by proc mtx
  *      d - locked by allproc_lock lock
  *      e - locked by proctree_lock lock
  *      f - session mtx
  *      g - process group mtx
  *      h - callout_lock mtx
  *      i - by curproc or the master session mtx
  *      j - locked by sched_lock mtx
  *      k - only accessed by curthread
  *      l - the attaching proc or attaching proc parent
  *      m - Giant
  *      n - not locked, lazy
  *      o - ktrace lock
  *      p - select lock (sellock)
  *      q - td_contested lock
  *      r - p_peers lock
  *      x - created at fork, only changes during single threading in exec
  *      z - zombie threads/kse/ksegroup lock
  *
  * If the locking key specifies two identifiers (for example, p_pptr) then
  * either lock is sufficient for read access, but both locks must be held
  * for write access.
  */
 struct ithd;
 struct ke_sched;
 struct kg_sched;
 struct nlminfo;
 struct p_sched;
 struct sleepqueue;
 struct td_sched;
 struct trapframe;
 struct turnstile;
 
 /*
  * Here we define the four structures used for process information.
  *
  * The first is the thread. It might be though of as a "Kernel
  * Schedulable Entity Context".
  * This structure contains all the information as to where a thread of
  * execution is now, or was when it was suspended, why it was suspended,
  * and anything else that will be needed to restart it when it is
  * rescheduled. Always associated with a KSE when running, but can be
  * reassigned to an equivalent KSE when being restarted for
  * load balancing. Each of these is associated with a kernel stack
  * and a pcb.
  *
  * It is important to remember that a particular thread structure only
  * exists as long as the system call or kernel entrance (e.g. by pagefault)
  * which it is currently executing. It should therefore NEVER be referenced
  * by pointers in long lived structures that live longer than a single
  * request. If several threads complete their work at the same time,
  * they will all rewind their stacks to the user boundary, report their
  * completion state, and all but one will be freed. That last one will
  * be kept to provide a kernel stack and pcb for the NEXT syscall or kernel
  * entrance. (basically to save freeing and then re-allocating it) The KSE
  * keeps a cached thread available to allow it to quickly
  * get one when it needs a new one. There is also a system
  * cache of free threads. Threads have priority and partake in priority
  * inheritance schemes.
  */
 struct thread;
 
 /*
  * The second structure is the Kernel Schedulable Entity. (KSE)
  * It represents the ability to take a slot in the scheduler queue.
  * As long as this is scheduled, it could continue to run any threads that
  * are assigned to the KSEGRP (see later) until either it runs out
  * of runnable threads of high enough priority, or CPU.
  * It runs on one CPU and is assigned a quantum of time. When a thread is
  * blocked, The KSE continues to run and will search for another thread
  * in a runnable state amongst those it has. It May decide to return to user
  * mode with a new 'empty' thread if there are no runnable threads.
  * Threads are temporarily associated with a KSE for scheduling reasons.
  */
 struct kse;
 
 /*
  * The KSEGRP is allocated resources across a number of CPUs.
  * (Including a number of CPUxQUANTA. It parcels these QUANTA up among
  * its KSEs, each of which should be running in a different CPU.
  * BASE priority and total available quanta are properties of a KSEGRP.
  * Multiple KSEGRPs in a single process compete against each other
  * for total quanta in the same way that a forked child competes against
  * it's parent process.
  */
 struct ksegrp;
 
 /*
  * A process is the owner of all system resources allocated to a task
  * except CPU quanta.
  * All KSEGs under one process see, and have the same access to, these
  * resources (e.g. files, memory, sockets, permissions kqueues).
  * A process may compete for CPU cycles on the same basis as a
  * forked process cluster by spawning several KSEGRPs.
  */
 struct proc;
 
 /***************
  * In pictures:
  With a single run queue used by all processors:
 
  RUNQ: --->KSE---KSE--...               SLEEPQ:[]---THREAD---THREAD---THREAD
 	   |   /                               []---THREAD
 	   KSEG---THREAD--THREAD--THREAD       []
 					       []---THREAD---THREAD
 
   (processors run THREADs from the KSEG until they are exhausted or
   the KSEG exhausts its quantum)
 
 With PER-CPU run queues:
 KSEs on the separate run queues directly
 They would be given priorities calculated from the KSEG.
 
  *
  *****************/
 
 /*
  * Kernel runnable context (thread).
  * This is what is put to sleep and reactivated.
  * The first KSE available in the correct group will run this thread.
  * If several are available, use the one on the same CPU as last time.
  * When waiting to be run, threads are hung off the KSEGRP in priority order.
  * with N runnable and queued KSEs in the KSEGRP, the first N threads
  * are linked to them. Other threads are not yet assigned.
  */
 struct thread {
 	struct proc	*td_proc;	/* (*) Associated process. */
 	struct ksegrp	*td_ksegrp;	/* (*) Associated KSEG. */
 	TAILQ_ENTRY(thread) td_plist;	/* (*) All threads in this proc. */
 	TAILQ_ENTRY(thread) td_kglist;	/* (*) All threads in this ksegrp. */
 
 	/* The two queues below should someday be merged. */
 	TAILQ_ENTRY(thread) td_slpq;	/* (j) Sleep queue. */
 	TAILQ_ENTRY(thread) td_lockq;	/* (j) Lock queue. */
 	TAILQ_ENTRY(thread) td_runq;	/* (j/z) Run queue(s). XXXKSE */
 
 	TAILQ_HEAD(, selinfo) td_selq;	/* (p) List of selinfos. */
 	struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
 	struct turnstile *td_turnstile;	/* (k) Associated turnstile. */
 	int		td_tid;		/* (b) Thread ID. */
 
 /* Cleared during fork1() or thread_sched_upcall(). */
 #define	td_startzero td_flags
 	int		td_flags;	/* (j) TDF_* flags. */
 	int		td_inhibitors;	/* (j) Why can not run. */
 	int		td_pflags;	/* (k) Private thread (TDP_*) flags. */
 	struct kse	*td_last_kse;	/* (j) Previous value of td_kse. */
 	struct kse	*td_kse;	/* (j) Current KSE if running. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
 	void		*td_wchan;	/* (j) Sleep address. */
 	const char	*td_wmesg;	/* (j) Reason for sleep. */
 	u_char		td_lastcpu;	/* (j) Last cpu we were on. */
 	u_char		td_oncpu;	/* (j) Which cpu we are on. */
 	short		td_locks;	/* (k) DEBUG: lockmgr count of locks. */
 	struct turnstile *td_blocked;	/* (j) Lock process is blocked on. */
 	struct ithd	*td_ithd;	/* (b) For interrupt threads only. */
 	const char	*td_lockname;	/* (j) Name of lock blocked on. */
 	LIST_HEAD(, turnstile) td_contested;	/* (q) Contested locks. */
 	struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
 	int		td_intr_nesting_level; /* (k) Interrupt recursion. */
 	int		td_pinned;	/* (k) Temporary cpu pin count. */
 	struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. */
 	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	struct thread	*td_standin;	/* (*) Use this for an upcall. */
 	u_int		td_prticks;	/* (*) Profclock hits in sys for user */
 	struct kse_upcall *td_upcall;	/* (*) Upcall structure. */
 	u_int64_t	td_sticks;	/* (j) Statclock hits in system mode. */
 	u_int		td_uuticks;	/* (*) Statclock in user, for UTS. */
 	u_int		td_usticks;	/* (*) Statclock in kernel, for UTS. */
 	int		td_intrval;	/* (*) Return value of TDF_INTERRUPT. */
 	sigset_t	td_oldsigmask;	/* (k) Saved mask from pre sigpause. */
 	sigset_t	td_sigmask;	/* (c) Current signal mask. */
 	sigset_t	td_siglist;	/* (c) Sigs arrived, not delivered. */
 	sigset_t	*td_waitset;	/* (c) Wait set for sigwait. */
 	TAILQ_ENTRY(thread) td_umtx;	/* (c?) Link for when we're blocked. */
 	volatile u_int	td_generation;	/* (k) Enable detection of preemption */
 	stack_t		td_sigstk;	/* (k) Stack ptr and on-stack flag. */
 	int		td_kflags;	/* (c) Flags for KSE threading. */
 
 #define	td_endzero td_base_pri
 
 /* Copied during fork1() or thread_sched_upcall(). */
 #define	td_startcopy td_endzero
 	u_char		td_base_pri;	/* (j) Thread base kernel priority. */
 	u_char		td_priority;	/* (j) Thread active priority. */
 #define	td_endcopy td_pcb
 
 /*
  * fields that must be manually set in fork1() or thread_sched_upcall()
  * or already have been set in the allocator, contstructor, etc..
  */
 	struct pcb	*td_pcb;	/* (k) Kernel VA of pcb and kstack. */
 	enum {
 		TDS_INACTIVE = 0x0,
 		TDS_INHIBITED,
 		TDS_CAN_RUN,
 		TDS_RUNQ,
 		TDS_RUNNING
 	} td_state;
 	register_t	td_retval[2];	/* (k) Syscall aux returns. */
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
 	struct trapframe *td_frame;	/* (k) */
 	struct vm_object *td_kstack_obj;/* (a) Kstack object. */
 	vm_offset_t	td_kstack;	/* (a) Kernel VA of kstack. */
 	int		td_kstack_pages; /* (a) Size of the kstack. */
 	struct vm_object *td_altkstack_obj;/* (a) Alternate kstack object. */
 	vm_offset_t	td_altkstack;	/* (a) Kernel VA of alternate kstack. */
 	int		td_altkstack_pages; /* (a) Size of the alternate kstack */
 	u_int		td_critnest;	/* (k) Critical section nest level. */
 	struct mdthread td_md;		/* (k) Any machine-dependent fields. */
 	struct td_sched	*td_sched;	/* (*) Scheduler-specific data. */
 };
 
 /* Flags kept in td_flags: */
 #define	TDF_INPANIC	0x000002 /* Caused a panic, let it drive crashdump. */
 #define	TDF_CAN_UNBIND	0x000004 /* Only temporarily bound. */
 #define	TDF_SINTR	0x000008 /* Sleep is interruptible. */
 #define	TDF_TIMEOUT	0x000010 /* Timing out during sleep. */
 #define	TDF_IDLETD	0x000020 /* This is one of the per-CPU idle threads. */
 #define	TDF_SELECT	0x000040 /* Selecting; wakeup/waiting danger. */
 #define	TDF_TSNOBLOCK	0x000100 /* Don't block on a turnstile due to race. */
 #define	TDF_ASTPENDING	0x000800 /* Thread has some asynchronous events. */
 #define	TDF_TIMOFAIL	0x001000 /* Timeout from sleep after we were awake. */
 #define	TDF_INTERRUPT	0x002000 /* Thread is marked as interrupted. */
 #define	TDF_USTATCLOCK	0x004000 /* Finish user statclock hit at next AST. */
 #define	TDF_OWEUPC	0x008000 /* Owe thread an addupc() call at next AST. */
 #define	TDF_NEEDRESCHED	0x010000 /* Thread needs to yield. */
 #define	TDF_NEEDSIGCHK	0x020000 /* Thread may need signal delivery. */
 #define	TDF_UMTXWAKEUP	0x080000 /* Libthr thread must not sleep on a umtx. */
 #define	TDF_THRWAKEUP	0x100000 /* Libthr thread must not suspend itself. */
 
 /* "Private" flags kept in td_pflags: */
 #define	TDP_OLDMASK	0x0001 /* Need to restore mask after suspend. */
 #define	TDP_INKTR	0x0002 /* Thread is currently in KTR code. */
 #define	TDP_INKTRACE	0x0004 /* Thread is currently in KTRACE code. */
 #define	TDP_UPCALLING	0x0008 /* This thread is doing an upcall. */
 #define	TDP_COWINPROGRESS 0x0010 /* Snapshot copy-on-write in progress. */
 #define	TDP_ALTSTACK	0x0020 /* Have alternate signal stack. */
 #define	TDP_DEADLKTREAT	0x0040 /* Lock aquisition - deadlock treatment. */
 #define	TDP_SA		0x0080 /* A scheduler activation based thread. */
 
 #define	TDI_SUSPENDED	0x0001	/* On suspension queue. */
 #define	TDI_SLEEPING	0x0002	/* Actually asleep! (tricky). */
 #define	TDI_SWAPPED	0x0004	/* Stack not in mem.. bad juju if run. */
 #define	TDI_LOCK	0x0008	/* Stopped on a lock. */
 #define	TDI_IWAIT	0x0010	/* Awaiting interrupt. */
 
 #define	TDK_KSEREL	0x0001	/* Blocked in msleep on kg->kg_completed. */
 #define	TDK_KSERELSIG	0x0002	/* Blocked in msleep on p->p_siglist. */
 #define	TDK_WAKEUP	0x0004	/* Thread has been woken by kse_wakeup. */
 
 #define	TD_CAN_UNBIND(td)					\
     (((td)->td_flags & TDF_CAN_UNBIND) == TDF_CAN_UNBIND &&	\
      ((td)->td_upcall != NULL))
 
 #define	TD_IS_SLEEPING(td)	((td)->td_inhibitors & TDI_SLEEPING)
 #define	TD_ON_SLEEPQ(td)	((td)->td_wchan != NULL)
 #define	TD_IS_SUSPENDED(td)	((td)->td_inhibitors & TDI_SUSPENDED)
 #define	TD_IS_SWAPPED(td)	((td)->td_inhibitors & TDI_SWAPPED)
 #define	TD_ON_LOCK(td)		((td)->td_inhibitors & TDI_LOCK)
 #define	TD_AWAITING_INTR(td)	((td)->td_inhibitors & TDI_IWAIT)
 #define	TD_IS_RUNNING(td)	((td)->td_state == TDS_RUNNING)
 #define	TD_ON_RUNQ(td)		((td)->td_state == TDS_RUNQ)
 #define	TD_CAN_RUN(td)		((td)->td_state == TDS_CAN_RUN)
 #define	TD_IS_INHIBITED(td)	((td)->td_state == TDS_INHIBITED)
 
 #define	TD_SET_INHIB(td, inhib) do {			\
 	(td)->td_state = TDS_INHIBITED;			\
 	(td)->td_inhibitors |= (inhib);			\
 } while (0)
 
 #define	TD_CLR_INHIB(td, inhib) do {			\
 	if (((td)->td_inhibitors & (inhib)) &&		\
 	    (((td)->td_inhibitors &= ~(inhib)) == 0))	\
 		(td)->td_state = TDS_CAN_RUN;		\
 } while (0)
 
 #define	TD_SET_SLEEPING(td)	TD_SET_INHIB((td), TDI_SLEEPING)
 #define	TD_SET_SWAPPED(td)	TD_SET_INHIB((td), TDI_SWAPPED)
 #define	TD_SET_LOCK(td)		TD_SET_INHIB((td), TDI_LOCK)
 #define	TD_SET_SUSPENDED(td)	TD_SET_INHIB((td), TDI_SUSPENDED)
 #define	TD_SET_IWAIT(td)	TD_SET_INHIB((td), TDI_IWAIT)
 #define	TD_SET_EXITING(td)	TD_SET_INHIB((td), TDI_EXITING)
 
 #define	TD_CLR_SLEEPING(td)	TD_CLR_INHIB((td), TDI_SLEEPING)
 #define	TD_CLR_SWAPPED(td)	TD_CLR_INHIB((td), TDI_SWAPPED)
 #define	TD_CLR_LOCK(td)		TD_CLR_INHIB((td), TDI_LOCK)
 #define	TD_CLR_SUSPENDED(td)	TD_CLR_INHIB((td), TDI_SUSPENDED)
 #define	TD_CLR_IWAIT(td)	TD_CLR_INHIB((td), TDI_IWAIT)
 
 #define	TD_SET_RUNNING(td)	(td)->td_state = TDS_RUNNING
 #define	TD_SET_RUNQ(td)		(td)->td_state = TDS_RUNQ
 #define	TD_SET_CAN_RUN(td)	(td)->td_state = TDS_CAN_RUN
 
 /*
  * The schedulable entity that can be given a context to run.
  * A process may have several of these. Probably one per processor
  * but posibly a few more. In this universe they are grouped
  * with a KSEG that contains the priority and niceness
  * for the group.
  */
 struct kse {
 	struct proc	*ke_proc;	/* (*) Associated process. */
 	struct ksegrp	*ke_ksegrp;	/* (*) Associated KSEG. */
 	TAILQ_ENTRY(kse) ke_kglist;	/* (*) Queue of KSEs in ke_ksegrp. */
 	TAILQ_ENTRY(kse) ke_kgrlist;	/* (*) Queue of KSEs in this state. */
 	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
 
 #define	ke_startzero ke_flags
 	int		ke_flags;	/* (j) KEF_* flags. */
 	struct thread	*ke_thread;	/* (*) Active associated thread. */
 	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
 	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
 	char		ke_rqindex;	/* (j) Run queue index. */
 	enum {
 		KES_UNUSED = 0x0,
 		KES_IDLE,
 		KES_ONRUNQ,
 		KES_UNQUEUED,		/* in transit */
 		KES_THREAD		/* slaved to thread state */
 	} ke_state;			/* (j) KSE status. */
 #define	ke_endzero ke_dummy
 	u_char		ke_dummy;
 	struct ke_sched	*ke_sched;	/* (*) Scheduler-specific data. */
 };
 
 /* flags kept in ke_flags */
 #define	KEF_SCHED0	0x00001	/* For scheduler-specific use. */
 #define	KEF_SCHED1	0x00002	/* For scheduler-specific use. */
 #define	KEF_SCHED2	0X00004	/* For scheduler-specific use. */
 #define	KEF_SCHED3	0x00008	/* For scheduler-specific use. */
 #define	KEF_DIDRUN	0x02000	/* KSE actually ran. */
 #define	KEF_EXIT	0x04000	/* KSE is being killed. */
 
 /*
  * The upcall management structure.
  * The upcall is used when returning to userland.  If a thread does not have
  * an upcall on return to userland the thread exports its context and exits.
  */
 struct kse_upcall {
 	TAILQ_ENTRY(kse_upcall) ku_link;	/* List of upcalls in KSEG. */
 	struct ksegrp		*ku_ksegrp;	/* Associated KSEG. */
 	struct thread		*ku_owner;	/* owning thread */
 	int			ku_flags;	/* KUF_* flags. */
 	struct kse_mailbox	*ku_mailbox;	/* userland mailbox address. */
 	stack_t			ku_stack;	/* userland upcall stack. */
 	void			*ku_func;	/* userland upcall function. */
 	unsigned int		ku_mflags;	/* cached upcall mailbox flags */
 };
 
 #define	KUF_DOUPCALL	0x00001		/* Do upcall now, don't wait. */
 #define	KUF_EXITING	0x00002		/* Upcall structure is exiting. */
 
 /*
  * Kernel-scheduled entity group (KSEG).  The scheduler considers each KSEG to
  * be an indivisible unit from a time-sharing perspective, though each KSEG may
  * contain multiple KSEs.
  */
 struct ksegrp {
 	struct proc	*kg_proc;	/* (*) Process that contains this KSEG. */
 	TAILQ_ENTRY(ksegrp) kg_ksegrp;	/* (*) Queue of KSEGs in kg_proc. */
 	TAILQ_HEAD(, kse) kg_kseq;	/* (ke_kglist) All KSEs. */
 	TAILQ_HEAD(, kse) kg_iq;	/* (ke_kgrlist) All idle KSEs. */
 	TAILQ_HEAD(, thread) kg_threads;/* (td_kglist) All threads. */
 	TAILQ_HEAD(, thread) kg_runq;	/* (td_runq) waiting RUNNABLE threads */
 	TAILQ_HEAD(, thread) kg_slpq;	/* (td_runq) NONRUNNABLE threads. */
 	TAILQ_HEAD(, kse_upcall) kg_upcalls;	/* All upcalls in the group. */
 #define	kg_startzero kg_estcpu
 	u_int		kg_estcpu;	/* (j) Sum of the same field in KSEs. */
 	u_int		kg_slptime;	/* (j) How long completely blocked. */
 	struct thread	*kg_last_assigned; /* (j) Last thread assigned to a KSE. */
 	int		kg_runnable;	/* (j) Num runnable threads on queue. */
 	int		kg_runq_kses;	/* (j) Num KSEs on runq. */
 	int		kg_idle_kses;	/* (j) Num KSEs on iq. */
 	int		kg_numupcalls;	/* (j) Num upcalls. */
 	int		kg_upsleeps;	/* (c) Num threads in kse_release(). */
 	struct kse_thr_mailbox *kg_completed; /* (c) Completed thread mboxes. */
 	int		kg_nextupcall;	/* (*) Next upcall time. */
 	int		kg_upquantum;	/* (*) Quantum to schedule an upcall. */
 #define	kg_endzero kg_pri_class
 
 #define	kg_startcopy	kg_endzero
 	u_char		kg_pri_class;	/* (j) Scheduling class. */
 	u_char		kg_user_pri;	/* (j) User pri from estcpu and nice. */
-	signed char	kg_nice;	/* (c + j) Process "nice" value. */
 #define	kg_endcopy kg_numthreads
 	int		kg_numthreads;	/* (j) Num threads in total. */
 	int		kg_kses;	/* (j) Num KSEs in group. */
 	struct kg_sched	*kg_sched;	/* (*) Scheduler-specific data. */
 };
 
 /*
  * The old fashionned process. May have multiple threads, KSEGRPs
  * and KSEs. Starts off with a single embedded KSEGRP, KSE and THREAD.
  */
 struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
 	TAILQ_HEAD(, ksegrp) p_ksegrps;	/* (kg_ksegrp) All KSEGs. */
 	TAILQ_HEAD(, thread) p_threads;	/* (td_plist) Threads. (shortcut) */
 	TAILQ_HEAD(, thread) p_suspended; /* (td_runq) Suspended threads. */
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Ptr to open files structure. */
 	struct filedesc_to_leader *p_fdtol; /* (b) Ptr to tracking node */
 					/* Accumulated stats for all KSEs? */
 	struct pstats	*p_stats;	/* (b) Accounting/statistics (CPU). */
 	struct plimit	*p_limit;	/* (c) Process limits. */
 	struct vm_object *p_upages_obj; /* (a) Upages object. */
 	struct sigacts	*p_sigacts;	/* (x) Signal actions, state (CPU). */
 	/*
 	 * The following don't make too much sense..
 	 * See the td_ or ke_ versions of the same flags
 	 */
 	int		p_flag;		/* (c) P_* flags. */
 	int		p_sflag;	/* (j) PS_* flags. */
 	enum {
 		PRS_NEW = 0,		/* In creation */
 		PRS_NORMAL,		/* KSEs can be run. */
 		PRS_ZOMBIE
 	} p_state;			/* (j/c) S* process status. */
 	pid_t		p_pid;		/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (g + e) List of processes in pgrp. */
 	struct proc	*p_pptr;	/* (c + e) Pointer to parent process. */
 	LIST_ENTRY(proc) p_sibling;	/* (e) List of sibling processes. */
 	LIST_HEAD(, proc) p_children;	/* (e) Pointer to list of children. */
 	struct mtx	p_mtx;		/* (n) Lock for this struct. */
 
 /* The following fields are all zeroed upon creation in fork. */
 #define	p_startzero	p_oppid
 	pid_t		p_oppid;	/* (c + e) Save ppid in ptrace. XXX */
 	struct vmspace	*p_vmspace;	/* (b) Address space. */
 	u_int		p_swtime;	/* (j) Time swapped in or out. */
 	struct itimerval p_realtimer;	/* (c) Alarm timer. */
 	struct bintime	p_runtime;	/* (j) Real time. */
 	u_int64_t	p_uu;		/* (j) Previous user time in usec. */
 	u_int64_t	p_su;		/* (j) Previous system time in usec. */
 	u_int64_t	p_iu;		/* (j) Previous intr time in usec. */
 	u_int64_t	p_uticks;	/* (j) Statclock hits in user mode. */
 	u_int64_t	p_sticks;	/* (j) Statclock hits in system mode. */
 	u_int64_t	p_iticks;	/* (j) Statclock hits in intr. */
 	int		p_profthreads;	/* (c) Num threads in addupc_task. */
 	int		p_maxthrwaits;	/* (c) Max threads num waiters */
 	int		p_traceflag;	/* (o) Kernel trace points. */
 	struct vnode	*p_tracevp;	/* (c + o) Trace to vnode. */
 	struct ucred	*p_tracecred;	/* (o) Credentials to trace with. */
 	struct vnode	*p_textvp;	/* (b) Vnode of executable. */
 	sigset_t	p_siglist;	/* (c) Sigs not delivered to a td. */
 	char		p_lock;		/* (c) Proclock (prevent swap) count. */
 	struct klist	p_klist;	/* (c) Knotes attached to this proc. */
 	struct sigiolst	p_sigiolst;	/* (c) List of sigio sources. */
 	int		p_sigparent;	/* (c) Signal to parent on exit. */
 	int		p_sig;		/* (n) For core dump/debugger XXX. */
 	u_long		p_code;		/* (n) For core dump/debugger XXX. */
 	u_int		p_stops;	/* (c) Stop event bitmask. */
 	u_int		p_stype;	/* (c) Stop event type. */
 	char		p_step;		/* (c) Process is stopped. */
 	u_char		p_pfsflags;	/* (c) Procfs flags. */
 	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
 	void		*p_aioinfo;	/* (?) ASYNC I/O info. */
 	struct thread	*p_singlethread;/* (c + j) If single threading this is it */
 	int		p_suspcount;	/* (c) # threads in suspended mode */
 /* End area that is zeroed on creation. */
 #define	p_endzero	p_magic
 
 /* The following fields are all copied upon creation in fork. */
 #define	p_startcopy	p_endzero
 	u_int		p_magic;	/* (b) Magic number. */
 	char		p_comm[MAXCOMLEN + 1];	/* (b) Process name. */
 	struct pgrp	*p_pgrp;	/* (c + e) Pointer to process group. */
 	struct sysentvec *p_sysent;	/* (b) Syscall dispatch info. */
 	struct pargs	*p_args;	/* (c) Process arguments. */
 	rlim_t		p_cpulimit;	/* (j) Current CPU limit in seconds. */
+	signed char	p_nice;		/* (c + j) Process "nice" value. */
 /* End area that is copied on creation. */
 #define	p_endcopy	p_xstat
 
 	u_short		p_xstat;	/* (c) Exit status; also stop sig. */
 	int		p_numthreads;	/* (j) Number of threads. */
 	int		p_numksegrps;	/* (?) number of ksegrps */
 	struct mdproc	p_md;		/* Any machine-dependent fields. */
 	struct callout	p_itcallout;	/* (h + c) Interval timer callout. */
 	struct user	*p_uarea;	/* (k) Kernel VA of u-area (CPU). */
 	u_short		p_acflag;	/* (c) Accounting flags. */
 	struct rusage	*p_ru;		/* (a) Exit information. XXX */
 	struct proc	*p_peers;	/* (r) */
 	struct proc	*p_leader;	/* (b) */
 	void		*p_emuldata;	/* (c) Emulator state data. */
 	struct label	*p_label;	/* (*) Proc (not subject) MAC label. */
 	struct p_sched	*p_sched;	/* (*) Scheduler-specific data. */
 };
 
 #define	p_session	p_pgrp->pg_session
 #define	p_pgid		p_pgrp->pg_id
 
 #define	NOCPU	0xff		/* For when we aren't on a CPU. (SMP) */
 
 /* Status values (p_stat). */
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
 #define	P_CONTROLT	0x00002	/* Has a controlling terminal. */
 #define	P_KTHREAD	0x00004	/* Kernel thread. (*)*/
 #define	P_NOLOAD	0x00008	/* Ignore during load avg calculations. */
 #define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
 #define	P_PROFIL	0x00020	/* Has started profiling. */
 #define	P_STOPPROF	0x00040	/* Has thread in requesting to stop prof */
 #define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
 #define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
 #define	P_SINGLE_EXIT	0x00400	/* Threads suspending should exit, not wait. */
 #define	P_TRACED	0x00800	/* Debugged process being traced. */
 #define	P_WAITED	0x01000	/* Someone is waiting for us. */
 #define	P_WEXIT		0x02000	/* Working on exiting. */
 #define	P_EXEC		0x04000	/* Process called exec. */
 #define	P_SA		0x08000	/* Using scheduler activations. */
 #define	P_CONTINUED	0x10000	/* Proc has continued from a stopped state. */
 #define	P_STOPPED_SIG	0x20000	/* Stopped due to SIGSTOP/SIGTSTP. */
 #define	P_STOPPED_TRACE	0x40000	/* Stopped because of tracing. */
 #define	P_STOPPED_SINGLE	0x80000	/* Only one thread can continue */
 					/* (not to user) */
 #define	P_PROTECTED	0x100000 /* Do not kill on memory overcommit. */
 #define	P_SIGEVENT	0x200000 /* Process pending signals changed. */
 
 #define	P_JAILED	0x1000000 /* Process is in jail. */
 #define	P_INEXEC	0x4000000 /* Process is in execve(). */
 
 #define	P_STOPPED		(P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
 #define	P_SHOULDSTOP(p)		((p)->p_flag & P_STOPPED)
 
 /* These flags are kept in p_sflag and are protected with sched_lock. */
 #define	PS_INMEM	0x00001	/* Loaded into memory. */
 #define	PS_XCPU		0x00002 /* Exceeded CPU limit. */
 #define	PS_ALRMPEND	0x00020	/* Pending SIGVTALRM needs to be posted. */
 #define	PS_PROFPEND	0x00040	/* Pending SIGPROF needs to be posted. */
 #define	PS_SWAPINREQ	0x00100	/* Swapin request due to wakeup. */
 #define	PS_SWAPPINGOUT	0x00200	/* Process is being swapped out. */
 #define	PS_SWAPPINGIN	0x04000	/* Process is being swapped in. */
 #define	PS_MACPEND	0x08000	/* Ast()-based MAC event pending. */
 
 /* used only in legacy conversion code */
 #define	SIDL	1		/* Process being created by fork. */
 #define	SRUN	2		/* Currently runnable. */
 #define	SSLEEP	3		/* Sleeping on an address. */
 #define	SSTOP	4		/* Process debugging or suspension. */
 #define	SZOMB	5		/* Awaiting collection by parent. */
 #define	SWAIT	6		/* Waiting for interrupt. */
 #define	SLOCK	7		/* Blocked on a lock. */
 
 #define	P_MAGIC		0xbeefface
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_PARGS);
 MALLOC_DECLARE(M_PGRP);
 MALLOC_DECLARE(M_SESSION);
 MALLOC_DECLARE(M_SUBPROC);
 MALLOC_DECLARE(M_ZOMBIE);
 #endif
 
 #define	FOREACH_PROC_IN_SYSTEM(p)					\
 	LIST_FOREACH((p), &allproc, p_list)
 #define	FOREACH_KSEGRP_IN_PROC(p, kg)					\
 	TAILQ_FOREACH((kg), &(p)->p_ksegrps, kg_ksegrp)
 #define	FOREACH_THREAD_IN_GROUP(kg, td)					\
 	TAILQ_FOREACH((td), &(kg)->kg_threads, td_kglist)
 #define	FOREACH_KSE_IN_GROUP(kg, ke)					\
 	TAILQ_FOREACH((ke), &(kg)->kg_kseq, ke_kglist)
 #define	FOREACH_UPCALL_IN_GROUP(kg, ku)					\
 	TAILQ_FOREACH((ku), &(kg)->kg_upcalls, ku_link)
 #define	FOREACH_THREAD_IN_PROC(p, td)					\
 	TAILQ_FOREACH((td), &(p)->p_threads, td_plist)
 
 /* XXXKSE the lines below should probably only be used in 1:1 code */
 #define	FIRST_THREAD_IN_PROC(p)	TAILQ_FIRST(&(p)->p_threads)
 #define	FIRST_KSEGRP_IN_PROC(p)	TAILQ_FIRST(&(p)->p_ksegrps)
 #define	FIRST_KSE_IN_KSEGRP(kg)	TAILQ_FIRST(&(kg)->kg_kseq)
 #define	FIRST_KSE_IN_PROC(p)	FIRST_KSE_IN_KSEGRP(FIRST_KSEGRP_IN_PROC(p))
 
 /*
  * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t,
  * as it is used to represent "no process group".
  */
 #define	PID_MAX		99999
 #define	NO_PID		100000
 
 #define	SESS_LEADER(p)	((p)->p_session->s_leader == (p))
 #define	SESSHOLD(s)	((s)->s_count++)
 #define	SESSRELE(s) {							\
 	if (--(s)->s_count == 0)					\
 		FREE(s, M_SESSION);					\
 }
 
 #define	STOPEVENT(p, e, v) do {						\
 	if ((p)->p_stops & (e))	{					\
 		PROC_LOCK(p);						\
 		stopevent((p), (e), (v));				\
 		PROC_UNLOCK(p);						\
 	}								\
 } while (0)
 #define	_STOPEVENT(p, e, v) do {					\
 	PROC_LOCK_ASSERT(p, MA_OWNED);					\
 	if ((p)->p_stops & (e))						\
 		stopevent((p), (e), (v));				\
 } while (0)
 
 /* Lock and unlock a process. */
 #define	PROC_LOCK(p)	mtx_lock(&(p)->p_mtx)
 #define	PROC_TRYLOCK(p)	mtx_trylock(&(p)->p_mtx)
 #define	PROC_UNLOCK(p)	mtx_unlock(&(p)->p_mtx)
 #define	PROC_LOCKED(p)	mtx_owned(&(p)->p_mtx)
 #define	PROC_LOCK_ASSERT(p, type)	mtx_assert(&(p)->p_mtx, (type))
 
 /* Lock and unlock a process group. */
 #define	PGRP_LOCK(pg)	mtx_lock(&(pg)->pg_mtx)
 #define	PGRP_UNLOCK(pg)	mtx_unlock(&(pg)->pg_mtx)
 #define	PGRP_LOCKED(pg)	mtx_owned(&(pg)->pg_mtx)
 #define	PGRP_LOCK_ASSERT(pg, type)	mtx_assert(&(pg)->pg_mtx, (type))
 
 #define	PGRP_LOCK_PGSIGNAL(pg) do {					\
 	if ((pg) != NULL)						\
 		PGRP_LOCK(pg);						\
 } while (0)
 #define	PGRP_UNLOCK_PGSIGNAL(pg) do {					\
 	if ((pg) != NULL)						\
 		PGRP_UNLOCK(pg);					\
 } while (0)
 
 /* Lock and unlock a session. */
 #define	SESS_LOCK(s)	mtx_lock(&(s)->s_mtx)
 #define	SESS_UNLOCK(s)	mtx_unlock(&(s)->s_mtx)
 #define	SESS_LOCKED(s)	mtx_owned(&(s)->s_mtx)
 #define	SESS_LOCK_ASSERT(s, type)	mtx_assert(&(s)->s_mtx, (type))
 
 /* Hold process U-area in memory, normally for ptrace/procfs work. */
 #define	PHOLD(p) do {							\
 	PROC_LOCK(p);							\
 	_PHOLD(p);							\
 	PROC_UNLOCK(p);							\
 } while (0)
 #define	_PHOLD(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	(p)->p_lock++;							\
 	if (((p)->p_sflag & PS_INMEM) == 0)				\
 		faultin((p));						\
 } while (0)
 
 #define	PRELE(p) do {							\
 	PROC_LOCK((p));							\
 	_PRELE((p));							\
 	PROC_UNLOCK((p));						\
 } while (0)
 #define	_PRELE(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	(--(p)->p_lock);						\
 } while (0)
 
 /* Check whether a thread is safe to be swapped out. */
 #define	thread_safetoswapout(td) (TD_IS_SLEEPING(td) || TD_IS_SUSPENDED(td))
 
 /* Lock and unlock process arguments. */
 #define	PARGS_LOCK(p)		mtx_lock(&pargs_ref_lock)
 #define	PARGS_UNLOCK(p)		mtx_unlock(&pargs_ref_lock)
 
 #define	PIDHASH(pid)	(&pidhashtbl[(pid) & pidhash])
 extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
 extern u_long pidhash;
 
 #define	PGRPHASH(pgid)	(&pgrphashtbl[(pgid) & pgrphash])
 extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
 extern u_long pgrphash;
 
 extern struct sx allproc_lock;
 extern struct sx proctree_lock;
 extern struct mtx pargs_ref_lock;
 extern struct mtx ppeers_lock;
 extern struct proc proc0;		/* Process slot for swapper. */
 extern struct thread thread0;		/* Primary thread in proc0. */
 extern struct ksegrp ksegrp0;		/* Primary ksegrp in proc0. */
 extern struct kse kse0;			/* Primary kse in proc0. */
 extern struct vmspace vmspace0;		/* VM space for proc0. */
 extern int hogticks;			/* Limit on kernel cpu hogs. */
 extern int nprocs, maxproc;		/* Current and max number of procs. */
 extern int maxprocperuid;		/* Max procs per uid. */
 extern u_long ps_arg_cache_limit;
 extern int sched_quantum;		/* Scheduling quantum in ticks. */
 
 LIST_HEAD(proclist, proc);
 TAILQ_HEAD(procqueue, proc);
 TAILQ_HEAD(threadqueue, thread);
 extern struct proclist allproc;		/* List of all processes. */
 extern struct proclist zombproc;	/* List of zombie processes. */
 extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
 extern struct proc *updateproc;		/* Process slot for syncer (sic). */
 
 extern struct uma_zone *proc_zone;
 
 extern int lastpid;
 
 struct	proc *pfind(pid_t);	/* Find process by id. */
 struct	pgrp *pgfind(pid_t);	/* Find process group by id. */
 struct	proc *zpfind(pid_t);	/* Find zombie process by id. */
 
 void	adjustrunqueue(struct thread *, int newpri);
 void	ast(struct trapframe *framep);
 struct	thread *choosethread(void);
 int	cr_cansignal(struct ucred *cred, struct proc *proc, int signum);
 int	enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess);
 int	enterthispgrp(struct proc *p, struct pgrp *pgrp);
 void	faultin(struct proc *p);
 void	fixjobc(struct proc *p, struct pgrp *pgrp, int entering);
 int	fork1(struct thread *, int, int, struct proc **);
 void	fork_exit(void (*)(void *, struct trapframe *), void *,
 	    struct trapframe *);
 void	fork_return(struct thread *, struct trapframe *);
 int	inferior(struct proc *p);
 int	leavepgrp(struct proc *p);
 void	mi_switch(int flags);
 /* Flags for mi_switch(). */
 #define	SW_VOL		0x0001		/* Voluntary switch. */
 #define	SW_INVOL	0x0002		/* Involuntary switch. */
 int	p_candebug(struct thread *td, struct proc *p);
 int	p_cansee(struct thread *td, struct proc *p);
 int	p_cansched(struct thread *td, struct proc *p);
 int	p_cansignal(struct thread *td, struct proc *p, int signum);
 struct	pargs *pargs_alloc(int len);
 void	pargs_drop(struct pargs *pa);
 void	pargs_free(struct pargs *pa);
 void	pargs_hold(struct pargs *pa);
 void	procinit(void);
 void	threadinit(void);
 void	proc_linkup(struct proc *p, struct ksegrp *kg,
 	    struct kse *ke, struct thread *td);
 void	proc_reparent(struct proc *child, struct proc *newparent);
 int	securelevel_ge(struct ucred *cr, int level);
 int	securelevel_gt(struct ucred *cr, int level);
 void	setrunnable(struct thread *);
 void	setrunqueue(struct thread *);
 void	setsugid(struct proc *p);
 int	sigonstack(size_t sp);
 void	sleepinit(void);
 void	stopevent(struct proc *, u_int, u_int);
 void	cpu_idle(void);
 extern	void (*cpu_idle_hook)(void);	/* Hook to machdep CPU idler. */
 void	cpu_switch(struct thread *old, struct thread *new);
 void	cpu_throw(struct thread *old, struct thread *new) __dead2;
 void	unsleep(struct thread *);
 void	userret(struct thread *, struct trapframe *, u_int);
 
 void	cpu_exit(struct thread *);
 void	exit1(struct thread *, int) __dead2;
 void	cpu_fork(struct thread *, struct proc *, struct thread *, int);
 void	cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
 
 /* New in KSE. */
 struct	ksegrp *ksegrp_alloc(void);
 void	ksegrp_free(struct ksegrp *kg);
 void	ksegrp_stash(struct ksegrp *kg);
 struct	kse *kse_alloc(void);
 void	kse_free(struct kse *ke);
 void	kse_stash(struct kse *ke);
 void	cpu_set_upcall(struct thread *td, struct thread *td0);
 void	cpu_set_upcall_kse(struct thread *td, struct kse_upcall *ku);
 void	cpu_thread_clean(struct thread *);
 void	cpu_thread_exit(struct thread *);
 void	cpu_thread_setup(struct thread *td);
 void	cpu_thread_siginfo(int sig, u_long code, siginfo_t *si);
 void	cpu_thread_swapin(struct thread *);
 void	cpu_thread_swapout(struct thread *);
 void	kse_reassign(struct kse *ke);
 void	kse_link(struct kse *ke, struct ksegrp *kg);
 void	kse_unlink(struct kse *ke);
 void	ksegrp_link(struct ksegrp *kg, struct proc *p);
 void	ksegrp_unlink(struct ksegrp *kg);
 void	thread_signal_add(struct thread *td, int sig);
 struct	thread *thread_alloc(void);
 void	thread_exit(void) __dead2;
 int	thread_export_context(struct thread *td, int willexit);
 void	thread_free(struct thread *td);
 void	thread_link(struct thread *td, struct ksegrp *kg);
 int	thread_new_tid(void);
 void	thread_reap(void);
 struct thread *thread_schedule_upcall(struct thread *td, struct kse_upcall *ku);
 int	thread_single(int how);
 #define	SINGLE_NO_EXIT 0			/* values for 'how' */
 #define	SINGLE_EXIT 1
 void	thread_single_end(void);
 void	thread_stash(struct thread *td);
 int	thread_suspend_check(int how);
 void	thread_suspend_one(struct thread *td);
 void	thread_unlink(struct thread *td);
 void	thread_unsuspend(struct proc *p);
 void	thread_unsuspend_one(struct thread *td);
 int	thread_userret(struct thread *td, struct trapframe *frame);
 int	thread_upcall_check(struct thread *td);
 void	thread_user_enter(struct proc *p, struct thread *td);
 void	thread_wait(struct proc *p);
 int	thread_statclock(int user);
 struct kse_upcall *upcall_alloc(void);
 void	upcall_free(struct kse_upcall *ku);
 void	upcall_link(struct kse_upcall *ku, struct ksegrp *kg);
 void	upcall_unlink(struct kse_upcall *ku);
 void	upcall_remove(struct thread *td);
 void	upcall_stash(struct kse_upcall *ke);
 void	thread_sanity_check(struct thread *td, char *);
 void	thread_stopped(struct proc *p);
 void	thread_switchout(struct thread *td);
 void	thr_exit1(void);
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_PROC_H_ */
Index: head/sys/sys/sched.h
===================================================================
--- head/sys/sys/sched.h	(revision 130550)
+++ head/sys/sys/sched.h	(revision 130551)
@@ -1,122 +1,122 @@
 /*-
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_SCHED_H_
 #define	_SYS_SCHED_H_
 
 /*
  * General scheduling info.
  *
  * sched_load:
  *	Total runnable non-ithread threads in the system.
  *
  * sched_runnable:
  *	Runnable threads for this processor.
  */
 int	sched_load(void);
 int	sched_rr_interval(void);
 int	sched_runnable(void);
 
 /* 
  * Proc related scheduling hooks.
  */
 void	sched_exit(struct proc *p, struct proc *child);
 void	sched_fork(struct proc *p, struct proc *child);
 
 /*
  * KSE Groups contain scheduling priority information.  They record the
  * behavior of groups of KSEs and threads.
  */
 void	sched_class(struct ksegrp *kg, int class);
 void	sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child);
 void	sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child);
-void	sched_nice(struct ksegrp *kg, int nice);
+void	sched_nice(struct proc *p, int nice);
 
 /*
  * Threads are switched in and out, block on resources, have temporary
  * priorities inherited from their ksegs, and use up cpu time.
  */
 void	sched_exit_thread(struct thread *td, struct thread *child);
 void	sched_fork_thread(struct thread *td, struct thread *child);
 fixpt_t	sched_pctcpu(struct thread *td);
 void	sched_prio(struct thread *td, u_char prio);
 void	sched_sleep(struct thread *td);
 void	sched_switch(struct thread *td);
 void	sched_userret(struct thread *td);
 void	sched_wakeup(struct thread *td);
 
 /*
  * Threads are moved on and off of run queues
  */
 void	sched_add(struct thread *td);
 struct kse *sched_choose(void);		/* XXX Should be thread * */
 void	sched_clock(struct thread *td);
 void	sched_rem(struct thread *td);
 
 /*
  * Binding makes cpu affinity permanent while pinning is used to temporarily
  * hold a thread on a particular CPU.
  */
 void	sched_bind(struct thread *td, int cpu);
 static __inline void sched_pin(void);
 void	sched_unbind(struct thread *td);
 static __inline void sched_unpin(void);
 
 /*
  * These interfaces will eventually be removed.
  */
 void	sched_exit_kse(struct kse *ke, struct kse *child);
 void	sched_fork_kse(struct kse *ke, struct kse *child);
 
 /*
  * These procedures tell the process data structure allocation code how
  * many bytes to actually allocate.
  */
 int	sched_sizeof_kse(void);
 int	sched_sizeof_ksegrp(void);
 int	sched_sizeof_proc(void);
 int	sched_sizeof_thread(void);
 
 extern struct ke_sched *kse0_sched;
 extern struct kg_sched *ksegrp0_sched;
 extern struct p_sched *proc0_sched;
 extern struct td_sched *thread0_sched;
 
 static __inline void
 sched_pin(void)
 {
 	curthread->td_pinned++;
 }
 
 static __inline void
 sched_unpin(void)
 {
 	curthread->td_pinned--;
 }
 
 #endif /* !_SYS_SCHED_H_ */
Index: head/sys/ufs/ffs/ffs_snapshot.c
===================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c	(revision 130550)
+++ head/sys/ufs/ffs/ffs_snapshot.c	(revision 130551)
@@ -1,2120 +1,2120 @@
 /*
  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
  *
  * Further information about snapshots can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/sched.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #define KERNCRED thread0.td_ucred
 #define DEBUG 1
 
 static int cgaccount(int, struct vnode *, struct buf *, int);
 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int ffs_copyonwrite(struct vnode *, struct buf *);
 static int readblock(struct buf *, ufs2_daddr_t);
 
 /*
  * To ensure the consistency of snapshots across crashes, we must
  * synchronously write out copied blocks before allowing the
  * originals to be modified. Because of the rather severe speed
  * penalty that this imposes, the following flag allows this
  * crash persistence to be disabled.
  */
 int dopersistence = 0;
 
 #ifdef DEBUG
 #include <sys/sysctl.h>
 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
 static int snapdebug = 0;
 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
 int collectsnapstats = 0;
 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
 	0, "");
 #endif /* DEBUG */
 
 /*
  * Create a snapshot file and initialize it for the filesystem.
  */
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
 	int error, cg, snaploc;
 	int i, size, len, loc;
 	int flag = mp->mnt_flag;
 	struct timespec starttime = {0, 0}, endtime;
 	char saved_nice = 0;
 	long redo = 0, snaplistsize = 0;
 	int32_t *lp;
 	void *space;
 	struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs;
 	struct snaphead *snaphead;
 	struct thread *td = curthread;
 	struct inode *ip, *xp;
 	struct buf *bp, *nbp, *ibp, *sbp = NULL;
 	struct nameidata nd;
 	struct mount *wrtmp;
 	struct vattr vat;
 	struct vnode *vp, *xvp, *nvp, *devvp;
 	struct uio auio;
 	struct iovec aiov;
 
 	/*
 	 * Need to serialize access to snapshot code per filesystem.
 	 */
 	/*
 	 * Assign a snapshot slot in the superblock.
 	 */
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 	if (snaploc == FSMAXSNAP)
 		return (ENOSPC);
 	/*
 	 * Create the snapshot file.
 	 */
 restart:
 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		vput(nd.ni_vp);
 		error = EEXIST;
 	}
 	if (nd.ni_dvp->v_mount != mp)
 		error = EXDEV;
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		return (error);
 	}
 	VATTR_NULL(&vat);
 	vat.va_type = VREG;
 	vat.va_mode = S_IRUSR;
 	vat.va_vaflags |= VA_EXCLUSIVE;
 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
 		wrtmp = NULL;
 	if (wrtmp != mp)
 		panic("ffs_snapshot: mount mismatch");
 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &wrtmp,
 		    V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
 	vput(nd.ni_dvp);
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vn_finished_write(wrtmp);
 		return (error);
 	}
 	vp = nd.ni_vp;
 	ip = VTOI(vp);
 	devvp = ip->i_devvp;
 	/*
 	 * Allocate and copy the last block contents so as to be able
 	 * to set size to that of the filesystem.
 	 */
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 	if (error)
 		goto out;
 	ip->i_size = lblktosize(fs, (off_t)numblks);
 	DIP(ip, i_size) = ip->i_size;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	if ((error = readblock(bp, numblks - 1)) != 0)
 		goto out;
 	bawrite(bp);
 	/*
 	 * Preallocate critical data structures so that we can copy
 	 * them in without further allocation after we suspend all
 	 * operations on the filesystem. We would like to just release
 	 * the allocated buffers without writing them since they will
 	 * be filled in below once we are ready to go, but this upsets
 	 * the soft update code, so we go ahead and write the new buffers.
 	 *
 	 * Allocate all indirect blocks and mark all of them as not
 	 * needing to be copied.
 	 */
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
 		if (error)
 			goto out;
 		bawrite(ibp);
 	}
 	/*
 	 * Allocate copies for the superblock and its summary information.
 	 */
 	error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
 	    0, &nbp);
 	if (error)
 		goto out;
 	bawrite(nbp);
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	for (loc = 0; loc < len; loc++) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Allocate all cylinder group blocks.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Copy all the cylinder group maps. Although the
 	 * filesystem is still active, we hope that only a few
 	 * cylinder groups will change between now and when we
 	 * suspend operations. Thus, we will be able to quickly
 	 * touch up the few cylinder groups that changed during
 	 * the suspension period.
 	 */
 	len = howmany(fs->fs_ncg, NBBY);
 	MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK);
 	bzero(fs->fs_active, len);
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		error = cgaccount(cg, vp, nbp, 1);
 		bawrite(nbp);
 		if (error)
 			goto out;
 	}
 	/*
 	 * Change inode to snapshot type file.
 	 */
 	ip->i_flags |= SF_SNAPSHOT;
 	DIP(ip, i_flags) = ip->i_flags;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * Ensure that the snapshot is completely on disk.
 	 * Since we have marked it as a snapshot it is safe to
 	 * unlock it as no process will be allowed to write to it.
 	 */
 	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0)
 		goto out;
 	VOP_UNLOCK(vp, 0, td);
 	/*
 	 * All allocations are done, so we can now snapshot the system.
 	 *
 	 * Recind nice scheduling while running with the filesystem suspended.
 	 */
-	if (td->td_ksegrp->kg_nice > 0) {
+	if (td->td_proc->p_nice > 0) {
 		PROC_LOCK(td->td_proc);
 		mtx_lock_spin(&sched_lock);
-		saved_nice = td->td_ksegrp->kg_nice;
-		sched_nice(td->td_ksegrp, 0);
+		saved_nice = td->td_proc->p_nice;
+		sched_nice(td->td_proc, 0);
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(td->td_proc);
 	}
 	/*
 	 * Suspend operation on filesystem.
 	 */
 	for (;;) {
 		vn_finished_write(wrtmp);
 		if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
 			vn_start_write(NULL, &wrtmp, V_WAIT);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 			goto out;
 		}
 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
 			break;
 		vn_start_write(NULL, &wrtmp, V_WAIT);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (collectsnapstats)
 		nanotime(&starttime);
 	/*
 	 * First, copy all the cylinder group maps that have changed.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
 			continue;
 		redo++;
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out1;
 		error = cgaccount(cg, vp, nbp, 2);
 		bawrite(nbp);
 		if (error)
 			goto out1;
 	}
 	/*
 	 * Grab a copy of the superblock and its summary information.
 	 * We delay writing it until the suspension is released below.
 	 */
 	error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
 	    KERNCRED, &sbp);
 	if (error) {
 		brelse(sbp);
 		sbp = NULL;
 		goto out1;
 	}
 	loc = blkoff(fs, fs->fs_sblockloc);
 	copy_fs = (struct fs *)(sbp->b_data + loc);
 	bcopy(fs, copy_fs, fs->fs_sbsize);
 	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 		copy_fs->fs_clean = 1;
 	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
 	if (fs->fs_sbsize < size)
 		bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize);
 	size = blkroundup(fs, fs->fs_cssize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
 	copy_fs->fs_csp = space;
 	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
 	space = (char *)space + fs->fs_cssize;
 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
 	i = fs->fs_frag - loc % fs->fs_frag;
 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
 	if (len > 0) {
 		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
 		    len, KERNCRED, &bp)) != 0) {
 			brelse(bp);
 			free(copy_fs->fs_csp, M_UFSMNT);
 			bawrite(sbp);
 			sbp = NULL;
 			goto out1;
 		}
 		bcopy(bp->b_data, space, (u_int)len);
 		space = (char *)space + len;
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 	}
 	if (fs->fs_contigsumsize > 0) {
 		copy_fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 	}
 	/*
 	 * We must check for active files that have been unlinked
 	 * (e.g., with a zero link count). We have to expunge all
 	 * trace of these files from the snapshot so that they are
 	 * not reclaimed prematurely by fsck or unnecessarily dumped.
 	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
 	 * spec_strategy about writing on a suspended filesystem.
 	 * Note that we skip unlinked snapshot files as they will
 	 * be handled separately below.
 	 *
 	 * We also calculate the needed size for the snapshot list.
 	 */
 	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
 	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
 	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
 	MNT_ILOCK(mp);
 loop:
 	for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) {
 		/*
 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
 		 * Start over if it has (it won't be on the list anymore).
 		 */
 		if (xvp->v_mount != mp)
 			goto loop;
 		nvp = TAILQ_NEXT(xvp, v_nmntvnodes);
 		VI_LOCK(xvp);
 		MNT_IUNLOCK(mp);
 		if ((xvp->v_iflag & VI_XLOCK) ||
 		    xvp->v_usecount == 0 || xvp->v_type == VNON ||
 		    (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
 			VI_UNLOCK(xvp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) {
 			MNT_ILOCK(mp);
 			goto loop;
 		}
 		if (snapdebug)
 			vprint("ffs_snapshot: busy vnode", xvp);
 		if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 &&
 		    vat.va_nlink > 0) {
 			VOP_UNLOCK(xvp, 0, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		xp = VTOI(xvp);
 		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
 			VOP_UNLOCK(xvp, 0, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If there is a fragment, clear it here.
 		 */
 		blkno = 0;
 		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
 		if (loc < NDADDR) {
 			len = fragroundup(fs, blkoff(fs, xp->i_size));
 			if (len < fs->fs_bsize) {
 				ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]),
 				    len, xp->i_number);
 				blkno = DIP(xp, i_db[loc]);
 				DIP(xp, i_db[loc]) = 0;
 			}
 		}
 		snaplistsize += 1;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 			    BLK_NOCOPY);
 		else
 			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
 			    BLK_NOCOPY);
 		if (blkno)
 			DIP(xp, i_db[loc]) = blkno;
 		if (!error)
 			error = ffs_freefile(copy_fs, vp, xp->i_number,
 			    xp->i_mode);
 		VOP_UNLOCK(xvp, 0, td);
 		if (error) {
 			free(copy_fs->fs_csp, M_UFSMNT);
 			bawrite(sbp);
 			sbp = NULL;
 			goto out1;
 		}
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	/*
 	 * If there already exist snapshots on this filesystem, grab a
 	 * reference to their shared lock. If this is the first snapshot
 	 * on this filesystem, we need to allocate a lock for the snapshots
 	 * to share. In either case, acquire the snapshot lock and give
 	 * up our original private lock.
 	 */
 	VI_LOCK(devvp);
 	snaphead = &devvp->v_rdev->si_snapshots;
 	if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
 		struct lock *lkp;
 
 		lkp = ITOV(xp)->v_vnlock;
 		VI_UNLOCK(devvp);
 		VI_LOCK(vp);
 		vp->v_vnlock = lkp;
 	} else {
 		struct lock *lkp;
 
 		VI_UNLOCK(devvp);
 		MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT,
 		    M_WAITOK);
 		lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
 		    LK_CANRECURSE | LK_NOPAUSE);
 		VI_LOCK(vp);
 		vp->v_vnlock = lkp;
 	}
 	vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
 	transferlockers(&vp->v_lock, vp->v_vnlock);
 	lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 	/*
 	 * If this is the first snapshot on this filesystem, then we need
 	 * to allocate the space for the list of preallocated snapshot blocks.
 	 * This list will be refined below, but this preliminary one will
 	 * keep us out of deadlock until the full one is ready.
 	 */
 	if (xp == NULL) {
 		MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
 		    M_UFSMNT, M_WAITOK);
 		blkp = &snapblklist[1];
 		*blkp++ = lblkno(fs, fs->fs_sblockloc);
 		blkno = fragstoblks(fs, fs->fs_csaddr);
 		for (cg = 0; cg < fs->fs_ncg; cg++) {
 			if (fragstoblks(fs, cgtod(fs, cg) > blkno))
 				break;
 			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
 		}
 		len = howmany(fs->fs_cssize, fs->fs_bsize);
 		for (loc = 0; loc < len; loc++)
 			*blkp++ = blkno + loc;
 		for (; cg < fs->fs_ncg; cg++)
 			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
 		snapblklist[0] = blkp - snapblklist;
 		VI_LOCK(devvp);
 		if (devvp->v_rdev->si_snapblklist != NULL)
 			panic("ffs_snapshot: non-empty list");
 		devvp->v_rdev->si_snapblklist = snapblklist;
 		devvp->v_rdev->si_snaplistsize = blkp - snapblklist;
 		VI_UNLOCK(devvp);
 	}
 	/*
 	 * Record snapshot inode. Since this is the newest snapshot,
 	 * it must be placed at the end of the list.
 	 */
 	VI_LOCK(devvp);
 	fs->fs_snapinum[snaploc] = ip->i_number;
 	if (ip->i_nextsnap.tqe_prev != 0)
 		panic("ffs_snapshot: %d already on list", ip->i_number);
 	TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
 	devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
 	vp->v_vflag |= VV_SYSTEM;
 out1:
 	/*
 	 * Resume operation on filesystem.
 	 */
 	vfs_write_resume(vp->v_mount);
 	vn_start_write(NULL, &wrtmp, V_WAIT);
 	if (collectsnapstats && starttime.tv_sec > 0) {
 		nanotime(&endtime);
 		timespecsub(&endtime, &starttime);
 		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
 		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
 		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
 	}
 	if (sbp == NULL)
 		goto out;
 	/*
 	 * Copy allocation information from all the snapshots in
 	 * this snapshot and then expunge them from its view.
 	 */
 	snaphead = &devvp->v_rdev->si_snapshots;
 	TAILQ_FOREACH(xp, snaphead, i_nextsnap) {
 		if (xp == ip)
 			break;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
 			    BLK_SNAP);
 		else
 			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
 			    BLK_SNAP);
 		if (error) {
 			fs->fs_snapinum[snaploc] = 0;
 			goto done;
 		}
 	}
 	/*
 	 * Allocate space for the full list of preallocated snapshot blocks.
 	 */
 	MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	ip->i_snapblklist = &snapblklist[1];
 	/*
 	 * Expunge the blocks used by the snapshots from the set of
 	 * blocks marked as used in the snapshot bitmaps. Also, collect
 	 * the list of allocated blocks in i_snapblklist.
 	 */
 	if (ip->i_ump->um_fstype == UFS1)
 		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
 	else
 		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
 	if (error) {
 		fs->fs_snapinum[snaploc] = 0;
 		FREE(snapblklist, M_UFSMNT);
 		goto done;
 	}
 	if (snaplistsize < ip->i_snapblklist - snapblklist)
 		panic("ffs_snapshot: list too small");
 	snaplistsize = ip->i_snapblklist - snapblklist;
 	snapblklist[0] = snaplistsize;
 	ip->i_snapblklist = 0;
 	/*
 	 * Write out the list of allocated blocks to the end of the snapshot.
 	 */
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)snapblklist;
 	aiov.iov_len = snaplistsize * sizeof(daddr_t);
 	auio.uio_resid = aiov.iov_len;;
 	auio.uio_offset = ip->i_size;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		fs->fs_snapinum[snaploc] = 0;
 		FREE(snapblklist, M_UFSMNT);
 		goto done;
 	}
 	/*
 	 * Write the superblock and its summary information
 	 * to the snapshot.
 	 */
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	space = copy_fs->fs_csp;
 	for (loc = 0; loc < len; loc++) {
 		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
 		if (error) {
 			brelse(nbp);
 			fs->fs_snapinum[snaploc] = 0;
 			FREE(snapblklist, M_UFSMNT);
 			goto done;
 		}
 		bcopy(space, nbp->b_data, fs->fs_bsize);
 		space = (char *)space + fs->fs_bsize;
 		bawrite(nbp);
 	}
 	/*
 	 * As this is the newest list, it is the most inclusive, so
 	 * should replace the previous list.
 	 */
 	VI_LOCK(devvp);
 	space = devvp->v_rdev->si_snapblklist;
 	devvp->v_rdev->si_snapblklist = snapblklist;
 	devvp->v_rdev->si_snaplistsize = snaplistsize;
 	VI_UNLOCK(devvp);
 	if (space != NULL)
 		FREE(space, M_UFSMNT);
 done:
 	free(copy_fs->fs_csp, M_UFSMNT);
 	bawrite(sbp);
 out:
 	if (saved_nice > 0) {
 		PROC_LOCK(td->td_proc);
 		mtx_lock_spin(&sched_lock);
-		sched_nice(td->td_ksegrp, saved_nice);
+		sched_nice(td->td_proc, saved_nice);
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(td->td_proc);
 	}
 	if (fs->fs_active != 0) {
 		FREE(fs->fs_active, M_DEVBUF);
 		fs->fs_active = 0;
 	}
 	mp->mnt_flag = flag;
 	if (error)
 		(void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
 	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 	if (error)
 		vput(vp);
 	else
 		VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(wrtmp);
 	return (error);
 }
 
 /*
  * Copy a cylinder group map. All the unallocated blocks are marked
  * BLK_NOCOPY so that the snapshot knows that it need not copy them
  * if they are later written. If passno is one, then this is a first
  * pass, so only setting needs to be done. If passno is 2, then this
  * is a revision to a previous pass which must be undone as the
  * replacement pass is done.
  */
 static int
 cgaccount(cg, vp, nbp, passno)
 	int cg;
 	struct vnode *vp;
 	struct buf *nbp;
 	int passno;
 {
 	struct buf *bp, *ibp;
 	struct inode *ip;
 	struct cg *cgp;
 	struct fs *fs;
 	ufs2_daddr_t base, numblks;
 	int error, len, loc, indiroff;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, KERNCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return (EIO);
 	}
 	atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
 	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
 	if (fs->fs_cgsize < fs->fs_bsize)
 		bzero(&nbp->b_data[fs->fs_cgsize],
 		    fs->fs_bsize - fs->fs_cgsize);
 	if (passno == 2)
 		nbp->b_flags |= B_VALIDSUSPWRT;
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	len = howmany(fs->fs_fpg, fs->fs_frag);
 	base = cg * fs->fs_fpg / fs->fs_frag;
 	if (base + len >= numblks)
 		len = numblks - base - 1;
 	loc = 0;
 	if (base < NDADDR) {
 		for ( ; loc < NDADDR; loc++) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				DIP(ip, i_db[loc]) = BLK_NOCOPY;
 			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				DIP(ip, i_db[loc]) = 0;
 			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				panic("ffs_snapshot: lost direct block");
 		}
 	}
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
 	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 	indiroff = (base + loc - NDADDR) % NINDIR(fs);
 	for ( ; loc < len; loc++, indiroff++) {
 		if (indiroff >= NINDIR(fs)) {
 			if (passno == 2)
 				ibp->b_flags |= B_VALIDSUSPWRT;
 			bawrite(ibp);
 			error = UFS_BALLOC(vp,
 			    lblktosize(fs, (off_t)(base + loc)),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 			indiroff = 0;
 		}
 		if (ip->i_ump->um_fstype == UFS1) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
 			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				panic("ffs_snapshot: lost indirect block");
 			continue;
 		}
 		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
 		else if (passno == 2 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
 		else if (passno == 1 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			panic("ffs_snapshot: lost indirect block");
 	}
 	bqrelse(bp);
 	if (passno == 2)
 		ibp->b_flags |= B_VALIDSUSPWRT;
 	bdwrite(ibp);
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs1_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < NDADDR) {
 		blkno = VTOI(snapvp)->i_din1->di_db[lbn];
 	} else {
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - NDADDR) % NINDIR(fs);
 		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	if (blkno != 0) {
 		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 			return (error);
 	} else {
 		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &bp);
 		if (error)
 			return (error);
 		if ((error = readblock(bp, lbn)) != 0)
 			return (error);
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * to be completely unallocated.
 	 */
 	dip = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (expungetype == BLK_NOCOPY)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
 	bdwrite(bp);
 	/*
 	 * Now go through and expunge all the blocks in the file
 	 * using the function requested.
 	 */
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
 	    &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
 		return (error);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
 	    &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -NDADDR;
 	len = numblks - NDADDR;
 	rlbn = NDADDR;
 	for (i = 0; len > 0 && i < NIADDR; i++) {
 		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs1_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[NIADDR + 2];
 	ufs1_daddr_t last, *bap;
 	struct buf *bp;
 
 	if (blkno == 0) {
 		if (expungetype == BLK_NOCOPY)
 			return (0);
 		panic("indiracct_ufs1: missing indir");
 	}
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 		panic("indiracct_ufs1: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 	    level == 0 ? rlbn : -1, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs1_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_din1->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs1_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct_ufs1: bad block");
 			*blkp = expungetype;
 			if (lbn >= NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs1_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 	int acctit;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	if (lblkno == -1)
 		acctit = 0;
 	else
 		acctit = 1;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
 	}
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs2_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < NDADDR) {
 		blkno = VTOI(snapvp)->i_din2->di_db[lbn];
 	} else {
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - NDADDR) % NINDIR(fs);
 		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	if (blkno != 0) {
 		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 			return (error);
 	} else {
 		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &bp);
 		if (error)
 			return (error);
 		if ((error = readblock(bp, lbn)) != 0)
 			return (error);
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * to be completely unallocated.
 	 */
 	dip = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (expungetype == BLK_NOCOPY)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
 	bdwrite(bp);
 	/*
 	 * Now go through and expunge all the blocks in the file
 	 * using the function requested.
 	 */
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
 	    &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
 		return (error);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
 	    &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -NDADDR;
 	len = numblks - NDADDR;
 	rlbn = NDADDR;
 	for (i = 0; len > 0 && i < NIADDR; i++) {
 		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs2_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[NIADDR + 2];
 	ufs2_daddr_t last, *bap;
 	struct buf *bp;
 
 	if (blkno == 0) {
 		if (expungetype == BLK_NOCOPY)
 			return (0);
 		panic("indiracct_ufs2: missing indir");
 	}
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 		panic("indiracct_ufs2: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 	    level == 0 ? rlbn : -1, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs2_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_din2->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs2_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct_ufs2: bad block");
 			*blkp = expungetype;
 			if (lbn >= NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs2_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 	int acctit;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	if (lblkno == -1)
 		acctit = 0;
 	else
 		acctit = 1;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
 	}
 	return (0);
 }
 
 /*
  * Decrement extra reference on snapshot when last name is removed.
  * It will not be freed until the last open reference goes away.
  */
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 	struct inode *xp;
 	struct fs *fs;
 	int snaploc;
 
 	/*
 	 * Find snapshot in incore list.
 	 */
 	TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap)
 		if (xp == ip)
 			break;
 	if (xp != NULL)
 		vrele(ITOV(ip));
 	else if (snapdebug)
 		printf("ffs_snapgone: lost snapshot vnode %d\n",
 		    ip->i_number);
 	/*
 	 * Delete snapshot inode from superblock. Keep list dense.
 	 */
 	fs = ip->i_fs;
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == ip->i_number)
 			break;
 	if (snaploc < FSMAXSNAP) {
 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
 			if (fs->fs_snapinum[snaploc] == 0)
 				break;
 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
 		}
 		fs->fs_snapinum[snaploc - 1] = 0;
 	}
 }
 
 /*
  * Prepare a snapshot file for being removed.
  */
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 	struct inode *ip;
 	struct vnode *devvp;
 	struct lock *lkp;
 	struct buf *ibp;
 	struct fs *fs;
 	struct thread *td = curthread;
 	ufs2_daddr_t numblks, blkno, dblk, *snapblklist;
 	int error, loc, last;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	devvp = ip->i_devvp;
 	/*
 	 * If active, delete from incore list (this snapshot may
 	 * already have been in the process of being deleted, so
 	 * would not have been active).
 	 *
 	 * Clear copy-on-write flag if last snapshot.
 	 */
 	if (ip->i_nextsnap.tqe_prev != 0) {
 		VI_LOCK(devvp);
 		lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE,
 		    VI_MTX(devvp), td);
 		VI_LOCK(devvp);
 		TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
 		ip->i_nextsnap.tqe_prev = 0;
 		lkp = vp->v_vnlock;
 		vp->v_vnlock = &vp->v_lock;
 		lockmgr(lkp, LK_RELEASE, NULL, td);
 		if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) {
 			VI_UNLOCK(devvp);
 		} else {
 			snapblklist = devvp->v_rdev->si_snapblklist;
 			devvp->v_rdev->si_snapblklist = 0;
 			devvp->v_rdev->si_snaplistsize = 0;
 			devvp->v_rdev->si_copyonwrite = 0;
 			devvp->v_vflag &= ~VV_COPYONWRITE;
 			lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
 			lockmgr(lkp, LK_RELEASE, NULL, td);
 			lockdestroy(lkp);
 			FREE(lkp, M_UFSMNT);
 			FREE(snapblklist, M_UFSMNT);
 		}
 	}
 	/*
 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
 	 * snapshots that want them (see ffs_snapblkfree below).
 	 */
 	for (blkno = 1; blkno < NDADDR; blkno++) {
 		dblk = DIP(ip, i_db[blkno]);
 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 			DIP(ip, i_db[blkno]) = 0;
 		else if ((dblk == blkstofrags(fs, blkno) &&
 		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
 		     ip->i_number))) {
 			DIP(ip, i_blocks) -= btodb(fs->fs_bsize);
 			DIP(ip, i_db[blkno]) = 0;
 		}
 	}
 	numblks = howmany(ip->i_size, fs->fs_bsize);
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 		if (error)
 			continue;
 		if (fs->fs_size - blkno > NINDIR(fs))
 			last = NINDIR(fs);
 		else
 			last = fs->fs_size - blkno;
 		for (loc = 0; loc < last; loc++) {
 			if (ip->i_ump->um_fstype == UFS1) {
 				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
 				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				else if ((dblk == blkstofrags(fs, blkno) &&
 				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
 				     fs->fs_bsize, ip->i_number))) {
 					ip->i_din1->di_blocks -=
 					    btodb(fs->fs_bsize);
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				}
 				continue;
 			}
 			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			else if ((dblk == blkstofrags(fs, blkno) &&
 			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
 			     fs->fs_bsize, ip->i_number))) {
 				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			}
 		}
 		bawrite(ibp);
 	}
 	/*
 	 * Clear snapshot flag and drop reference.
 	 */
 	ip->i_flags &= ~SF_SNAPSHOT;
 	DIP(ip, i_flags) = ip->i_flags;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 }
 
 /*
  * Notification that a block is being freed. Return zero if the free
  * should be allowed to proceed. Return non-zero if the snapshot file
  * wants to claim the block. The block will be claimed if it is an
  * uncopied part of one of the snapshots. It will be freed if it is
  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
  * If a fragment is being freed, then all snapshots that care about
  * it must make a copy since a snapshot file can only claim full sized
  * blocks. Note that if more than one snapshot file maps the block,
  * we can pick one at random to claim it. Since none of the snapshots
  * can change, we are assurred that they will all see the same unmodified
  * image. When deleting a snapshot file (see ffs_snapremove above), we
  * must push any of these claimed blocks to one of the other snapshots
  * that maps it. These claimed blocks are easily identified as they will
  * have a block number equal to their logical block number within the
  * snapshot. A copied block can never have this property because they
  * must always have been allocated from a BLK_NOCOPY location.
  */
 int
 ffs_snapblkfree(fs, devvp, bno, size, inum)
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 {
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct thread *td = curthread;
 	struct inode *ip;
 	struct vnode *vp = NULL;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t blkno;
 	int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0;
 	struct snaphead *snaphead;
 
 	lbn = fragstoblks(fs, bno);
 retry:
 	VI_LOCK(devvp);
 	snaphead = &devvp->v_rdev->si_snapshots;
 	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
 		vp = ITOV(ip);
 		/*
 		 * Lookup block being written.
 		 */
 		if (lbn < NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			if (snapshot_locked == 0 &&
 			    lockmgr(vp->v_vnlock,
 			      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 			      VI_MTX(devvp), td) != 0)
 				goto retry;
 			snapshot_locked = 1;
 			td->td_pflags |= TDP_COWINPROGRESS;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_pflags &= ~TDP_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			if (ip->i_ump->um_fstype == UFS1)
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 		}
 		/*
 		 * Check to see if block needs to be copied.
 		 */
 		if (blkno == 0) {
 			/*
 			 * A block that we map is being freed. If it has not
 			 * been claimed yet, we will claim or copy it (below).
 			 */
 			claimedblk = 1;
 		} else if (blkno == BLK_SNAP) {
 			/*
 			 * No previous snapshot claimed the block,
 			 * so it will be freed and become a BLK_NOCOPY
 			 * (don't care) for us.
 			 */
 			if (claimedblk)
 				panic("snapblkfree: inconsistent block type");
 			if (snapshot_locked == 0 &&
 			    lockmgr(vp->v_vnlock,
 			      LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
 			      VI_MTX(devvp), td) != 0) {
 				if (lbn >= NDADDR)
 					bqrelse(ibp);
 				vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td);
 				goto retry;
 			}
 			snapshot_locked = 1;
 			if (lbn < NDADDR) {
 				DIP(ip, i_db[lbn]) = BLK_NOCOPY;
 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			} else if (ip->i_ump->um_fstype == UFS1) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			}
 			continue;
 		} else /* BLK_NOCOPY or default */ {
 			/*
 			 * If the snapshot has already copied the block
 			 * (default), or does not care about the block,
 			 * it is not needed.
 			 */
 			if (lbn >= NDADDR)
 				bqrelse(ibp);
 			continue;
 		}
 		/*
 		 * If this is a full size block, we will just grab it
 		 * and assign it to the snapshot inode. Otherwise we
 		 * will proceed to copy it. See explanation for this
 		 * routine as to why only a single snapshot needs to
 		 * claim this block.
 		 */
 		if (snapshot_locked == 0 &&
 		    lockmgr(vp->v_vnlock,
 		      LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
 		      VI_MTX(devvp), td) != 0) {
 			if (lbn >= NDADDR)
 				bqrelse(ibp);
 			vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td);
 			goto retry;
 		}
 		snapshot_locked = 1;
 		if (size == fs->fs_bsize) {
 #ifdef DEBUG
 			if (snapdebug)
 				printf("%s %d lbn %jd from inum %d\n",
 				    "Grabonremove: snapino", ip->i_number,
 				    (intmax_t)lbn, inum);
 #endif
 			if (lbn < NDADDR) {
 				DIP(ip, i_db[lbn]) = bno;
 			} else if (ip->i_ump->um_fstype == UFS1) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			}
 			DIP(ip, i_blocks) += btodb(size);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			VOP_UNLOCK(vp, 0, td);
 			return (1);
 		}
 		if (lbn >= NDADDR)
 			bqrelse(ibp);
 		/*
 		 * Allocate the block into which to do the copy. Note that this
 		 * allocation will never require any additional allocations for
 		 * the snapshot inode.
 		 */
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug)
 			printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
 			    "Copyonremove: snapino ", ip->i_number,
 			    (intmax_t)lbn, "for inum", inum, size,
 			    (intmax_t)cbp->b_blkno);
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if (dopersistence && VTOI(vp)->i_effnlink > 0)
 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 	}
 	/*
 	 * If we have been unable to allocate a block in which to do
 	 * the copy, then return non-zero so that the fragment will
 	 * not be freed. Although space will be lost, the snapshot
 	 * will stay consistent.
 	 */
 	if (snapshot_locked)
 		VOP_UNLOCK(vp, 0, td);
 	else
 		VI_UNLOCK(devvp);
 	return (error);
 }
 
 /*
  * Associate snapshot files when mounting.
  */
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct vnode *devvp = ump->um_devvp;
 	struct fs *fs = ump->um_fs;
 	struct thread *td = curthread;
 	struct snaphead *snaphead;
 	struct vnode *vp;
 	struct inode *ip, *xp;
 	struct uio auio;
 	struct iovec aiov;
 	void *snapblklist;
 	char *reason;
 	daddr_t snaplistsize;
 	int error, snaploc, loc;
 
 	/*
 	 * XXX The following needs to be set before UFS_TRUNCATE or
 	 * VOP_READ can be called.
 	 */
 	mp->mnt_stat.f_iosize = fs->fs_bsize;
 	/*
 	 * Process each snapshot listed in the superblock.
 	 */
 	vp = NULL;
 	snaphead = &devvp->v_rdev->si_snapshots;
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
 		    LK_EXCLUSIVE, &vp)) != 0){
 			printf("ffs_snapshot_mount: vget failed %d\n", error);
 			continue;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
 			if ((ip->i_flags & SF_SNAPSHOT) == 0) {
 				reason = "non-snapshot";
 			} else {
 				reason = "old format snapshot";
 				(void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
 				(void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			}
 			printf("ffs_snapshot_mount: %s inode %d\n",
 			    reason, fs->fs_snapinum[snaploc]);
 			vput(vp);
 			vp = NULL;
 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
 				if (fs->fs_snapinum[loc] == 0)
 					break;
 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
 			}
 			fs->fs_snapinum[loc - 1] = 0;
 			snaploc--;
 			continue;
 		}
 		/*
 		 * If there already exist snapshots on this filesystem, grab a
 		 * reference to their shared lock. If this is the first snapshot
 		 * on this filesystem, we need to allocate a lock for the
 		 * snapshots to share. In either case, acquire the snapshot
 		 * lock and give up our original private lock.
 		 */
 		VI_LOCK(devvp);
 		if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
 			struct lock *lkp;
 
 			lkp = ITOV(xp)->v_vnlock;
 			VI_UNLOCK(devvp);
 			VI_LOCK(vp);
 			vp->v_vnlock = lkp;
 		} else {
 			struct lock *lkp;
 
 			VI_UNLOCK(devvp);
 			MALLOC(lkp, struct lock *, sizeof(struct lock),
 			    M_UFSMNT, M_WAITOK);
 			lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
 			    LK_CANRECURSE | LK_NOPAUSE);
 			VI_LOCK(vp);
 			vp->v_vnlock = lkp;
 		}
 		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
 		transferlockers(&vp->v_lock, vp->v_vnlock);
 		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 		/*
 		 * Link it onto the active snapshot list.
 		 */
 		VI_LOCK(devvp);
 		if (ip->i_nextsnap.tqe_prev != 0)
 			panic("ffs_snapshot_mount: %d already on list",
 			    ip->i_number);
 		else
 			TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
 		vp->v_vflag |= VV_SYSTEM;
 		VI_UNLOCK(devvp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 	/*
 	 * No usable snapshots found.
 	 */
 	if (vp == NULL)
 		return;
 	/*
 	 * Allocate the space for the block hints list. We always want to
 	 * use the list from the newest snapshot.
 	 */
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)&snaplistsize;
 	aiov.iov_len = sizeof(snaplistsize);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset =
 	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
 		VOP_UNLOCK(vp, 0, td);
 		return;
 	}
 	MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = snapblklist;
 	aiov.iov_len = snaplistsize * sizeof (daddr_t);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset -= sizeof(snaplistsize);
 	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
 		VOP_UNLOCK(vp, 0, td);
 		FREE(snapblklist, M_UFSMNT);
 		return;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VI_LOCK(devvp);
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
 	devvp->v_rdev->si_snaplistsize = snaplistsize;
 	devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist;
 	devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 }
 
 /*
  * Disassociate snapshot files when unmounting.
  */
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
 	struct snaphead *snaphead = &devvp->v_rdev->si_snapshots;
 	struct lock *lkp = NULL;
 	struct inode *xp;
 	struct vnode *vp;
 
 	VI_LOCK(devvp);
 	while ((xp = TAILQ_FIRST(snaphead)) != 0) {
 		vp = ITOV(xp);
 		lkp = vp->v_vnlock;
 		vp->v_vnlock = &vp->v_lock;
 		TAILQ_REMOVE(snaphead, xp, i_nextsnap);
 		xp->i_nextsnap.tqe_prev = 0;
 		if (xp->i_effnlink > 0) {
 			VI_UNLOCK(devvp);
 			vrele(vp);
 			VI_LOCK(devvp);
 		}
 	}
 	if (devvp->v_rdev->si_snapblklist != NULL) {
 		FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT);
 		devvp->v_rdev->si_snapblklist = NULL;
 		devvp->v_rdev->si_snaplistsize = 0;
 	}
 	if (lkp != NULL) {
 		lockdestroy(lkp);
 		FREE(lkp, M_UFSMNT);
 	}
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
 	devvp->v_rdev->si_copyonwrite = 0;
 	devvp->v_vflag &= ~VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 }
 
 /*
  * Check for need to copy block that is about to be written,
  * copying the block if necessary.
  */
 static int
 ffs_copyonwrite(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	struct snaphead *snaphead;
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct thread *td = curthread;
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp = 0;
 	ufs2_daddr_t lbn, blkno, *snapblklist;
 	int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0;
 
 	if (td->td_pflags & TDP_COWINPROGRESS)
 		panic("ffs_copyonwrite: recursive call");
 	/*
 	 * First check to see if it is in the preallocated list.
 	 * By doing this check we avoid several potential deadlocks.
 	 */
 	VI_LOCK(devvp);
 	snaphead = &devvp->v_rdev->si_snapshots;
 	ip = TAILQ_FIRST(snaphead);
 	fs = ip->i_fs;
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	snapblklist = devvp->v_rdev->si_snapblklist;
 	upper = devvp->v_rdev->si_snaplistsize - 1;
 	lower = 1;
 	while (lower <= upper) {
 		mid = (lower + upper) / 2;
 		if (snapblklist[mid] == lbn)
 			break;
 		if (snapblklist[mid] < lbn)
 			lower = mid + 1;
 		else
 			upper = mid - 1;
 	}
 	if (lower <= upper) {
 		VI_UNLOCK(devvp);
 		return (0);
 	}
 	/*
 	 * Not in the precomputed list, so check the snapshots.
 	 */
 retry:
 	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
 		vp = ITOV(ip);
 		/*
 		 * We ensure that everything of our own that needs to be
 		 * copied will be done at the time that ffs_snapshot is
 		 * called. Thus we can skip the check here which can
 		 * deadlock in doing the lookup in UFS_BALLOC.
 		 */
 		if (bp->b_vp == vp)
 			continue;
 		/*
 		 * Check to see if block needs to be copied. We do not have
 		 * to hold the snapshot lock while doing this lookup as it
 		 * will never require any additional allocations for the
 		 * snapshot inode.
 		 */
 		if (lbn < NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			if (snapshot_locked == 0 &&
 			    lockmgr(vp->v_vnlock,
 			      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 			      VI_MTX(devvp), td) != 0) {
 				VI_LOCK(devvp);
 				goto retry;
 			}
 			snapshot_locked = 1;
 			td->td_pflags |= TDP_COWINPROGRESS;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_pflags &= ~TDP_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			if (ip->i_ump->um_fstype == UFS1)
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 			bqrelse(ibp);
 		}
 #ifdef DIAGNOSTIC
 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 			panic("ffs_copyonwrite: bad copy block");
 #endif
 		if (blkno != 0)
 			continue;
 		/*
 		 * Allocate the block into which to do the copy. Since
 		 * multiple processes may all try to copy the same block,
 		 * we have to recheck our need to do a copy if we sleep
 		 * waiting for the lock.
 		 *
 		 * Because all snapshots on a filesystem share a single
 		 * lock, we ensure that we will never be in competition
 		 * with another process to allocate a block.
 		 */
 		if (snapshot_locked == 0 &&
 		    lockmgr(vp->v_vnlock,
 		      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 		      VI_MTX(devvp), td) != 0) {
 			VI_LOCK(devvp);
 			goto retry;
 		}
 		snapshot_locked = 1;
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug) {
 			printf("Copyonwrite: snapino %d lbn %jd for ",
 			    ip->i_number, (intmax_t)lbn);
 			if (bp->b_vp == devvp)
 				printf("fs metadata");
 			else
 				printf("inum %d", VTOI(bp->b_vp)->i_number);
 			printf(" lblkno %jd to blkno %jd\n",
 			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
 		}
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if (dopersistence && VTOI(vp)->i_effnlink > 0)
 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
 	}
 	if (snapshot_locked)
 		VOP_UNLOCK(vp, 0, td);
 	else
 		VI_UNLOCK(devvp);
 	return (error);
 }
 
 /*
  * Read the specified block into the given buffer.
  * Much of this boiler-plate comes from bwrite().
  */
 static int
 readblock(bp, lbn)
 	struct buf *bp;
 	ufs2_daddr_t lbn;
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct thread *td = curthread;
 	struct inode *ip = VTOI(bp->b_vp);
 
 	aiov.iov_base = bp->b_data;
 	aiov.iov_len = bp->b_bcount;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 	auio.uio_resid = bp->b_bcount;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	return (physio(ip->i_devvp->v_rdev, &auio, 0));
 }
Index: head/sys/vm/vm_glue.c
===================================================================
--- head/sys/vm/vm_glue.c	(revision 130550)
+++ head/sys/vm/vm_glue.c	(revision 130551)
@@ -1,1120 +1,1120 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
 #include "opt_kstack_max_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/shm.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/unistd.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 #include <sys/user.h>
 
 extern int maxslp;
 
 /*
  * System initialization
  *
  * Note: proc0 from proc.h
  */
 static void vm_init_limits(void *);
 SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0)
 
 /*
  * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
  *
  * Note: run scheduling should be divorced from the vm system.
  */
 static void scheduler(void *);
 SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL)
 
 #ifndef NO_SWAPPING
 static void swapout(struct proc *);
 static void vm_proc_swapin(struct proc *p);
 static void vm_proc_swapout(struct proc *p);
 #endif
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  In most cases
  * just checking the vm_map_entry is sufficient within the kernel's address
  * space.
  */
 int
 kernacc(addr, len, rw)
 	void *addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
 	vm_prot_t prot;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
 	prot = rw;
 	saddr = trunc_page((vm_offset_t)addr);
 	eaddr = round_page((vm_offset_t)addr + len);
 	vm_map_lock_read(kernel_map);
 	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
 	vm_map_unlock_read(kernel_map);
 	return (rv == TRUE);
 }
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  vmapbuf(),
  * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
  * used in conjuction with this call.
  */
 int
 useracc(addr, len, rw)
 	void *addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_prot_t prot;
 	vm_map_t map;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
 	prot = rw;
 	map = &curproc->p_vmspace->vm_map;
 	if ((vm_offset_t)addr + len > vm_map_max(map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
 		return (FALSE);
 	}
 	vm_map_lock_read(map);
 	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
 	    round_page((vm_offset_t)addr + len), prot);
 	vm_map_unlock_read(map);
 	return (rv == TRUE);
 }
 
 int
 vslock(void *addr, size_t len)
 {
 	vm_offset_t end, last, start;
 	vm_size_t npages;
 	int error;
 
 	last = (vm_offset_t)addr + len;
 	start = trunc_page((vm_offset_t)addr);
 	end = round_page(last);
 	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
 	PROC_LOCK(curproc);
 	if (ptoa(npages +
 	    pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))) >
 	    lim_cur(curproc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(curproc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(curproc);
 #if 0
 	/*
 	 * XXX - not yet
 	 *
 	 * The limit for transient usage of wired pages should be
 	 * larger than for "permanent" wired pages (mlock()).
 	 *
 	 * Also, the sysctl code, which is the only present user
 	 * of vslock(), does a hard loop on EAGAIN.
 	 */
 	if (npages + cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #endif
 	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 	/*
 	 * Return EFAULT on error to match copy{in,out}() behaviour
 	 * rather than returning ENOMEM like mlock() would.
 	 */
 	return (error == KERN_SUCCESS ? 0 : EFAULT);
 }
 
 void
 vsunlock(void *addr, size_t len)
 {
 
 	/* Rely on the parameter sanity checks performed by vslock(). */
 	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 }
 
 /*
  * Create the U area for a new process.
  * This routine directly affects the fork perf for a process.
  */
 void
 vm_proc_new(struct proc *p)
 {
 	vm_page_t ma[UAREA_PAGES];
 	vm_object_t upobj;
 	vm_offset_t up;
 	vm_page_t m;
 	u_int i;
 
 	/*
 	 * Get a kernel virtual address for the U area for this process.
 	 */
 	up = kmem_alloc_nofault(kernel_map, UAREA_PAGES * PAGE_SIZE);
 	if (up == 0)
 		panic("vm_proc_new: upage allocation failed");
 	p->p_uarea = (struct user *)up;
 
 	/*
 	 * Allocate object and page(s) for the U area.
 	 */
 	upobj = vm_object_allocate(OBJT_DEFAULT, UAREA_PAGES);
 	p->p_upages_obj = upobj;
 	VM_OBJECT_LOCK(upobj);
 	for (i = 0; i < UAREA_PAGES; i++) {
 		m = vm_page_grab(upobj, i,
 		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
 		ma[i] = m;
 
 		vm_page_lock_queues();
 		vm_page_wakeup(m);
 		m->valid = VM_PAGE_BITS_ALL;
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(upobj);
 
 	/*
 	 * Enter the pages into the kernel address space.
 	 */
 	pmap_qenter(up, ma, UAREA_PAGES);
 }
 
 /*
  * Dispose the U area for a process that has exited.
  * This routine directly impacts the exit perf of a process.
  * XXX proc_zone is marked UMA_ZONE_NOFREE, so this should never be called.
  */
 void
 vm_proc_dispose(struct proc *p)
 {
 	vm_object_t upobj;
 	vm_offset_t up;
 	vm_page_t m;
 
 	upobj = p->p_upages_obj;
 	VM_OBJECT_LOCK(upobj);
 	if (upobj->resident_page_count != UAREA_PAGES)
 		panic("vm_proc_dispose: incorrect number of pages in upobj");
 	vm_page_lock_queues();
 	while ((m = TAILQ_FIRST(&upobj->memq)) != NULL) {
 		vm_page_busy(m);
 		vm_page_unwire(m, 0);
 		vm_page_free(m);
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(upobj);
 	up = (vm_offset_t)p->p_uarea;
 	pmap_qremove(up, UAREA_PAGES);
 	kmem_free(kernel_map, up, UAREA_PAGES * PAGE_SIZE);
 	vm_object_deallocate(upobj);
 }
 
 #ifndef NO_SWAPPING
 /*
  * Allow the U area for a process to be prejudicially paged out.
  */
 static void
 vm_proc_swapout(struct proc *p)
 {
 	vm_object_t upobj;
 	vm_offset_t up;
 	vm_page_t m;
 
 	upobj = p->p_upages_obj;
 	VM_OBJECT_LOCK(upobj);
 	if (upobj->resident_page_count != UAREA_PAGES)
 		panic("vm_proc_dispose: incorrect number of pages in upobj");
 	vm_page_lock_queues();
 	TAILQ_FOREACH(m, &upobj->memq, listq) {
 		vm_page_dirty(m);
 		vm_page_unwire(m, 0);
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(upobj);
 	up = (vm_offset_t)p->p_uarea;
 	pmap_qremove(up, UAREA_PAGES);
 }
 
 /*
  * Bring the U area for a specified process back in.
  */
 static void
 vm_proc_swapin(struct proc *p)
 {
 	vm_page_t ma[UAREA_PAGES];
 	vm_object_t upobj;
 	vm_offset_t up;
 	vm_page_t m;
 	int rv;
 	int i;
 
 	upobj = p->p_upages_obj;
 	VM_OBJECT_LOCK(upobj);
 	for (i = 0; i < UAREA_PAGES; i++) {
 		m = vm_page_grab(upobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			rv = vm_pager_get_pages(upobj, &m, 1, 0);
 			if (rv != VM_PAGER_OK)
 				panic("vm_proc_swapin: cannot get upage");
 		}
 		ma[i] = m;
 	}
 	if (upobj->resident_page_count != UAREA_PAGES)
 		panic("vm_proc_swapin: lost pages from upobj");
 	vm_page_lock_queues();
 	TAILQ_FOREACH(m, &upobj->memq, listq) {
 		m->valid = VM_PAGE_BITS_ALL;
 		vm_page_wire(m);
 		vm_page_wakeup(m);
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(upobj);
 	up = (vm_offset_t)p->p_uarea;
 	pmap_qenter(up, ma, UAREA_PAGES);
 }
 
 /*
  * Swap in the UAREAs of all processes swapped out to the given device.
  * The pages in the UAREA are marked dirty and their swap metadata is freed.
  */
 void
 vm_proc_swapin_all(struct swdevt *devidx)
 {
 	struct proc *p;
 	vm_object_t object;
 	vm_page_t m;
 
 retry:
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		object = p->p_upages_obj;
 		if (object != NULL) {
 			VM_OBJECT_LOCK(object);
 			if (swap_pager_isswapped(object, devidx)) {
 				VM_OBJECT_UNLOCK(object);
 				sx_sunlock(&allproc_lock);
 				faultin(p);
 				PROC_UNLOCK(p);
 				VM_OBJECT_LOCK(object);
 				vm_page_lock_queues();
 				TAILQ_FOREACH(m, &object->memq, listq)
 					vm_page_dirty(m);
 				vm_page_unlock_queues();
 				swap_pager_freespace(object, 0,
 				    object->un_pager.swp.swp_bcount);
 				VM_OBJECT_UNLOCK(object);
 				goto retry;
 			}
 			VM_OBJECT_UNLOCK(object);
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 }
 #endif
 
 #ifndef KSTACK_MAX_PAGES
 #define KSTACK_MAX_PAGES 32
 #endif
 
 /*
  * Create the kernel stack (including pcb for i386) for a new thread.
  * This routine directly affects the fork perf for a process and
  * create performance for a thread.
  */
 void
 vm_thread_new(struct thread *td, int pages)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	vm_page_t m, ma[KSTACK_MAX_PAGES];
 	int i;
 
 	/* Bounds check */
 	if (pages <= 1)
 		pages = KSTACK_PAGES;
 	else if (pages > KSTACK_MAX_PAGES)
 		pages = KSTACK_MAX_PAGES;
 	/*
 	 * Allocate an object for the kstack.
 	 */
 	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
 	td->td_kstack_obj = ksobj;
 	/*
 	 * Get a kernel virtual address for this thread's kstack.
 	 */
 	ks = kmem_alloc_nofault(kernel_map,
 	   (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 	if (ks == 0)
 		panic("vm_thread_new: kstack allocation failed");
 	if (KSTACK_GUARD_PAGES != 0) {
 		pmap_qremove(ks, KSTACK_GUARD_PAGES);
 		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
 	}
 	td->td_kstack = ks;
 	/*
 	 * Knowing the number of pages allocated is useful when you
 	 * want to deallocate them.
 	 */
 	td->td_kstack_pages = pages;
 	/* 
 	 * For the length of the stack, link in a real page of ram for each
 	 * page of stack.
 	 */
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		/*
 		 * Get a kernel stack page.
 		 */
 		m = vm_page_grab(ksobj, i,
 		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
 		ma[i] = m;
 		vm_page_lock_queues();
 		vm_page_wakeup(m);
 		m->valid = VM_PAGE_BITS_ALL;
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	pmap_qenter(ks, ma, pages);
 }
 
 /*
  * Dispose of a thread's kernel stack.
  */
 void
 vm_thread_dispose(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	vm_page_t m;
 	int i, pages;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	ks = td->td_kstack;
 	pmap_qremove(ks, pages);
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_dispose: kstack already missing?");
 		vm_page_lock_queues();
 		vm_page_busy(m);
 		vm_page_unwire(m, 0);
 		vm_page_free(m);
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	vm_object_deallocate(ksobj);
 	kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 }
 
 /*
  * Allow a thread's kernel stack to be paged out.
  */
 void
 vm_thread_swapout(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t m;
 	int i, pages;
 
 	cpu_thread_swapout(td);
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	pmap_qremove(td->td_kstack, pages);
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_swapout: kstack already missing?");
 		vm_page_lock_queues();
 		vm_page_dirty(m);
 		vm_page_unwire(m, 0);
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 }
 
 /*
  * Bring the kernel stack for a specified thread back in.
  */
 void
 vm_thread_swapin(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t m, ma[KSTACK_MAX_PAGES];
 	int i, pages, rv;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			rv = vm_pager_get_pages(ksobj, &m, 1, 0);
 			if (rv != VM_PAGER_OK)
 				panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid);
 			m = vm_page_lookup(ksobj, i);
 			m->valid = VM_PAGE_BITS_ALL;
 		}
 		ma[i] = m;
 		vm_page_lock_queues();
 		vm_page_wire(m);
 		vm_page_wakeup(m);
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	pmap_qenter(td->td_kstack, ma, pages);
 	cpu_thread_swapin(td);
 }
 
 /*
  * Set up a variable-sized alternate kstack.
  */
 void
 vm_thread_new_altkstack(struct thread *td, int pages)
 {
 
 	td->td_altkstack = td->td_kstack;
 	td->td_altkstack_obj = td->td_kstack_obj;
 	td->td_altkstack_pages = td->td_kstack_pages;
 
 	vm_thread_new(td, pages);
 }
 
 /*
  * Restore the original kstack.
  */
 void
 vm_thread_dispose_altkstack(struct thread *td)
 {
 
 	vm_thread_dispose(td);
 
 	td->td_kstack = td->td_altkstack;
 	td->td_kstack_obj = td->td_altkstack_obj;
 	td->td_kstack_pages = td->td_altkstack_pages;
 	td->td_altkstack = 0;
 	td->td_altkstack_obj = NULL;
 	td->td_altkstack_pages = 0;
 }
 
 /*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
  * machine-dependent layer to fill those in and make the new process
  * ready to run.  The new process is set up so that it returns directly
  * to user mode to avoid stack copying and relocation problems.
  */
 void
 vm_forkproc(td, p2, td2, flags)
 	struct thread *td;
 	struct proc *p2;
 	struct thread *td2;
 	int flags;
 {
 	struct proc *p1 = td->td_proc;
 	struct user *up;
 
 	GIANT_REQUIRED;
 
 	if ((flags & RFPROC) == 0) {
 		/*
 		 * Divorce the memory, if it is shared, essentially
 		 * this changes shared memory amongst threads, into
 		 * COW locally.
 		 */
 		if ((flags & RFMEM) == 0) {
 			if (p1->p_vmspace->vm_refcnt > 1) {
 				vmspace_unshare(p1);
 			}
 		}
 		cpu_fork(td, p2, td2, flags);
 		return;
 	}
 
 	if (flags & RFMEM) {
 		p2->p_vmspace = p1->p_vmspace;
 		p1->p_vmspace->vm_refcnt++;
 	}
 
 	while (vm_page_count_severe()) {
 		VM_WAIT;
 	}
 
 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
 		if (p1->p_vmspace->vm_shm)
 			shmfork(p1, p2);
 	}
 
 	/* XXXKSE this is unsatisfactory but should be adequate */
 	up = p2->p_uarea;
 	MPASS(p2->p_sigacts != NULL);
 
 	/*
 	 * p_stats currently points at fields in the user struct
 	 * but not at &u, instead at p_addr. Copy parts of
 	 * p_stats; zero the rest of p_stats (statistics).
 	 */
 	p2->p_stats = &up->u_stats;
 	bzero(&up->u_stats.pstat_startzero,
 	    (unsigned) ((caddr_t) &up->u_stats.pstat_endzero -
 		(caddr_t) &up->u_stats.pstat_startzero));
 	bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy,
 	    ((caddr_t) &up->u_stats.pstat_endcopy -
 		(caddr_t) &up->u_stats.pstat_startcopy));
 
 	/*
 	 * cpu_fork will copy and update the pcb, set up the kernel stack,
 	 * and make the child ready to run.
 	 */
 	cpu_fork(td, p2, td2, flags);
 }
 
 /*
  * Called after process has been wait(2)'ed apon and is being reaped.
  * The idea is to reclaim resources that we could not reclaim while
  * the process was still executing.
  */
 void
 vm_waitproc(p)
 	struct proc *p;
 {
 
 	GIANT_REQUIRED;
 	vmspace_exitfree(p);		/* and clean-out the vmspace */
 }
 
 /*
  * Set default limits for VM system.
  * Called for proc 0, and then inherited by all others.
  *
  * XXX should probably act directly on proc0.
  */
 static void
 vm_init_limits(udata)
 	void *udata;
 {
 	struct proc *p = udata;
 	struct plimit *limp;
 	int rss_limit;
 
 	/*
 	 * Set up the initial limits on process VM. Set the maximum resident
 	 * set size to be half of (reasonably) available memory.  Since this
 	 * is a soft limit, it comes into effect only when the system is out
 	 * of memory - half of main memory helps to favor smaller processes,
 	 * and reduces thrashing of the object cache.
 	 */
 	limp = p->p_limit;
 	limp->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
 	limp->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
 	limp->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
 	limp->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
 	/* limit the limit to no less than 2MB */
 	rss_limit = max(cnt.v_free_count, 512);
 	limp->pl_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
 	limp->pl_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
 }
 
 void
 faultin(p)
 	struct proc *p;
 {
 #ifdef NO_SWAPPING
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_sflag & PS_INMEM) == 0)
 		panic("faultin: proc swapped out with NO_SWAPPING!");
 #else /* !NO_SWAPPING */
 	struct thread *td;
 
 	GIANT_REQUIRED;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * If another process is swapping in this process,
 	 * just wait until it finishes.
 	 */
 	if (p->p_sflag & PS_SWAPPINGIN)
 		msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0);
 	else if ((p->p_sflag & PS_INMEM) == 0) {
 		/*
 		 * Don't let another thread swap process p out while we are
 		 * busy swapping it in.
 		 */
 		++p->p_lock;
 		mtx_lock_spin(&sched_lock);
 		p->p_sflag |= PS_SWAPPINGIN;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 
 		vm_proc_swapin(p);
 		FOREACH_THREAD_IN_PROC(p, td)
 			vm_thread_swapin(td);
 
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		p->p_sflag &= ~PS_SWAPPINGIN;
 		p->p_sflag |= PS_INMEM;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			TD_CLR_SWAPPED(td);
 			if (TD_CAN_RUN(td))
 				setrunnable(td);
 		}
 		mtx_unlock_spin(&sched_lock);
 
 		wakeup(&p->p_sflag);
 
 		/* Allow other threads to swap p out now. */
 		--p->p_lock;
 	}
 #endif /* NO_SWAPPING */
 }
 
 /*
  * This swapin algorithm attempts to swap-in processes only if there
  * is enough space for them.  Of course, if a process waits for a long
  * time, it will be swapped in anyway.
  *
  *  XXXKSE - process with the thread with highest priority counts..
  *
  * Giant is still held at this point, to be released in tsleep.
  */
 /* ARGSUSED*/
 static void
 scheduler(dummy)
 	void *dummy;
 {
 	struct proc *p;
 	struct thread *td;
 	int pri;
 	struct proc *pp;
 	int ppri;
 
 	mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
 	/* GIANT_REQUIRED */
 
 loop:
 	if (vm_page_count_min()) {
 		VM_WAIT;
 		goto loop;
 	}
 
 	pp = NULL;
 	ppri = INT_MIN;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		struct ksegrp *kg;
 		if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
 			continue;
 		}
 		mtx_lock_spin(&sched_lock);
 		FOREACH_THREAD_IN_PROC(p, td) {
 			/*
 			 * An otherwise runnable thread of a process
 			 * swapped out has only the TDI_SWAPPED bit set.
 			 * 
 			 */
 			if (td->td_inhibitors == TDI_SWAPPED) {
 				kg = td->td_ksegrp;
 				pri = p->p_swtime + kg->kg_slptime;
 				if ((p->p_sflag & PS_SWAPINREQ) == 0) {
-					pri -= kg->kg_nice * 8;
+					pri -= p->p_nice * 8;
 				}
 
 				/*
 				 * if this ksegrp is higher priority
 				 * and there is enough space, then select
 				 * this process instead of the previous
 				 * selection.
 				 */
 				if (pri > ppri) {
 					pp = p;
 					ppri = pri;
 				}
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 	}
 	sx_sunlock(&allproc_lock);
 
 	/*
 	 * Nothing to do, back to sleep.
 	 */
 	if ((p = pp) == NULL) {
 		tsleep(&proc0, PVM, "sched", maxslp * hz / 2);
 		goto loop;
 	}
 	PROC_LOCK(p);
 
 	/*
 	 * Another process may be bringing or may have already
 	 * brought this process in while we traverse all threads.
 	 * Or, this process may even be being swapped out again.
 	 */
 	if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
 		PROC_UNLOCK(p);
 		goto loop;
 	}
 
 	mtx_lock_spin(&sched_lock);
 	p->p_sflag &= ~PS_SWAPINREQ;
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * We would like to bring someone in. (only if there is space).
 	 * [What checks the space? ]
 	 */
 	faultin(p);
 	PROC_UNLOCK(p);
 	mtx_lock_spin(&sched_lock);
 	p->p_swtime = 0;
 	mtx_unlock_spin(&sched_lock);
 	goto loop;
 }
 
 #ifndef NO_SWAPPING
 
 /*
  * Swap_idle_threshold1 is the guaranteed swapped in time for a process
  */
 static int swap_idle_threshold1 = 2;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
     &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
 
 /*
  * Swap_idle_threshold2 is the time that a process can be idle before
  * it will be swapped out, if idle swapping is enabled.
  */
 static int swap_idle_threshold2 = 10;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
     &swap_idle_threshold2, 0, "Time before a process will be swapped out");
 
 /*
  * Swapout is driven by the pageout daemon.  Very simple, we find eligible
  * procs and unwire their u-areas.  We try to always "swap" at least one
  * process in case we need the room for a swapin.
  * If any procs have been sleeping/stopped for at least maxslp seconds,
  * they are swapped.  Else, we swap the longest-sleeping or stopped process,
  * if any, otherwise the longest-resident process.
  */
 void
 swapout_procs(action)
 int action;
 {
 	struct proc *p;
 	struct thread *td;
 	struct ksegrp *kg;
 	int didswap = 0;
 
 	GIANT_REQUIRED;
 
 retry:
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		struct vmspace *vm;
 		int minslptime = 100000;
 		
 		/*
 		 * Watch out for a process in
 		 * creation.  It may have no
 		 * address space or lock yet.
 		 */
 		mtx_lock_spin(&sched_lock);
 		if (p->p_state == PRS_NEW) {
 			mtx_unlock_spin(&sched_lock);
 			continue;
 		}
 		mtx_unlock_spin(&sched_lock);
 
 		/*
 		 * An aio daemon switches its
 		 * address space while running.
 		 * Perform a quick check whether
 		 * a process has P_SYSTEM.
 		 */
 		if ((p->p_flag & P_SYSTEM) != 0)
 			continue;
 
 		/*
 		 * Do not swapout a process that
 		 * is waiting for VM data
 		 * structures as there is a possible
 		 * deadlock.  Test this first as
 		 * this may block.
 		 *
 		 * Lock the map until swapout
 		 * finishes, or a thread of this
 		 * process may attempt to alter
 		 * the map.
 		 */
 		PROC_LOCK(p);
 		vm = p->p_vmspace;
 		KASSERT(vm != NULL,
 			("swapout_procs: a process has no address space"));
 		++vm->vm_refcnt;
 		PROC_UNLOCK(p);
 		if (!vm_map_trylock(&vm->vm_map))
 			goto nextproc1;
 
 		PROC_LOCK(p);
 		if (p->p_lock != 0 ||
 		    (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
 		    ) != 0) {
 			goto nextproc2;
 		}
 		/*
 		 * only aiod changes vmspace, however it will be
 		 * skipped because of the if statement above checking 
 		 * for P_SYSTEM
 		 */
 		if ((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) != PS_INMEM)
 			goto nextproc2;
 
 		switch (p->p_state) {
 		default:
 			/* Don't swap out processes in any sort
 			 * of 'special' state. */
 			break;
 
 		case PRS_NORMAL:
 			mtx_lock_spin(&sched_lock);
 			/*
 			 * do not swapout a realtime process
 			 * Check all the thread groups..
 			 */
 			FOREACH_KSEGRP_IN_PROC(p, kg) {
 				if (PRI_IS_REALTIME(kg->kg_pri_class))
 					goto nextproc;
 
 				/*
 				 * Guarantee swap_idle_threshold1
 				 * time in memory.
 				 */
 				if (kg->kg_slptime < swap_idle_threshold1)
 					goto nextproc;
 
 				/*
 				 * Do not swapout a process if it is
 				 * waiting on a critical event of some
 				 * kind or there is a thread whose
 				 * pageable memory may be accessed.
 				 *
 				 * This could be refined to support
 				 * swapping out a thread.
 				 */
 				FOREACH_THREAD_IN_GROUP(kg, td) {
 					if ((td->td_priority) < PSOCK ||
 					    !thread_safetoswapout(td))
 						goto nextproc;
 				}
 				/*
 				 * If the system is under memory stress,
 				 * or if we are swapping
 				 * idle processes >= swap_idle_threshold2,
 				 * then swap the process out.
 				 */
 				if (((action & VM_SWAP_NORMAL) == 0) &&
 				    (((action & VM_SWAP_IDLE) == 0) ||
 				    (kg->kg_slptime < swap_idle_threshold2)))
 					goto nextproc;
 
 				if (minslptime > kg->kg_slptime)
 					minslptime = kg->kg_slptime;
 			}
 
 			/*
 			 * If the process has been asleep for awhile and had
 			 * most of its pages taken away already, swap it out.
 			 */
 			if ((action & VM_SWAP_NORMAL) ||
 				((action & VM_SWAP_IDLE) &&
 				 (minslptime > swap_idle_threshold2))) {
 				swapout(p);
 				didswap++;
 				mtx_unlock_spin(&sched_lock);
 				PROC_UNLOCK(p);
 				vm_map_unlock(&vm->vm_map);
 				vmspace_free(vm);
 				sx_sunlock(&allproc_lock);
 				goto retry;
 			}
 nextproc:			
 			mtx_unlock_spin(&sched_lock);
 		}
 nextproc2:
 		PROC_UNLOCK(p);
 		vm_map_unlock(&vm->vm_map);
 nextproc1:
 		vmspace_free(vm);
 		continue;
 	}
 	sx_sunlock(&allproc_lock);
 	/*
 	 * If we swapped something out, and another process needed memory,
 	 * then wakeup the sched process.
 	 */
 	if (didswap)
 		wakeup(&proc0);
 }
 
 static void
 swapout(p)
 	struct proc *p;
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 #if defined(SWAP_DEBUG)
 	printf("swapping out %d\n", p->p_pid);
 #endif
 
 	/*
 	 * The states of this process and its threads may have changed
 	 * by now.  Assuming that there is only one pageout daemon thread,
 	 * this process should still be in memory.
 	 */
 	KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) == PS_INMEM,
 		("swapout: lost a swapout race?"));
 
 #if defined(INVARIANTS)
 	/*
 	 * Make sure that all threads are safe to be swapped out.
 	 *
 	 * Alternatively, we could swap out only safe threads.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		KASSERT(thread_safetoswapout(td),
 			("swapout: there is a thread not safe for swapout"));
 	}
 #endif /* INVARIANTS */
 
 	++p->p_stats->p_ru.ru_nswap;
 	/*
 	 * remember the process resident count
 	 */
 	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
 
 	p->p_sflag &= ~PS_INMEM;
 	p->p_sflag |= PS_SWAPPINGOUT;
 	PROC_UNLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td)
 		TD_SET_SWAPPED(td);
 	mtx_unlock_spin(&sched_lock);
 
 	vm_proc_swapout(p);
 	FOREACH_THREAD_IN_PROC(p, td)
 		vm_thread_swapout(td);
 
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	p->p_sflag &= ~PS_SWAPPINGOUT;
 	p->p_swtime = 0;
 }
 #endif /* !NO_SWAPPING */
Index: head/sys/vm/vm_pageout.c
===================================================================
--- head/sys/vm/vm_pageout.c	(revision 130550)
+++ head/sys/vm/vm_pageout.c	(revision 130551)
@@ -1,1573 +1,1570 @@
 /*
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	The proverbial page-out daemon.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <machine/mutex.h>
 
 /*
  * System initialization
  */
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static int vm_pageout_clean(vm_page_t);
 static void vm_pageout_pmap_collect(void);
 static void vm_pageout_scan(int pass);
 
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
 	"pagedaemon",
 	vm_pageout,
 	&pageproc
 };
 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
 
 #if !defined(NO_SWAPPING)
 /* the kernel process "vm_daemon"*/
 static void vm_daemon(void);
 static struct	proc *vmproc;
 
 static struct kproc_desc vm_kp = {
 	"vmdaemon",
 	vm_daemon,
 	&vmproc
 };
 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
 #endif
 
 
 int vm_pages_needed;		/* Event on which pageout daemon sleeps */
 int vm_pageout_deficit;		/* Estimated number of pages deficit */
 int vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
 
 #if !defined(NO_SWAPPING)
 static int vm_pageout_req_swapout;	/* XXX */
 static int vm_daemon_needed;
 #endif
 static int vm_max_launder = 32;
 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
 static int vm_pageout_full_stats_interval = 0;
 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
 static int defer_swap_pageouts=0;
 static int disable_swap_pageouts=0;
 
 #if defined(NO_SWAPPING)
 static int vm_swap_enabled=0;
 static int vm_swap_idle_enabled=0;
 #else
 static int vm_swap_enabled=1;
 static int vm_swap_idle_enabled=0;
 #endif
 
 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
 	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
 
 SYSCTL_INT(_vm, OID_AUTO, max_launder,
 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
 	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
 	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
 	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
 	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
 
 #if defined(NO_SWAPPING)
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RD, &vm_swap_enabled, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
 #else
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
 #endif
 
 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
 
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
 static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
 
 #if !defined(NO_SWAPPING)
 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
 static void vm_req_vmdaemon(void);
 #endif
 static void vm_pageout_page_stats(void);
 
 /*
  * vm_pageout_clean:
  *
  * Clean the page and remove it from the laundry.
  * 
  * We set the busy bit to cause potential page faults on this page to
  * block.  Note the careful timing, however, the busy bit isn't set till
  * late and we cannot do anything that will mess with the page.
  */
 static int
 vm_pageout_clean(m)
 	vm_page_t m;
 {
 	vm_object_t object;
 	vm_page_t mc[2*vm_pageout_page_count];
 	int pageout_count;
 	int ib, is, page_base;
 	vm_pindex_t pindex = m->pindex;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 
 	/*
 	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
 	 * with the new swapper, but we could have serious problems paging
 	 * out other object types if there is insufficient memory.  
 	 *
 	 * Unfortunately, checking free memory here is far too late, so the
 	 * check has been moved up a procedural level.
 	 */
 
 	/*
 	 * Don't mess with the page if it's busy, held, or special
 	 */
 	if ((m->hold_count != 0) ||
 	    ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) {
 		return 0;
 	}
 
 	mc[vm_pageout_page_count] = m;
 	pageout_count = 1;
 	page_base = vm_pageout_page_count;
 	ib = 1;
 	is = 1;
 
 	/*
 	 * Scan object for clusterable pages.
 	 *
 	 * We can cluster ONLY if: ->> the page is NOT
 	 * clean, wired, busy, held, or mapped into a
 	 * buffer, and one of the following:
 	 * 1) The page is inactive, or a seldom used
 	 *    active page.
 	 * -or-
 	 * 2) we force the issue.
 	 *
 	 * During heavy mmap/modification loads the pageout
 	 * daemon can really fragment the underlying file
 	 * due to flushing pages out of order and not trying
 	 * align the clusters (which leave sporatic out-of-order
 	 * holes).  To solve this problem we do the reverse scan
 	 * first and attempt to align our cluster, then do a 
 	 * forward scan if room remains.
 	 */
 	object = m->object;
 more:
 	while (ib && pageout_count < vm_pageout_page_count) {
 		vm_page_t p;
 
 		if (ib > pindex) {
 			ib = 0;
 			break;
 		}
 
 		if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
 			ib = 0;
 			break;
 		}
 		if (((p->queue - p->pc) == PQ_CACHE) ||
 		    (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
 			ib = 0;
 			break;
 		}
 		vm_page_test_dirty(p);
 		if ((p->dirty & p->valid) == 0 ||
 		    p->queue != PQ_INACTIVE ||
 		    p->wire_count != 0 ||	/* may be held by buf cache */
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			ib = 0;
 			break;
 		}
 		mc[--page_base] = p;
 		++pageout_count;
 		++ib;
 		/*
 		 * alignment boundry, stop here and switch directions.  Do
 		 * not clear ib.
 		 */
 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
 			break;
 	}
 
 	while (pageout_count < vm_pageout_page_count && 
 	    pindex + is < object->size) {
 		vm_page_t p;
 
 		if ((p = vm_page_lookup(object, pindex + is)) == NULL)
 			break;
 		if (((p->queue - p->pc) == PQ_CACHE) ||
 		    (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
 			break;
 		}
 		vm_page_test_dirty(p);
 		if ((p->dirty & p->valid) == 0 ||
 		    p->queue != PQ_INACTIVE ||
 		    p->wire_count != 0 ||	/* may be held by buf cache */
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			break;
 		}
 		mc[page_base + pageout_count] = p;
 		++pageout_count;
 		++is;
 	}
 
 	/*
 	 * If we exhausted our forward scan, continue with the reverse scan
 	 * when possible, even past a page boundry.  This catches boundry
 	 * conditions.
 	 */
 	if (ib && pageout_count < vm_pageout_page_count)
 		goto more;
 
 	/*
 	 * we allow reads during pageouts...
 	 */
 	return (vm_pageout_flush(&mc[page_base], pageout_count, 0));
 }
 
 /*
  * vm_pageout_flush() - launder the given pages
  *
  *	The given pages are laundered.  Note that we setup for the start of
  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
  *	reference count all in here rather then in the parent.  If we want
  *	the parent to do more sophisticated things we may have to change
  *	the ordering.
  */
 int
 vm_pageout_flush(vm_page_t *mc, int count, int flags)
 {
 	vm_object_t object = mc[0]->object;
 	int pageout_status[count];
 	int numpagedout = 0;
 	int i;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * Initiate I/O.  Bump the vm_page_t->busy counter and
 	 * mark the pages read-only.
 	 *
 	 * We do not have to fixup the clean/dirty bits here... we can
 	 * allow the pager to do it after the I/O completes.
 	 *
 	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
 	 * edge case with file fragments.
 	 */
 	for (i = 0; i < count; i++) {
 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
 		vm_page_io_start(mc[i]);
 		pmap_page_protect(mc[i], VM_PROT_READ);
 	}
 	vm_page_unlock_queues();
 	vm_object_pip_add(object, count);
 
 	vm_pager_put_pages(object, mc, count,
 	    (flags | ((object == kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
 	    pageout_status);
 
 	vm_page_lock_queues();
 	for (i = 0; i < count; i++) {
 		vm_page_t mt = mc[i];
 
 		KASSERT((mt->flags & PG_WRITEABLE) == 0,
 		    ("vm_pageout_flush: page %p is not write protected", mt));
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
 		case VM_PAGER_PEND:
 			numpagedout++;
 			break;
 		case VM_PAGER_BAD:
 			/*
 			 * Page outside of range of object. Right now we
 			 * essentially lose the changes by pretending it
 			 * worked.
 			 */
 			pmap_clear_modify(mt);
 			vm_page_undirty(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
 			 * If page couldn't be paged out, then reactivate the
 			 * page so it doesn't clog the inactive list.  (We
 			 * will try paging out it again later).
 			 */
 			vm_page_activate(mt);
 			break;
 		case VM_PAGER_AGAIN:
 			break;
 		}
 
 		/*
 		 * If the operation is still going, leave the page busy to
 		 * block all other accesses. Also, leave the paging in
 		 * progress indicator set so that we don't attempt an object
 		 * collapse.
 		 */
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_io_finish(mt);
 			if (vm_page_count_severe())
 				vm_page_try_to_cache(mt);
 		}
 	}
 	return numpagedout;
 }
 
 #if !defined(NO_SWAPPING)
 /*
  *	vm_pageout_object_deactivate_pages
  *
  *	deactivate enough pages to satisfy the inactive target
  *	requirements or if vm_page_proc_limit is set, then
  *	deactivate all of the pages in the object and its
  *	backing_objects.
  *
  *	The object and map must be locked.
  */
 static void
 vm_pageout_object_deactivate_pages(pmap, first_object, desired)
 	pmap_t pmap;
 	vm_object_t first_object;
 	long desired;
 {
 	vm_object_t backing_object, object;
 	vm_page_t p, next;
 	int actcount, rcount, remove_mode;
 
 	VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
 	if (first_object->type == OBJT_DEVICE || first_object->type == OBJT_PHYS)
 		return;
 	for (object = first_object;; object = backing_object) {
 		if (pmap_resident_count(pmap) <= desired)
 			goto unlock_return;
 		if (object->paging_in_progress)
 			goto unlock_return;
 
 		remove_mode = 0;
 		if (object->shadow_count > 1)
 			remove_mode = 1;
 		/*
 		 * scan the objects entire memory queue
 		 */
 		rcount = object->resident_page_count;
 		p = TAILQ_FIRST(&object->memq);
 		vm_page_lock_queues();
 		while (p && (rcount-- > 0)) {
 			if (pmap_resident_count(pmap) <= desired) {
 				vm_page_unlock_queues();
 				goto unlock_return;
 			}
 			next = TAILQ_NEXT(p, listq);
 			cnt.v_pdpages++;
 			if (p->wire_count != 0 ||
 			    p->hold_count != 0 ||
 			    p->busy != 0 ||
 			    (p->flags & (PG_BUSY|PG_UNMANAGED)) ||
 			    !pmap_page_exists_quick(pmap, p)) {
 				p = next;
 				continue;
 			}
 			actcount = pmap_ts_referenced(p);
 			if (actcount) {
 				vm_page_flag_set(p, PG_REFERENCED);
 			} else if (p->flags & PG_REFERENCED) {
 				actcount = 1;
 			}
 			if ((p->queue != PQ_ACTIVE) &&
 				(p->flags & PG_REFERENCED)) {
 				vm_page_activate(p);
 				p->act_count += actcount;
 				vm_page_flag_clear(p, PG_REFERENCED);
 			} else if (p->queue == PQ_ACTIVE) {
 				if ((p->flags & PG_REFERENCED) == 0) {
 					p->act_count -= min(p->act_count, ACT_DECLINE);
 					if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) {
 						pmap_remove_all(p);
 						vm_page_deactivate(p);
 					} else {
 						vm_pageq_requeue(p);
 					}
 				} else {
 					vm_page_activate(p);
 					vm_page_flag_clear(p, PG_REFERENCED);
 					if (p->act_count < (ACT_MAX - ACT_ADVANCE))
 						p->act_count += ACT_ADVANCE;
 					vm_pageq_requeue(p);
 				}
 			} else if (p->queue == PQ_INACTIVE) {
 				pmap_remove_all(p);
 			}
 			p = next;
 		}
 		vm_page_unlock_queues();
 		if ((backing_object = object->backing_object) == NULL)
 			goto unlock_return;
 		VM_OBJECT_LOCK(backing_object);
 		if (object != first_object)
 			VM_OBJECT_UNLOCK(object);
 	}
 unlock_return:
 	if (object != first_object)
 		VM_OBJECT_UNLOCK(object);
 }
 
 /*
  * deactivate some number of pages in a map, try to do it fairly, but
  * that is really hard to do.
  */
 static void
 vm_pageout_map_deactivate_pages(map, desired)
 	vm_map_t map;
 	long desired;
 {
 	vm_map_entry_t tmpe;
 	vm_object_t obj, bigobj;
 	int nothingwired;
 
 	if (!vm_map_trylock(map))
 		return;
 
 	bigobj = NULL;
 	nothingwired = TRUE;
 
 	/*
 	 * first, search out the biggest object, and try to free pages from
 	 * that.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL && VM_OBJECT_TRYLOCK(obj)) {
 				if (obj->shadow_count <= 1 &&
 				    (bigobj == NULL ||
 				     bigobj->resident_page_count < obj->resident_page_count)) {
 					if (bigobj != NULL)
 						VM_OBJECT_UNLOCK(bigobj);
 					bigobj = obj;
 				} else
 					VM_OBJECT_UNLOCK(obj);
 			}
 		}
 		if (tmpe->wired_count > 0)
 			nothingwired = FALSE;
 		tmpe = tmpe->next;
 	}
 
 	if (bigobj != NULL) {
 		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
 		VM_OBJECT_UNLOCK(bigobj);
 	}
 	/*
 	 * Next, hunt around for other pages to deactivate.  We actually
 	 * do this search sort of wrong -- .text first is not the best idea.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
 			break;
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL) {
 				VM_OBJECT_LOCK(obj);
 				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
 				VM_OBJECT_UNLOCK(obj);
 			}
 		}
 		tmpe = tmpe->next;
 	}
 
 	/*
 	 * Remove all mappings if a process is swapped out, this will free page
 	 * table pages.
 	 */
 	if (desired == 0 && nothingwired) {
 		GIANT_REQUIRED;
 		vm_page_lock_queues();
 		pmap_remove(vm_map_pmap(map), vm_map_min(map),
 		    vm_map_max(map));
 		vm_page_unlock_queues();
 	}
 	vm_map_unlock(map);
 }
 #endif		/* !defined(NO_SWAPPING) */
 
 /*
  * This routine is very drastic, but can save the system
  * in a pinch.
  */
 static void
 vm_pageout_pmap_collect(void)
 {
 	int i;
 	vm_page_t m;
 	static int warningdone;
 
 	if (pmap_pagedaemon_waken == 0)
 		return;
 	if (warningdone < 5) {
 		printf("collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
 		warningdone++;
 	}
 	vm_page_lock_queues();
 	for (i = 0; i < vm_page_array_size; i++) {
 		m = &vm_page_array[i];
 		if (m->wire_count || m->hold_count || m->busy ||
 		    (m->flags & (PG_BUSY | PG_UNMANAGED)))
 			continue;
 		pmap_remove_all(m);
 	}
 	vm_page_unlock_queues();
 	pmap_pagedaemon_waken = 0;
 }
 	
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  */
 static void
 vm_pageout_scan(int pass)
 {
 	vm_page_t m, next;
 	struct vm_page marker;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage, addl_page_shortage_init;
 	struct proc *p, *bigproc;
 	struct thread *td;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
 	int actcount;
 	int vnodes_skipped = 0;
 	int maxlaunder;
 	int s;
 
 	mtx_lock(&Giant);
 	/*
 	 * Decrease registered cache sizes.
 	 */
 	EVENTHANDLER_INVOKE(vm_lowmem, 0);
 	/*
 	 * We do this explicitly after the caches have been drained above.
 	 */
 	uma_reclaim();
 	/*
 	 * Do whatever cleanup that the pmap code can.
 	 */
 	vm_pageout_pmap_collect();
 
 	addl_page_shortage_init = atomic_readandclear_int(&vm_pageout_deficit);
 
 	/*
 	 * Calculate the number of pages we want to either free or move
 	 * to the cache.
 	 */
 	page_shortage = vm_paging_target() + addl_page_shortage_init;
 
 	/*
 	 * Initialize our marker
 	 */
 	bzero(&marker, sizeof(marker));
 	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
 	marker.queue = PQ_INACTIVE;
 	marker.wire_count = 1;
 
 	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
 	 * we have scanned the entire inactive queue.  Note that m->act_count
 	 * is not used to form decisions for the inactive queue, only for the
 	 * active queue.
 	 *
 	 * maxlaunder limits the number of dirty pages we flush per scan.
 	 * For most systems a smaller value (16 or 32) is more robust under
 	 * extreme memory and disk pressure because any unnecessary writes
 	 * to disk can result in extreme performance degredation.  However,
 	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
 	 * used) will die horribly with limited laundering.  If the pageout
 	 * daemon cannot clean enough pages in the first pass, we let it go
 	 * all out in succeeding passes.
 	 */
 	if ((maxlaunder = vm_max_launder) <= 1)
 		maxlaunder = 1;
 	if (pass)
 		maxlaunder = 10000;
 	vm_page_lock_queues();
 rescan0:
 	addl_page_shortage = addl_page_shortage_init;
 	maxscan = cnt.v_inactive_count;
 
 	for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
 	     m != NULL && maxscan-- > 0 && page_shortage > 0;
 	     m = next) {
 
 		cnt.v_pdpages++;
 
 		if (m->queue != PQ_INACTIVE) {
 			goto rescan0;
 		}
 
 		next = TAILQ_NEXT(m, pageq);
 
 		/*
 		 * skip marker pages
 		 */
 		if (m->flags & PG_MARKER)
 			continue;
 
 		/*
 		 * A held page may be undergoing I/O, so skip it.
 		 */
 		if (m->hold_count) {
 			vm_pageq_requeue(m);
 			addl_page_shortage++;
 			continue;
 		}
 		/*
 		 * Don't mess with busy pages, keep in the front of the
 		 * queue, most likely are being paged out.
 		 */
 		if (m->busy || (m->flags & PG_BUSY)) {
 			addl_page_shortage++;
 			continue;
 		}
 
 		/*
 		 * If the object is not being used, we ignore previous 
 		 * references.
 		 */
 		if (m->object->ref_count == 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			pmap_clear_reference(m);
 
 		/*
 		 * Otherwise, if the page has been referenced while in the 
 		 * inactive queue, we bump the "activation count" upwards, 
 		 * making it less likely that the page will be added back to 
 		 * the inactive queue prematurely again.  Here we check the 
 		 * page tables (or emulated bits, if any), given the upper 
 		 * level VM system not knowing anything about existing 
 		 * references.
 		 */
 		} else if (((m->flags & PG_REFERENCED) == 0) &&
 			(actcount = pmap_ts_referenced(m))) {
 			vm_page_activate(m);
 			m->act_count += (actcount + ACT_ADVANCE);
 			continue;
 		}
 
 		/*
 		 * If the upper level VM system knows about any page 
 		 * references, we activate the page.  We also set the 
 		 * "activation count" higher than normal so that we will less 
 		 * likely place pages back onto the inactive queue again.
 		 */
 		if ((m->flags & PG_REFERENCED) != 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			actcount = pmap_ts_referenced(m);
 			vm_page_activate(m);
 			m->act_count += (actcount + ACT_ADVANCE + 1);
 			continue;
 		}
 
 		/*
 		 * If the upper level VM system doesn't know anything about 
 		 * the page being dirty, we have to check for it again.  As 
 		 * far as the VM code knows, any partially dirty pages are 
 		 * fully dirty.
 		 */
 		if (m->dirty == 0 && !pmap_is_modified(m)) {
 			/*
 			 * Avoid a race condition: Unless write access is
 			 * removed from the page, another processor could
 			 * modify it before all access is removed by the call
 			 * to vm_page_cache() below.  If vm_page_cache() finds
 			 * that the page has been modified when it removes all
 			 * access, it panics because it cannot cache dirty
 			 * pages.  In principle, we could eliminate just write
 			 * access here rather than all access.  In the expected
 			 * case, when there are no last instant modifications
 			 * to the page, removing all access will be cheaper
 			 * overall.
 			 */
 			if ((m->flags & PG_WRITEABLE) != 0)
 				pmap_remove_all(m);
 		} else {
 			vm_page_dirty(m);
 		}
 
 		object = m->object;
 		if (!VM_OBJECT_TRYLOCK(object))
 			continue;
 		if (m->valid == 0) {
 			/*
 			 * Invalid pages can be easily freed
 			 */
 			vm_page_busy(m);
 			pmap_remove_all(m);
 			vm_page_free(m);
 			cnt.v_dfree++;
 			--page_shortage;
 		} else if (m->dirty == 0) {
 			/*
 			 * Clean pages can be placed onto the cache queue.
 			 * This effectively frees them.
 			 */
 			vm_page_cache(m);
 			--page_shortage;
 		} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
 			/*
 			 * Dirty pages need to be paged out, but flushing
 			 * a page is extremely expensive verses freeing
 			 * a clean page.  Rather then artificially limiting
 			 * the number of pages we can flush, we instead give
 			 * dirty pages extra priority on the inactive queue
 			 * by forcing them to be cycled through the queue
 			 * twice before being flushed, after which the
 			 * (now clean) page will cycle through once more
 			 * before being freed.  This significantly extends
 			 * the thrash point for a heavily loaded machine.
 			 */
 			vm_page_flag_set(m, PG_WINATCFLS);
 			vm_pageq_requeue(m);
 		} else if (maxlaunder > 0) {
 			/*
 			 * We always want to try to flush some dirty pages if
 			 * we encounter them, to keep the system stable.
 			 * Normally this number is small, but under extreme
 			 * pressure where there are insufficient clean pages
 			 * on the inactive queue, we may have to go all out.
 			 */
 			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 			struct mount *mp;
 
 			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
 				swap_pageouts_ok = 1;
 			} else {
 				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
 				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
 				vm_page_count_min());
 										
 			}
 
 			/*
 			 * We don't bother paging objects that are "dead".  
 			 * Those objects are in a "rundown" state.
 			 */
 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
 				VM_OBJECT_UNLOCK(object);
 				vm_pageq_requeue(m);
 				continue;
 			}
 
 			/*
 			 * The object is already known NOT to be dead.   It
 			 * is possible for the vget() to block the whole
 			 * pageout daemon, but the new low-memory handling
 			 * code should prevent it.
 			 *
 			 * The previous code skipped locked vnodes and, worse,
 			 * reordered pages in the queue.  This results in
 			 * completely non-deterministic operation and, on a
 			 * busy system, can lead to extremely non-optimal
 			 * pageouts.  For example, it can cause clean pages
 			 * to be freed and dirty pages to be moved to the end
 			 * of the queue.  Since dirty pages are also moved to
 			 * the end of the queue once-cleaned, this gives
 			 * way too large a weighting to defering the freeing
 			 * of dirty pages.
 			 *
 			 * We can't wait forever for the vnode lock, we might
 			 * deadlock due to a vn_read() getting stuck in
 			 * vm_wait while holding this vnode.  We skip the 
 			 * vnode if we can't get it in a reasonable amount
 			 * of time.
 			 */
 			if (object->type == OBJT_VNODE) {
 				vp = object->handle;
 				mp = NULL;
 				if (vp->v_type == VREG)
 					vn_start_write(vp, &mp, V_NOWAIT);
 				vm_page_unlock_queues();
 				VI_LOCK(vp);
 				VM_OBJECT_UNLOCK(object);
 				if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK |
 				    LK_TIMELOCK, curthread)) {
 					VM_OBJECT_LOCK(object);
 					vm_page_lock_queues();
 					++pageout_lock_miss;
 					vn_finished_write(mp);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					VM_OBJECT_UNLOCK(object);
 					continue;
 				}
 				VM_OBJECT_LOCK(object);
 				vm_page_lock_queues();
 				/*
 				 * The page might have been moved to another
 				 * queue during potential blocking in vget()
 				 * above.  The page might have been freed and
 				 * reused for another vnode.  The object might
 				 * have been reused for another vnode.
 				 */
 				if (m->queue != PQ_INACTIVE ||
 				    m->object != object ||
 				    object->handle != vp) {
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					goto unlock_and_continue;
 				}
 	
 				/*
 				 * The page may have been busied during the
 				 * blocking in vput();  We don't move the
 				 * page back onto the end of the queue so that
 				 * statistics are more correct if we don't.
 				 */
 				if (m->busy || (m->flags & PG_BUSY)) {
 					goto unlock_and_continue;
 				}
 
 				/*
 				 * If the page has become held it might
 				 * be undergoing I/O, so skip it
 				 */
 				if (m->hold_count) {
 					vm_pageq_requeue(m);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					goto unlock_and_continue;
 				}
 			}
 
 			/*
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
 			 * start the cleaning operation. 
 			 *
 			 * This operation may cluster, invalidating the 'next'
 			 * pointer.  To prevent an inordinate number of
 			 * restarts we use our marker to remember our place.
 			 *
 			 * decrement page_shortage on success to account for
 			 * the (future) cleaned page.  Otherwise we could wind
 			 * up laundering or cleaning too many pages.
 			 */
 			s = splvm();
 			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
 			splx(s);
 			if (vm_pageout_clean(m) != 0) {
 				--page_shortage;
 				--maxlaunder;
 			}
 			s = splvm();
 			next = TAILQ_NEXT(&marker, pageq);
 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
 			splx(s);
 unlock_and_continue:
 			VM_OBJECT_UNLOCK(object);
 			if (vp) {
 				vm_page_unlock_queues();
 				vput(vp);
 				vn_finished_write(mp);
 				vm_page_lock_queues();
 			}
 			continue;
 		}
 		VM_OBJECT_UNLOCK(object);
 	}
 
 	/*
 	 * Compute the number of pages we want to try to move from the
 	 * active queue to the inactive queue.
 	 */
 	page_shortage = vm_paging_target() +
 		cnt.v_inactive_target - cnt.v_inactive_count;
 	page_shortage += addl_page_shortage;
 
 	/*
 	 * Scan the active queue for things we can deactivate. We nominally
 	 * track the per-page activity counter and use it to locate
 	 * deactivation candidates.
 	 */
 	pcount = cnt.v_active_count;
 	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
 
 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
 
 		KASSERT(m->queue == PQ_ACTIVE,
 		    ("vm_pageout_scan: page %p isn't active", m));
 
 		next = TAILQ_NEXT(m, pageq);
 		/*
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
 		    (m->flags & PG_BUSY) ||
 		    (m->hold_count != 0)) {
 			vm_pageq_requeue(m);
 			m = next;
 			continue;
 		}
 
 		/*
 		 * The count for pagedaemon pages is done after checking the
 		 * page for eligibility...
 		 */
 		cnt.v_pdpages++;
 
 		/*
 		 * Check to see "how much" the page has been used.
 		 */
 		actcount = 0;
 		if (m->object->ref_count != 0) {
 			if (m->flags & PG_REFERENCED) {
 				actcount += 1;
 			}
 			actcount += pmap_ts_referenced(m);
 			if (actcount) {
 				m->act_count += ACT_ADVANCE + actcount;
 				if (m->act_count > ACT_MAX)
 					m->act_count = ACT_MAX;
 			}
 		}
 
 		/*
 		 * Since we have "tested" this bit, we need to clear it now.
 		 */
 		vm_page_flag_clear(m, PG_REFERENCED);
 
 		/*
 		 * Only if an object is currently being used, do we use the
 		 * page activation count stats.
 		 */
 		if (actcount && (m->object->ref_count != 0)) {
 			vm_pageq_requeue(m);
 		} else {
 			m->act_count -= min(m->act_count, ACT_DECLINE);
 			if (vm_pageout_algorithm ||
 			    m->object->ref_count == 0 ||
 			    m->act_count == 0) {
 				page_shortage--;
 				if (m->object->ref_count == 0) {
 					pmap_remove_all(m);
 					if (m->dirty == 0)
 						vm_page_cache(m);
 					else
 						vm_page_deactivate(m);
 				} else {
 					vm_page_deactivate(m);
 				}
 			} else {
 				vm_pageq_requeue(m);
 			}
 		}
 		m = next;
 	}
 	s = splvm();
 
 	/*
 	 * We try to maintain some *really* free pages, this allows interrupt
 	 * code to be guaranteed space.  Since both cache and free queues 
 	 * are considered basically 'free', moving pages from cache to free
 	 * does not effect other calculations.
 	 */
 	while (cnt.v_free_count < cnt.v_free_reserved) {
 		static int cache_rover = 0;
 
 		if ((m = vm_page_select_cache(cache_rover)) == NULL)
 			break;
 		cache_rover = (m->pc + PQ_PRIME2) & PQ_L2_MASK;
 		object = m->object;
 		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 		vm_page_busy(m);
 		vm_page_free(m);
 		VM_OBJECT_UNLOCK(object);
 		cnt.v_dfree++;
 	}
 	splx(s);
 	vm_page_unlock_queues();
 #if !defined(NO_SWAPPING)
 	/*
 	 * Idle process swapout -- run once per second.
 	 */
 	if (vm_swap_idle_enabled) {
 		static long lsec;
 		if (time_second != lsec) {
 			vm_pageout_req_swapout |= VM_SWAP_IDLE;
 			vm_req_vmdaemon();
 			lsec = time_second;
 		}
 	}
 #endif
 		
 	/*
 	 * If we didn't get enough free pages, and we have skipped a vnode
 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
 	 * if we did not get enough free pages.
 	 */
 	if (vm_paging_target() > 0) {
 		if (vnodes_skipped && vm_page_count_min())
 			(void) speedup_syncer();
 #if !defined(NO_SWAPPING)
 		if (vm_swap_enabled && vm_page_count_target()) {
 			vm_req_vmdaemon();
 			vm_pageout_req_swapout |= VM_SWAP_NORMAL;
 		}
 #endif
 	}
 
 	/*
 	 * If we are critically low on one of RAM or swap and low on
 	 * the other, kill the largest process.  However, we avoid
 	 * doing this on the first pass in order to give ourselves a
 	 * chance to flush out dirty vnode-backed pages and to allow
 	 * active pages to be moved to the inactive queue and reclaimed.
 	 *
 	 * We keep the process bigproc locked once we find it to keep anyone
 	 * from messing with it; however, there is a possibility of
 	 * deadlock if process B is bigproc and one of it's child processes
 	 * attempts to propagate a signal to B while we are waiting for A's
 	 * lock while walking this list.  To avoid this, we don't block on
 	 * the process lock but just skip a process if it is already locked.
 	 */
 	if (pass != 0 &&
 	    ((swap_pager_avail < 64 && vm_page_count_min()) ||
 	     (swap_pager_full && vm_paging_target() > 0))) {
 		bigproc = NULL;
 		bigsize = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			int breakout;
 
 			if (PROC_TRYLOCK(p) == 0)
 				continue;
 			/*
 			 * If this is a system or protected process, skip it.
 			 */
 			if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) ||
 			    (p->p_flag & P_PROTECTED) ||
 			    ((p->p_pid < 48) && (swap_pager_avail != 0))) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * If the process is in a non-running type state,
 			 * don't touch it.  Check all the threads individually.
 			 */
 			mtx_lock_spin(&sched_lock);
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td)) {
 					breakout = 1;
 					break;
 				}
 			}
 			if (breakout) {
 				mtx_unlock_spin(&sched_lock);
 				PROC_UNLOCK(p);
 				continue;
 			}
 			mtx_unlock_spin(&sched_lock);
 			/*
 			 * get the process size
 			 */
 			if (!vm_map_trylock_read(&p->p_vmspace->vm_map)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			size = vmspace_swap_count(p->p_vmspace);
 			vm_map_unlock_read(&p->p_vmspace->vm_map);
 			size += vmspace_resident_count(p->p_vmspace);
 			/*
 			 * if the this process is bigger than the biggest one
 			 * remember it.
 			 */
 			if (size > bigsize) {
 				if (bigproc != NULL)
 					PROC_UNLOCK(bigproc);
 				bigproc = p;
 				bigsize = size;
 			} else
 				PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		if (bigproc != NULL) {
-			struct ksegrp *kg;
 			killproc(bigproc, "out of swap space");
 			mtx_lock_spin(&sched_lock);
-			FOREACH_KSEGRP_IN_PROC(bigproc, kg) {
-				sched_nice(kg, PRIO_MIN); /* XXXKSE ??? */
-			}
+			sched_nice(bigproc, PRIO_MIN);
 			mtx_unlock_spin(&sched_lock);
 			PROC_UNLOCK(bigproc);
 			wakeup(&cnt.v_free_count);
 		}
 	}
 	mtx_unlock(&Giant);
 }
 
 /*
  * This routine tries to maintain the pseudo LRU active queue,
  * so that during long periods of time where there is no paging,
  * that some statistic accumulation still occurs.  This code
  * helps the situation where paging just starts to occur.
  */
 static void
 vm_pageout_page_stats()
 {
 	vm_page_t m,next;
 	int pcount,tpcount;		/* Number of pages to check */
 	static int fullintervalcount = 0;
 	int page_shortage;
 	int s0;
 
 	page_shortage = 
 	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
 
 	if (page_shortage <= 0)
 		return;
 
 	s0 = splvm();
 	vm_page_lock_queues();
 	pcount = cnt.v_active_count;
 	fullintervalcount += vm_pageout_stats_interval;
 	if (fullintervalcount < vm_pageout_full_stats_interval) {
 		tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count;
 		if (pcount > tpcount)
 			pcount = tpcount;
 	} else {
 		fullintervalcount = 0;
 	}
 
 	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
 	while ((m != NULL) && (pcount-- > 0)) {
 		int actcount;
 
 		KASSERT(m->queue == PQ_ACTIVE,
 		    ("vm_pageout_page_stats: page %p isn't active", m));
 
 		next = TAILQ_NEXT(m, pageq);
 		/*
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
 		    (m->flags & PG_BUSY) ||
 		    (m->hold_count != 0)) {
 			vm_pageq_requeue(m);
 			m = next;
 			continue;
 		}
 
 		actcount = 0;
 		if (m->flags & PG_REFERENCED) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			actcount += 1;
 		}
 
 		actcount += pmap_ts_referenced(m);
 		if (actcount) {
 			m->act_count += ACT_ADVANCE + actcount;
 			if (m->act_count > ACT_MAX)
 				m->act_count = ACT_MAX;
 			vm_pageq_requeue(m);
 		} else {
 			if (m->act_count == 0) {
 				/*
 				 * We turn off page access, so that we have
 				 * more accurate RSS stats.  We don't do this
 				 * in the normal page deactivation when the
 				 * system is loaded VM wise, because the
 				 * cost of the large number of page protect
 				 * operations would be higher than the value
 				 * of doing the operation.
 				 */
 				pmap_remove_all(m);
 				vm_page_deactivate(m);
 			} else {
 				m->act_count -= min(m->act_count, ACT_DECLINE);
 				vm_pageq_requeue(m);
 			}
 		}
 
 		m = next;
 	}
 	vm_page_unlock_queues();
 	splx(s0);
 }
 
 /*
  *	vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout()
 {
 	int error, pass, s;
 
 	/*
 	 * Initialize some paging parameters.
 	 */
 	cnt.v_interrupt_free_min = 2;
 	if (cnt.v_page_count < 2000)
 		vm_pageout_page_count = 8;
 
 	/*
 	 * v_free_reserved needs to include enough for the largest
 	 * swap pager structures plus enough for any pv_entry structs
 	 * when paging. 
 	 */
 	if (cnt.v_page_count > 1024)
 		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
 	else
 		cnt.v_free_min = 4;
 	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
 	    cnt.v_interrupt_free_min;
 	cnt.v_free_reserved = vm_pageout_page_count +
 	    cnt.v_pageout_free_min + (cnt.v_page_count / 768) + PQ_L2_SIZE;
 	cnt.v_free_severe = cnt.v_free_min / 2;
 	cnt.v_free_min += cnt.v_free_reserved;
 	cnt.v_free_severe += cnt.v_free_reserved;
 
 	/*
 	 * v_free_target and v_cache_min control pageout hysteresis.  Note
 	 * that these are more a measure of the VM cache queue hysteresis
 	 * then the VM free queue.  Specifically, v_free_target is the
 	 * high water mark (free+cache pages).
 	 *
 	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
 	 * low water mark, while v_free_min is the stop.  v_cache_min must
 	 * be big enough to handle memory needs while the pageout daemon
 	 * is signalled and run to free more pages.
 	 */
 	if (cnt.v_free_count > 6144)
 		cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
 	else
 		cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
 
 	if (cnt.v_free_count > 2048) {
 		cnt.v_cache_min = cnt.v_free_target;
 		cnt.v_cache_max = 2 * cnt.v_cache_min;
 		cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
 	} else {
 		cnt.v_cache_min = 0;
 		cnt.v_cache_max = 0;
 		cnt.v_inactive_target = cnt.v_free_count / 4;
 	}
 	if (cnt.v_inactive_target > cnt.v_free_count / 3)
 		cnt.v_inactive_target = cnt.v_free_count / 3;
 
 	/* XXX does not really belong here */
 	if (vm_page_max_wired == 0)
 		vm_page_max_wired = cnt.v_free_count / 3;
 
 	if (vm_pageout_stats_max == 0)
 		vm_pageout_stats_max = cnt.v_free_target;
 
 	/*
 	 * Set interval in seconds for stats scan.
 	 */
 	if (vm_pageout_stats_interval == 0)
 		vm_pageout_stats_interval = 5;
 	if (vm_pageout_full_stats_interval == 0)
 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
 
 	/*
 	 * Set maximum free per pass
 	 */
 	if (vm_pageout_stats_free_max == 0)
 		vm_pageout_stats_free_max = 5;
 
 	swap_pager_swap_init();
 	pass = 0;
 	/*
 	 * The pageout daemon is never done, so loop forever.
 	 */
 	while (TRUE) {
 		s = splvm();
 		vm_page_lock_queues();
 		/*
 		 * If we have enough free memory, wakeup waiters.  Do
 		 * not clear vm_pages_needed until we reach our target,
 		 * otherwise we may be woken up over and over again and
 		 * waste a lot of cpu.
 		 */
 		if (vm_pages_needed && !vm_page_count_min()) {
 			if (!vm_paging_needed())
 				vm_pages_needed = 0;
 			wakeup(&cnt.v_free_count);
 		}
 		if (vm_pages_needed) {
 			/*
 			 * Still not done, take a second pass without waiting
 			 * (unlimited dirty cleaning), otherwise sleep a bit
 			 * and try again.
 			 */
 			++pass;
 			if (pass > 1)
 				msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM,
 				       "psleep", hz/2);
 		} else {
 			/*
 			 * Good enough, sleep & handle stats.  Prime the pass
 			 * for the next run.
 			 */
 			if (pass > 1)
 				pass = 1;
 			else
 				pass = 0;
 			error = msleep(&vm_pages_needed, &vm_page_queue_mtx, PVM,
 				    "psleep", vm_pageout_stats_interval * hz);
 			if (error && !vm_pages_needed) {
 				vm_page_unlock_queues();
 				splx(s);
 				pass = 0;
 				vm_pageout_page_stats();
 				continue;
 			}
 		}
 		if (vm_pages_needed)
 			cnt.v_pdwakeups++;
 		vm_page_unlock_queues();
 		splx(s);
 		vm_pageout_scan(pass);
 	}
 }
 
 /*
  * Unless the page queue lock is held by the caller, this function
  * should be regarded as advisory.  Specifically, the caller should
  * not msleep() on &cnt.v_free_count following this function unless
  * the page queue lock is held until the msleep() is performed.
  */
 void
 pagedaemon_wakeup()
 {
 
 	if (!vm_pages_needed && curthread->td_proc != pageproc) {
 		vm_pages_needed = 1;
 		wakeup(&vm_pages_needed);
 	}
 }
 
 #if !defined(NO_SWAPPING)
 static void
 vm_req_vmdaemon()
 {
 	static int lastrun = 0;
 
 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
 		wakeup(&vm_daemon_needed);
 		lastrun = ticks;
 	}
 }
 
 static void
 vm_daemon()
 {
 	struct rlimit rsslim;
 	struct proc *p;
 	struct thread *td;
 	int breakout;
 
 	mtx_lock(&Giant);
 	while (TRUE) {
 		tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
 		if (vm_pageout_req_swapout) {
 			swapout_procs(vm_pageout_req_swapout);
 			vm_pageout_req_swapout = 0;
 		}
 		/*
 		 * scan the processes for exceeding their rlimits or if
 		 * process is swapped out -- deactivate pages
 		 */
 		sx_slock(&allproc_lock);
 		LIST_FOREACH(p, &allproc, p_list) {
 			vm_pindex_t limit, size;
 
 			/*
 			 * if this is a system process or if we have already
 			 * looked at this process, skip it.
 			 */
 			PROC_LOCK(p);
 			if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
 			mtx_lock_spin(&sched_lock);
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td)) {
 					breakout = 1;
 					break;
 				}
 			}
 			mtx_unlock_spin(&sched_lock);
 			if (breakout) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * get a limit
 			 */
 			lim_rlimit(p, RLIMIT_RSS, &rsslim);
 			limit = OFF_TO_IDX(
 			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
 
 			/*
 			 * let processes that are swapped out really be
 			 * swapped out set the limit to nothing (will force a
 			 * swap-out.)
 			 */
 			if ((p->p_sflag & PS_INMEM) == 0)
 				limit = 0;	/* XXX */
 			PROC_UNLOCK(p);
 
 			size = vmspace_resident_count(p->p_vmspace);
 			if (limit >= 0 && size >= limit) {
 				vm_pageout_map_deactivate_pages(
 				    &p->p_vmspace->vm_map, limit);
 			}
 		}
 		sx_sunlock(&allproc_lock);
 	}
 }
 #endif			/* !defined(NO_SWAPPING) */