Index: head/sys/kern/kern_acct.c
===================================================================
--- head/sys/kern/kern_acct.c	(revision 167210)
+++ head/sys/kern/kern_acct.c	(revision 167211)
@@ -1,580 +1,578 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Copyright (c) 1994 Christopher G. Demetriou
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_acct.c	8.1 (Berkeley) 6/14/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/acct.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/tty.h>
 #include <sys/vnode.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * The routines implemented in this file are described in:
  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  *	    UNIX Operating System (Addison Welley, 1989)
  * on pages 62-63.
  *
  * Arguably, to simplify accounting operations, this mechanism should
  * be replaced by one in which an accounting log file (similar to /dev/klog)
  * is read by a user process, etc.  However, that has its own problems.
  */
 
 /*
  * Internal accounting functions.
  * The former's operation is described in Leffler, et al., and the latter
  * was provided by UCB with the 4.4BSD-Lite release
  */
 static comp_t	encode_comp_t(u_long, u_long);
 static void	acctwatch(void);
 static void	acct_thread(void *);
 static int	acct_disable(struct thread *);
 
 /*
  * Accounting vnode pointer, saved vnode pointer, and flags for each.
  * acct_sx protects against changes to the active vnode and credentials
  * while accounting records are being committed to disk.
  */
 static int		 acct_configured;
 static int		 acct_suspended;
 static struct vnode	*acct_vp;
 static struct ucred	*acct_cred;
 static int		 acct_flags;
 static struct sx	 acct_sx;
 
 SX_SYSINIT(acct, &acct_sx, "acct_sx");
 
 /*
  * State of the accounting kthread.
  */
 static int		 acct_state;
 
 #define	ACCT_RUNNING	1	/* Accounting kthread is running. */
 #define	ACCT_EXITREQ	2	/* Accounting kthread should exit. */
 
 /*
  * Values associated with enabling and disabling accounting
  */
 static int acctsuspend = 2;	/* stop accounting when < 2% free space left */
 SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
 	&acctsuspend, 0, "percentage of free disk space below which accounting stops");
 
 static int acctresume = 4;	/* resume when free space risen to > 4% */
 SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
 	&acctresume, 0, "percentage of free disk space above which accounting resumes");
 
 static int acctchkfreq = 15;	/* frequency (in seconds) to check space */
 
 static int
 sysctl_acct_chkfreq(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	/* Write out the old value. */
 	error = SYSCTL_OUT(req, &acctchkfreq, sizeof(int));
 	if (error || req->newptr == NULL)
 		return (error);
 
 	/* Read in and verify the new value. */
 	error = SYSCTL_IN(req, &value, sizeof(int));
 	if (error)
 		return (error);
 	if (value <= 0)
 		return (EINVAL);
 	acctchkfreq = value;
 	return (0);
 }
 SYSCTL_PROC(_kern, OID_AUTO, acct_chkfreq, CTLTYPE_INT|CTLFLAG_RW,
     &acctchkfreq, 0, sysctl_acct_chkfreq, "I",
     "frequency for checking the free space");
 
 SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0,
 	"Accounting configured or not");
 
 SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0,
 	"Accounting suspended or not");
 
 /*
- * Accounting system call.  Written based on the specification and
- * previous implementation done by Mark Tinguely.
- *
- * MPSAFE
+ * Accounting system call.  Written based on the specification and previous
+ * implementation done by Mark Tinguely.
  */
 int
 acct(struct thread *td, struct acct_args *uap)
 {
 	struct nameidata nd;
 	int error, flags, vfslocked;
 
 	error = priv_check(td, PRIV_ACCT);
 	if (error)
 		return (error);
 
 	/*
 	 * If accounting is to be started to a file, open that file for
 	 * appending and make sure it's a 'normal'.
 	 */
 	if (uap->path != NULL) {
 		NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1,
 		    UIO_USERSPACE, uap->path, td);
 		flags = FWRITE | O_APPEND;
 		error = vn_open(&nd, &flags, 0, -1);
 		if (error)
 			return (error);
 		vfslocked = NDHASGIANT(&nd);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
 		error = mac_check_system_acct(td->td_ucred, nd.ni_vp);
 		if (error) {
 			VOP_UNLOCK(nd.ni_vp, 0, td);
 			vn_close(nd.ni_vp, flags, td->td_ucred, td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			return (error);
 		}
 #endif
 		VOP_UNLOCK(nd.ni_vp, 0, td);
 		if (nd.ni_vp->v_type != VREG) {
 			vn_close(nd.ni_vp, flags, td->td_ucred, td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			return (EACCES);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 #ifdef MAC
 	} else {
 		error = mac_check_system_acct(td->td_ucred, NULL);
 		if (error)
 			return (error);
 #endif
 	}
 
 	/*
 	 * Disallow concurrent access to the accounting vnode while we swap
 	 * it out, in order to prevent access after close.
 	 */
 	sx_xlock(&acct_sx);
 
 	/*
 	 * If accounting was previously enabled, kill the old space-watcher,
 	 * close the file, and (if no new file was specified, leave).  Reset
 	 * the suspended state regardless of whether accounting remains
 	 * enabled.
 	 */
 	acct_suspended = 0;
 	if (acct_vp != NULL) {
 		vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
 		error = acct_disable(td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (uap->path == NULL) {
 		if (acct_state & ACCT_RUNNING) {
 			acct_state |= ACCT_EXITREQ;
 			wakeup(&acct_state);
 		}
 		sx_xunlock(&acct_sx);
 		return (error);
 	}
 
 	/*
 	 * Save the new accounting file vnode, and schedule the new
 	 * free space watcher.
 	 */
 	acct_vp = nd.ni_vp;
 	acct_cred = crhold(td->td_ucred);
 	acct_flags = flags;
 	if (acct_state & ACCT_RUNNING)
 		acct_state &= ~ACCT_EXITREQ;
 	else {
 		/*
 		 * Try to start up an accounting kthread.  We may start more
 		 * than one, but if so the extras will commit suicide as
 		 * soon as they start up.
 		 */
 		error = kthread_create(acct_thread, NULL, NULL, 0, 0,
 		    "accounting");
 		if (error) {
 			vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
 			(void) vn_close(acct_vp, acct_flags, acct_cred, td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			crfree(acct_cred);
 			acct_configured = 0;
 			acct_vp = NULL;
 			acct_cred = NULL;
 			acct_flags = 0;
 			sx_xunlock(&acct_sx);
 			log(LOG_NOTICE, "Unable to start accounting thread\n");
 			return (error);
 		}
 	}
 	acct_configured = 1;
 	sx_xunlock(&acct_sx);
 	log(LOG_NOTICE, "Accounting enabled\n");
 	return (error);
 }
 
 /*
  * Disable currently in-progress accounting by closing the vnode, dropping
  * our reference to the credential, and clearing the vnode's flags.
  */
 static int
 acct_disable(struct thread *td)
 {
 	int error;
 
 	sx_assert(&acct_sx, SX_XLOCKED);
 	error = vn_close(acct_vp, acct_flags, acct_cred, td);
 	crfree(acct_cred);
 	acct_configured = 0;
 	acct_vp = NULL;
 	acct_cred = NULL;
 	acct_flags = 0;
 	log(LOG_NOTICE, "Accounting disabled\n");
 	return (error);
 }
 
 /*
  * Write out process accounting information, on process exit.
  * Data to be written out is specified in Leffler, et al.
  * and are enumerated below.  (They're also noted in the system
  * "acct.h" header file.)
  */
 int
 acct_process(struct thread *td)
 {
 	struct acct acct;
 	struct timeval ut, st, tmp;
 	struct plimit *newlim, *oldlim;
 	struct proc *p;
 	struct rusage *r;
 	int t, ret, vfslocked;
 
 	/*
 	 * Lockless check of accounting condition before doing the hard
 	 * work.
 	 */
 	if (acct_vp == NULL || acct_suspended)
 		return (0);
 
 	sx_slock(&acct_sx);
 
 	/*
 	 * If accounting isn't enabled, don't bother.  Have to check again
 	 * once we own the lock in case we raced with disabling of accounting
 	 * by another thread.
 	 */
 	if (acct_vp == NULL || acct_suspended) {
 		sx_sunlock(&acct_sx);
 		return (0);
 	}
 
 	p = td->td_proc;
 
 	/*
 	 * Get process accounting information.
 	 */
 
 	PROC_LOCK(p);
 	/* (1) The name of the command that ran */
 	bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
 
 	/* (2) The amount of user and system time that was used */
 	calcru(p, &ut, &st);
 	acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec);
 	acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec);
 
 	/* (3) The elapsed time the command ran (and its starting time) */
 	tmp = boottime;
 	timevaladd(&tmp, &p->p_stats->p_start);
 	acct.ac_btime = tmp.tv_sec;
 	microuptime(&tmp);
 	timevalsub(&tmp, &p->p_stats->p_start);
 	acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec);
 
 	/* (4) The average amount of memory used */
 	r = &p->p_stats->p_ru;
 	tmp = ut;
 	timevaladd(&tmp, &st);
 	t = tmp.tv_sec * hz + tmp.tv_usec / tick;
 	if (t)
 		acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
 	else
 		acct.ac_mem = 0;
 
 	/* (5) The number of disk I/O operations done */
 	acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
 
 	/* (6) The UID and GID of the process */
 	acct.ac_uid = p->p_ucred->cr_ruid;
 	acct.ac_gid = p->p_ucred->cr_rgid;
 
 	/* (7) The terminal from which the process was started */
 	SESS_LOCK(p->p_session);
 	if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
 		acct.ac_tty = dev2udev(p->p_pgrp->pg_session->s_ttyp->t_dev);
 	else
 		acct.ac_tty = NODEV;
 	SESS_UNLOCK(p->p_session);
 
 	/* (8) The boolean flags that tell how the process terminated, etc. */
 	acct.ac_flag = p->p_acflag;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Eliminate any file size rlimit.
 	 */
 	newlim = lim_alloc();
 	PROC_LOCK(p);
 	oldlim = p->p_limit;
 	lim_copy(newlim, oldlim);
 	newlim->pl_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	p->p_limit = newlim;
 	PROC_UNLOCK(p);
 	lim_free(oldlim);
 
 	/*
 	 * Write the accounting information to the file.
 	 */
 	vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
 	VOP_LEASE(acct_vp, td, acct_cred, LEASE_WRITE);
 	ret = vn_rdwr(UIO_WRITE, acct_vp, (caddr_t)&acct, sizeof (acct),
 	    (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, acct_cred, NOCRED,
 	    (int *)0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	sx_sunlock(&acct_sx);
 	return (ret);
 }
 
 /*
  * Encode_comp_t converts from ticks in seconds and microseconds
  * to ticks in 1/AHZ seconds.  The encoding is described in
  * Leffler, et al., on page 63.
  */
 
 #define	MANTSIZE	13			/* 13 bit mantissa. */
 #define	EXPSIZE		3			/* Base 8 (3 bit) exponent. */
 #define	MAXFRACT	((1 << MANTSIZE) - 1)	/* Maximum fractional value. */
 
 static comp_t
 encode_comp_t(u_long s, u_long us)
 {
 	int exp, rnd;
 
 	exp = 0;
 	rnd = 0;
 	s *= AHZ;
 	s += us / (1000000 / AHZ);	/* Maximize precision. */
 
 	while (s > MAXFRACT) {
 	rnd = s & (1 << (EXPSIZE - 1));	/* Round up? */
 		s >>= EXPSIZE;		/* Base 8 exponent == 3 bit shift. */
 		exp++;
 	}
 
 	/* If we need to round up, do it (and handle overflow correctly). */
 	if (rnd && (++s > MAXFRACT)) {
 		s >>= EXPSIZE;
 		exp++;
 	}
 
 	/* Clean it up and polish it off. */
 	exp <<= MANTSIZE;		/* Shift the exponent into place */
 	exp += s;			/* and add on the mantissa. */
 	return (exp);
 }
 
 /*
  * Periodically check the filesystem to see if accounting
  * should be turned on or off.  Beware the case where the vnode
  * has been vgone()'d out from underneath us, e.g. when the file
  * system containing the accounting file has been forcibly unmounted.
  */
 /* ARGSUSED */
 static void
 acctwatch(void)
 {
 	struct statfs sb;
 	int vfslocked;
 
 	sx_assert(&acct_sx, SX_XLOCKED);
 
 	/*
 	 * If accounting was disabled before our kthread was scheduled,
 	 * then acct_vp might be NULL.  If so, just ask our kthread to
 	 * exit and return.
 	 */
 	if (acct_vp == NULL) {
 		acct_state |= ACCT_EXITREQ;
 		return;
 	}
 
 	/*
 	 * If our vnode is no longer valid, tear it down and signal the
 	 * accounting thread to die.
 	 */
 	vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
 	if (acct_vp->v_type == VBAD) {
 		(void) acct_disable(NULL);
 		VFS_UNLOCK_GIANT(vfslocked);
 		acct_state |= ACCT_EXITREQ;
 		return;
 	}
 
 	/*
 	 * Stopping here is better than continuing, maybe it will be VBAD
 	 * next time around.
 	 */
 	if (VFS_STATFS(acct_vp->v_mount, &sb, curthread) < 0) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		return;
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (acct_suspended) {
 		if (sb.f_bavail > (int64_t)(acctresume * sb.f_blocks /
 		    100)) {
 			acct_suspended = 0;
 			log(LOG_NOTICE, "Accounting resumed\n");
 		}
 	} else {
 		if (sb.f_bavail <= (int64_t)(acctsuspend * sb.f_blocks /
 		    100)) {
 			acct_suspended = 1;
 			log(LOG_NOTICE, "Accounting suspended\n");
 		}
 	}
 }
 
 /*
  * The main loop for the dedicated kernel thread that periodically calls
  * acctwatch().
  */
 static void
 acct_thread(void *dummy)
 {
 	u_char pri;
 
 	/* This is a low-priority kernel thread. */
 	pri = PRI_MAX_KERN;
 	mtx_lock_spin(&sched_lock);
 	sched_prio(curthread, pri);
 	mtx_unlock_spin(&sched_lock);
 
 	/* If another accounting kthread is already running, just die. */
 	sx_xlock(&acct_sx);
 	if (acct_state & ACCT_RUNNING) {
 		sx_xunlock(&acct_sx);
 		kthread_exit(0);
 	}
 	acct_state |= ACCT_RUNNING;
 
 	/* Loop until we are asked to exit. */
 	while (!(acct_state & ACCT_EXITREQ)) {
 
 		/* Perform our periodic checks. */
 		acctwatch();
 
 		/*
 		 * We check this flag again before sleeping since the
 		 * acctwatch() might have shut down accounting and asked us
 		 * to exit.
 		 */
 		if (!(acct_state & ACCT_EXITREQ)) {
 			sx_xunlock(&acct_sx);
 			tsleep(&acct_state, pri, "-", acctchkfreq * hz);
 			sx_xlock(&acct_sx);
 		}
 	}
 
 	/*
 	 * Acknowledge the exit request and shutdown.  We clear both the
 	 * exit request and running flags.
 	 */
 	acct_state = 0;
 	sx_xunlock(&acct_sx);
 	kthread_exit(0);
 }
Index: head/sys/kern/kern_acl.c
===================================================================
--- head/sys/kern/kern_acl.c	(revision 167210)
+++ head/sys/kern/kern_acl.c	(revision 167211)
@@ -1,455 +1,431 @@
 /*-
  * Copyright (c) 1999-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
  * Developed by the TrustedBSD Project.
  *
  * ACL system calls and other functions common across different ACL types.
  * Type-specific routines go into subr_acl_<type>.c.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/acl.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 uma_zone_t	acl_zone;
 static int	vacl_set_acl(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 static int	vacl_get_acl(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 
 /*
  * These calls wrap the real vnode operations, and are called by the syscall
  * code once the syscall has converted the path or file descriptor to a vnode
  * (unlocked).  The aclp pointer is assumed still to point to userland, so
  * this should not be consumed within the kernel except by syscall code.
  * Other code should directly invoke VOP_{SET,GET}ACL.
  */
 
 /*
  * Given a vnode, set its ACL.
  */
 static int
 vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernacl;
 	struct mount *mp;
 	int error;
 
 	error = copyin(aclp, &inkernacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 	error = mac_check_vnode_setacl(td->td_ucred, vp, type, &inkernacl);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return(error);
 }
 
 /*
  * Given a vnode, get its ACL.
  */
 static int
 vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 	error = mac_check_vnode_getacl(td->td_ucred, vp, type);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	if (error == 0)
 		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
 	return (error);
 }
 
 /*
  * Given a vnode, delete its ACL.
  */
 static int
 vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
 {
 	struct mount *mp;
 	int error;
 
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 	error = mac_check_vnode_deleteacl(td->td_ucred, vp, type);
 	if (error)
 		goto out;
 #endif
 	error = VOP_SETACL(vp, type, 0, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Given a vnode, check whether an ACL is appropriate for it
  */
 static int
 vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
 	return (error);
 }
 
 /*
  * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.  Don't
  * need to lock, as the vacl_ code will get/release any locks required.
  */
 
 /*
  * Given a file path, get an ACL for it
- *
- * MPSAFE
  */
 int
 __acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, get an ACL for it; don't follow links.
- *
- * MPSAFE
  */
 int
 __acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
- * Given a file path, set an ACL for it
- *
- * MPSAFE
+ * Given a file path, set an ACL for it.
  */
 int
 __acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it; don't follow links.
- *
- * MPSAFE
  */
 int
 __acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file descriptor, get an ACL for it
- *
- * MPSAFE
  */
 int
 __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
  * Given a file descriptor, set an ACL for it
- *
- * MPSAFE
  */
 int
 __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
- *
- * MPSAFE
  */
 int
 __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, uap->type);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it; don't follow links.
- *
- * MPSAFE
  */
 int
 __acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, uap->type);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
- *
- * MPSAFE
  */
 int
 __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_delete(td, fp->f_vnode, uap->type);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
- * Given a file path, check an ACL for it
- *
- * MPSAFE
+ * Given a file path, check an ACL for it.
  */
 int
 __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
 {
 	struct nameidata	nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it; don't follow links.
- *
- * MPSAFE
  */
 int
 __acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
 {
 	struct nameidata	nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
- * Given a file descriptor, check an ACL for it
- *
- * MPSAFE
+ * Given a file descriptor, check an ACL for it.
  */
 int
 __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /* ARGUSED */
 
 static void
 aclinit(void *dummy __unused)
 {
 
 	acl_zone = uma_zcreate("ACL UMA zone", sizeof(struct acl),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 SYSINIT(acls, SI_SUB_ACL, SI_ORDER_FIRST, aclinit, NULL)
Index: head/sys/kern/kern_context.c
===================================================================
--- head/sys/kern/kern_context.c	(revision 167210)
+++ head/sys/kern/kern_context.c	(revision 167211)
@@ -1,136 +1,130 @@
 /*-
  * Copyright (c) 2002 Daniel M. Eischen <deischen@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/ucontext.h>
 
 /*
  * The first two fields of a ucontext_t are the signal mask and
  * the machine context.  The next field is uc_link; we want to
  * avoid destroying the link when copying out contexts.
  */
 #define	UC_COPY_SIZE	offsetof(ucontext_t, uc_link)
 
 #ifndef _SYS_SYSPROTO_H_
 struct getcontext_args {
 	struct __ucontext *ucp;
 }
 struct setcontext_args {
 	const struct __ucontext_t *ucp;
 }
 struct swapcontext_args {
 	struct __ucontext *oucp;
 	const struct __ucontext_t *ucp;
 }
 #endif
 
-/*
- * MPSAFE
- */
 int
 getcontext(struct thread *td, struct getcontext_args *uap)
 {
 	ucontext_t uc;
 	int ret;
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		ret = copyout(&uc, uap->ucp, UC_COPY_SIZE);
 	}
 	return (ret);
 }
 
-/*
- * MPSAFE
- */
 int
 setcontext(struct thread *td, struct setcontext_args *uap)
 {
 	ucontext_t uc;
 	int ret;	
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
 		if (ret == 0) {
 			ret = set_mcontext(td, &uc.uc_mcontext);
 			if (ret == 0) {
 				SIG_CANTMASK(uc.uc_sigmask);
 				PROC_LOCK(td->td_proc);
 				td->td_sigmask = uc.uc_sigmask;
 				PROC_UNLOCK(td->td_proc);
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
 
 int
 swapcontext(struct thread *td, struct swapcontext_args *uap)
 {
 	ucontext_t uc;
 	int ret;	
 
 	if (uap->oucp == NULL || uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		ret = copyout(&uc, uap->oucp, UC_COPY_SIZE);
 		if (ret == 0) {
 			ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
 			if (ret == 0) {
 				ret = set_mcontext(td, &uc.uc_mcontext);
 				if (ret == 0) {
 					SIG_CANTMASK(uc.uc_sigmask);
 					PROC_LOCK(td->td_proc);
 					td->td_sigmask = uc.uc_sigmask;
 					PROC_UNLOCK(td->td_proc);
 				}
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
Index: head/sys/kern/kern_descrip.c
===================================================================
--- head/sys/kern/kern_descrip.c	(revision 167210)
+++ head/sys/kern/kern_descrip.c	(revision 167211)
@@ -1,2752 +1,2722 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
 		     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 
 static uma_zone_t file_zone;
 
 
 /* How to treat 'new' parameter when allocating a fd for do_dup(). */
 enum dup_type { DUP_VARIABLE, DUP_FIXED };
 
 static int do_dup(struct thread *td, enum dup_type type, int old, int new,
     register_t *retval);
 static int	fd_first_free(struct filedesc *, int, int);
 static int	fd_last_used(struct filedesc *, int, int);
 static void	fdgrowtable(struct filedesc *, int);
 static int	fdrop_locked(struct file *fp, struct thread *td);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 
 /*
  * A process is initially started out with NDFILE descriptors stored within
  * this structure, selected to be enough for typical applications based on
  * the historical limit of 20 open files (and the usage of descriptors by
  * shells).  If these descriptors are exhausted, a larger descriptor table
  * may be allocated, up to a process' resource limit; the internal arrays
  * are then unused.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * Storage required per open file descriptor.
  */
 #define OFILESIZE (sizeof(struct file *) + sizeof(char))
 
 /*
  * Basic allocation of descriptors:
  * one of the above, plus arrays for NDFILE descriptors.
  */
 struct filedesc0 {
 	struct	filedesc fd_fd;
 	/*
 	 * These arrays are used when the number of open files is
 	 * <= NDFILE, and are then pointed to by the pointers above.
 	 */
 	struct	file *fd_dfiles[NDFILE];
 	char	fd_dfileflags[NDFILE];
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 struct filelist filehead;	/* head of list of open files */
 int openfiles;			/* actual number of open files */
 struct sx filelist_lock;	/* sx to protect filelist */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /* A mutex to protect the association between a proc and filedesc. */
 static struct mtx	fdesc_mtx;
 
 /*
  * Find the first zero bit in the given bitmap, starting at low and not
  * exceeding size - 1.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the highest non-zero bit in the given bitmap, starting at low and
  * not exceeding size - 1.
  */
 static int
 fd_last_used(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
 	if (low >= size)
 		return (-1);
 
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
 		if ((mask &= map[off]) != 0)
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
 	for (minoff = NDSLOT(low); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (low - 1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
         KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
             ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd already used"));
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 	KASSERT(fdisused(fdp, fd),
 	    ("fd is already unused"));
 	KASSERT(fdp->fd_ofiles[fd] == NULL,
 	    ("fd is still in use"));
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
 		fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	td->td_retval[0] =
 	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
 		    td->td_retval));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 fcntl(struct thread *td, struct fcntl_args *uap)
 {
 	struct flock fl;
 	intptr_t arg;
 	int error;
 
 	error = 0;
 	switch (uap->cmd) {
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
 		arg = (intptr_t)&fl;
 		break;
 	default:
 		arg = uap->arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
 	if (error)
 		return (error);
 	if (uap->cmd == F_GETLK)
 		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp;
 	struct proc *p;
 	char *pop;
 	struct vnode *vp;
 	u_int newmin;
 	int error, flg, tmp;
 	int giant_locked;
 
 	/*
 	 * XXXRW: Some fcntl() calls require Giant -- others don't.  Try to
 	 * avoid grabbing Giant for calls we know don't need it.
 	 */
 	switch (cmd) {
 	case F_DUPFD:
 	case F_GETFD:
 	case F_SETFD:
 	case F_GETFL:
 		giant_locked = 0;
 		break;
 
 	default:
 		giant_locked = 1;
 		mtx_lock(&Giant);
 	}
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 	FILEDESC_LOCK(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		error = EBADF;
 		goto done2;
 	}
 	pop = &fdp->fd_ofileflags[fd];
 
 	switch (cmd) {
 	case F_DUPFD:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		FILEDESC_UNLOCK(fdp);
 		newmin = arg;
 		PROC_LOCK(p);
 		if (newmin >= lim_cur(p, RLIMIT_NOFILE) ||
 		    newmin >= maxfilesperproc) {
 			PROC_UNLOCK(p);
 			error = EINVAL;
 			break;
 		}
 		PROC_UNLOCK(p);
 		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
 		break;
 
 	case F_GETFD:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		*pop = (*pop &~ UF_EXCLOSE) |
 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		FILE_LOCK(fp);
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		FILE_UNLOCK(fp);
 		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_SETFL:
 		mtx_assert(&Giant, MA_OWNED);
 		FILE_LOCK(fp);
 		fhold_locked(fp);
 		fp->f_flag &= ~FCNTLFLAGS;
 		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		FILE_UNLOCK(fp);
 		FILEDESC_UNLOCK(fdp);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		FILE_LOCK(fp);
 		fp->f_flag &= ~FNONBLOCK;
 		FILE_UNLOCK(fp);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		mtx_assert(&Giant, MA_OWNED);
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		mtx_assert(&Giant, MA_OWNED);
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLKW:
 		mtx_assert(&Giant, MA_OWNED);
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 		mtx_assert(&Giant, MA_OWNED);
 		if (fp->f_type != DTYPE_VNODE) {
 			FILEDESC_UNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			if (fp->f_offset < 0 ||
 			    (flp->l_start > 0 &&
 			     fp->f_offset > OFF_MAX - flp->l_start)) {
 				FILEDESC_UNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
 			flp->l_start += fp->f_offset;
 		}
 
 		/*
 		 * VOP_ADVLOCK() may block.
 		 */
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		vp = fp->f_vnode;
 
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, F_POSIX);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		/* Check for race with close */
 		FILEDESC_LOCK_FAST(fdp);
 		if ((unsigned) fd >= fdp->fd_nfiles ||
 		    fp != fdp->fd_ofiles[fd]) {
 			FILEDESC_UNLOCK_FAST(fdp);
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 					   F_UNLCK, flp, F_POSIX);
 		} else
 			FILEDESC_UNLOCK_FAST(fdp);
 		fdrop(fp, td);
 		break;
 
 	case F_GETLK:
 		mtx_assert(&Giant, MA_OWNED);
 		if (fp->f_type != DTYPE_VNODE) {
 			FILEDESC_UNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			FILEDESC_UNLOCK(fdp);
 			error = EINVAL;
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			if ((flp->l_start > 0 &&
 			    fp->f_offset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			     fp->f_offset < OFF_MIN - flp->l_start)) {
 				FILEDESC_UNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
 			flp->l_start += fp->f_offset;
 		}
 		/*
 		 * VOP_ADVLOCK() may block.
 		 */
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 	default:
 		FILEDESC_UNLOCK(fdp);
 		error = EINVAL;
 		break;
 	}
 done2:
 	if (giant_locked)
 		mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Common code for dup, dup2, and fcntl(F_DUPFD).
  */
 static int
 do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	struct file *fp;
 	struct file *delfp;
 	int error, holdleaders, maxfd;
 
 	KASSERT((type == DUP_VARIABLE || type == DUP_FIXED),
 	    ("invalid dup type %d", type));
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to.
 	 */
 	if (old < 0 || new < 0)
 		return (EBADF);
 	PROC_LOCK(p);
 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	if (new >= maxfd)
 		return (EMFILE);
 
 	FILEDESC_LOCK(fdp);
 	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	if (type == DUP_FIXED && old == new) {
 		*retval = new;
 		FILEDESC_UNLOCK(fdp);
 		return (0);
 	}
 	fp = fdp->fd_ofiles[old];
 	fhold(fp);
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.  Since the filedesc
 	 * lock may be temporarily dropped in the process, we have to look
 	 * out for a race.
 	 */
 	if (type == DUP_FIXED) {
 		if (new >= fdp->fd_nfiles)
 			fdgrowtable(fdp, new + 1);
 		if (fdp->fd_ofiles[new] == NULL)
 			fdused(fdp, new);
 	} else {
 		if ((error = fdalloc(td, new, &new)) != 0) {
 			FILEDESC_UNLOCK(fdp);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	/*
 	 * If the old file changed out from under us then treat it as a
 	 * bad file descriptor.  Userland should do its own locking to
 	 * avoid this case.
 	 */
 	if (fdp->fd_ofiles[old] != fp) {
 		/* we've allocated a descriptor which we won't use */
 		if (fdp->fd_ofiles[new] == NULL)
 			fdunused(fdp, new);
 		FILEDESC_UNLOCK(fdp);
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	KASSERT(old != new,
 	    ("new fd is same as old"));
 
 	/*
 	 * Save info on the descriptor being overwritten.  We cannot close
 	 * it without introducing an ownership race for the slot, since we
 	 * need to drop the filedesc lock to call closef().
 	 *
 	 * XXX this duplicates parts of close().
 	 */
 	delfp = fdp->fd_ofiles[new];
 	holdleaders = 0;
 	if (delfp != NULL) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 			holdleaders = 1;
 		}
 	}
 
 	/*
 	 * Duplicate the source descriptor
 	 */
 	fdp->fd_ofiles[new] = fp;
 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
 	if (new > fdp->fd_lastfile)
 		fdp->fd_lastfile = new;
 	*retval = new;
 
 	/*
 	 * If we dup'd over a valid file, we now own the reference to it
 	 * and must dispose of it using closef() semantics (as if a
 	 * close() were performed on it).
 	 *
 	 * XXX this duplicates parts of close().
 	 */
 	if (delfp != NULL) {
 		knote_fdclose(td, new);
 		if (delfp->f_type == DTYPE_MQUEUE)
 			mq_fdclose(td, new, delfp);
 		FILEDESC_UNLOCK(fdp);
 		(void) closef(delfp, td);
 		if (holdleaders) {
 			FILEDESC_LOCK_FAST(fdp);
 			fdp->fd_holdleaderscount--;
 			if (fdp->fd_holdleaderscount == 0 &&
 			    fdp->fd_holdleaderswakeup != 0) {
 				fdp->fd_holdleaderswakeup = 0;
 				wakeup(&fdp->fd_holdleaderscount);
 			}
 			FILEDESC_UNLOCK_FAST(fdp);
 		}
 	} else {
 		FILEDESC_UNLOCK(fdp);
 	}
 	return (0);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	*(sigio->sio_myref) = NULL;
 	if ((sigio)->sio_pgid < 0) {
 		struct pgrp *pg = (sigio)->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		struct proc *p = (sigio)->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	SIGIO_UNLOCK();
 	crfree(sigio->sio_ucred);
 	FREE(sigio, M_SIGIO);
 }
 
 /*
  * Free a list of sigio structures.
  * We only need to lock the SIGIO_LOCK because we have made ourselves
  * inaccessible to callers of fsetown and therefore do not need to lock
  * the proc or pgrp struct for the list manipulation.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio;
 
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 	p = NULL;
 	pg = NULL;
 
 	/*
 	 * Every entry of the list should belong
 	 * to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	}
 
 	SIGIO_LOCK();
 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
 		*(sigio->sio_myref) = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 			PGRP_LOCK(pg);
 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PGRP_UNLOCK(pg);
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 			PROC_LOCK(p);
 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PROC_UNLOCK(p);
 		}
 		SIGIO_UNLOCK();
 		crfree(sigio->sio_ucred);
 		FREE(sigio, M_SIGIO);
 		SIGIO_LOCK();
 	}
 	SIGIO_UNLOCK();
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	ret = 0;
 
 	/* Allocate and fill in the new sigio out of locks. */
 	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	sx_slock(&proctree_lock);
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		PROC_UNLOCK(proc);
 		if (proc->p_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		pgrp = NULL;
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 		PGRP_UNLOCK(pgrp);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		proc = NULL;
 	}
 	funsetown(sigiop);
 	if (pgid > 0) {
 		PROC_LOCK(proc);
 		/*
 		 * Since funsetownlst() is called without the proctree
 		 * locked, we need to check for P_WEXIT.
 		 * XXX: is ESRCH correct?
 		 */
 		if ((proc->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(proc);
 			ret = ESRCH;
 			goto fail;
 		}
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_proc = proc;
 		PROC_UNLOCK(proc);
 	} else {
 		PGRP_LOCK(pgrp);
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 		PGRP_UNLOCK(pgrp);
 	}
 	sx_sunlock(&proctree_lock);
 	SIGIO_LOCK();
 	*sigiop = sigio;
 	SIGIO_UNLOCK();
 	return (0);
 
 fail:
 	sx_sunlock(&proctree_lock);
 	crfree(sigio->sio_ucred);
 	FREE(sigio, M_SIGIO);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(sigiop)
 	struct sigio **sigiop;
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 close(td, uap)
 	struct thread *td;
 	struct close_args *uap;
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(td, fd)
 	struct thread *td;
 	int fd;
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int error;
 	int holdleaders;
 
 	error = 0;
 	holdleaders = 0;
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_LOCK(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	fdp->fd_ofiles[fd] = NULL;
 	fdp->fd_ofileflags[fd] = 0;
 	fdunused(fdp, fd);
 	if (td->td_proc->p_fdtol != NULL) {
 		/*
 		 * Ask fdfree() to sleep to ensure that all relevant
 		 * process leaders can be traversed in closef().
 		 */
 		fdp->fd_holdleaderscount++;
 		holdleaders = 1;
 	}
 
 	/*
 	 * We now hold the fp reference that used to be owned by the descriptor
 	 * array.
 	 * We have to unlock the FILEDESC *AFTER* knote_fdclose to prevent a
 	 * race of the fd getting opened, a knote added, and deleteing a knote
 	 * for the new fd.
 	 */
 	knote_fdclose(td, fd);
 	if (fp->f_type == DTYPE_MQUEUE)
 		mq_fdclose(td, fd, fp);
 	FILEDESC_UNLOCK(fdp);
 
 	error = closef(fp, td);
 	if (holdleaders) {
 		FILEDESC_LOCK_FAST(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 	}
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG(fd, fd);
 
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
 
 	AUDIT_ARG(file, td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 nfstat(struct thread *td, struct nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	int error;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 
 	/* If asynchronous I/O is available, it works for all descriptors. */
 	if (uap->name == _PC_ASYNC_IO) {
 		td->td_retval[0] = async_io_version;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		int vfslocked;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (uap->name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
 		error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Grow the file table to accomodate (at least) nfd descriptors.  This may
  * block and drop the filedesc lock, but it will reacquire it before
  * returning.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct file **ntable;
 	char *nfileflags;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 
 	KASSERT(fdp->fd_nfiles > 0,
 	    ("zero-length file table"));
 
 	/* compute the size of the new table */
 	onfiles = fdp->fd_nfiles;
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/* allocate a new table and (if required) new bitmaps */
 	FILEDESC_UNLOCK(fdp);
 	MALLOC(ntable, struct file **, nnfiles * OFILESIZE,
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	nfileflags = (char *)&ntable[nnfiles];
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
 		MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE,
 		    M_FILEDESC, M_ZERO | M_WAITOK);
 	else
 		nmap = NULL;
 	FILEDESC_LOCK(fdp);
 
 	/*
 	 * We now have new tables ready to go.  Since we dropped the
 	 * filedesc lock to call malloc(), watch out for a race.
 	 */
 	onfiles = fdp->fd_nfiles;
 	if (onfiles >= nnfiles) {
 		/* we lost the race, but that's OK */
 		free(ntable, M_FILEDESC);
 		if (nmap != NULL)
 			free(nmap, M_FILEDESC);
 		return;
 	}
 	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
 	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
 	if (onfiles > NDFILE)
 		free(fdp->fd_ofiles, M_FILEDESC);
 	fdp->fd_ofiles = ntable;
 	fdp->fd_ofileflags = nfileflags;
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
 		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 			free(fdp->fd_map, M_FILEDESC);
 		fdp->fd_map = nmap;
 	}
 	fdp->fd_nfiles = nnfiles;
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd = -1, maxfd;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;	   
 
 	PROC_LOCK(p);
 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Search the bitmap for a free descriptor.  If none is found, try
 	 * to grow the file table.  Keep at it until we either get a file
 	 * descriptor or run into process or system limits; fdgrowtable()
 	 * may drop the filedesc lock, so we're in a race.
 	 */
 	for (;;) {
 		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 		if (fd >= maxfd)
 			return (EMFILE);
 		if (fd < fdp->fd_nfiles)
 			break;
 		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd] == NULL,
 	    ("free descriptor isn't"));
 	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Check to see whether n user file descriptors
  * are available to the process p.
  */
 int
 fdavail(struct thread *td, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file **fpp;
 	int i, lim, last;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 
 	PROC_LOCK(p);
 	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
 	last = min(fdp->fd_nfiles, lim);
 	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
 	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
 		if (*fpp == NULL && --n <= 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate
  * a file decriptor for the process that refers to it.
  * We add one reference to the file for the descriptor table
  * and one reference for resultfp. This is to prevent us being
  * preempted and the entry in the descriptor table closed after
  * we release the FILEDESC lock.
  */
 int
 falloc(struct thread *td, struct file **resultfp, int *resultfd)
 {
 	struct proc *p = td->td_proc;
 	struct file *fp, *fq;
 	int error, i;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	static struct timeval lastfail;
 	static int curfail;
 
 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	sx_xlock(&filelist_lock);
 
 	if ((openfiles >= maxuserfiles &&
 	    priv_check_cred(td->td_ucred, PRIV_MAXFILES, SUSER_RUID) != 0) ||
 	    openfiles >= maxfiles) {
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
 				td->td_ucred->cr_ruid);
 		}
 		sx_xunlock(&filelist_lock);
 		uma_zfree(file_zone, fp);
 		return (ENFILE);
 	}
 	openfiles++;
 
 	/*
 	 * If the process has file descriptor zero open, add the new file
 	 * descriptor to the list of open files at that point, otherwise
 	 * put it at the front of the list of open files.
 	 */
 	fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep);
 	fp->f_count = 1;
 	if (resultfp)
 		fp->f_count++;
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fp->f_vnode = NULL;
 	FILEDESC_LOCK(p->p_fd);
 	if ((fq = p->p_fd->fd_ofiles[0])) {
 		LIST_INSERT_AFTER(fq, fp, f_list);
 	} else {
 		LIST_INSERT_HEAD(&filehead, fp, f_list);
 	}
 	sx_xunlock(&filelist_lock);
 	if ((error = fdalloc(td, 0, &i))) {
 		FILEDESC_UNLOCK(p->p_fd);
 		fdrop(fp, td);
 		if (resultfp)
 			fdrop(fp, td);
 		return (error);
 	}
 	p->p_fd->fd_ofiles[i] = fp;
 	FILEDESC_UNLOCK(p->p_fd);
 	if (resultfp)
 		*resultfp = fp;
 	if (resultfd)
 		*resultfd = i;
 	return (0);
 }
 
 /*
  * Build a new filedesc structure from another.
  * Copy the current, root, and jail root vnode references.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp)
 {
 	struct filedesc0 *newfdp;
 
 	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
 	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
 	if (fdp != NULL) {
 		FILEDESC_LOCK(fdp);
 		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
 		if (newfdp->fd_fd.fd_cdir)
 			VREF(newfdp->fd_fd.fd_cdir);
 		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
 		if (newfdp->fd_fd.fd_rdir)
 			VREF(newfdp->fd_fd.fd_rdir);
 		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
 		if (newfdp->fd_fd.fd_jdir)
 			VREF(newfdp->fd_fd.fd_jdir);
 		FILEDESC_UNLOCK(fdp);
 	}
 
 	/* Create the file descriptor table. */
 	newfdp->fd_fd.fd_refcnt = 1;
 	newfdp->fd_fd.fd_holdcnt = 1;
 	newfdp->fd_fd.fd_cmask = CMASK;
 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
 	newfdp->fd_fd.fd_nfiles = NDFILE;
 	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
 	newfdp->fd_fd.fd_lastfile = -1;
 	return (&newfdp->fd_fd);
 }
 
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	mtx_lock(&fdesc_mtx);
 	fdp = p->p_fd;
 	if (fdp != NULL)
 		fdp->fd_holdcnt++;
 	mtx_unlock(&fdesc_mtx);
 	return (fdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 	int i;
 
 	mtx_lock(&fdesc_mtx);
 	i = --fdp->fd_holdcnt;
 	mtx_unlock(&fdesc_mtx);
 	if (i > 0)
 		return;
 
 	mtx_destroy(&fdp->fd_mtx);
 	FREE(fdp, M_FILEDESC);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 	FILEDESC_LOCK_FAST(fdp);
 	fdp->fd_refcnt++;
 	FILEDESC_UNLOCK_FAST(fdp);
 	return (fdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct proc *p, struct thread *td)
 {
 
 	FILEDESC_LOCK_FAST(p->p_fd);
 	if (p->p_fd->fd_refcnt > 1) {
 		struct filedesc *tmp;
 
 		FILEDESC_UNLOCK_FAST(p->p_fd);
 		tmp = fdcopy(p->p_fd);
 		fdfree(td);
 		p->p_fd = tmp;
 	} else
 		FILEDESC_UNLOCK_FAST(p->p_fd);
 }
 
 /*
  * Copy a filedesc structure.
  * A NULL pointer in returns a NULL reference, this is to ease callers,
  * not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	if (fdp == NULL)
 		return (NULL);
 
 	newfdp = fdinit(fdp);
 	FILEDESC_LOCK_FAST(fdp);
 	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
 		FILEDESC_UNLOCK_FAST(fdp);
 		FILEDESC_LOCK(newfdp);
 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 		FILEDESC_UNLOCK(newfdp);
 		FILEDESC_LOCK_FAST(fdp);
 	}
 	/* copy everything except kqueue descriptors */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
 		if (fdisused(fdp, i) &&
 		    fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE) {
 			newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
 			newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
 			fhold(newfdp->fd_ofiles[i]);
 			newfdp->fd_lastfile = i;
 		} else {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 		}
 	}
 	FILEDESC_UNLOCK_FAST(fdp);
 	FILEDESC_LOCK(newfdp);
 	for (i = 0; i <= newfdp->fd_lastfile; ++i)
 		if (newfdp->fd_ofiles[i] != NULL)
 			fdused(newfdp, i);
 	FILEDESC_UNLOCK(newfdp);
 	FILEDESC_LOCK_FAST(fdp);
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_UNLOCK_FAST(fdp);
 	return (newfdp);
 }
 
 /*
  * Release a filedesc structure.
  */
 void
 fdfree(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file **fpp;
 	int i, locked;
 	struct filedesc_to_leader *fdtol;
 	struct file *fp;
 	struct vnode *cdir, *jdir, *rdir, *vp;
 	struct flock lf;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	/* Check for special need to clear POSIX style locks */
 	fdtol = td->td_proc->p_fdtol;
 	if (fdtol != NULL) {
 		FILEDESC_LOCK(fdp);
 		KASSERT(fdtol->fdl_refcount > 0,
 			("filedesc_to_refcount botch: fdl_refcount=%d",
 			 fdtol->fdl_refcount));
 		if (fdtol->fdl_refcount == 1 &&
 		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			for (i = 0, fpp = fdp->fd_ofiles;
 			     i <= fdp->fd_lastfile;
 			     i++, fpp++) {
 				if (*fpp == NULL ||
 				    (*fpp)->f_type != DTYPE_VNODE)
 					continue;
 				fp = *fpp;
 				fhold(fp);
 				FILEDESC_UNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				locked = VFS_LOCK_GIANT(vp->v_mount);
 				(void) VOP_ADVLOCK(vp,
 						   (caddr_t)td->td_proc->
 						   p_leader,
 						   F_UNLCK,
 						   &lf,
 						   F_POSIX);
 				VFS_UNLOCK_GIANT(locked);
 				FILEDESC_LOCK(fdp);
 				fdrop(fp, td);
 				fpp = fdp->fd_ofiles + i;
 			}
 		}
 	retry:
 		if (fdtol->fdl_refcount == 1) {
 			if (fdp->fd_holdleaderscount > 0 &&
 			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 				/*
 				 * close() or do_dup() has cleared a reference
 				 * in a shared file descriptor table.
 				 */
 				fdp->fd_holdleaderswakeup = 1;
 				msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx,
 				       PLOCK, "fdlhold", 0);
 				goto retry;
 			}
 			if (fdtol->fdl_holdcount > 0) {
 				/*
 				 * Ensure that fdtol->fdl_leader
 				 * remains valid in closef().
 				 */
 				fdtol->fdl_wakeup = 1;
 				msleep(fdtol, &fdp->fd_mtx,
 				       PLOCK, "fdlhold", 0);
 				goto retry;
 			}
 		}
 		fdtol->fdl_refcount--;
 		if (fdtol->fdl_refcount == 0 &&
 		    fdtol->fdl_holdcount == 0) {
 			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 		} else
 			fdtol = NULL;
 		td->td_proc->p_fdtol = NULL;
 		FILEDESC_UNLOCK(fdp);
 		if (fdtol != NULL)
 			FREE(fdtol, M_FILEDESC_TO_LEADER);
 	}
 	FILEDESC_LOCK_FAST(fdp);
 	i = --fdp->fd_refcnt;
 	FILEDESC_UNLOCK_FAST(fdp);
 	if (i > 0)
 		return;
 	/*
 	 * We are the last reference to the structure, so we can
 	 * safely assume it will not change out from under us.
 	 */
 	fpp = fdp->fd_ofiles;
 	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
 		if (*fpp)
 			(void) closef(*fpp, td);
 	}
 	FILEDESC_LOCK(fdp);
 
 	/* XXX This should happen earlier. */
 	mtx_lock(&fdesc_mtx);
 	td->td_proc->p_fd = NULL;
 	mtx_unlock(&fdesc_mtx);
 
 	if (fdp->fd_nfiles > NDFILE)
 		FREE(fdp->fd_ofiles, M_FILEDESC);
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		FREE(fdp->fd_map, M_FILEDESC);
 
 	fdp->fd_nfiles = 0;
 
 	cdir = fdp->fd_cdir;
 	fdp->fd_cdir = NULL;
 	rdir = fdp->fd_rdir;
 	fdp->fd_rdir = NULL;
 	jdir = fdp->fd_jdir;
 	fdp->fd_jdir = NULL;
 	FILEDESC_UNLOCK(fdp);
 
 	if (cdir) {
 		locked = VFS_LOCK_GIANT(cdir->v_mount);
 		vrele(cdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 	if (rdir) {
 		locked = VFS_LOCK_GIANT(rdir->v_mount);
 		vrele(rdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 	if (jdir) {
 		locked = VFS_LOCK_GIANT(jdir->v_mount);
 		vrele(jdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 
 	fddrop(fdp);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static int
 is_unsafe(struct file *fp)
 {
 	if (fp->f_type == DTYPE_VNODE) {
 		struct vnode *vp = fp->f_vnode;
 
 		if ((vp->v_vflag & VV_PROCDEP) != 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 setugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	/*
 	 * Note: fdp->fd_ofiles may be reallocated out from under us while
 	 * we are blocked in a close.  Be careful!
 	 */
 	FILEDESC_LOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (i > 2)
 			break;
 		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
 			struct file *fp;
 
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
 			FILEDESC_UNLOCK(fdp);
 			(void) closef(fp, td);
 			FILEDESC_LOCK(fdp);
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 }
 
 /*
  * If a specific file object occupies a specific file descriptor,
  * close the file descriptor entry and drop a reference on the file
  * object.  This is a convenience function to handle a subsequent
  * error in a function that calls falloc() that handles the race that
  * another thread might have closed the file descriptor out from under
  * the thread creating the file object.
  */
 void
 fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
 {
 
 	FILEDESC_LOCK(fdp);
 	if (fdp->fd_ofiles[idx] == fp) {
 		fdp->fd_ofiles[idx] = NULL;
 		fdunused(fdp, idx);
 		FILEDESC_UNLOCK(fdp);
 		fdrop(fp, td);
 	} else {
 		FILEDESC_UNLOCK(fdp);
 	}
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	FILEDESC_LOCK(fdp);
 
 	/*
 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
 	 * may block and rip them out from under us.
 	 */
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (fdp->fd_ofiles[i] != NULL &&
 		    (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
 			struct file *fp;
 
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
 			if (fp->f_type == DTYPE_MQUEUE)
 				mq_fdclose(td, i, fp);
 			FILEDESC_UNLOCK(fdp);
 			(void) closef(fp, td);
 			FILEDESC_LOCK(fdp);
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct nameidata nd;
 	struct filedesc *fdp;
 	struct file *fp;
 	register_t retval;
 	int fd, i, error, flags, devnull;
 
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return (0);
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	devnull = -1;
 	error = 0;
 	for (i = 0; i < 3; i++) {
 		if (fdp->fd_ofiles[i] != NULL)
 			continue;
 		if (devnull < 0) {
 			int vfslocked;
 			error = falloc(td, &fp, &fd);
 			if (error != 0)
 				break;
 			/* Note extra ref on `fp' held for us by falloc(). */
 			KASSERT(fd == i, ("oof, we didn't get our fd"));
 			NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE,
 			    "/dev/null", td);
 			flags = FREAD | FWRITE;
 			error = vn_open(&nd, &flags, 0, fd);
 			if (error != 0) {
 				/*
 				 * Someone may have closed the entry in the
 				 * file descriptor table, so check it hasn't
 				 * changed before dropping the reference count.
 				 */
 				FILEDESC_LOCK(fdp);
 				KASSERT(fdp->fd_ofiles[fd] == fp,
 				    ("table not shared, how did it change?"));
 				fdp->fd_ofiles[fd] = NULL;
 				fdunused(fdp, fd);
 				FILEDESC_UNLOCK(fdp);
 				fdrop(fp, td);
 				fdrop(fp, td);
 				break;
 			}
 			vfslocked = NDHASGIANT(&nd);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			fp->f_flag = flags;
 			fp->f_vnode = nd.ni_vp;
 			if (fp->f_data == NULL)
 				fp->f_data = nd.ni_vp;
 			if (fp->f_ops == &badfileops)
 				fp->f_ops = &vnops;
 			fp->f_type = DTYPE_VNODE;
 			VOP_UNLOCK(nd.ni_vp, 0, td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			devnull = fd;
 			fdrop(fp, td);
 		} else {
 			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
 			if (error != 0)
 				break;
 		}
 	}
 	return (error);
 }
 
 /*
  * Internal form of close.
  * Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  *
  * XXXRW: Giant is not required for the caller, but often will be held; this
  * makes it moderately likely the Giant will be recursed in the VFS case.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE && td != NULL) {
 		int vfslocked;
 
 		vp = fp->f_vnode;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 					   F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table
 			 * is shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_LOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			     fdtol != td->td_proc->p_fdtol;
 			     fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				     P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_UNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 						   (caddr_t)fdtol->fdl_leader,
 						   F_UNLCK, &lf, F_POSIX);
 				FILEDESC_LOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_UNLOCK(fdp);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (fdrop(fp, td));
 }
 
 /*
  * Extract the file pointer associated with the specified descriptor for
  * the current user process.
  *
  * If the descriptor doesn't exist, EBADF is returned.
  *
  * If the descriptor exists but doesn't match 'flags' then
  * return EBADF for read attempts and EINVAL for write attempts.
  *
  * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
  * It should be dropped with fdrop().
  * If it is not set, then the refcount will not be bumped however the
  * thread's filedesc struct will be returned locked (for fgetsock).
  *
  * If an error occured the non-zero error is returned and *fpp is set to NULL.
  * Otherwise *fpp is set and zero is returned.
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
 	FILEDESC_LOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 *
 	 * Only one flag, or 0, may be specified.
 	 */
 	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	if (hold) {
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, 0, 1));
 }
 
 int
 fget_read(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, FREAD, 1));
 }
 
 int
 fget_write(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, FWRITE, 1));
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if
  * the descriptor does not represent a vnode.  Note that pipes use vnodes
  * but never have VM objects.  The returned vnode will be vref()d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
 	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, 0));
 }
 
 int
 fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, FREAD));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, FWRITE));
 }
 #endif
 
 /*
  * Like fget() but loads the underlying socket, or returns an error if
  * the descriptor does not represent a socket.
  *
  * We bump the ref count on the returned socket.  XXX Also obtain the SX
  * lock in the future.
  *
  * XXXRW: fgetsock() and fputsock() are deprecated, as consumers should rely
  * on their file descriptor reference to prevent the socket from being
  * freed during use.
  */
 int
 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	NET_ASSERT_GIANT();
 
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
 	} else {
 		*spp = fp->f_data;
 		if (fflagp)
 			*fflagp = fp->f_flag;
 		SOCK_LOCK(*spp);
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
 	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	return (error);
 }
 
 /*
  * Drop the reference count on the socket and XXX release the SX lock in the
  * future.  The last reference closes the socket.
  *
  * XXXRW: fputsock() is deprecated, see comment for fgetsock().
  */
 void
 fputsock(struct socket *so)
 {
 
 	NET_ASSERT_GIANT();
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	sorele(so);
 }
 
 int
 fdrop(struct file *fp, struct thread *td)
 {
 
 	FILE_LOCK(fp);
 	return (fdrop_locked(fp, td));
 }
 
 /*
  * Drop reference on struct file passed in, may call closef if the
  * reference hits zero.
  * Expects struct file locked, and will unlock it.
  */
 static int
 fdrop_locked(struct file *fp, struct thread *td)
 {
 	int error;
 
 	FILE_LOCK_ASSERT(fp, MA_OWNED);
 
 	if (--fp->f_count > 0) {
 		FILE_UNLOCK(fp);
 		return (0);
 	}
 
 	/*
 	 * We might have just dropped the last reference to a file
 	 * object that is for a UNIX domain socket whose message
 	 * buffers are being examined in unp_gc().  If that is the
 	 * case, FWAIT will be set in f_gcflag and we need to wait for
 	 * unp_gc() to finish its scan.
 	 */
 	while (fp->f_gcflag & FWAIT)
 		msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0);
 
 	/* We have the last ref so we can proceed without the file lock. */
 	FILE_UNLOCK(fp);
 	if (fp->f_count < 0)
 		panic("fdrop: count < 0");
 	if (fp->f_ops != &badfileops)
 		error = fo_close(fp, td);
 	else
 		error = 0;
 
 	sx_xlock(&filelist_lock);
 	LIST_REMOVE(fp, f_list);
 	openfiles--;
 	sx_xunlock(&filelist_lock);
 	crfree(fp->f_cred);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on
  * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
 
 	mtx_lock(&Giant);
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		FILE_LOCK(fp);
 		fp->f_flag &= ~FHASLOCK;
 		FILE_UNLOCK(fp);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
 	FILE_LOCK(fp);
 	fp->f_flag |= FHASLOCK;
 	FILE_UNLOCK(fp);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
 {
 	struct file *wfp;
 	struct file *fp;
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_LOCK(fdp);
 	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
 	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor
 	 * (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and
 	 * store it in (indx).  (dfd) is effectively closed by
 	 * this operation.
 	 *
 	 * Any other error code is just returned.
 	 */
 	switch (error) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		FILE_LOCK(wfp);
 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
 			FILE_UNLOCK(wfp);
 			FILEDESC_UNLOCK(fdp);
 			return (EACCES);
 		}
 		fp = fdp->fd_ofiles[indx];
 		fdp->fd_ofiles[indx] = wfp;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		if (fp == NULL)
 			fdused(fdp, indx);
 		fhold_locked(wfp);
 		FILE_UNLOCK(wfp);
 		FILEDESC_UNLOCK(fdp);
 		if (fp != NULL) {
 			/*
 			 * We now own the reference to fp that the ofiles[]
 			 * array used to own.  Release it.
 			 */
 			FILE_LOCK(fp);
 			fdrop_locked(fp, td);
 		}
 		return (0);
 
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		fp = fdp->fd_ofiles[indx];
 		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
 		fdp->fd_ofiles[dfd] = NULL;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		fdp->fd_ofileflags[dfd] = 0;
 		fdunused(fdp, dfd);
 		if (fp == NULL)
 			fdused(fdp, indx);
 		if (fp != NULL)
 			FILE_LOCK(fp);
 
 		/*
 		 * We now own the reference to fp that the ofiles[] array
 		 * used to own.  Release it.
 		 */
 		if (fp != NULL)
 			fdrop_locked(fp, td);
 
 		FILEDESC_UNLOCK(fdp);
 
 		return (0);
 
 	default:
 		FILEDESC_UNLOCK(fdp);
 		return (error);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Scan all active processes to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new
  * mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		nrele = 0;
 		FILEDESC_LOCK_FAST(fdp);
 		if (fdp->fd_cdir == olddp) {
 			vref(newdp);
 			fdp->fd_cdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_rdir == olddp) {
 			vref(newdp);
 			fdp->fd_rdir = newdp;
 			nrele++;
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 		fddrop(fdp);
 		while (nrele--)
 			vrele(olddp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vrele(rootvnode);
 		vref(newdp);
 		rootvnode = newdp;
 	}
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	MALLOC(fdtol, struct filedesc_to_leader *,
 	       sizeof(struct filedesc_to_leader),
 	       M_FILEDESC_TO_LEADER,
 	       M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_LOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_UNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 /*
  * Get file structures.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n;
 
 	/*
 	 * Note: because the number of file descriptors is calculated
 	 * in different ways for sizing vs returning the data,
 	 * there is information leakage from the first loop.  However,
 	 * it is of a similar order of magnitude to the leakage from
 	 * global system statistics such as kern.openfiles.
 	 */
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 16;		/* A slight overestimate. */
 		sx_slock(&filelist_lock);
 		LIST_FOREACH(fp, &filehead, f_list) {
 			/*
 			 * We should grab the lock, but this is an
 			 * estimate, so does it really matter?
 			 */
 			/* mtx_lock(fp->f_mtxp); */
 			n += fp->f_count;
 			/* mtx_unlock(f->f_mtxp); */
 		}
 		sx_sunlock(&filelist_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		PROC_LOCK(p);
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		PROC_UNLOCK(p);
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_LOCK_FAST(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
 			if ((fp = fdp->fd_ofiles[n]) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
 			xf.xf_data = fp->f_data;
 			xf.xf_vnode = fp->f_vnode;
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = fp->f_msgcount;
 			xf.xf_offset = fp->f_offset;
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnod");
 	case DTYPE_SOCKET:
 		return ("sock");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kque");
 	case DTYPE_CRYPTO:
 		return ("crpt");
 	case DTYPE_MQUEUE:
 		return ("mque");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; n++) {
 			if (fp == fdp->fd_ofiles[n])
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 	struct proc *p;
 
 	if (header)
 		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
 		    "File", "Type", "Data", "Flag", "GCFl", "Count",
 		    "MCount", "Vnode", "FPID", "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
 	    fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct file *fp;
 	int header;
 
 	header = 1;
 	LIST_FOREACH(fp, &filehead, f_list) {
 		db_print_file(fp, header);
 		header = 0;
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     &openfiles, 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	sx_init(&filelist_lock, "filelist lock");
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 };
 
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_NEEDGIANT,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL)
Index: head/sys/kern/kern_event.c
===================================================================
--- head/sys/kern/kern_event.c	(revision 167210)
+++ head/sys/kern/kern_event.c	(revision 167211)
@@ -1,1969 +1,1963 @@
 /*-
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/unistd.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/kthread.h>
 #include <sys/selinfo.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
 #include <sys/poll.h>
 #include <sys/protosw.h>
 #include <sys/sigio.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 /*
  * This lock is used if multiple kq locks are required.  This possibly
  * should be made into a per proc lock.
  */
 static struct mtx	kq_global;
 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
 #define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
 	if (!haslck)				\
 		mtx_lock(lck);			\
 	haslck = 1;				\
 } while (0)
 #define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
 	if (haslck)				\
 		mtx_unlock(lck);			\
 	haslck = 0;				\
 } while (0)
 
 TASKQUEUE_DEFINE_THREAD(kqueue);
 
 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
 static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
 		    struct thread *td, int waitok);
 static int	kqueue_aquire(struct file *fp, struct kqueue **kqp);
 static void	kqueue_release(struct kqueue *kq, int locked);
 static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
 		    uintptr_t ident, int waitok);
 static void	kqueue_task(void *arg, int pending);
 static int	kqueue_scan(struct kqueue *kq, int maxevents,
 		    struct kevent_copyops *k_ops,
 		    const struct timespec *timeout,
 		    struct kevent *keva, struct thread *td);
 static void 	kqueue_wakeup(struct kqueue *kq);
 static struct filterops *kqueue_fo_find(int filt);
 static void	kqueue_fo_release(int filt);
 
 static fo_rdwr_t	kqueue_read;
 static fo_rdwr_t	kqueue_write;
 static fo_ioctl_t	kqueue_ioctl;
 static fo_poll_t	kqueue_poll;
 static fo_kqfilter_t	kqueue_kqfilter;
 static fo_stat_t	kqueue_stat;
 static fo_close_t	kqueue_close;
 
 static struct fileops kqueueops = {
 	.fo_read = kqueue_read,
 	.fo_write = kqueue_write,
 	.fo_ioctl = kqueue_ioctl,
 	.fo_poll = kqueue_poll,
 	.fo_kqfilter = kqueue_kqfilter,
 	.fo_stat = kqueue_stat,
 	.fo_close = kqueue_close,
 };
 
 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
 static void 	knote_drop(struct knote *kn, struct thread *td);
 static void 	knote_enqueue(struct knote *kn);
 static void 	knote_dequeue(struct knote *kn);
 static void 	knote_init(void);
 static struct 	knote *knote_alloc(int waitok);
 static void 	knote_free(struct knote *kn);
 
 static void	filt_kqdetach(struct knote *kn);
 static int	filt_kqueue(struct knote *kn, long hint);
 static int	filt_procattach(struct knote *kn);
 static void	filt_procdetach(struct knote *kn);
 static int	filt_proc(struct knote *kn, long hint);
 static int	filt_fileattach(struct knote *kn);
 static void	filt_timerexpire(void *knx);
 static int	filt_timerattach(struct knote *kn);
 static void	filt_timerdetach(struct knote *kn);
 static int	filt_timer(struct knote *kn, long hint);
 
 static struct filterops file_filtops =
 	{ 1, filt_fileattach, NULL, NULL };
 static struct filterops kqread_filtops =
 	{ 1, NULL, filt_kqdetach, filt_kqueue };
 /* XXX - move to kern_proc.c?  */
 static struct filterops proc_filtops =
 	{ 0, filt_procattach, filt_procdetach, filt_proc };
 static struct filterops timer_filtops =
 	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
 
 static uma_zone_t	knote_zone;
 static int 		kq_ncallouts = 0;
 static int 		kq_calloutmax = (4 * 1024);
 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
 /* XXX - ensure not KN_INFLUX?? */
 #define KNOTE_ACTIVATE(kn, islock) do { 				\
 	if ((islock))							\
 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
 	else								\
 		KQ_LOCK((kn)->kn_kq);					\
 	(kn)->kn_status |= KN_ACTIVE;					\
 	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
 		knote_enqueue((kn));					\
 	if (!(islock))							\
 		KQ_UNLOCK((kn)->kn_kq);					\
 } while(0)
 #define KQ_LOCK(kq) do {						\
 	mtx_lock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_FLUX_WAKEUP(kq) do {						\
 	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
 		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
 		wakeup((kq));						\
 	}								\
 } while (0)
 #define KQ_UNLOCK_FLUX(kq) do {						\
 	KQ_FLUX_WAKEUP(kq);						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_UNLOCK(kq) do {						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_OWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
 } while (0)
 #define KQ_NOTOWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
 } while (0)
 #define KN_LIST_LOCK(kn) do {						\
 	if (kn->kn_knlist != NULL)					\
 		kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg);	\
 } while (0)
 #define KN_LIST_UNLOCK(kn) do {						\
 	if (kn->kn_knlist != NULL) 					\
 		kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg);	\
 } while (0)
 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
 	if (islocked)							\
 		KNL_ASSERT_LOCKED(knl);				\
 	else								\
 		KNL_ASSERT_UNLOCKED(knl);				\
 } while (0)
 #ifdef INVARIANTS
 #define	KNL_ASSERT_LOCKED(knl) do {					\
 	if (!knl->kl_locked((knl)->kl_lockarg))				\
 			panic("knlist not locked, but should be");	\
 } while (0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {				\
 	if (knl->kl_locked((knl)->kl_lockarg))				\
 		panic("knlist locked, but should not be");		\
 } while (0)
 #else /* !INVARIANTS */
 #define	KNL_ASSERT_LOCKED(knl) do {} while(0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
 #endif /* INVARIANTS */
 
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
 static int
 filt_nullattach(struct knote *kn)
 {
 
 	return (ENXIO);
 };
 
 struct filterops null_filtops =
 	{ 0, filt_nullattach, NULL, NULL };
 
 /* XXX - make SYSINIT to add these, and move into respective modules. */
 extern struct filterops sig_filtops;
 extern struct filterops fs_filtops;
 
 /*
  * Table for for all system-defined filters.
  */
 static struct mtx	filterops_lock;
 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
 	MTX_DEF);
 static struct {
 	struct filterops *for_fop;
 	int for_refcnt;
 } sysfilt_ops[EVFILT_SYSCOUNT] = {
 	{ &file_filtops },			/* EVFILT_READ */
 	{ &file_filtops },			/* EVFILT_WRITE */
 	{ &null_filtops },			/* EVFILT_AIO */
 	{ &file_filtops },			/* EVFILT_VNODE */
 	{ &proc_filtops },			/* EVFILT_PROC */
 	{ &sig_filtops },			/* EVFILT_SIGNAL */
 	{ &timer_filtops },			/* EVFILT_TIMER */
 	{ &file_filtops },			/* EVFILT_NETDEV */
 	{ &fs_filtops },			/* EVFILT_FS */
 	{ &null_filtops },			/* EVFILT_LIO */
 };
 
 /*
  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
  * method.
  */
 static int
 filt_fileattach(struct knote *kn)
 {
 
 	return (fo_kqfilter(kn->kn_fp, kn));
 }
 
 /*ARGSUSED*/
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (EINVAL);
 
 	kn->kn_status |= KN_KQUEUE;
 	kn->kn_fop = &kqread_filtops;
 	knlist_add(&kq->kq_sel.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_kqdetach(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	knlist_remove(&kq->kq_sel.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_kqueue(struct knote *kn, long hint)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	kn->kn_data = kq->kq_count;
 	return (kn->kn_data > 0);
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_procattach(struct knote *kn)
 {
 	struct proc *p;
 	int immediate;
 	int error;
 
 	immediate = 0;
 	p = pfind(kn->kn_id);
 	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
 		p = zpfind(kn->kn_id);
 		immediate = 1;
 	} else if (p != NULL && (p->p_flag & P_WEXIT)) {
 		immediate = 1;
 	}
 
 	if (p == NULL)
 		return (ESRCH);
 	if ((error = p_cansee(curthread, p)))
 		return (error);
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/*
 	 * internal flag indicating registration done by kernel
 	 */
 	if (kn->kn_flags & EV_FLAG1) {
 		kn->kn_data = kn->kn_sdata;		/* ppid */
 		kn->kn_fflags = NOTE_CHILD;
 		kn->kn_flags &= ~EV_FLAG1;
 	}
 
 	if (immediate == 0)
 		knlist_add(&p->p_klist, kn, 1);
 
 	/*
 	 * Immediately activate any exit notes if the target process is a
 	 * zombie.  This is necessary to handle the case where the target
 	 * process, e.g. a child, dies before the kevent is registered.
 	 */
 	if (immediate && filt_proc(kn, NOTE_EXIT))
 		KNOTE_ACTIVATE(kn, 0);
 
 	PROC_UNLOCK(p);
 
 	return (0);
 }
 
 /*
  * The knote may be attached to a different process, which may exit,
  * leaving nothing for the knote to be attached to.  So when the process
  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
  * it will be deleted when read out.  However, as part of the knote deletion,
  * this routine is called, so a check is needed to avoid actually performing
  * a detach, because the original process does not exist any more.
  */
 /* XXX - move to kern_proc.c?  */
 static void
 filt_procdetach(struct knote *kn)
 {
 	struct proc *p;
 
 	p = kn->kn_ptr.p_proc;
 	knlist_remove(&p->p_klist, kn, 0);
 	kn->kn_ptr.p_proc = NULL;
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_proc(struct knote *kn, long hint)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 	u_int event;
 
 	/*
 	 * mask off extra data
 	 */
 	event = (u_int)hint & NOTE_PCTRLMASK;
 
 	/*
 	 * if the user is interested in this event, record it.
 	 */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/*
 	 * process is gone, so flag the event as finished.
 	 */
 	if (event == NOTE_EXIT) {
 		if (!(kn->kn_status & KN_DETACHED))
 			knlist_remove_inevent(&p->p_klist, kn);
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		kn->kn_data = p->p_xstat;
 		kn->kn_ptr.p_proc = NULL;
 		return (1);
 	}
 
 	/*
 	 * process forked, and user wants to track the new process,
 	 * so attach a new knote to it, and immediately report an
 	 * event with the parent's pid.
 	 */
 	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
 		struct kevent kev;
 		int error;
 
 		/*
 		 * register knote with new process.
 		 */
 		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;			/* parent */
 		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
 		error = kqueue_register(kn->kn_kq, &kev, NULL, 0);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
 static int
 timertoticks(intptr_t data)
 {
 	struct timeval tv;
 	int tticks;
 
 	tv.tv_sec = data / 1000;
 	tv.tv_usec = (data % 1000) * 1000;
 	tticks = tvtohz(&tv);
 
 	return tticks;
 }
 
 /* XXX - move to kern_timeout.c? */
 static void
 filt_timerexpire(void *knx)
 {
 	struct knote *kn = knx;
 	struct callout *calloutp;
 
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 
 	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
 		calloutp = (struct callout *)kn->kn_hook;
 		callout_reset(calloutp, timertoticks(kn->kn_sdata),
 		    filt_timerexpire, kn);
 	}
 }
 
 /*
  * data contains amount of time to sleep, in milliseconds
  */
 /* XXX - move to kern_timeout.c? */
 static int
 filt_timerattach(struct knote *kn)
 {
 	struct callout *calloutp;
 
 	atomic_add_int(&kq_ncallouts, 1);
 
 	if (kq_ncallouts >= kq_calloutmax) {
 		atomic_add_int(&kq_ncallouts, -1);
 		return (ENOMEM);
 	}
 
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add usually sets it */
 	MALLOC(calloutp, struct callout *, sizeof(*calloutp),
 	    M_KQUEUE, M_WAITOK);
 	callout_init(calloutp, CALLOUT_MPSAFE);
 	kn->kn_hook = calloutp;
 	callout_reset(calloutp, timertoticks(kn->kn_sdata), filt_timerexpire,
 	    kn);
 
 	return (0);
 }
 
 /* XXX - move to kern_timeout.c? */
 static void
 filt_timerdetach(struct knote *kn)
 {
 	struct callout *calloutp;
 
 	calloutp = (struct callout *)kn->kn_hook;
 	callout_drain(calloutp);
 	FREE(calloutp, M_KQUEUE);
 	atomic_add_int(&kq_ncallouts, -1);
 	kn->kn_status |= KN_DETACHED;	/* knlist_remove usually clears it */
 }
 
 /* XXX - move to kern_timeout.c? */
 static int
 filt_timer(struct knote *kn, long hint)
 {
 
 	return (kn->kn_data != 0);
 }
 
-/*
- * MPSAFE
- */
 int
 kqueue(struct thread *td, struct kqueue_args *uap)
 {
 	struct filedesc *fdp;
 	struct kqueue *kq;
 	struct file *fp;
 	int fd, error;
 
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd);
 	if (error)
 		goto done2;
 
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
 	kq->kq_fdp = fdp;
 	knlist_init(&kq->kq_sel.si_note, &kq->kq_lock, NULL, NULL, NULL);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
 
 	FILEDESC_LOCK_FAST(fdp);
 	SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
 	FILEDESC_UNLOCK_FAST(fdp);
 
 	FILE_LOCK(fp);
 	fp->f_flag = FREAD | FWRITE;
 	fp->f_type = DTYPE_KQUEUE;
 	fp->f_ops = &kqueueops;
 	fp->f_data = kq;
 	FILE_UNLOCK(fp);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 done2:
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kevent_args {
 	int	fd;
 	const struct kevent *changelist;
 	int	nchanges;
 	struct	kevent *eventlist;
 	int	nevents;
 	const struct timespec *timeout;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 kevent(struct thread *td, struct kevent_args *uap)
 {
 	struct timespec ts, *tsp;
 	struct kevent_copyops k_ops = { uap,
 					kevent_copyout,
 					kevent_copyin};
 	int error;
 #ifdef KTRACE
 	struct uio ktruio;
 	struct iovec ktriov;
 	struct uio *ktruioin = NULL;
 	struct uio *ktruioout = NULL;
 #endif
 
 	if (uap->timeout != NULL) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO)) {
 		ktriov.iov_base = uap->changelist;
 		ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
 		ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
 		    .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
 		    .uio_td = td };
 		ktruioin = cloneuio(&ktruio);
 		ktriov.iov_base = uap->eventlist;
 		ktriov.iov_len = uap->nevents * sizeof(struct kevent);
 		ktruioout = cloneuio(&ktruio);
 	}
 #endif
 
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    &k_ops, tsp);
 
 #ifdef KTRACE
 	if (ktruioin != NULL) {
 		ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
 		ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
 		ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
 		ktrgenio(uap->fd, UIO_READ, ktruioout, error);
 	}
 #endif
 
 	return (error);
 }
 
 /*
  * Copy 'count' items into the destination list pointed to by uap->eventlist.
  */
 static int
 kevent_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
 	if (error == 0)
 		uap->changelist += count;
 	return (error);
 }
 
 int
 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kevent keva[KQ_NEVENTS];
 	struct kevent *kevp, *changes;
 	struct kqueue *kq;
 	struct file *fp;
 	int i, n, nerrors, error;
 
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
 	if ((error = kqueue_aquire(fp, &kq)) != 0)
 		goto done_norel;
 
 	nerrors = 0;
 
 	while (nchanges > 0) {
 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
 		error = k_ops->k_copyin(k_ops->arg, keva, n);
 		if (error)
 			goto done;
 		changes = keva;
 		for (i = 0; i < n; i++) {
 			kevp = &changes[i];
 			if (!kevp->filter)
 				continue;
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td, 1);
 			if (error) {
 				if (nevents != 0) {
 					kevp->flags = EV_ERROR;
 					kevp->data = error;
 					(void) k_ops->k_copyout(k_ops->arg,
 					    kevp, 1);
 					nevents--;
 					nerrors++;
 				} else {
 					goto done;
 				}
 			}
 		}
 		nchanges -= n;
 	}
 	if (nerrors) {
 		td->td_retval[0] = nerrors;
 		error = 0;
 		goto done;
 	}
 
 	error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
 done:
 	kqueue_release(kq, 0);
 done_norel:
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 int
 kqueue_add_filteropts(int filt, struct filterops *filtops)
 {
 	int error;
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
 		printf(
 "trying to add a filterop that is out of range: %d is beyond %d\n",
 		    ~filt, EVFILT_SYSCOUNT);
 		return EINVAL;
 	}
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
 	    sysfilt_ops[~filt].for_fop != NULL)
 		error = EEXIST;
 	else {
 		sysfilt_ops[~filt].for_fop = filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return (0);
 }
 
 int
 kqueue_del_filteropts(int filt)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return EINVAL;
 
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
 	    sysfilt_ops[~filt].for_fop == NULL)
 		error = EINVAL;
 	else if (sysfilt_ops[~filt].for_refcnt != 0)
 		error = EBUSY;
 	else {
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return error;
 }
 
 static struct filterops *
 kqueue_fo_find(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return NULL;
 
 	mtx_lock(&filterops_lock);
 	sysfilt_ops[~filt].for_refcnt++;
 	if (sysfilt_ops[~filt].for_fop == NULL)
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 	mtx_unlock(&filterops_lock);
 
 	return sysfilt_ops[~filt].for_fop;
 }
 
 static void
 kqueue_fo_release(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return;
 
 	mtx_lock(&filterops_lock);
 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
 	    ("filter object refcount not valid on release"));
 	sysfilt_ops[~filt].for_refcnt--;
 	mtx_unlock(&filterops_lock);
 }
 
 /*
  * A ref to kq (obtained via kqueue_aquire) must be held.  waitok will
  * influence if memory allocation should wait.  Make sure it is 0 if you
  * hold any mutexes.
  */
 static int
 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
 {
 	struct filterops *fops;
 	struct file *fp;
 	struct knote *kn, *tkn;
 	int error, filt, event;
 	int haskqglobal;
 
 	fp = NULL;
 	kn = NULL;
 	error = 0;
 	haskqglobal = 0;
 
 	filt = kev->filter;
 	fops = kqueue_fo_find(filt);
 	if (fops == NULL)
 		return EINVAL;
 
 	tkn = knote_alloc(waitok);		/* prevent waiting with locks */
 
 findkn:
 	if (fops->f_isfd) {
 		KASSERT(td != NULL, ("td is NULL"));
 		error = fget(td, kev->ident, &fp);
 		if (error)
 			goto done;
 
 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
 		    kev->ident, 0) != 0) {
 			/* try again */
 			fdrop(fp, td);
 			fp = NULL;
 			error = kqueue_expand(kq, fops, kev->ident, waitok);
 			if (error)
 				goto done;
 			goto findkn;
 		}
 
 		if (fp->f_type == DTYPE_KQUEUE) {
 			/*
 			 * if we add some inteligence about what we are doing,
 			 * we should be able to support events on ourselves.
 			 * We need to know when we are doing this to prevent
 			 * getting both the knlist lock and the kq lock since
 			 * they are the same thing.
 			 */
 			if (fp->f_data == kq) {
 				error = EINVAL;
 				goto done;
 			}
 
 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 		}
 
 		KQ_LOCK(kq);
 		if (kev->ident < kq->kq_knlistsize) {
 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
 				if (kev->filter == kn->kn_filter)
 					break;
 		}
 	} else {
 		if ((kev->flags & EV_ADD) == EV_ADD)
 			kqueue_expand(kq, fops, kev->ident, waitok);
 
 		KQ_LOCK(kq);
 		if (kq->kq_knhashmask != 0) {
 			struct klist *list;
 
 			list = &kq->kq_knhash[
 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
 			SLIST_FOREACH(kn, list, kn_link)
 				if (kev->ident == kn->kn_id &&
 				    kev->filter == kn->kn_filter)
 					break;
 		}
 	}
 
 	/* knote is in the process of changing, wait for it to stablize. */
 	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
 		if (fp != NULL) {
 			fdrop(fp, td);
 			fp = NULL;
 		}
 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
 		goto findkn;
 	}
 
 	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
 		KQ_UNLOCK(kq);
 		error = ENOENT;
 		goto done;
 	}
 
 	/*
 	 * kn now contains the matching knote, or NULL if no match
 	 */
 	if (kev->flags & EV_ADD) {
 		if (kn == NULL) {
 			kn = tkn;
 			tkn = NULL;
 			if (kn == NULL) {
 				KQ_UNLOCK(kq);
 				error = ENOMEM;
 				goto done;
 			}
 			kn->kn_fp = fp;
 			kn->kn_kq = kq;
 			kn->kn_fop = fops;
 			/*
 			 * apply reference counts to knote structure, and
 			 * do not release it at the end of this routine.
 			 */
 			fops = NULL;
 			fp = NULL;
 
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kev->fflags = 0;
 			kev->data = 0;
 			kn->kn_kevent = *kev;
 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
 			    EV_ENABLE | EV_DISABLE);
 			kn->kn_status = KN_INFLUX|KN_DETACHED;
 
 			error = knote_attach(kn, kq);
 			KQ_UNLOCK(kq);
 			if (error != 0) {
 				tkn = kn;
 				goto done;
 			}
 
 			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
 				knote_drop(kn, td);
 				goto done;
 			}
 			KN_LIST_LOCK(kn);
 		} else {
 			/*
 			 * The user may change some filter values after the
 			 * initial EV_ADD, but doing so will not reset any
 			 * filter which has already been triggered.
 			 */
 			kn->kn_status |= KN_INFLUX;
 			KQ_UNLOCK(kq);
 			KN_LIST_LOCK(kn);
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kn->kn_kevent.udata = kev->udata;
 		}
 
 		/*
 		 * We can get here with kn->kn_knlist == NULL.
 		 * This can happen when the initial attach event decides that
 		 * the event is "completed" already.  i.e. filt_procattach
 		 * is called on a zombie process.  It will call filt_proc
 		 * which will remove it from the list, and NULL kn_knlist.
 		 */
 		event = kn->kn_fop->f_event(kn, 0);
 		KQ_LOCK(kq);
 		if (event)
 			KNOTE_ACTIVATE(kn, 1);
 		kn->kn_status &= ~KN_INFLUX;
 		KN_LIST_UNLOCK(kn);
 	} else if (kev->flags & EV_DELETE) {
 		kn->kn_status |= KN_INFLUX;
 		KQ_UNLOCK(kq);
 		if (!(kn->kn_status & KN_DETACHED))
 			kn->kn_fop->f_detach(kn);
 		knote_drop(kn, td);
 		goto done;
 	}
 
 	if ((kev->flags & EV_DISABLE) &&
 	    ((kn->kn_status & KN_DISABLED) == 0)) {
 		kn->kn_status |= KN_DISABLED;
 	}
 
 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
 		kn->kn_status &= ~KN_DISABLED;
 		if ((kn->kn_status & KN_ACTIVE) &&
 		    ((kn->kn_status & KN_QUEUED) == 0))
 			knote_enqueue(kn);
 	}
 	KQ_UNLOCK_FLUX(kq);
 
 done:
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 	if (fp != NULL)
 		fdrop(fp, td);
 	if (tkn != NULL)
 		knote_free(tkn);
 	if (fops != NULL)
 		kqueue_fo_release(filt);
 	return (error);
 }
 
 static int
 kqueue_aquire(struct file *fp, struct kqueue **kqp)
 {
 	int error;
 	struct kqueue *kq;
 
 	error = 0;
 
 	FILE_LOCK(fp);
 	do {
 		kq = fp->f_data;
 		if (fp->f_type != DTYPE_KQUEUE || kq == NULL) {
 			error = EBADF;
 			break;
 		}
 		*kqp = kq;
 		KQ_LOCK(kq);
 		if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
 			KQ_UNLOCK(kq);
 			error = EBADF;
 			break;
 		}
 		kq->kq_refcnt++;
 		KQ_UNLOCK(kq);
 	} while (0);
 	FILE_UNLOCK(fp);
 
 	return error;
 }
 
 static void
 kqueue_release(struct kqueue *kq, int locked)
 {
 	if (locked)
 		KQ_OWNED(kq);
 	else
 		KQ_LOCK(kq);
 	kq->kq_refcnt--;
 	if (kq->kq_refcnt == 1)
 		wakeup(&kq->kq_refcnt);
 	if (!locked)
 		KQ_UNLOCK(kq);
 }
 
 static void
 kqueue_schedtask(struct kqueue *kq)
 {
 
 	KQ_OWNED(kq);
 	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
 	    ("scheduling kqueue task while draining"));
 
 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
 		taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
 		kq->kq_state |= KQ_TASKSCHED;
 	}
 }
 
 /*
  * Expand the kq to make sure we have storage for fops/ident pair.
  *
  * Return 0 on success (or no work necessary), return errno on failure.
  *
  * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
  * If kqueue_register is called from a non-fd context, there usually/should
  * be no locks held.
  */
 static int
 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
 	int waitok)
 {
 	struct klist *list, *tmp_knhash;
 	u_long tmp_knhashmask;
 	int size;
 	int fd;
 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
 
 	KQ_NOTOWNED(kq);
 
 	if (fops->f_isfd) {
 		fd = ident;
 		if (kq->kq_knlistsize <= fd) {
 			size = kq->kq_knlistsize;
 			while (size <= fd)
 				size += KQEXTENT;
 			MALLOC(list, struct klist *,
 			    size * sizeof list, M_KQUEUE, mflag);
 			if (list == NULL)
 				return ENOMEM;
 			KQ_LOCK(kq);
 			if (kq->kq_knlistsize > fd) {
 				FREE(list, M_KQUEUE);
 				list = NULL;
 			} else {
 				if (kq->kq_knlist != NULL) {
 					bcopy(kq->kq_knlist, list,
 					    kq->kq_knlistsize * sizeof list);
 					FREE(kq->kq_knlist, M_KQUEUE);
 					kq->kq_knlist = NULL;
 				}
 				bzero((caddr_t)list +
 				    kq->kq_knlistsize * sizeof list,
 				    (size - kq->kq_knlistsize) * sizeof list);
 				kq->kq_knlistsize = size;
 				kq->kq_knlist = list;
 			}
 			KQ_UNLOCK(kq);
 		}
 	} else {
 		if (kq->kq_knhashmask == 0) {
 			tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
 			    &tmp_knhashmask);
 			if (tmp_knhash == NULL)
 				return ENOMEM;
 			KQ_LOCK(kq);
 			if (kq->kq_knhashmask == 0) {
 				kq->kq_knhash = tmp_knhash;
 				kq->kq_knhashmask = tmp_knhashmask;
 			} else {
 				free(tmp_knhash, M_KQUEUE);
 			}
 			KQ_UNLOCK(kq);
 		}
 	}
 
 	KQ_NOTOWNED(kq);
 	return 0;
 }
 
 static void
 kqueue_task(void *arg, int pending)
 {
 	struct kqueue *kq;
 	int haskqglobal;
 
 	haskqglobal = 0;
 	kq = arg;
 
 	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 	KQ_LOCK(kq);
 
 	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
 
 	kq->kq_state &= ~KQ_TASKSCHED;
 	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
 		wakeup(&kq->kq_state);
 	}
 	KQ_UNLOCK(kq);
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 }
 
 /*
  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
  * We treat KN_MARKER knotes as if they are INFLUX.
  */
 static int
 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
     const struct timespec *tsp, struct kevent *keva, struct thread *td)
 {
 	struct kevent *kevp;
 	struct timeval atv, rtv, ttv;
 	struct knote *kn, *marker;
 	int count, timeout, nkev, error;
 	int haskqglobal;
 
 	count = maxevents;
 	nkev = 0;
 	error = 0;
 	haskqglobal = 0;
 
 	if (maxevents == 0)
 		goto done_nl;
 
 	if (tsp != NULL) {
 		TIMESPEC_TO_TIMEVAL(&atv, tsp);
 		if (itimerfix(&atv)) {
 			error = EINVAL;
 			goto done_nl;
 		}
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			timeout = -1;
 		else
 			timeout = atv.tv_sec > 24 * 60 * 60 ?
 			    24 * 60 * 60 * hz : tvtohz(&atv);
 		getmicrouptime(&rtv);
 		timevaladd(&atv, &rtv);
 	} else {
 		atv.tv_sec = 0;
 		atv.tv_usec = 0;
 		timeout = 0;
 	}
 	marker = knote_alloc(1);
 	if (marker == NULL) {
 		error = ENOMEM;
 		goto done_nl;
 	}
 	marker->kn_status = KN_MARKER;
 	KQ_LOCK(kq);
 	goto start;
 
 retry:
 	if (atv.tv_sec || atv.tv_usec) {
 		getmicrouptime(&rtv);
 		if (timevalcmp(&rtv, &atv, >=))
 			goto done;
 		ttv = atv;
 		timevalsub(&ttv, &rtv);
 		timeout = ttv.tv_sec > 24 * 60 * 60 ?
 			24 * 60 * 60 * hz : tvtohz(&ttv);
 	}
 
 start:
 	kevp = keva;
 	if (kq->kq_count == 0) {
 		if (timeout < 0) {
 			error = EWOULDBLOCK;
 		} else {
 			kq->kq_state |= KQ_SLEEP;
 			error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
 			    "kqread", timeout);
 		}
 		if (error == 0)
 			goto retry;
 		/* don't restart after signals... */
 		if (error == ERESTART)
 			error = EINTR;
 		else if (error == EWOULDBLOCK)
 			error = 0;
 		goto done;
 	}
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
 	while (count) {
 		KQ_OWNED(kq);
 		kn = TAILQ_FIRST(&kq->kq_head);
 
 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
 		    (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
 			kq->kq_state |= KQ_FLUXWAIT;
 			error = msleep(kq, &kq->kq_lock, PSOCK,
 			    "kqflxwt", 0);
 			continue;
 		}
 
 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
 			kn->kn_status &= ~KN_QUEUED;
 			kq->kq_count--;
 			continue;
 		}
 		if (kn == marker) {
 			KQ_FLUX_WAKEUP(kq);
 			if (count == maxevents)
 				goto retry;
 			goto done;
 		}
 		KASSERT((kn->kn_status & KN_INFLUX) == 0,
 		    ("KN_INFLUX set when not suppose to be"));
 
 		if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn->kn_status |= KN_INFLUX;
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've marked
 			 * it _INFLUX.
 			 */
 			*kevp = kn->kn_kevent;
 			if (!(kn->kn_status & KN_DETACHED))
 				kn->kn_fop->f_detach(kn);
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			kn = NULL;
 		} else {
 			kn->kn_status |= KN_INFLUX;
 			KQ_UNLOCK(kq);
 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 			KN_LIST_LOCK(kn);
 			if (kn->kn_fop->f_event(kn, 0) == 0) {
 				KQ_LOCK(kq);
 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 				kn->kn_status &=
 				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
 				kq->kq_count--;
 				KN_LIST_UNLOCK(kn);
 				continue;
 			}
 			*kevp = kn->kn_kevent;
 			KQ_LOCK(kq);
 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 			if (kn->kn_flags & EV_CLEAR) {
 				kn->kn_data = 0;
 				kn->kn_fflags = 0;
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
 				kq->kq_count--;
 			} else
 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 			
 			kn->kn_status &= ~(KN_INFLUX);
 			KN_LIST_UNLOCK(kn);
 		}
 
 		/* we are returning a copy to the user */
 		kevp++;
 		nkev++;
 		count--;
 
 		if (nkev == KQ_NEVENTS) {
 			KQ_UNLOCK_FLUX(kq);
 			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 			nkev = 0;
 			kevp = keva;
 			KQ_LOCK(kq);
 			if (error)
 				break;
 		}
 	}
 	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
 done:
 	KQ_OWNED(kq);
 	KQ_UNLOCK_FLUX(kq);
 	knote_free(marker);
 done_nl:
 	KQ_NOTOWNED(kq);
 	if (nkev != 0)
 		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 	td->td_retval[0] = maxevents - count;
 	return (error);
 }
 
 /*
  * XXX
  * This could be expanded to call kqueue_scan, if desired.
  */
 /*ARGSUSED*/
 static int
 kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
 	int flags, struct thread *td)
 {
 	return (ENXIO);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
 	 int flags, struct thread *td)
 {
 	return (ENXIO);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
 	struct ucred *active_cred, struct thread *td)
 {
 	/*
 	 * Enabling sigio causes two major problems:
 	 * 1) infinite recursion:
 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
 	 * set.  On receipt of a signal this will cause a kqueue to recurse
 	 * into itself over and over.  Sending the sigio causes the kqueue
 	 * to become ready, which in turn posts sigio again, forever.
 	 * Solution: this can be solved by setting a flag in the kqueue that
 	 * we have a SIGIO in progress.
 	 * 2) locking problems:
 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
 	 * us above the proc and pgrp locks.
 	 * Solution: Post a signal using an async mechanism, being sure to
 	 * record a generation count in the delivery so that we do not deliver
 	 * a signal to the wrong process.
 	 *
 	 * Note, these two mechanisms are somewhat mutually exclusive!
 	 */
 #if 0
 	struct kqueue *kq;
 
 	kq = fp->f_data;
 	switch (cmd) {
 	case FIOASYNC:
 		if (*(int *)data) {
 			kq->kq_state |= KQ_ASYNC;
 		} else {
 			kq->kq_state &= ~KQ_ASYNC;
 		}
 		return (0);
 
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &kq->kq_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&kq->kq_sigio);
 		return (0);
 	}
 #endif
 
 	return (ENOTTY);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct kqueue *kq;
 	int revents = 0;
 	int error;
 
 	if ((error = kqueue_aquire(fp, &kq)))
 		return POLLERR;
 
 	KQ_LOCK(kq);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (kq->kq_count) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			selrecord(td, &kq->kq_sel);
 			kq->kq_state |= KQ_SEL;
 		}
 	}
 	kqueue_release(kq, 1);
 	KQ_UNLOCK(kq);
 	return (revents);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 
 	bzero((void *)st, sizeof *st);
 	/*
 	 * We no longer return kq_count because the unlocked value is useless.
 	 * If you spent all this time getting the count, why not spend your
 	 * syscall better by calling kevent?
 	 *
 	 * XXX - This is needed for libc_r.
 	 */
 	st->st_mode = S_IFIFO;
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_close(struct file *fp, struct thread *td)
 {
 	struct kqueue *kq = fp->f_data;
 	struct filedesc *fdp;
 	struct knote *kn;
 	int i;
 	int error;
 
 	if ((error = kqueue_aquire(fp, &kq)))
 		return error;
 
 	KQ_LOCK(kq);
 
 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
 	    ("kqueue already closing"));
 	kq->kq_state |= KQ_CLOSING;
 	if (kq->kq_refcnt > 1)
 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
 
 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
 	fdp = kq->kq_fdp;
 
 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
 	    ("kqueue's knlist not empty"));
 
 	for (i = 0; i < kq->kq_knlistsize; i++) {
 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
 			KASSERT((kn->kn_status & KN_INFLUX) == 0,
 			    ("KN_INFLUX set when not suppose to be"));
 			kn->kn_status |= KN_INFLUX;
 			KQ_UNLOCK(kq);
 			if (!(kn->kn_status & KN_DETACHED))
 				kn->kn_fop->f_detach(kn);
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 	}
 	if (kq->kq_knhashmask != 0) {
 		for (i = 0; i <= kq->kq_knhashmask; i++) {
 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
 				KASSERT((kn->kn_status & KN_INFLUX) == 0,
 				    ("KN_INFLUX set when not suppose to be"));
 				kn->kn_status |= KN_INFLUX;
 				KQ_UNLOCK(kq);
 				if (!(kn->kn_status & KN_DETACHED))
 					kn->kn_fop->f_detach(kn);
 				knote_drop(kn, td);
 				KQ_LOCK(kq);
 			}
 		}
 	}
 
 	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
 		kq->kq_state |= KQ_TASKDRAIN;
 		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
 	}
 
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		kq->kq_state &= ~KQ_SEL;
 		selwakeuppri(&kq->kq_sel, PSOCK);
 	}
 
 	KQ_UNLOCK(kq);
 
 	FILEDESC_LOCK_FAST(fdp);
 	SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
 	FILEDESC_UNLOCK_FAST(fdp);
 
 	knlist_destroy(&kq->kq_sel.si_note);
 	mtx_destroy(&kq->kq_lock);
 	kq->kq_fdp = NULL;
 
 	if (kq->kq_knhash != NULL)
 		free(kq->kq_knhash, M_KQUEUE);
 	if (kq->kq_knlist != NULL)
 		free(kq->kq_knlist, M_KQUEUE);
 
 	funsetown(&kq->kq_sigio);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
 	return (0);
 }
 
 static void
 kqueue_wakeup(struct kqueue *kq)
 {
 	KQ_OWNED(kq);
 
 	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
 		kq->kq_state &= ~KQ_SLEEP;
 		wakeup(kq);
 	}
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		kq->kq_state &= ~KQ_SEL;
 		selwakeuppri(&kq->kq_sel, PSOCK);
 	}
 	if (!knlist_empty(&kq->kq_sel.si_note))
 		kqueue_schedtask(kq);
 	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
 		pgsigio(&kq->kq_sigio, SIGIO, 0);
 	}
 }
 
 /*
  * Walk down a list of knotes, activating them if their event has triggered.
  *
  * There is a possibility to optimize in the case of one kq watching another.
  * Instead of scheduling a task to wake it up, you could pass enough state
  * down the chain to make up the parent kqueue.  Make this code functional
  * first.
  */
 void
 knote(struct knlist *list, long hint, int islocked)
 {
 	struct kqueue *kq;
 	struct knote *kn;
 
 	if (list == NULL)
 		return;
 
 	KNL_ASSERT_LOCK(list, islocked);
 
 	if (!islocked) 
 		list->kl_lock(list->kl_lockarg); 
 
 	/*
 	 * If we unlock the list lock (and set KN_INFLUX), we can eliminate
 	 * the kqueue scheduling, but this will introduce four
 	 * lock/unlock's for each knote to test.  If we do, continue to use
 	 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
 	 * only safe if you want to remove the current item, which we are
 	 * not doing.
 	 */
 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
 		kq = kn->kn_kq;
 		if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
 			KQ_LOCK(kq);
 			if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
 				kn->kn_status |= KN_HASKQLOCK;
 				if (kn->kn_fop->f_event(kn, hint))
 					KNOTE_ACTIVATE(kn, 1);
 				kn->kn_status &= ~KN_HASKQLOCK;
 			}
 			KQ_UNLOCK(kq);
 		}
 		kq = NULL;
 	}
 	if (!islocked)
 		list->kl_unlock(list->kl_lockarg); 
 }
 
 /*
  * add a knote to a knlist
  */
 void
 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
 {
 	KNL_ASSERT_LOCK(knl, islocked);
 	KQ_NOTOWNED(kn->kn_kq);
 	KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
 	    (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
 	if (!islocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
 	if (!islocked)
 		knl->kl_unlock(knl->kl_lockarg);
 	KQ_LOCK(kn->kn_kq);
 	kn->kn_knlist = knl;
 	kn->kn_status &= ~KN_DETACHED;
 	KQ_UNLOCK(kn->kn_kq);
 }
 
 static void
 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
 {
 	KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
 	KNL_ASSERT_LOCK(knl, knlislocked);
 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
 	if (!kqislocked)
 		KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
     ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
 	if (!knlislocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
 	kn->kn_knlist = NULL;
 	if (!knlislocked)
 		knl->kl_unlock(knl->kl_lockarg);
 	if (!kqislocked)
 		KQ_LOCK(kn->kn_kq);
 	kn->kn_status |= KN_DETACHED;
 	if (!kqislocked)
 		KQ_UNLOCK(kn->kn_kq);
 }
 
 /*
  * remove all knotes from a specified klist
  */
 void
 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	knlist_remove_kq(knl, kn, islocked, 0);
 }
 
 /*
  * remove knote from a specified klist while in f_event handler.
  */
 void
 knlist_remove_inevent(struct knlist *knl, struct knote *kn)
 {
 
 	knlist_remove_kq(knl, kn, 1,
 	    (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
 }
 
 int
 knlist_empty(struct knlist *knl)
 {
 	KNL_ASSERT_LOCKED(knl);
 	return SLIST_EMPTY(&knl->kl_list);
 }
 
 static struct mtx	knlist_lock;
 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
 	MTX_DEF);
 static void knlist_mtx_lock(void *arg);
 static void knlist_mtx_unlock(void *arg);
 static int knlist_mtx_locked(void *arg);
 
 static void
 knlist_mtx_lock(void *arg)
 {
 	mtx_lock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_unlock(void *arg)
 {
 	mtx_unlock((struct mtx *)arg);
 }
 
 static int
 knlist_mtx_locked(void *arg)
 {
 	return (mtx_owned((struct mtx *)arg));
 }
 
 void
 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
     void (*kl_unlock)(void *), int (*kl_locked)(void *))
 {
 
 	if (lock == NULL)
 		knl->kl_lockarg = &knlist_lock;
 	else
 		knl->kl_lockarg = lock;
 
 	if (kl_lock == NULL)
 		knl->kl_lock = knlist_mtx_lock;
 	else
 		knl->kl_lock = kl_lock;
 	if (kl_unlock == NULL)
 		knl->kl_unlock = knlist_mtx_unlock;
 	else
 		knl->kl_unlock = kl_unlock;
 	if (kl_locked == NULL)
 		knl->kl_locked = knlist_mtx_locked;
 	else
 		knl->kl_locked = kl_locked;
 
 	SLIST_INIT(&knl->kl_list);
 }
 
 void
 knlist_destroy(struct knlist *knl)
 {
 
 #ifdef INVARIANTS
 	/*
 	 * if we run across this error, we need to find the offending
 	 * driver and have it call knlist_clear.
 	 */
 	if (!SLIST_EMPTY(&knl->kl_list))
 		printf("WARNING: destroying knlist w/ knotes on it!\n");
 #endif
 
 	knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
 	SLIST_INIT(&knl->kl_list);
 }
 
 /*
  * Even if we are locked, we may need to drop the lock to allow any influx
  * knotes time to "settle".
  */
 void
 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
 {
 	struct knote *kn, *kn2;
 	struct kqueue *kq;
 
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		KNL_ASSERT_UNLOCKED(knl);
 again:		/* need to reaquire lock since we have dropped it */
 		knl->kl_lock(knl->kl_lockarg);
 	}
 
 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if ((kn->kn_status & KN_INFLUX)) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 		knlist_remove_kq(knl, kn, 1, 1);
 		if (killkn) {
 			kn->kn_status |= KN_INFLUX | KN_DETACHED;
 			KQ_UNLOCK(kq);
 			knote_drop(kn, td);
 		} else {
 			/* Make sure cleared knotes disappear soon */
 			kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 			KQ_UNLOCK(kq);
 		}
 		kq = NULL;
 	}
 
 	if (!SLIST_EMPTY(&knl->kl_list)) {
 		/* there are still KN_INFLUX remaining */
 		kn = SLIST_FIRST(&knl->kl_list);
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		KASSERT(kn->kn_status & KN_INFLUX,
 		    ("knote removed w/o list lock"));
 		knl->kl_unlock(knl->kl_lockarg);
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
 		kq = NULL;
 		goto again;
 	}
 
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		knl->kl_unlock(knl->kl_lockarg);
 		KNL_ASSERT_UNLOCKED(knl);
 	}
 }
 
 /*
  * remove all knotes referencing a specified fd
  * must be called with FILEDESC lock.  This prevents a race where a new fd
  * comes along and occupies the entry and we attach a knote to the fd.
  */
 void
 knote_fdclose(struct thread *td, int fd)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct kqueue *kq;
 	struct knote *kn;
 	int influx;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 
 	/*
 	 * We shouldn't have to worry about new kevents appearing on fd
 	 * since filedesc is locked.
 	 */
 	SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
 		KQ_LOCK(kq);
 
 again:
 		influx = 0;
 		while (kq->kq_knlistsize > fd &&
 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
 			if (kn->kn_status & KN_INFLUX) {
 				/* someone else might be waiting on our knote */
 				if (influx)
 					wakeup(kq);
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
 				goto again;
 			}
 			kn->kn_status |= KN_INFLUX;
 			KQ_UNLOCK(kq);
 			if (!(kn->kn_status & KN_DETACHED))
 				kn->kn_fop->f_detach(kn);
 			knote_drop(kn, td);
 			influx = 1;
 			KQ_LOCK(kq);
 		}
 		KQ_UNLOCK_FLUX(kq);
 	}
 }
 
 static int
 knote_attach(struct knote *kn, struct kqueue *kq)
 {
 	struct klist *list;
 
 	KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
 	KQ_OWNED(kq);
 
 	if (kn->kn_fop->f_isfd) {
 		if (kn->kn_id >= kq->kq_knlistsize)
 			return ENOMEM;
 		list = &kq->kq_knlist[kn->kn_id];
 	} else {
 		if (kq->kq_knhash == NULL)
 			return ENOMEM;
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 	}
 
 	SLIST_INSERT_HEAD(list, kn, kn_link);
 
 	return 0;
 }
 
 /*
  * knote must already have been detached using the f_detach method.
  * no lock need to be held, it is assumed that the KN_INFLUX flag is set
  * to prevent other removal.
  */
 static void
 knote_drop(struct knote *kn, struct thread *td)
 {
 	struct kqueue *kq;
 	struct klist *list;
 
 	kq = kn->kn_kq;
 
 	KQ_NOTOWNED(kq);
 	KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
 	    ("knote_drop called without KN_INFLUX set in kn_status"));
 
 	KQ_LOCK(kq);
 	if (kn->kn_fop->f_isfd)
 		list = &kq->kq_knlist[kn->kn_id];
 	else
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 
 	if (!SLIST_EMPTY(list))
 		SLIST_REMOVE(list, kn, knote, kn_link);
 	if (kn->kn_status & KN_QUEUED)
 		knote_dequeue(kn);
 	KQ_UNLOCK_FLUX(kq);
 
 	if (kn->kn_fop->f_isfd) {
 		fdrop(kn->kn_fp, td);
 		kn->kn_fp = NULL;
 	}
 	kqueue_fo_release(kn->kn_kevent.filter);
 	kn->kn_fop = NULL;
 	knote_free(kn);
 }
 
 static void
 knote_enqueue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status |= KN_QUEUED;
 	kq->kq_count++;
 	kqueue_wakeup(kq);
 }
 
 static void
 knote_dequeue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
 
 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status &= ~KN_QUEUED;
 	kq->kq_count--;
 }
 
 static void
 knote_init(void)
 {
 
 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
 
 static struct knote *
 knote_alloc(int waitok)
 {
 	return ((struct knote *)uma_zalloc(knote_zone,
 	    (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
 }
 
 static void
 knote_free(struct knote *kn)
 {
 	if (kn != NULL)
 		uma_zfree(knote_zone, kn);
 }
 
 /*
  * Register the kev w/ the kq specified by fd.
  */
 int 
 kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
 {
 	struct kqueue *kq;
 	struct file *fp;
 	int error;
 
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
 	if ((error = kqueue_aquire(fp, &kq)) != 0)
 		goto noaquire;
 
 	error = kqueue_register(kq, kev, td, waitok);
 
 	kqueue_release(kq, 0);
 
 noaquire:
 	fdrop(fp, td);
 
 	return error;
 }
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 167210)
+++ head/sys/kern/kern_exec.c	(revision 167211)
@@ -1,1307 +1,1299 @@
 /*-
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/acct.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/wait.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/namei.h>
 #include <sys/resourcevar.h>
 #include <sys/sf_buf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p);
 static void exec_free_args(struct image_args *);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_ps_strings, "LU", "");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_usrstack, "LU", "");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
     NULL, 0, sysctl_kern_stackprot, "I", "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_psstrings;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 		   sizeof(p->p_sysent->sv_psstrings));
 	return error;
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_usrstack;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 		    sizeof(p->p_sysent->sv_usrstack));
 	return error;
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
 	char    *fname; 
 	char    **argv;
 	char    **envv; 
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 execve(td, uap)
 	struct thread *td;
 	struct execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 	} */ *uap;
 {
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 __mac_execve(td, uap)
 	struct thread *td;
 	struct __mac_execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 		struct mac *mac_p;
 	} */ *uap;
 {
 #ifdef MAC
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 /*
  * XXX: kern_execve has the astonishing property of not always
  * returning to the caller.  If sufficiently bad things happen during
  * the call to do_execve(), it can end up calling exit1(); as a result,
  * callers must avoid doing anything which they might need to undo
  * (e.g., allocating memory).
  */
 int
 kern_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	int error;
 
 	AUDIT_ARG(argv, args->begin_argv, args->argc,
 	    args->begin_envv - args->begin_argv);
 	AUDIT_ARG(envv, args->begin_envv, args->envc,
 	    args->endp - args->begin_envv);
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		if (thread_single(SINGLE_BOUNDARY)) {
 			PROC_UNLOCK(p);
 	       		exec_free_args(args);
 			return (ERESTART);	/* Try again later. */
 		}
 		PROC_UNLOCK(p);
 	}
 
 	error = do_execve(td, args, mac_p);
 
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
 		 * force other threads to suicide.
 		 */
 		if (error == 0)
 			thread_single(SINGLE_EXIT);
 		else
 			thread_single_end();
 		PROC_UNLOCK(p);
 	}
 
 	return (error);
 }
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
- *
- * MPSAFE
  */
 static int
 do_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd, *ndp;
 	struct ucred *newcred = NULL, *oldcred;
 	struct uidinfo *euip;
 	register_t *stack_base;
 	int error, len, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts, *newsigacts;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *textvp = NULL;
 	int credential_changing;
 	int vfslocked;
 	int textset;
 #ifdef MAC
 	struct label *interplabel = NULL;
 	int will_transition;
 #endif
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
 
 	vfslocked = 0;
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->execlabel = NULL;
 	imgp->attr = &attr;
 	imgp->entry_addr = 0;
 	imgp->vmspace_destroyed = 0;
 	imgp->interpreted = 0;
 	imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX;
 	imgp->auxargs = NULL;
 	imgp->vp = NULL;
 	imgp->object = NULL;
 	imgp->firstpage = NULL;
 	imgp->ps_strings = 0;
 	imgp->auxarg_size = 0;
 	imgp->args = args;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
 	if (error)
 		goto exec_fail;
 #endif
 
 	imgp->image_header = NULL;
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp amoung other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
 	 */
 	ndp = &nd;
 	NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE |
 	    AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 
 interpret:
 	error = namei(ndp);
 	if (error)
 		goto exec_fail;
 
 	vfslocked = NDHASGIANT(ndp);
 	imgp->vp = ndp->ni_vp;
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->object = imgp->vp->v_object;
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
 	/*
 	 * Set VV_TEXT now so no one can write to the executable while we're
 	 * activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	textset = imgp->vp->v_vflag & VV_TEXT;
 	imgp->vp->v_vflag |= VV_TEXT;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1) {
 			if (textset == 0)
 				imgp->vp->v_vflag &= ~VV_TEXT;
 			error = ENOEXEC;
 		}
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * VV_TEXT needs to be unset for scripts.  There is a short
 		 * period before we determine that something is a script where
 		 * VV_TEXT will be set. The vnode lock is held over this
 		 * entire period so nothing should illegitimately be blocked.
 		 */
 		imgp->vp->v_vflag &= ~VV_TEXT;
 		/* free name buffer and old vnode */
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 #ifdef MAC
 		interplabel = mac_vnode_label_alloc();
 		mac_copy_vnode_label(ndp->ni_vp->v_label, interplabel);
 #endif
 		vput(ndp->ni_vp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
 		/* set new name to that of the interpreter */
 		NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		goto interpret;
 	}
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	if (p->p_sysent->sv_copyout_strings)
 		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
 	else
 		stack_base = exec_copyout_strings(imgp);
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup != NULL)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->args->argc);
 
 	/*
 	 * For security and other reasons, the file descriptor table cannot
 	 * be shared after an exec.
 	 */
 	fdunshare(p, td);
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	newcred = crget();
 	euip = uifind(attr.va_uid);
 	i = imgp->args->begin_envv - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		newargs = pargs_alloc(i);
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
 	/* close files on exec */
 	VOP_UNLOCK(imgp->vp, 0, td);
 	fdcloseexec(td);
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	/* Get a reference to the vnode prior to locking the proc */
 	VREF(ndp->ni_vp);
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	PROC_LOCK(p);
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		PROC_UNLOCK(p);
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 		PROC_LOCK(p);
 		p->p_sigacts = newsigacts;
 	} else
 		oldsigacts = NULL;
 
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
 	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
 	p->p_comm[len] = 0;
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
 		p->p_flag &= ~P_PPWAIT;
 		wakeup(p->p_pptr);
 	}
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	oldcred = p->p_ucred;
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid !=
 	    attr.va_uid;
 	credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid !=
 	    attr.va_gid;
 #ifdef MAC
 	will_transition = mac_execve_will_transition(oldcred, imgp->vp,
 	    interplabel, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	if (credential_changing &&
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 
 #ifdef KTRACE
 		if (p->p_tracevp != NULL &&
 		    priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED,
 		    SUSER_ALLOWJAIL)) {
 			mtx_lock(&ktrace_mtx);
 			p->p_traceflag = 0;
 			tracevp = p->p_tracevp;
 			p->p_tracevp = NULL;
 			tracecred = p->p_tracecred;
 			p->p_tracecred = NULL;
 			mtx_unlock(&ktrace_mtx);
 		}
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * setugidsafety() may call closef() and then pfind()
 		 * which may grab the process lock.
 		 * fdcheckstd() may call falloc() which may block to
 		 * allocate memory, so temporarily drop the process lock.
 		 */
 		PROC_UNLOCK(p);
 		setugidsafety(td);
 		VOP_UNLOCK(imgp->vp, 0, td);
 		error = fdcheckstd(td);
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 		if (error != 0)
 			goto done1;
 		PROC_LOCK(p);
 		/*
 		 * Set the new credentials.
 		 */
 		crcopy(newcred, oldcred);
 		if (attr.va_mode & VSUID)
 			change_euid(newcred, euip);
 		if (attr.va_mode & VSGID)
 			change_egid(newcred, attr.va_gid);
 #ifdef MAC
 		if (will_transition) {
 			mac_execve_transition(oldcred, newcred, imgp->vp,
 			    interplabel, imgp);
 		}
 #endif
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(newcred, newcred->cr_uid);
 		change_svgid(newcred, newcred->cr_gid);
 		p->p_ucred = newcred;
 		newcred = NULL;
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			crcopy(newcred, oldcred);
 			change_svuid(newcred, newcred->cr_uid);
 			change_svgid(newcred, newcred->cr_gid);
 			p->p_ucred = newcred;
 			newcred = NULL;
 		}
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced prior
 	 * to locking the proc lock.
 	 */
 	textvp = p->p_textvp;
 	p->p_textvp = ndp->ni_vp;
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/*
 	 * If tracing the process, trap to debugger so breakpoints
 	 * can be set before the program executes.
 	 * Use tdsignal to deliver signal to current thread, use
 	 * psignal may cause the signal to be delivered to wrong thread
 	 * because that thread will exit, remember we are going to enter
 	 * single thread mode.
 	 */
 	if (p->p_flag & P_TRACED)
 		tdsignal(p, td, SIGTRAP, NULL);
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/*
 	 * Free any previous argument cache and replace it with
 	 * the new argument cache, if any.
 	 */
 	oldargs = p->p_args;
 	p->p_args = newargs;
 	newargs = NULL;
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * Check if system-wide sampling is in effect or if the
 	 * current process is using PMCs.  If so, do exec() time
 	 * processing.  This processing needs to happen AFTER the
 	 * P_INEXEC flag is cleared.
 	 *
 	 * The proc lock needs to be released before taking the PMC
 	 * SX.
 	 */
 	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
 		PROC_UNLOCK(p);
 		pe.pm_credentialschanged = credential_changing;
 		pe.pm_entryaddr = imgp->entry_addr;
 
 		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
 	} else
 		PROC_UNLOCK(p);
 #else  /* !HWPMC_HOOKS */
 	PROC_UNLOCK(p);
 #endif
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
 		(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 	else
 		exec_setregs(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 
 	vfs_mark_atime(imgp->vp, td);
 
 done1:
 	/*
 	 * Free any resources malloc'd earlier that we didn't use.
 	 */
 	uifree(euip);
 	if (newcred == NULL)
 		crfree(oldcred);
 	else
 		crfree(newcred);
 	VOP_UNLOCK(imgp->vp, 0, td);
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (textvp != NULL) {
 		int tvfslocked;
 
 		tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
 		vrele(textvp);
 		VFS_UNLOCK_GIANT(tvfslocked);
 	}
 	if (ndp->ni_vp && error != 0)
 		vrele(ndp->ni_vp);
 #ifdef KTRACE
 	if (tracevp != NULL)
 		vrele(tracevp);
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (oldargs != NULL)
 		pargs_drop(oldargs);
 	if (newargs != NULL)
 		pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 
 exec_fail_dealloc:
 
 	/*
 	 * free various allocated resources
 	 */
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp != NULL) {
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 		vput(imgp->vp);
 	}
 
 	if (imgp->object != NULL)
 		vm_object_deallocate(imgp->object);
 
 	if (error == 0) {
 		/*
 		 * Stop the process here if its stop event mask has
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
 		goto done2;
 	}
 
 exec_fail:
 	/* we're done here, clear P_INEXEC */
 	PROC_LOCK(p);
 	p->p_flag &= ~P_INEXEC;
 	PROC_UNLOCK(p);
 
 done2:
 #ifdef MAC
 	mac_execve_exit(imgp);
 	if (interplabel != NULL)
 		mac_vnode_label_free(interplabel);
 #endif
 	VFS_UNLOCK_GIANT(vfslocked);
 	exec_free_args(args);
 
 	if (error && imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(td, W_EXITCODE(0, SIGABRT));
 		/* NOT REACHED */
 	}
 	return (error);
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i;
 	int initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
 	VM_OBJECT_LOCK(object);
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
 		initial_pagein = VM_INITIAL_PAGEIN;
 		if (initial_pagein > object->size)
 			initial_pagein = object->size;
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
 				if (ma[i]->valid)
 					break;
 				if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
 					break;
 				vm_page_busy(ma[i]);
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
 		ma[0] = vm_page_lookup(object, 0);
 		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) ||
 		    (ma[0]->valid == 0)) {
 			if (ma[0]) {
 				vm_page_lock_queues();
 				vm_page_free(ma[0]);
 				vm_page_unlock_queues();
 			}
 			VM_OBJECT_UNLOCK(object);
 			return (EIO);
 		}
 	}
 	vm_page_lock_queues();
 	vm_page_hold(ma[0]);
 	vm_page_unlock_queues();
 	vm_page_wakeup(ma[0]);
 	VM_OBJECT_UNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(imgp)
 	struct image_params *imgp;
 {
 	vm_page_t m;
 
 	if (imgp->firstpage != NULL) {
 		m = sf_buf_page(imgp->firstpage);
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_lock_queues();
 		vm_page_unhold(m);
 		vm_page_unlock_queues();
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack
  *	The new stack is only SGROWSIZ large because it is grown
  *	automatically in trap.c.
  */
 int
 exec_new_vmspace(imgp, sv)
 	struct image_params *imgp;
 	struct sysentvec *sv;
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_offset_t stack_addr;
 	vm_map_t map;
 
 	imgp->vmspace_destroyed = 1;
 	imgp->sysent = sv;
 
 	/* Called with Giant held, do not depend on it! */
 	EVENTHANDLER_INVOKE(process_exec, p, imgp);
 
 	/*
 	 * Here is as good a place as any to do any resource limit cleanups.
 	 * This is needed if a 64 bit binary exec's a 32 bit binary - the
 	 * data size limit may need to be changed to a value that makes
 	 * sense for the 32 bit binary.
 	 */
 	if (sv->sv_fixlimits != NULL)
 		sv->sv_fixlimits(p);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 	} else {
 		vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Allocate a new stack */
 	stack_addr = sv->sv_usrstack - maxssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error)
 		return (error);
 
 #ifdef __ia64__
 	/* Allocate a new register stack */
 	stack_addr = IA64_BACKINGSTORE;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
 	if (error)
 		return (error);
 #endif
 
 	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
 	 * VM_STACK case, but they are still used to monitor the size of the
 	 * process stack so we can check the stack rlimit.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;
 
 	return (0);
 }
 
 /*
  * Copy out argument and environment strings from the old process
  *	address space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
 	char *argp, *envp;
 	int error;
 	size_t length;
 
 	error = 0;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 	/*
 	 * Allocate temporary demand zeroed space for argument and
 	 *	environment strings:
 	 *
 	 * o ARG_MAX for argument and environment;
 	 * o MAXSHELLCMDLEN for the name of interpreters.
 	 */
 	args->buf = (char *) kmem_alloc_wait(exec_map,
 	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
 	if (args->buf == NULL)
 		return (ENOMEM);
 	args->begin_argv = args->buf;
 	args->endp = args->begin_argv;
 	args->stringspace = ARG_MAX;
 
 	args->fname = args->buf + ARG_MAX;
 
 	/*
 	 * Copy the file name.
 	 */
 	error = (segflg == UIO_SYSSPACE) ?
 	    copystr(fname, args->fname, PATH_MAX, &length) :
 	    copyinstr(fname, args->fname, PATH_MAX, &length);
 	if (error != 0)
 		goto err_exit;
 
 	/*
 	 * extract arguments first
 	 */
 	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
 		if (argp == (caddr_t) -1) {
 			error = EFAULT;
 			goto err_exit;
 		}
 		if ((error = copyinstr(argp, args->endp,
 		    args->stringspace, &length))) {
 			if (error == ENAMETOOLONG) 
 				error = E2BIG;
 			goto err_exit;
 		}
 		args->stringspace -= length;
 		args->endp += length;
 		args->argc++;
 	}
 
 	args->begin_envv = args->endp;
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
 			if (envp == (caddr_t)-1) {
 				error = EFAULT;
 				goto err_exit;
 			}
 			if ((error = copyinstr(envp, args->endp,
 			    args->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
 					error = E2BIG;
 				goto err_exit;
 			}
 			args->stringspace -= length;
 			args->endp += length;
 			args->envc++;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 static void
 exec_free_args(struct image_args *args)
 {
 
 	if (args->buf) {
 		kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
 		    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
 		args->buf = NULL;
 	}
 }
 
 /*
  * Copy strings out to the new process address space, constructing
  *	new arg and env vector tables. Return a pointer to the base
  *	so that it can be used as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(imgp)
 	struct image_params *imgp;
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp, *destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	int szsigcode;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	if (p->p_sysent->sv_szsigcode != NULL)
 		szsigcode = *(p->p_sysent->sv_szsigcode);
 	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
 	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode)
 		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
 		    szsigcode), szsigcode);
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size) *
 		    sizeof(char *));
 
 	} else {
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
 		    sizeof(char *));
 	}
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(imgp)
 	struct image_params *imgp;
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error;
 
 	td = curthread;			/* XXXKSE */
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred, td);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that this
 	 *	file resides on.
 	 * 2) Insure that at least one execute bit is on - otherwise root
 	 *	will always succeed, and we don't want to happen unless the
 	 *	file really is executable.
 	 * 3) Insure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr->va_mode & 0111) == 0) ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	if (vp->v_writecount)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1);
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: head/sys/kern/kern_exit.c
===================================================================
--- head/sys/kern/kern_exit.c	(revision 167210)
+++ head/sys/kern/kern_exit.c	(revision 167211)
@@ -1,908 +1,899 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/tty.h>
 #include <sys/wait.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/syslog.h>
 #include <sys/ptrace.h>
 #include <sys/acct.h>		/* for acct_process() function prototype */
 #include <sys/filedesc.h>
 #include <sys/shm.h>
 #include <sys/sem.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 /* Required to be non-static for SysVR4 emulator */
 MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status");
 
 /* Hook for NFS teardown procedure. */
 void (*nlminfo_release_p)(struct proc *p);
 
 /*
  * exit --
  *	Death of process.
- *
- * MPSAFE
  */
 void
 sys_exit(struct thread *td, struct sys_exit_args *uap)
 {
 
 	exit1(td, W_EXITCODE(uap->rval, 0));
 	/* NOTREACHED */
 }
 
 /*
  * Exit: deallocate address space and other resources, change proc state
  * to zombie, and unlink proc from allproc and parent's lists.  Save exit
  * status and rusage for wait().  Check for child processes and orphan them.
  */
 void
 exit1(struct thread *td, int rv)
 {
 	struct proc *p, *nq, *q;
 	struct tty *tp;
 	struct vnode *ttyvp;
 	struct vnode *vtmp;
 #ifdef KTRACE
 	struct vnode *tracevp;
 	struct ucred *tracecred;
 #endif
 	struct plimit *plim;
 	int locked;
 
 	/*
 	 * Drop Giant if caller has it.  Eventually we should warn about
 	 * being called with Giant held.
 	 */ 
 	while (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 
 	p = td->td_proc;
 	if (p == initproc) {
 		printf("init died (signal %d, exit %d)\n",
 		    WTERMSIG(rv), WEXITSTATUS(rv));
 		panic("Going nowhere without my init!");
 	}
 
 	/*
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);
 	if (p->p_flag & P_HADTHREADS) {
 retry:
 		/*
 		 * First check if some other thread got here before us..
 		 * if so, act apropriatly, (exit or suspend);
 		 */
 		thread_suspend_check(0);
 
 		/*
 		 * Kill off the other threads. This requires
 		 * some co-operation from other parts of the kernel
 		 * so it may not be instantaneous.  With this state set
 		 * any thread entering the kernel from userspace will
 		 * thread_exit() in trap().  Any thread attempting to
 		 * sleep will return immediately with EINTR or EWOULDBLOCK
 		 * which will hopefully force them to back out to userland
 		 * freeing resources as they go.  Any thread attempting
 		 * to return to userland will thread_exit() from userret().
 		 * thread_exit() will unsuspend us when the last of the
 		 * other threads exits.
 		 * If there is already a thread singler after resumption,
 		 * calling thread_single will fail; in that case, we just
 		 * re-check all suspension request, the thread should
 		 * either be suspended there or exit.
 		 */
 		if (thread_single(SINGLE_EXIT))
 			goto retry;
 
 		/*
 		 * All other activity in this process is now stopped.
 		 * Threading support has been turned off.
 		 */
 	}
 
 	/*
 	 * Wakeup anyone in procfs' PIOCWAIT.  They should have a hold
 	 * on our vmspace, so we should block below until they have
 	 * released their reference to us.  Note that if they have
 	 * requested S_EXIT stops we will block here until they ack
 	 * via PIOCCONT.
 	 */
 	_STOPEVENT(p, S_EXIT, rv);
 
 	/*
 	 * Note that we are exiting and do another wakeup of anyone in
 	 * PIOCWAIT in case they aren't listening for S_EXIT stops or
 	 * decided to wait again after we told them we are exiting.
 	 */
 	p->p_flag |= P_WEXIT;
 	wakeup(&p->p_stype);
 
 	/*
 	 * Wait for any processes that have a hold on our vmspace to
 	 * release their reference.
 	 */
 	while (p->p_lock > 0)
 		msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
 
 	PROC_UNLOCK(p);
 
 #ifdef AUDIT
 	/*
 	 * The Sun BSM exit token contains two components: an exit status as
 	 * passed to exit(), and a return value to indicate what sort of exit
 	 * it was.  The exit status is WEXITSTATUS(rv), but it's not clear
 	 * what the return value is.
 	 */
 	AUDIT_ARG(exit, WEXITSTATUS(rv), 0);
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 
 	/* Are we a task leader? */
 	if (p == p->p_leader) {
 		mtx_lock(&ppeers_lock);
 		q = p->p_peers;
 		while (q != NULL) {
 			PROC_LOCK(q);
 			psignal(q, SIGKILL);
 			PROC_UNLOCK(q);
 			q = q->p_peers;
 		}
 		while (p->p_peers != NULL)
 			msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
 		mtx_unlock(&ppeers_lock);
 	}
 
 	/*
 	 * Check if any loadable modules need anything done at process exit.
 	 * E.g. SYSV IPC stuff
 	 * XXX what if one of these generates an error?
 	 */
 	EVENTHANDLER_INVOKE(process_exit, p);
 
 	MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
 		M_ZOMBIE, M_WAITOK);
 	/*
 	 * If parent is waiting for us to exit or exec,
 	 * P_PPWAIT is set; we will wakeup the parent below.
 	 */
 	PROC_LOCK(p);
 	stopprofclock(p);
 	p->p_flag &= ~(P_TRACED | P_PPWAIT);
 
 	/*
 	 * Stop the real interval timer.  If the handler is currently
 	 * executing, prevent it from rearming itself and let it finish.
 	 */
 	if (timevalisset(&p->p_realtimer.it_value) &&
 	    callout_stop(&p->p_itcallout) == 0) {
 		timevalclear(&p->p_realtimer.it_interval);
 		msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0);
 		KASSERT(!timevalisset(&p->p_realtimer.it_value),
 		    ("realtime timer is still armed"));
 	}
 	PROC_UNLOCK(p);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pid.
 	 */
 	funsetownlst(&p->p_sigiolst);
 
 	/*
 	 * If this process has an nlminfo data area (for lockd), release it
 	 */
 	if (nlminfo_release_p != NULL && p->p_nlminfo != NULL)
 		(*nlminfo_release_p)(p);
 
 	/*
 	 * Close open files and release open-file table.
 	 * This may block!
 	 */
 	fdfree(td);
 
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
 	 */
 	if (td->td_pflags & TDP_GEOM)
 		g_waitidle();
 
 	/*
 	 * Remove ourself from our leader's peer list and wake our leader.
 	 */
 	mtx_lock(&ppeers_lock);
 	if (p->p_leader->p_peers) {
 		q = p->p_leader;
 		while (q->p_peers != p)
 			q = q->p_peers;
 		q->p_peers = p->p_peers;
 		wakeup(p->p_leader);
 	}
 	mtx_unlock(&ppeers_lock);
 
 	vmspace_exit(td);
 
 	mtx_lock(&Giant);	/* XXX TTY */
 	sx_xlock(&proctree_lock);
 	if (SESS_LEADER(p)) {
 		struct session *sp;
 
 		sp = p->p_session;
 		if (sp->s_ttyvp) {
 			/*
 			 * Controlling process.
 			 * Signal foreground pgrp,
 			 * drain controlling terminal
 			 * and revoke access to controlling terminal.
 			 */
 			if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
 				tp = sp->s_ttyp;
 				if (sp->s_ttyp->t_pgrp) {
 					PGRP_LOCK(sp->s_ttyp->t_pgrp);
 					pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
 					PGRP_UNLOCK(sp->s_ttyp->t_pgrp);
 				}
 				/* XXX tp should be locked. */
 				sx_xunlock(&proctree_lock);
 				(void) ttywait(tp);
 				sx_xlock(&proctree_lock);
 				/*
 				 * The tty could have been revoked
 				 * if we blocked.
 				 */
 				if (sp->s_ttyvp) {
 					ttyvp = sp->s_ttyvp;
 					SESS_LOCK(p->p_session);
 					sp->s_ttyvp = NULL;
 					SESS_UNLOCK(p->p_session);
 					sx_xunlock(&proctree_lock);
 					VOP_LOCK(ttyvp, LK_EXCLUSIVE, td);
 					VOP_REVOKE(ttyvp, REVOKEALL);
 					vput(ttyvp);
 					sx_xlock(&proctree_lock);
 				}
 			}
 			if (sp->s_ttyvp) {
 				ttyvp = sp->s_ttyvp;
 				SESS_LOCK(p->p_session);
 				sp->s_ttyvp = NULL;
 				SESS_UNLOCK(p->p_session);
 				vrele(ttyvp);
 			}
 			/*
 			 * s_ttyp is not zero'd; we use this to indicate
 			 * that the session once had a controlling terminal.
 			 * (for logging and informational purposes)
 			 */
 		}
 		SESS_LOCK(p->p_session);
 		sp->s_leader = NULL;
 		SESS_UNLOCK(p->p_session);
 	}
 	fixjobc(p, p->p_pgrp, 0);
 	sx_xunlock(&proctree_lock);
 	(void)acct_process(td);
 	mtx_unlock(&Giant);	
 #ifdef KTRACE
 	/*
 	 * Drain any pending records on the thread and release the trace
 	 * file.  It might be better if drain-and-clear were atomic.
 	 */
 	ktrprocexit(td);
 	PROC_LOCK(p);
 	mtx_lock(&ktrace_mtx);
 	p->p_traceflag = 0;	/* don't trace the vrele() */
 	tracevp = p->p_tracevp;
 	p->p_tracevp = NULL;
 	tracecred = p->p_tracecred;
 	p->p_tracecred = NULL;
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
 	if (tracevp != NULL) {
 		locked = VFS_LOCK_GIANT(tracevp->v_mount);
 		vrele(tracevp);
 		VFS_UNLOCK_GIANT(locked);
 	}
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	/*
 	 * Release reference to text vnode
 	 */
 	if ((vtmp = p->p_textvp) != NULL) {
 		p->p_textvp = NULL;
 		locked = VFS_LOCK_GIANT(vtmp->v_mount);
 		vrele(vtmp);
 		VFS_UNLOCK_GIANT(locked);
 	}
 
 	/*
 	 * Release our limits structure.
 	 */
 	PROC_LOCK(p);
 	plim = p->p_limit;
 	p->p_limit = NULL;
 	PROC_UNLOCK(p);
 	lim_free(plim);
 
 	/*
 	 * Remove proc from allproc queue and pidhash chain.
 	 * Place onto zombproc.  Unlink from parent's child list.
 	 */
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(p, p_list);
 	LIST_INSERT_HEAD(&zombproc, p, p_list);
 	LIST_REMOVE(p, p_hash);
 	sx_xunlock(&allproc_lock);
 
 	/*
 	 * Reparent all of our children to init.
 	 */
 	sx_xlock(&proctree_lock);
 	q = LIST_FIRST(&p->p_children);
 	if (q != NULL)		/* only need this if any child is S_ZOMB */
 		wakeup(initproc);
 	for (; q != NULL; q = nq) {
 		nq = LIST_NEXT(q, p_sibling);
 		PROC_LOCK(q);
 		proc_reparent(q, initproc);
 		q->p_sigparent = SIGCHLD;
 		/*
 		 * Traced processes are killed
 		 * since their existence means someone is screwing up.
 		 */
 		if (q->p_flag & P_TRACED) {
 			q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
 			psignal(q, SIGKILL);
 		}
 		PROC_UNLOCK(q);
 	}
 
 	/*
 	 * Save exit status and finalize rusage info except for times,
 	 * adding in child rusage info later when our time is locked.
 	 */
 	PROC_LOCK(p);
 	p->p_xstat = rv;
 	p->p_xthread = td;
 	p->p_stats->p_ru.ru_nvcsw++;
 	*p->p_ru = p->p_stats->p_ru;
 
 	/*
 	 * Notify interested parties of our demise.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);
 
 	/*
 	 * Just delete all entries in the p_klist. At this point we won't
 	 * report any more events, and there are nasty race conditions that
 	 * can beat us if we don't.
 	 */
 	knlist_clear(&p->p_klist, 1);
 
 	/*
 	 * Notify parent that we're gone.  If parent has the PS_NOCLDWAIT
 	 * flag set, or if the handler is set to SIG_IGN, notify process
 	 * 1 instead (and hope it will handle this situation).
 	 */
 	PROC_LOCK(p->p_pptr);
 	mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
 	if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
 		struct proc *pp;
 
 		mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 		pp = p->p_pptr;
 		PROC_UNLOCK(pp);
 		proc_reparent(p, initproc);
 		p->p_sigparent = SIGCHLD;
 		PROC_LOCK(p->p_pptr);
 		/*
 		 * If this was the last child of our parent, notify
 		 * parent, so in case he was wait(2)ing, he will
 		 * continue.
 		 */
 		if (LIST_EMPTY(&pp->p_children))
 			wakeup(pp);
 	} else
 		mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 
 	if (p->p_pptr == initproc)
 		psignal(p->p_pptr, SIGCHLD);
 	else if (p->p_sigparent != 0) {
 		if (p->p_sigparent == SIGCHLD)
 			childproc_exited(p);
 		else	/* LINUX thread */
 			psignal(p->p_pptr, p->p_sigparent);
 	}
 	PROC_UNLOCK(p->p_pptr);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Finally, call machine-dependent code to release the remaining
 	 * resources including address space.
 	 * The address space is released by "vmspace_exitfree(p)" in
 	 * vm_waitproc().
 	 */
 	cpu_exit(td);
 
 	WITNESS_WARN(WARN_PANIC, &proctree_lock.sx_object,
 	    "process (pid %d) exiting", p->p_pid);
 
 	PROC_LOCK(p);
 	PROC_LOCK(p->p_pptr);
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * The state PRS_ZOMBIE prevents other proesses from sending
 	 * signal to the process, to avoid memory leak, we free memory
 	 * for signal queue at the time when the state is set.
 	 */
 	sigqueue_flush(&p->p_sigqueue);
 	sigqueue_flush(&td->td_sigqueue);
 
 	/*
 	 * We have to wait until after acquiring all locks before
 	 * changing p_state.  We need to avoid all possible context
 	 * switches (including ones from blocking on a mutex) while
 	 * marked as a zombie.  We also have to set the zombie state
 	 * before we release the parent process' proc lock to avoid
 	 * a lost wakeup.  So, we first call wakeup, then we grab the
 	 * sched lock, update the state, and release the parent process'
 	 * proc lock.
 	 */
 	wakeup(p->p_pptr);
 	mtx_lock_spin(&sched_lock);
 	p->p_state = PRS_ZOMBIE;
 	PROC_UNLOCK(p->p_pptr);
 
 	sched_exit(p->p_pptr, td);
 
 	/*
 	 * Hopefully no one will try to deliver a signal to the process this
 	 * late in the game.
 	 */
 	knlist_destroy(&p->p_klist);
 
 	/*
 	 * Make sure the scheduler takes this thread out of its tables etc.
 	 * This will also release this thread's reference to the ucred.
 	 * Other thread parts to release include pcb bits and such.
 	 */
 	thread_exit();
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct abort2_args {
 	char *why;
 	int nargs;
 	void **args;
 };
 #endif
 
-/*
- * MPSAFE.
- */
 int
 abort2(struct thread *td, struct abort2_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct sbuf *sb;
 	void *uargs[16];
 	int error, i, sig;
 
 	error = 0;	/* satisfy compiler */
 
 	/*
 	 * Do it right now so we can log either proper call of abort2(), or
 	 * note, that invalid argument was passed. 512 is big enough to
 	 * handle 16 arguments' descriptions with additional comments.
 	 */
 	sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
 	sbuf_clear(sb);
 	sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
 	    p->p_comm, p->p_pid, td->td_ucred->cr_uid);
 	/* 
 	 * Since we can't return from abort2(), send SIGKILL in cases, where
 	 * abort2() was called improperly
 	 */
 	sig = SIGKILL;
 	/* Prevent from DoSes from user-space. */
 	if (uap->nargs < 0 || uap->nargs > 16)
 		goto out;
 	if (uap->args == NULL)
 		goto out;
 	error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
 	if (error != 0)
 		goto out;
 	/*
 	 * Limit size of 'reason' string to 128. Will fit even when
 	 * maximal number of arguments was chosen to be logged.
 	 */
 	if (uap->why != NULL) {
 		error = sbuf_copyin(sb, uap->why, 128);
 		if (error < 0)
 			goto out;
 	} else {
 		sbuf_printf(sb, "(null)");
 	}
 	if (uap->nargs) {
 		sbuf_printf(sb, "(");
 		for (i = 0;i < uap->nargs; i++)
 			sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
 		sbuf_printf(sb, ")");
 	}
 	/*
 	 * Final stage: arguments were proper, string has been
 	 * successfully copied from userspace, and copying pointers
 	 * from user-space succeed.
 	 */
 	sig = SIGABRT;
 out:
 	if (sig == SIGKILL) {
 		sbuf_trim(sb);
 		sbuf_printf(sb, " (Reason text inaccessible)");
 	}
 	sbuf_cat(sb, "\n");
 	sbuf_finish(sb);
 	log(LOG_INFO, "%s", sbuf_data(sb));
 	sbuf_delete(sb);
 	exit1(td, W_EXITCODE(0, sig));
 	return (0);
 }
 
 
 #ifdef COMPAT_43
 /*
  * The dirty work is handled by kern_wait().
- *
- * MPSAFE.
  */
 int
 owait(struct thread *td, struct owait_args *uap __unused)
 {
 	int error, status;
 
 	error = kern_wait(td, WAIT_ANY, &status, 0, NULL);
 	if (error == 0)
 		td->td_retval[1] = status;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * The dirty work is handled by kern_wait().
- *
- * MPSAFE.
  */
 int
 wait4(struct thread *td, struct wait_args *uap)
 {
 	struct rusage ru, *rup;
 	int error, status;
 
 	if (uap->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
 	if (uap->status != NULL && error == 0)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->rusage != NULL && error == 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
 
 int
 kern_wait(struct thread *td, pid_t pid, int *status, int options,
     struct rusage *rusage)
 {
 	struct proc *p, *q, *t;
 	int error, nfound;
 
 	AUDIT_ARG(pid, pid);
 
 	q = td->td_proc;
 	if (pid == 0) {
 		PROC_LOCK(q);
 		pid = -q->p_pgid;
 		PROC_UNLOCK(q);
 	}
 	if (options &~ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE))
 		return (EINVAL);
 loop:
 	if (q->p_flag & P_STATCHILD) {
 		PROC_LOCK(q);
 		q->p_flag &= ~P_STATCHILD;
 		PROC_UNLOCK(q);
 	}
 	nfound = 0;
 	sx_xlock(&proctree_lock);
 	LIST_FOREACH(p, &q->p_children, p_sibling) {
 		PROC_LOCK(p);
 		if (pid != WAIT_ANY &&
 		    p->p_pid != pid && p->p_pgid != -pid) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p_canwait(td, p)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 
 		/*
 		 * This special case handles a kthread spawned by linux_clone
 		 * (see linux_misc.c).  The linux_wait4 and linux_waitpid
 		 * functions need to be able to distinguish between waiting
 		 * on a process and waiting on a thread.  It is a thread if
 		 * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
 		 * signifies we want to wait for threads and not processes.
 		 */
 		if ((p->p_sigparent != SIGCHLD) ^
 		    ((options & WLINUXCLONE) != 0)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 
 		nfound++;
 		if (p->p_state == PRS_ZOMBIE) {
 
 			/*
 			 * It is possible that the last thread of this
 			 * process is still running on another CPU
 			 * in thread_exit() after having dropped the process
 			 * lock via PROC_UNLOCK() but before it has completed
 			 * cpu_throw().  In that case, the other thread must
 			 * still hold sched_lock, so simply by acquiring
 			 * sched_lock once we will wait long enough for the
 			 * thread to exit in that case.
 			 */
 			mtx_lock_spin(&sched_lock);
 			mtx_unlock_spin(&sched_lock);
 			
 			td->td_retval[0] = p->p_pid;
 			if (status)
 				*status = p->p_xstat;	/* convert to int */
 			if (rusage) {
 				*rusage = *p->p_ru;
 				calcru(p, &rusage->ru_utime, &rusage->ru_stime);
 			}
 
 			PROC_LOCK(q);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(q);
 
 			/*
 			 * If we got the child via a ptrace 'attach',
 			 * we need to give it back to the old parent.
 			 */
 			PROC_UNLOCK(p);
 			if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
 				PROC_LOCK(p);
 				p->p_oppid = 0;
 				proc_reparent(p, t);
 				PROC_UNLOCK(p);
 				tdsignal(t, NULL, SIGCHLD, p->p_ksi);
 				wakeup(t);
 				PROC_UNLOCK(t);
 				sx_xunlock(&proctree_lock);
 				return (0);
 			}
 
 			/*
 			 * Remove other references to this process to ensure
 			 * we have an exclusive reference.
 			 */
 			sx_xlock(&allproc_lock);
 			LIST_REMOVE(p, p_list);	/* off zombproc */
 			sx_xunlock(&allproc_lock);
 			LIST_REMOVE(p, p_sibling);
 			leavepgrp(p);
 			sx_xunlock(&proctree_lock);
 
 			/*
 			 * As a side effect of this lock, we know that
 			 * all other writes to this proc are visible now, so
 			 * no more locking is needed for p.
 			 */
 			PROC_LOCK(p);
 			p->p_xstat = 0;		/* XXX: why? */
 			PROC_UNLOCK(p);
 			PROC_LOCK(q);
 			ruadd(&q->p_stats->p_cru, &q->p_crux, p->p_ru,
 			    &p->p_rux);
 			PROC_UNLOCK(q);
 			FREE(p->p_ru, M_ZOMBIE);
 			p->p_ru = NULL;
 
 			/*
 			 * Decrement the count of procs running with this uid.
 			 */
 			(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
 
 			/*
 			 * Free credentials, arguments, and sigacts.
 			 */
 			crfree(p->p_ucred);
 			p->p_ucred = NULL;
 			pargs_drop(p->p_args);
 			p->p_args = NULL;
 			sigacts_free(p->p_sigacts);
 			p->p_sigacts = NULL;
 
 			/*
 			 * Do any thread-system specific cleanups.
 			 */
 			thread_wait(p);
 
 			/*
 			 * Give vm and machine-dependent layer a chance
 			 * to free anything that cpu_exit couldn't
 			 * release while still running in process context.
 			 */
 			vm_waitproc(p);
 #ifdef MAC
 			mac_destroy_proc(p);
 #endif
 #ifdef AUDIT
 			audit_proc_free(p);
 #endif
 			KASSERT(FIRST_THREAD_IN_PROC(p),
 			    ("kern_wait: no residual thread!"));
 			uma_zfree(proc_zone, p);
 			sx_xlock(&allproc_lock);
 			nprocs--;
 			sx_xunlock(&allproc_lock);
 			return (0);
 		}
 		mtx_lock_spin(&sched_lock);
 		if ((p->p_flag & P_STOPPED_SIG) &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    (p->p_flag & P_WAITED) == 0 &&
 		    (p->p_flag & P_TRACED || options & WUNTRACED)) {
 			mtx_unlock_spin(&sched_lock);
 			p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 			if (status)
 				*status = W_STOPCODE(p->p_xstat);
 
 			PROC_LOCK(q);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(q);
 			PROC_UNLOCK(p);
 
 			return (0);
 		}
 		mtx_unlock_spin(&sched_lock);
 		if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 			p->p_flag &= ~P_CONTINUED;
 
 			PROC_LOCK(q);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(q);
 			PROC_UNLOCK(p);
 
 			if (status)
 				*status = SIGCONT;
 			return (0);
 		}
 		PROC_UNLOCK(p);
 	}
 	if (nfound == 0) {
 		sx_xunlock(&proctree_lock);
 		return (ECHILD);
 	}
 	if (options & WNOHANG) {
 		sx_xunlock(&proctree_lock);
 		td->td_retval[0] = 0;
 		return (0);
 	}
 	PROC_LOCK(q);
 	sx_xunlock(&proctree_lock);
 	if (q->p_flag & P_STATCHILD) {
 		q->p_flag &= ~P_STATCHILD;
 		error = 0;
 	} else
 		error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
 	PROC_UNLOCK(q);
 	if (error)
 		return (error);	
 	goto loop;
 }
 
 /*
  * Make process 'parent' the new parent of process 'child'.
  * Must be called with an exclusive hold of proctree lock.
  */
 void
 proc_reparent(struct proc *child, struct proc *parent)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(child, MA_OWNED);
 	if (child->p_pptr == parent)
 		return;
 
 	PROC_LOCK(child->p_pptr);
 	sigqueue_take(child->p_ksi);
 	PROC_UNLOCK(child->p_pptr);
 	LIST_REMOVE(child, p_sibling);
 	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
 	child->p_pptr = parent;
 }
Index: head/sys/kern/kern_fork.c
===================================================================
--- head/sys/kern/kern_fork.c	(revision 167210)
+++ head/sys/kern/kern_fork.c	(revision 167211)
@@ -1,857 +1,848 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/syscall.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/unistd.h>	
 #include <sys/sx.h>
 #include <sys/signalvar.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct fork_args {
 	int     dummy;
 };
 #endif
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 fork(td, uap)
 	struct thread *td;
 	struct fork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 vfork(td, uap)
 	struct thread *td;
 	struct vfork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 rfork(td, uap)
 	struct thread *td;
 	struct rfork_args *uap;
 {
 	struct proc *p2;
 	int error;
 
 	/* Don't allow kernel-only flags. */
 	if ((uap->flags & RFKERNELONLY) != 0)
 		return (EINVAL);
 
 	AUDIT_ARG(fflags, uap->flags);
 	error = fork1(td, uap->flags, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2 ? p2->p_pid : 0;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 int	nprocs = 1;		/* process 0 */
 int	lastpid = 0;
 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 
     "Last used PID");
 
 /*
  * Random component to lastpid generation.  We mix in a random factor to make
  * it a little harder to predict.  We sanity check the modulus value to avoid
  * doing it in critical paths.  Don't let it be too small or we pointlessly
  * waste randomness entropy, and don't let it be impossibly large.  Using a
  * modulus that is too big causes a LOT more process table scans and slows
  * down fork processing as the pidchecked caching is defeated.
  */
 static int randompid = 0;
 
 static int
 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 {
 	int error, pid;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error != 0)
 		return(error);
 	sx_xlock(&allproc_lock);
 	pid = randompid;
 	error = sysctl_handle_int(oidp, &pid, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (pid < 0 || pid > PID_MAX - 100)	/* out of range */
 			pid = PID_MAX - 100;
 		else if (pid < 2)			/* NOP */
 			pid = 0;
 		else if (pid < 100)			/* Make it reasonable */
 			pid = 100;
 		randompid = pid;
 	}
 	sx_xunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 
 int
 fork1(td, flags, pages, procp)
 	struct thread *td;
 	int flags;
 	int pages;
 	struct proc **procp;
 {
 	struct proc *p1, *p2, *pptr;
 	struct proc *newproc;
 	int ok, trypid;
 	static int curfail, pidchecked = 0;
 	static struct timeval lastfail;
 	struct filedesc *fd;
 	struct filedesc_to_leader *fdtol;
 	struct thread *td2;
 	struct sigacts *newsigacts;
 	int error;
 
 	/* Can't copy and clear. */
 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 		return (EINVAL);
 
 	p1 = td->td_proc;
 
 	/*
 	 * Here we don't create a new process, but we divorce
 	 * certain parts of a process from itself.
 	 */
 	if ((flags & RFPROC) == 0) {
 		if ((p1->p_flag & P_HADTHREADS) &&
 		    (flags & (RFCFDG | RFFDG))) {
 			PROC_LOCK(p1);
 			if (thread_single(SINGLE_BOUNDARY)) {
 				PROC_UNLOCK(p1);
 				return (ERESTART);
 			}
 			PROC_UNLOCK(p1);
 		}
 
 		vm_forkproc(td, NULL, NULL, flags);
 
 		/*
 		 * Close all file descriptors.
 		 */
 		if (flags & RFCFDG) {
 			struct filedesc *fdtmp;
 			fdtmp = fdinit(td->td_proc->p_fd);
 			fdfree(td);
 			p1->p_fd = fdtmp;
 		}
 
 		/*
 		 * Unshare file descriptors (from parent).
 		 */
 		if (flags & RFFDG) 
 			fdunshare(p1, td);
 
 		if ((p1->p_flag & P_HADTHREADS) &&
 		    (flags & (RFCFDG | RFFDG))) {
 			PROC_LOCK(p1);
 			thread_single_end();
 			PROC_UNLOCK(p1);
 		}
 		*procp = NULL;
 		return (0);
 	}
 
 	/*
 	 * Note 1:1 allows for forking with one thread coming out on the
 	 * other side with the expectation that the process is about to
 	 * exec.
 	 */
 	if (p1->p_flag & P_HADTHREADS) {
 		/*
 		 * Idle the other threads for a second.
 		 * Since the user space is copied, it must remain stable.
 		 * In addition, all threads (from the user perspective)
 		 * need to either be suspended or in the kernel,
 		 * where they will try restart in the parent and will
 		 * be aborted in the child.
 		 */
 		PROC_LOCK(p1);
 		if (thread_single(SINGLE_NO_EXIT)) {
 			/* Abort. Someone else is single threading before us. */
 			PROC_UNLOCK(p1);
 			return (ERESTART);
 		}
 		PROC_UNLOCK(p1);
 		/*
 		 * All other activity in this process
 		 * is now suspended at the user boundary,
 		 * (or other safe places if we think of any).
 		 */
 	}
 
 	/* Allocate new proc. */
 	newproc = uma_zalloc(proc_zone, M_WAITOK);
 #ifdef MAC
 	mac_init_proc(newproc);
 #endif
 #ifdef AUDIT
 	audit_proc_alloc(newproc);
 #endif
 	knlist_init(&newproc->p_klist, &newproc->p_mtx, NULL, NULL, NULL);
 	STAILQ_INIT(&newproc->p_ktr);
 
 	/* We have to lock the process tree while we look for a pid. */
 	sx_slock(&proctree_lock);
 
 	/*
 	 * Although process entries are dynamically created, we still keep
 	 * a global limit on the maximum number we will create.  Don't allow
 	 * a nonprivileged user to use the last ten processes; don't let root
 	 * exceed the limit. The variable nprocs is the current number of
 	 * processes, maxproc is the limit.
 	 */
 	sx_xlock(&allproc_lock);
 	if ((nprocs >= maxproc - 10 &&
 	    priv_check_cred(td->td_ucred, PRIV_MAXPROC, SUSER_RUID) != 0) ||
 	    nprocs >= maxproc) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
 	 * a nonprivileged user to exceed their current limit.
 	 *
 	 * XXXRW: Can we avoid privilege here if it's not needed?
 	 */
 	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, SUSER_RUID |
 	    SUSER_ALLOWJAIL);
 	if (error == 0)
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
 	else {
 		PROC_LOCK(p1);
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
 		    lim_cur(p1, RLIMIT_NPROC));
 		PROC_UNLOCK(p1);
 	}
 	if (!ok) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the nprocs resource before blocking can occur.  There
 	 * are hard-limits as to the number of processes that can run.
 	 */
 	nprocs++;
 
 	/*
 	 * Find an unused process ID.  We remember a range of unused IDs
 	 * ready to use (from lastpid+1 through pidchecked-1).
 	 *
 	 * If RFHIGHPID is set (used during system boot), do not allocate
 	 * low-numbered pids.
 	 */
 	trypid = lastpid + 1;
 	if (flags & RFHIGHPID) {
 		if (trypid < 10)
 			trypid = 10;
 	} else {
 		if (randompid)
 			trypid += arc4random() % randompid;
 	}
 retry:
 	/*
 	 * If the process ID prototype has wrapped around,
 	 * restart somewhat above 0, as the low-numbered procs
 	 * tend to include daemons that don't exit.
 	 */
 	if (trypid >= PID_MAX) {
 		trypid = trypid % PID_MAX;
 		if (trypid < 100)
 			trypid += 100;
 		pidchecked = 0;
 	}
 	if (trypid >= pidchecked) {
 		int doingzomb = 0;
 
 		pidchecked = PID_MAX;
 		/*
 		 * Scan the active and zombie procs to check whether this pid
 		 * is in use.  Remember the lowest pid that's greater
 		 * than trypid, so we can avoid checking for a while.
 		 */
 		p2 = LIST_FIRST(&allproc);
 again:
 		for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
 			while (p2->p_pid == trypid ||
 			    (p2->p_pgrp != NULL &&
 			    (p2->p_pgrp->pg_id == trypid ||
 			    (p2->p_session != NULL &&
 			    p2->p_session->s_sid == trypid)))) {
 				trypid++;
 				if (trypid >= pidchecked)
 					goto retry;
 			}
 			if (p2->p_pid > trypid && pidchecked > p2->p_pid)
 				pidchecked = p2->p_pid;
 			if (p2->p_pgrp != NULL) {
 				if (p2->p_pgrp->pg_id > trypid &&
 				    pidchecked > p2->p_pgrp->pg_id)
 					pidchecked = p2->p_pgrp->pg_id;
 				if (p2->p_session != NULL &&
 				    p2->p_session->s_sid > trypid &&
 				    pidchecked > p2->p_session->s_sid)
 					pidchecked = p2->p_session->s_sid;
 			}
 		}
 		if (!doingzomb) {
 			doingzomb = 1;
 			p2 = LIST_FIRST(&zombproc);
 			goto again;
 		}
 	}
 	sx_sunlock(&proctree_lock);
 
 	/*
 	 * RFHIGHPID does not mess with the lastpid counter during boot.
 	 */
 	if (flags & RFHIGHPID)
 		pidchecked = 0;
 	else
 		lastpid = trypid;
 
 	p2 = newproc;
 	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
 	AUDIT_ARG(pid, p2->p_pid);
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	sx_xunlock(&allproc_lock);
 
 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
 	    __rangeof(struct proc, p_startcopy, p_endcopy));
 	PROC_UNLOCK(p1);
 
 	bzero(&p2->p_startzero,
 	    __rangeof(struct proc, p_startzero, p_endzero));
 
 	p2->p_ucred = crhold(td->td_ucred);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Malloc things while we don't hold any locks.
 	 */
 	if (flags & RFSIGSHARE)
 		newsigacts = NULL;
 	else
 		newsigacts = sigacts_alloc();
 
 	/*
 	 * Copy filedesc.
 	 */
 	if (flags & RFCFDG) {
 		fd = fdinit(p1->p_fd);
 		fdtol = NULL;
 	} else if (flags & RFFDG) {
 		fd = fdcopy(p1->p_fd);
 		fdtol = NULL;
 	} else {
 		fd = fdshare(p1->p_fd);
 		if (p1->p_fdtol == NULL)
 			p1->p_fdtol =
 				filedesc_to_leader_alloc(NULL,
 							 NULL,
 							 p1->p_leader);
 		if ((flags & RFTHREAD) != 0) {
 			/*
 			 * Shared file descriptor table and
 			 * shared process leaders.
 			 */
 			fdtol = p1->p_fdtol;
 			FILEDESC_LOCK_FAST(p1->p_fd);
 			fdtol->fdl_refcount++;
 			FILEDESC_UNLOCK_FAST(p1->p_fd);
 		} else {
 			/* 
 			 * Shared file descriptor table, and
 			 * different process leaders 
 			 */
 			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
 							 p1->p_fd,
 							 p2);
 		}
 	}
 	/*
 	 * Make a proc table entry for the new process.
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
 	td2 = FIRST_THREAD_IN_PROC(p2);
 
 	/* Allocate and switch to an alternate kstack if specified. */
 	if (pages != 0)
 		vm_thread_new_altkstack(td2, pages);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	bzero(&td2->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
 	td2->td_sigstk = td->td_sigstk;
 	td2->td_sigmask = td->td_sigmask;
 
 	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 */
 	p2->p_flag = 0;
 	if (p1->p_flag & P_PROFIL)
 		startprofclock(p2);
 	mtx_lock_spin(&sched_lock);
 	p2->p_sflag = PS_INMEM;
 	/*
 	 * Allow the scheduler to adjust the priority of the child and
 	 * parent while we hold the sched_lock.
 	 */
 	sched_fork(td, td2);
 
 	mtx_unlock_spin(&sched_lock);
 	td2->td_ucred = crhold(p2->p_ucred);
 #ifdef AUDIT
 	audit_proc_fork(p1, p2);
 #endif
 	pargs_hold(p2->p_args);
 
 	if (flags & RFSIGSHARE) {
 		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
 	} else {
 		sigacts_copy(newsigacts, p1->p_sigacts);
 		p2->p_sigacts = newsigacts;
 	}
 	if (flags & RFLINUXTHPN) 
 	        p2->p_sigparent = SIGUSR1;
 	else
 	        p2->p_sigparent = SIGCHLD;
 
 	p2->p_textvp = p1->p_textvp;
 	p2->p_fd = fd;
 	p2->p_fdtol = fdtol;
 
 	/*
 	 * p_limit is copy-on-write.  Bump its refcount.
 	 */
 	p2->p_limit = lim_hold(p1->p_limit);
 
 	pstats_fork(p1->p_stats, p2->p_stats);
 
 	PROC_UNLOCK(p1);
 	PROC_UNLOCK(p2);
 
 	/* Bump references to the text vnode (for procfs) */
 	if (p2->p_textvp)
 		vref(p2->p_textvp);
 
 	/*
 	 * Set up linkage for kernel based threading.
 	 */
 	if ((flags & RFTHREAD) != 0) {
 		mtx_lock(&ppeers_lock);
 		p2->p_peers = p1->p_peers;
 		p1->p_peers = p2;
 		p2->p_leader = p1->p_leader;
 		mtx_unlock(&ppeers_lock);
 		PROC_LOCK(p1->p_leader);
 		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(p1->p_leader);
 			/*
 			 * The task leader is exiting, so process p1 is
 			 * going to be killed shortly.  Since p1 obviously
 			 * isn't dead yet, we know that the leader is either
 			 * sending SIGKILL's to all the processes in this
 			 * task or is sleeping waiting for all the peers to
 			 * exit.  We let p1 complete the fork, but we need
 			 * to go ahead and kill the new process p2 since
 			 * the task leader may not get a chance to send
 			 * SIGKILL to it.  We leave it on the list so that
 			 * the task leader will wait for this new process
 			 * to commit suicide.
 			 */
 			PROC_LOCK(p2);
 			psignal(p2, SIGKILL);
 			PROC_UNLOCK(p2);
 		} else
 			PROC_UNLOCK(p1->p_leader);
 	} else {
 		p2->p_peers = NULL;
 		p2->p_leader = p2;
 	}
 
 	sx_xlock(&proctree_lock);
 	PGRP_LOCK(p1->p_pgrp);
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	/*
 	 * Preserve some more flags in subprocess.  P_PROFIL has already
 	 * been preserved.
 	 */
 	p2->p_flag |= p1->p_flag & P_SUGID;
 	td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
 	SESS_LOCK(p1->p_session);
 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 		p2->p_flag |= P_CONTROLT;
 	SESS_UNLOCK(p1->p_session);
 	if (flags & RFPPWAIT)
 		p2->p_flag |= P_PPWAIT;
 
 	p2->p_pgrp = p1->p_pgrp;
 	LIST_INSERT_AFTER(p1, p2, p_pglist);
 	PGRP_UNLOCK(p1->p_pgrp);
 	LIST_INIT(&p2->p_children);
 
 	callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);
 
 #ifdef KTRACE
 	/*
 	 * Copy traceflag and tracefile if enabled.
 	 */
 	mtx_lock(&ktrace_mtx);
 	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
 	if (p1->p_traceflag & KTRFAC_INHERIT) {
 		p2->p_traceflag = p1->p_traceflag;
 		if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
 			VREF(p2->p_tracevp);
 			KASSERT(p1->p_tracecred != NULL,
 			    ("ktrace vnode with no cred"));
 			p2->p_tracecred = crhold(p1->p_tracecred);
 		}
 	}
 	mtx_unlock(&ktrace_mtx);
 #endif
 
 	/*
 	 * If PF_FORK is set, the child process inherits the
 	 * procfs ioctl flags from its parent.
 	 */
 	if (p1->p_pfsflags & PF_FORK) {
 		p2->p_stops = p1->p_stops;
 		p2->p_pfsflags = p1->p_pfsflags;
 	}
 
 	/*
 	 * This begins the section where we must prevent the parent
 	 * from being swapped.
 	 */
 	_PHOLD(p1);
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Attach the new process to its parent.
 	 *
 	 * If RFNOWAIT is set, the newly created process becomes a child
 	 * of init.  This effectively disassociates the child from the
 	 * parent.
 	 */
 	if (flags & RFNOWAIT)
 		pptr = initproc;
 	else
 		pptr = p1;
 	p2->p_pptr = pptr;
 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 	sx_xunlock(&proctree_lock);
 
 	/* Inform accounting that we have forked. */
 	p2->p_acflag = AFORK;
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Finish creating the child process.  It will return via a different
 	 * execution path later.  (ie: directly into user mode)
 	 */
 	vm_forkproc(td, p2, td2, flags);
 
 	if (flags == (RFFDG | RFPROC)) {
 		atomic_add_int(&cnt.v_forks, 1);
 		atomic_add_int(&cnt.v_forkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
 		atomic_add_int(&cnt.v_vforks, 1);
 		atomic_add_int(&cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (p1 == &proc0) {
 		atomic_add_int(&cnt.v_kthreads, 1);
 		atomic_add_int(&cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else {
 		atomic_add_int(&cnt.v_rforks, 1);
 		atomic_add_int(&cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	}
 
 	/*
 	 * Both processes are set up, now check if any loadable modules want
 	 * to adjust anything.
 	 *   What if they have an error? XXX
 	 */
 	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
 
 	/*
 	 * Set the child start time and mark the process as being complete.
 	 */
 	microuptime(&p2->p_stats->p_start);
 	mtx_lock_spin(&sched_lock);
 	p2->p_state = PRS_NORMAL;
 
 	/*
 	 * If RFSTOPPED not requested, make child runnable and add to
 	 * run queue.
 	 */
 	if ((flags & RFSTOPPED) == 0) {
 		TD_SET_CAN_RUN(td2);
 		sched_add(td2, SRQ_BORING);
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * Now can be swapped.
 	 */
 	PROC_LOCK(p1);
 	_PRELE(p1);
 
 	/*
 	 * Tell any interested parties about the new process.
 	 */
 	KNOTE_LOCKED(&p1->p_klist, NOTE_FORK | p2->p_pid);
 
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Preserve synchronization semantics of vfork.  If waiting for
 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
 	 * proc (in case of exit).
 	 */
 	PROC_LOCK(p2);
 	while (p2->p_flag & P_PPWAIT)
 		msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * If other threads are waiting, let them continue now.
 	 */
 	if (p1->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 
 	/*
 	 * Return child proc pointer to parent.
 	 */
 	*procp = p2;
 	return (0);
 fail:
 	sx_sunlock(&proctree_lock);
 	if (ppsratecheck(&lastfail, &curfail, 1))
 		printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
 		    td->td_ucred->cr_ruid);
 	sx_xunlock(&allproc_lock);
 #ifdef MAC
 	mac_destroy_proc(newproc);
 #endif
 #ifdef AUDIT
 	audit_proc_free(newproc);
 #endif
 	uma_zfree(proc_zone, newproc);
 	if (p1->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 	pause("fork", hz / 2);
 	return (error);
 }
 
 /*
  * Handle the return of a child process from fork1().  This function
  * is called from the MD fork_trampoline() entry point.
  */
 void
 fork_exit(callout, arg, frame)
 	void (*callout)(void *, struct trapframe *);
 	void *arg;
 	struct trapframe *frame;
 {
 	struct proc *p;
 	struct thread *td;
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with sched_lock held but not recursed.
 	 */
 	td = curthread;
 	p = td->td_proc;
 	td->td_oncpu = PCPU_GET(cpuid);
 	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
 
 	sched_lock.mtx_lock = (uintptr_t)td;
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)",
 		td, td->td_sched, p->p_pid, p->p_comm);
 
 	/*
 	 * Processes normally resume in mi_switch() after being
 	 * cpu_switch()'ed to, but when children start up they arrive here
 	 * instead, so we must do much the same things as mi_switch() would.
 	 */
 
 	if ((td = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(td);
 	}
 	td = curthread;
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
 	 * have this call a non-return function to stay in kernel mode.
 	 * initproc has its own fork handler, but it does return.
 	 */
 	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
 	callout(arg, frame);
 
 	/*
 	 * Check if a kernel thread misbehaved and returned from its main
 	 * function.
 	 */
 	if (p->p_flag & P_KTHREAD) {
 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 		    p->p_comm, p->p_pid);
 		kthread_exit(0);
 	}
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	EVENTHANDLER_INVOKE(schedtail, p);
 }
 
 /*
  * Simplified back end of syscall(), used when returning from fork()
  * directly into user mode.  Giant is not held on entry, and must not
  * be held on return.  This function is passed in to fork_exit() as the
  * first parameter and is called when returning to a new userland process.
  */
 void
 fork_return(td, frame)
 	struct thread *td;
 	struct trapframe *frame;
 {
 
 	userret(td, frame);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(SYS_fork, 0, 0);
 #endif
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/kern/kern_jail.c
===================================================================
--- head/sys/kern/kern_jail.c	(revision 167210)
+++ head/sys/kern/kern_jail.c	(revision 167211)
@@ -1,749 +1,745 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/taskqueue.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <net/if.h>
 #include <netinet/in.h>
 
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
 
 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
     "Jail rules");
 
 int	jail_set_hostname_allowed = 1;
 SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
     &jail_set_hostname_allowed, 0,
     "Processes in jail can set their hostnames");
 
 int	jail_socket_unixiproute_only = 1;
 SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
     &jail_socket_unixiproute_only, 0,
     "Processes in jail are limited to creating UNIX/IPv4/route sockets only");
 
 int	jail_sysvipc_allowed = 0;
 SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
     &jail_sysvipc_allowed, 0,
     "Processes in jail can use System V IPC primitives");
 
 static int jail_enforce_statfs = 2;
 SYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
     &jail_enforce_statfs, 0,
     "Processes in jail cannot see all mounted file systems");
 
 int	jail_allow_raw_sockets = 0;
 SYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
     &jail_allow_raw_sockets, 0,
     "Prison root can create raw sockets");
 
 int	jail_chflags_allowed = 0;
 SYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
     &jail_chflags_allowed, 0,
     "Processes in jail can alter system file flags");
 
 /* allprison, lastprid, and prisoncount are protected by allprison_mtx. */
 struct	prisonlist allprison;
 struct	mtx allprison_mtx;
 int	lastprid = 0;
 int	prisoncount = 0;
 
 static void		 init_prison(void *);
 static void		 prison_complete(void *context, int pending);
 static struct prison	*prison_find(int);
 static int		 sysctl_jail_list(SYSCTL_HANDLER_ARGS);
 
 static void
 init_prison(void *data __unused)
 {
 
 	mtx_init(&allprison_mtx, "allprison", NULL, MTX_DEF);
 	LIST_INIT(&allprison);
 }
 
 SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
 
 /*
- * MPSAFE
- *
  * struct jail_args {
  *	struct jail *jail;
  * };
  */
 int
 jail(struct thread *td, struct jail_args *uap)
 {
 	struct nameidata nd;
 	struct prison *pr, *tpr;
 	struct jail j;
 	struct jail_attach_args jaa;
 	int vfslocked, error, tryprid;
 
 	error = copyin(uap->jail, &j, sizeof(j));
 	if (error)
 		return (error);
 	if (j.version != 0)
 		return (EINVAL);
 
 	MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
 	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
 	pr->pr_ref = 1;
 	error = copyinstr(j.path, &pr->pr_path, sizeof(pr->pr_path), 0);
 	if (error)
 		goto e_killmtx;
 	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE,
 	    pr->pr_path, td);
 	error = namei(&nd);
 	if (error)
 		goto e_killmtx;
 	vfslocked = NDHASGIANT(&nd);
 	pr->pr_root = nd.ni_vp;
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	VFS_UNLOCK_GIANT(vfslocked);
 	error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0);
 	if (error)
 		goto e_dropvnref;
 	pr->pr_ip = j.ip_number;
 	pr->pr_linux = NULL;
 	pr->pr_securelevel = securelevel;
 
 	/* Determine next pr_id and add prison to allprison list. */
 	mtx_lock(&allprison_mtx);
 	tryprid = lastprid + 1;
 	if (tryprid == JAIL_MAX)
 		tryprid = 1;
 next:
 	LIST_FOREACH(tpr, &allprison, pr_list) {
 		if (tpr->pr_id == tryprid) {
 			tryprid++;
 			if (tryprid == JAIL_MAX) {
 				mtx_unlock(&allprison_mtx);
 				error = EAGAIN;
 				goto e_dropvnref;
 			}
 			goto next;
 		}
 	}
 	pr->pr_id = jaa.jid = lastprid = tryprid;
 	LIST_INSERT_HEAD(&allprison, pr, pr_list);
 	prisoncount++;
 	mtx_unlock(&allprison_mtx);
 
 	error = jail_attach(td, &jaa);
 	if (error)
 		goto e_dropprref;
 	mtx_lock(&pr->pr_mtx);
 	pr->pr_ref--;
 	mtx_unlock(&pr->pr_mtx);
 	td->td_retval[0] = jaa.jid;
 	return (0);
 e_dropprref:
 	mtx_lock(&allprison_mtx);
 	LIST_REMOVE(pr, pr_list);
 	prisoncount--;
 	mtx_unlock(&allprison_mtx);
 e_dropvnref:
 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
 	vrele(pr->pr_root);
 	VFS_UNLOCK_GIANT(vfslocked);
 e_killmtx:
 	mtx_destroy(&pr->pr_mtx);
 	FREE(pr, M_PRISON);
 	return (error);
 }
 
 /*
- * MPSAFE
- *
  * struct jail_attach_args {
  *	int jid;
  * };
  */
 int
 jail_attach(struct thread *td, struct jail_attach_args *uap)
 {
 	struct proc *p;
 	struct ucred *newcred, *oldcred;
 	struct prison *pr;
 	int vfslocked, error;
 	
 	/*
 	 * XXX: Note that there is a slight race here if two threads
 	 * in the same privileged process attempt to attach to two
 	 * different jails at the same time.  It is important for
 	 * user processes not to do this, or they might end up with
 	 * a process root from one prison, but attached to the jail
 	 * of another.
 	 */
 	error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 
 	p = td->td_proc;
 	mtx_lock(&allprison_mtx);
 	pr = prison_find(uap->jid);
 	if (pr == NULL) {
 		mtx_unlock(&allprison_mtx);
 		return (EINVAL);
 	}
 	pr->pr_ref++;
 	mtx_unlock(&pr->pr_mtx);
 	mtx_unlock(&allprison_mtx);
 
 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY, td);
 	if ((error = change_dir(pr->pr_root, td)) != 0)
 		goto e_unlock;
 #ifdef MAC
 	if ((error = mac_check_vnode_chroot(td->td_ucred, pr->pr_root)))
 		goto e_unlock;
 #endif
 	VOP_UNLOCK(pr->pr_root, 0, td);
 	change_root(pr->pr_root, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 	setsugid(p);
 	crcopy(newcred, oldcred);
 	newcred->cr_prison = pr;
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 e_unlock:
 	VOP_UNLOCK(pr->pr_root, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	mtx_lock(&pr->pr_mtx);
 	pr->pr_ref--;
 	mtx_unlock(&pr->pr_mtx);
 	return (error);
 }
 
 /*
  * Returns a locked prison instance, or NULL on failure.
  */
 static struct prison *
 prison_find(int prid)
 {
 	struct prison *pr;
 
 	mtx_assert(&allprison_mtx, MA_OWNED);
 	LIST_FOREACH(pr, &allprison, pr_list) {
 		if (pr->pr_id == prid) {
 			mtx_lock(&pr->pr_mtx);
 			return (pr);
 		}
 	}
 	return (NULL);
 }
 
 void
 prison_free(struct prison *pr)
 {
 
 	mtx_lock(&allprison_mtx);
 	mtx_lock(&pr->pr_mtx);
 	pr->pr_ref--;
 	if (pr->pr_ref == 0) {
 		LIST_REMOVE(pr, pr_list);
 		mtx_unlock(&pr->pr_mtx);
 		prisoncount--;
 		mtx_unlock(&allprison_mtx);
 
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 		return;
 	}
 	mtx_unlock(&pr->pr_mtx);
 	mtx_unlock(&allprison_mtx);
 }
 
 static void
 prison_complete(void *context, int pending)
 {
 	struct prison *pr;
 	int vfslocked;
 
 	pr = (struct prison *)context;
 
 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
 	vrele(pr->pr_root);
 	VFS_UNLOCK_GIANT(vfslocked);
 
 	mtx_destroy(&pr->pr_mtx);
 	if (pr->pr_linux != NULL)
 		FREE(pr->pr_linux, M_PRISON);
 	FREE(pr, M_PRISON);
 }
 
 void
 prison_hold(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	pr->pr_ref++;
 	mtx_unlock(&pr->pr_mtx);
 }
 
 u_int32_t
 prison_getip(struct ucred *cred)
 {
 
 	return (cred->cr_prison->pr_ip);
 }
 
 int
 prison_ip(struct ucred *cred, int flag, u_int32_t *ip)
 {
 	u_int32_t tmp;
 
 	if (!jailed(cred))
 		return (0);
 	if (flag) 
 		tmp = *ip;
 	else
 		tmp = ntohl(*ip);
 	if (tmp == INADDR_ANY) {
 		if (flag) 
 			*ip = cred->cr_prison->pr_ip;
 		else
 			*ip = htonl(cred->cr_prison->pr_ip);
 		return (0);
 	}
 	if (tmp == INADDR_LOOPBACK) {
 		if (flag)
 			*ip = cred->cr_prison->pr_ip;
 		else
 			*ip = htonl(cred->cr_prison->pr_ip);
 		return (0);
 	}
 	if (cred->cr_prison->pr_ip != tmp)
 		return (1);
 	return (0);
 }
 
 void
 prison_remote_ip(struct ucred *cred, int flag, u_int32_t *ip)
 {
 	u_int32_t tmp;
 
 	if (!jailed(cred))
 		return;
 	if (flag)
 		tmp = *ip;
 	else
 		tmp = ntohl(*ip);
 	if (tmp == INADDR_LOOPBACK) {
 		if (flag)
 			*ip = cred->cr_prison->pr_ip;
 		else
 			*ip = htonl(cred->cr_prison->pr_ip);
 		return;
 	}
 	return;
 }
 
 int
 prison_if(struct ucred *cred, struct sockaddr *sa)
 {
 	struct sockaddr_in *sai;
 	int ok;
 
 	sai = (struct sockaddr_in *)sa;
 	if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
 		ok = 1;
 	else if (sai->sin_family != AF_INET)
 		ok = 0;
 	else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
 		ok = 1;
 	else
 		ok = 0;
 	return (ok);
 }
 
 /*
  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
  */
 int
 prison_check(struct ucred *cred1, struct ucred *cred2)
 {
 
 	if (jailed(cred1)) {
 		if (!jailed(cred2))
 			return (ESRCH);
 		if (cred2->cr_prison != cred1->cr_prison)
 			return (ESRCH);
 	}
 
 	return (0);
 }
 
 /*
  * Return 1 if the passed credential is in a jail, otherwise 0.
  */
 int
 jailed(struct ucred *cred)
 {
 
 	return (cred->cr_prison != NULL);
 }
 
 /*
  * Return the correct hostname for the passed credential.
  */
 void
 getcredhostname(struct ucred *cred, char *buf, size_t size)
 {
 
 	if (jailed(cred)) {
 		mtx_lock(&cred->cr_prison->pr_mtx);
 		strlcpy(buf, cred->cr_prison->pr_host, size);
 		mtx_unlock(&cred->cr_prison->pr_mtx);
 	} else
 		strlcpy(buf, hostname, size);
 }
 
 /*
  * Determine whether the subject represented by cred can "see"
  * status of a mount point.
  * Returns: 0 for permitted, ENOENT otherwise.
  * XXX: This function should be called cr_canseemount() and should be
  *      placed in kern_prot.c.
  */
 int
 prison_canseemount(struct ucred *cred, struct mount *mp)
 {
 	struct prison *pr;
 	struct statfs *sp;
 	size_t len;
 
 	if (!jailed(cred) || jail_enforce_statfs == 0)
 		return (0);
 	pr = cred->cr_prison;
 	if (pr->pr_root->v_mount == mp)
 		return (0);
 	if (jail_enforce_statfs == 2)
 		return (ENOENT);
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 * This is ugly check, but this is the only situation when jail's
 	 * directory ends with '/'.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return (0);
 	len = strlen(pr->pr_path);
 	sp = &mp->mnt_stat;
 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
 		return (ENOENT);
 	/*
 	 * Be sure that we don't have situation where jail's root directory
 	 * is "/some/path" and mount point is "/some/pathpath".
 	 */
 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
 		return (ENOENT);
 	return (0);
 }
 
 void
 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
 {
 	char jpath[MAXPATHLEN];
 	struct prison *pr;
 	size_t len;
 
 	if (!jailed(cred) || jail_enforce_statfs == 0)
 		return;
 	pr = cred->cr_prison;
 	if (prison_canseemount(cred, mp) != 0) {
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		strlcpy(sp->f_mntonname, "[restricted]",
 		    sizeof(sp->f_mntonname));
 		return;
 	}
 	if (pr->pr_root->v_mount == mp) {
 		/*
 		 * Clear current buffer data, so we are sure nothing from
 		 * the valid path left there.
 		 */
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		*sp->f_mntonname = '/';
 		return;
 	}
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return;
 	len = strlen(pr->pr_path);
 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
 	/*
 	 * Clear current buffer data, so we are sure nothing from
 	 * the valid path left there.
 	 */
 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 	if (*jpath == '\0') {
 		/* Should never happen. */
 		*sp->f_mntonname = '/';
 	} else {
 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
 	}
 }
 
 /*
  * Check with permission for a specific privilege is granted within jail.  We
  * have a specific list of accepted privileges; the rest are denied.
  */
 int
 prison_priv_check(struct ucred *cred, int priv)
 {
 
 	if (!jailed(cred))
 		return (0);
 
 	switch (priv) {
 
 		/*
 		 * Allow ktrace privileges for root in jail.
 		 */
 	case PRIV_KTRACE:
 
 #if 0
 		/*
 		 * Allow jailed processes to configure audit identity and
 		 * submit audit records (login, etc).  In the future we may
 		 * want to further refine the relationship between audit and
 		 * jail.
 		 */
 	case PRIV_AUDIT_GETAUDIT:
 	case PRIV_AUDIT_SETAUDIT:
 	case PRIV_AUDIT_SUBMIT:
 #endif
 
 		/*
 		 * Allow jailed processes to manipulate process UNIX
 		 * credentials in any way they see fit.
 		 */
 	case PRIV_CRED_SETUID:
 	case PRIV_CRED_SETEUID:
 	case PRIV_CRED_SETGID:
 	case PRIV_CRED_SETEGID:
 	case PRIV_CRED_SETGROUPS:
 	case PRIV_CRED_SETREUID:
 	case PRIV_CRED_SETREGID:
 	case PRIV_CRED_SETRESUID:
 	case PRIV_CRED_SETRESGID:
 
 		/*
 		 * Jail implements visibility constraints already, so allow
 		 * jailed root to override uid/gid-based constraints.
 		 */
 	case PRIV_SEEOTHERGIDS:
 	case PRIV_SEEOTHERUIDS:
 
 		/*
 		 * Jail implements inter-process debugging limits already, so
 		 * allow jailed root various debugging privileges.
 		 */
 	case PRIV_DEBUG_DIFFCRED:
 	case PRIV_DEBUG_SUGID:
 	case PRIV_DEBUG_UNPRIV:
 
 		/*
 		 * Allow jail to set various resource limits and login
 		 * properties, and for now, exceed process resource limits.
 		 */
 	case PRIV_PROC_LIMIT:
 	case PRIV_PROC_SETLOGIN:
 	case PRIV_PROC_SETRLIMIT:
 
 		/*
 		 * System V and POSIX IPC privileges are granted in jail.
 		 */
 	case PRIV_IPC_READ:
 	case PRIV_IPC_WRITE:
 	case PRIV_IPC_ADMIN:
 	case PRIV_IPC_MSGSIZE:
 	case PRIV_MQ_ADMIN:
 
 		/*
 		 * Jail implements its own inter-process limits, so allow
 		 * root processes in jail to change scheduling on other
 		 * processes in the same jail.  Likewise for signalling.
 		 */
 	case PRIV_SCHED_DIFFCRED:
 	case PRIV_SIGNAL_DIFFCRED:
 	case PRIV_SIGNAL_SUGID:
 
 		/*
 		 * Allow jailed processes to write to sysctls marked as jail
 		 * writable.
 		 */
 	case PRIV_SYSCTL_WRITEJAIL:
 
 		/*
 		 * Allow root in jail to manage a variety of quota
 		 * properties.  These should likely be conditional on a
 		 * configuration option.
 		 */
 	case PRIV_VFS_GETQUOTA:
 	case PRIV_VFS_SETQUOTA:
 
 		/*
 		 * Since Jail relies on chroot() to implement file system
 		 * protections, grant many VFS privileges to root in jail.
 		 * Be careful to exclude mount-related and NFS-related
 		 * privileges.
 		 */
 	case PRIV_VFS_READ:
 	case PRIV_VFS_WRITE:
 	case PRIV_VFS_ADMIN:
 	case PRIV_VFS_EXEC:
 	case PRIV_VFS_LOOKUP:
 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
 	case PRIV_VFS_CHFLAGS_DEV:
 	case PRIV_VFS_CHOWN:
 	case PRIV_VFS_CHROOT:
 	case PRIV_VFS_RETAINSUGID:
 	case PRIV_VFS_FCHROOT:
 	case PRIV_VFS_LINK:
 	case PRIV_VFS_SETGID:
 	case PRIV_VFS_STICKYFILE:
 		return (0);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * setting system flags.
 		 */
 	case PRIV_VFS_SYSFLAGS:
 		if (jail_chflags_allowed)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Allow jailed root to bind reserved ports.
 		 */
 	case PRIV_NETINET_RESERVEDPORT:
 		return (0);
 
 		/*
 		 * Conditionally allow creating raw sockets in jail.
 		 */
 	case PRIV_NETINET_RAW:
 		if (jail_allow_raw_sockets)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Since jail implements its own visibility limits on netstat
 		 * sysctls, allow getcred.  This allows identd to work in
 		 * jail.
 		 */
 	case PRIV_NETINET_GETCRED:
 		return (0);
 
 	default:
 		/*
 		 * In all remaining cases, deny the privilege request.  This
 		 * includes almost all network privileges, many system
 		 * configuration privileges.
 		 */
 		return (EPERM);
 	}
 }
 
 static int
 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 {
 	struct xprison *xp, *sxp;
 	struct prison *pr;
 	int count, error;
 
 	if (jailed(req->td->td_ucred))
 		return (0);
 retry:
 	mtx_lock(&allprison_mtx);
 	count = prisoncount;
 	mtx_unlock(&allprison_mtx);
 
 	if (count == 0)
 		return (0);
 
 	sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO);
 	mtx_lock(&allprison_mtx);
 	if (count != prisoncount) {
 		mtx_unlock(&allprison_mtx);
 		free(sxp, M_TEMP);
 		goto retry;
 	}
 	
 	LIST_FOREACH(pr, &allprison, pr_list) {
 		mtx_lock(&pr->pr_mtx);
 		xp->pr_version = XPRISON_VERSION;
 		xp->pr_id = pr->pr_id;
 		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
 		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
 		xp->pr_ip = pr->pr_ip;
 		mtx_unlock(&pr->pr_mtx);
 		xp++;
 	}
 	mtx_unlock(&allprison_mtx);
 
 	error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count);
 	free(sxp, M_TEMP);
 	if (error)
 		return (error);
 	return (0);
 }
 
 SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD,
     NULL, 0, sysctl_jail_list, "S", "List of active jails");
 
 static int
 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
 {
 	int error, injail;
 
 	injail = jailed(req->td->td_ucred);
 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
 
 	return (error);
 }
 SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD,
     NULL, 0, sysctl_jail_jailed, "I", "Process in jail?");
Index: head/sys/kern/kern_ktrace.c
===================================================================
--- head/sys/kern/kern_ktrace.c	(revision 167210)
+++ head/sys/kern/kern_ktrace.c	(revision 167211)
@@ -1,1028 +1,1018 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_ktrace.c	8.2 (Berkeley) 9/23/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/ktrace.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * The ktrace facility allows the tracing of certain key events in user space
  * processes, such as system calls, signal delivery, context switches, and
  * user generated events using utrace(2).  It works by streaming event
  * records and data to a vnode associated with the process using the
  * ktrace(2) system call.  In general, records can be written directly from
  * the context that generates the event.  One important exception to this is
  * during a context switch, where sleeping is not permitted.  To handle this
  * case, trace events are generated using in-kernel ktr_request records, and
  * then delivered to disk at a convenient moment -- either immediately, the
  * next traceable event, at system call return, or at process exit.
  *
  * When dealing with multiple threads or processes writing to the same event
  * log, ordering guarantees are weak: specifically, if an event has multiple
  * records (i.e., system call enter and return), they may be interlaced with
  * records from another event.  Process and thread ID information is provided
  * in the record, and user applications can de-interlace events if required.
  */
 
 static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
 
 #ifdef KTRACE
 
 #ifndef KTRACE_REQUEST_POOL
 #define	KTRACE_REQUEST_POOL	100
 #endif
 
 struct ktr_request {
 	struct	ktr_header ktr_header;
 	void	*ktr_buffer;
 	union {
 		struct	ktr_syscall ktr_syscall;
 		struct	ktr_sysret ktr_sysret;
 		struct	ktr_genio ktr_genio;
 		struct	ktr_psig ktr_psig;
 		struct	ktr_csw ktr_csw;
 	} ktr_data;
 	STAILQ_ENTRY(ktr_request) ktr_list;
 };
 
 static int data_lengths[] = {
 	0,					/* none */
 	offsetof(struct ktr_syscall, ktr_args),	/* KTR_SYSCALL */
 	sizeof(struct ktr_sysret),		/* KTR_SYSRET */
 	0,					/* KTR_NAMEI */
 	sizeof(struct ktr_genio),		/* KTR_GENIO */
 	sizeof(struct ktr_psig),		/* KTR_PSIG */
 	sizeof(struct ktr_csw),			/* KTR_CSW */
 	0					/* KTR_USER */
 };
 
 static STAILQ_HEAD(, ktr_request) ktr_free;
 
 static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
 
 static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
 TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
 
 static u_int ktr_geniosize = PAGE_SIZE;
 TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
 SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
     0, "Maximum size of genio event payload");
 
 static int print_message = 1;
 struct mtx ktrace_mtx;
 static struct sx ktrace_sx;
 
 static void ktrace_init(void *dummy);
 static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
 static u_int ktrace_resize_pool(u_int newsize);
 static struct ktr_request *ktr_getrequest(int type);
 static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
 static void ktr_freerequest(struct ktr_request *req);
 static void ktr_writerequest(struct thread *td, struct ktr_request *req);
 static int ktrcanset(struct thread *,struct proc *);
 static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
 static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
 
 /*
  * ktrace itself generates events, such as context switches, which we do not
  * wish to trace.  Maintain a flag, TDP_INKTRACE, on each thread to determine
  * whether or not it is in a region where tracing of events should be
  * suppressed.
  */
 static void
 ktrace_enter(struct thread *td)
 {
 
 	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
 	td->td_pflags |= TDP_INKTRACE;
 }
 
 static void
 ktrace_exit(struct thread *td)
 {
 
 	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
 	td->td_pflags &= ~TDP_INKTRACE;
 }
 
 static void
 ktrace_assert(struct thread *td)
 {
 
 	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
 }
 
 static void
 ktrace_init(void *dummy)
 {
 	struct ktr_request *req;
 	int i;
 
 	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
 	sx_init(&ktrace_sx, "ktrace_sx");
 	STAILQ_INIT(&ktr_free);
 	for (i = 0; i < ktr_requestpool; i++) {
 		req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
 		STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 	}
 }
 SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
 
 static int
 sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td;
 	u_int newsize, oldsize, wantsize;
 	int error;
 
 	/* Handle easy read-only case first to avoid warnings from GCC. */
 	if (!req->newptr) {
 		mtx_lock(&ktrace_mtx);
 		oldsize = ktr_requestpool;
 		mtx_unlock(&ktrace_mtx);
 		return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
 	}
 
 	error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
 	if (error)
 		return (error);
 	td = curthread;
 	ktrace_enter(td);
 	mtx_lock(&ktrace_mtx);
 	oldsize = ktr_requestpool;
 	newsize = ktrace_resize_pool(wantsize);
 	mtx_unlock(&ktrace_mtx);
 	ktrace_exit(td);
 	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
 	if (error)
 		return (error);
 	if (wantsize > oldsize && newsize < wantsize)
 		return (ENOSPC);
 	return (0);
 }
 SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW,
     &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", "");
 
 static u_int
 ktrace_resize_pool(u_int newsize)
 {
 	struct ktr_request *req;
 	int bound;
 
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	print_message = 1;
 	bound = newsize - ktr_requestpool;
 	if (bound == 0)
 		return (ktr_requestpool);
 	if (bound < 0)
 		/* Shrink pool down to newsize if possible. */
 		while (bound++ < 0) {
 			req = STAILQ_FIRST(&ktr_free);
 			if (req == NULL)
 				return (ktr_requestpool);
 			STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
 			ktr_requestpool--;
 			mtx_unlock(&ktrace_mtx);
 			free(req, M_KTRACE);
 			mtx_lock(&ktrace_mtx);
 		}
 	else
 		/* Grow pool up to newsize. */
 		while (bound-- > 0) {
 			mtx_unlock(&ktrace_mtx);
 			req = malloc(sizeof(struct ktr_request), M_KTRACE,
 			    M_WAITOK);
 			mtx_lock(&ktrace_mtx);
 			STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 			ktr_requestpool++;
 		}
 	return (ktr_requestpool);
 }
 
 static struct ktr_request *
 ktr_getrequest(int type)
 {
 	struct ktr_request *req;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int pm;
 
 	ktrace_enter(td);	/* XXX: In caller instead? */
 	mtx_lock(&ktrace_mtx);
 	if (!KTRCHECK(td, type)) {
 		mtx_unlock(&ktrace_mtx);
 		ktrace_exit(td);
 		return (NULL);
 	}
 	req = STAILQ_FIRST(&ktr_free);
 	if (req != NULL) {
 		STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
 		req->ktr_header.ktr_type = type;
 		if (p->p_traceflag & KTRFAC_DROP) {
 			req->ktr_header.ktr_type |= KTR_DROP;
 			p->p_traceflag &= ~KTRFAC_DROP;
 		}
 		mtx_unlock(&ktrace_mtx);
 		microtime(&req->ktr_header.ktr_time);
 		req->ktr_header.ktr_pid = p->p_pid;
 		req->ktr_header.ktr_tid = td->td_tid;
 		bcopy(p->p_comm, req->ktr_header.ktr_comm, MAXCOMLEN + 1);
 		req->ktr_buffer = NULL;
 		req->ktr_header.ktr_len = 0;
 	} else {
 		p->p_traceflag |= KTRFAC_DROP;
 		pm = print_message;
 		print_message = 0;
 		mtx_unlock(&ktrace_mtx);
 		if (pm)
 			printf("Out of ktrace request objects.\n");
 		ktrace_exit(td);
 	}
 	return (req);
 }
 
 /*
  * Some trace generation environments don't permit direct access to VFS,
  * such as during a context switch where sleeping is not allowed.  Under these
  * circumstances, queue a request to the thread to be written asynchronously
  * later.
  */
 static void
 ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
 {
 
 	mtx_lock(&ktrace_mtx);
 	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
 	mtx_unlock(&ktrace_mtx);
 	ktrace_exit(td);
 }
 
 /*
  * Drain any pending ktrace records from the per-thread queue to disk.  This
  * is used both internally before committing other records, and also on
  * system call return.  We drain all the ones we can find at the time when
  * drain is requested, but don't keep draining after that as those events
  * may me approximately "after" the current event.
  */
 static void
 ktr_drain(struct thread *td)
 {
 	struct ktr_request *queued_req;
 	STAILQ_HEAD(, ktr_request) local_queue;
 
 	ktrace_assert(td);
 	sx_assert(&ktrace_sx, SX_XLOCKED);
 
 	STAILQ_INIT(&local_queue);	/* XXXRW: needed? */
 
 	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
 		mtx_lock(&ktrace_mtx);
 		STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
 		mtx_unlock(&ktrace_mtx);
 
 		while ((queued_req = STAILQ_FIRST(&local_queue))) {
 			STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
 			ktr_writerequest(td, queued_req);
 			ktr_freerequest(queued_req);
 		}
 	}
 }
 
 /*
  * Submit a trace record for immediate commit to disk -- to be used only
  * where entering VFS is OK.  First drain any pending records that may have
  * been cached in the thread.
  */
 static void
 ktr_submitrequest(struct thread *td, struct ktr_request *req)
 {
 
 	ktrace_assert(td);
 
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	ktr_writerequest(td, req);
 	ktr_freerequest(req);
 	sx_xunlock(&ktrace_sx);
 
 	ktrace_exit(td);
 }
 
 static void
 ktr_freerequest(struct ktr_request *req)
 {
 
 	if (req->ktr_buffer != NULL)
 		free(req->ktr_buffer, M_KTRACE);
 	mtx_lock(&ktrace_mtx);
 	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 	mtx_unlock(&ktrace_mtx);
 }
 
-/*
- * MPSAFE
- */
 void
 ktrsyscall(code, narg, args)
 	int code, narg;
 	register_t args[];
 {
 	struct ktr_request *req;
 	struct ktr_syscall *ktp;
 	size_t buflen;
 	char *buf = NULL;
 
 	buflen = sizeof(register_t) * narg;
 	if (buflen > 0) {
 		buf = malloc(buflen, M_KTRACE, M_WAITOK);
 		bcopy(args, buf, buflen);
 	}
 	req = ktr_getrequest(KTR_SYSCALL);
 	if (req == NULL) {
 		if (buf != NULL)
 			free(buf, M_KTRACE);
 		return;
 	}
 	ktp = &req->ktr_data.ktr_syscall;
 	ktp->ktr_code = code;
 	ktp->ktr_narg = narg;
 	if (buflen > 0) {
 		req->ktr_header.ktr_len = buflen;
 		req->ktr_buffer = buf;
 	}
 	ktr_submitrequest(curthread, req);
 }
 
-/*
- * MPSAFE
- */
 void
 ktrsysret(code, error, retval)
 	int code, error;
 	register_t retval;
 {
 	struct ktr_request *req;
 	struct ktr_sysret *ktp;
 
 	req = ktr_getrequest(KTR_SYSRET);
 	if (req == NULL)
 		return;
 	ktp = &req->ktr_data.ktr_sysret;
 	ktp->ktr_code = code;
 	ktp->ktr_error = error;
 	ktp->ktr_retval = retval;		/* what about val2 ? */
 	ktr_submitrequest(curthread, req);
 }
 
 /*
  * When a process exits, drain per-process asynchronous trace records.
  */
 void
 ktrprocexit(struct thread *td)
 {
 
 	ktrace_enter(td);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
 	ktrace_exit(td);
 }
 
 /*
  * When a thread returns, drain any asynchronous records generated by the
  * system call.
  */
 void
 ktruserret(struct thread *td)
 {
 
 	ktrace_enter(td);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
 	ktrace_exit(td);
 }
 
 void
 ktrnamei(path)
 	char *path;
 {
 	struct ktr_request *req;
 	int namelen;
 	char *buf = NULL;
 
 	namelen = strlen(path);
 	if (namelen > 0) {
 		buf = malloc(namelen, M_KTRACE, M_WAITOK);
 		bcopy(path, buf, namelen);
 	}
 	req = ktr_getrequest(KTR_NAMEI);
 	if (req == NULL) {
 		if (buf != NULL)
 			free(buf, M_KTRACE);
 		return;
 	}
 	if (namelen > 0) {
 		req->ktr_header.ktr_len = namelen;
 		req->ktr_buffer = buf;
 	}
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrgenio(fd, rw, uio, error)
 	int fd;
 	enum uio_rw rw;
 	struct uio *uio;
 	int error;
 {
 	struct ktr_request *req;
 	struct ktr_genio *ktg;
 	int datalen;
 	char *buf;
 
 	if (error) {
 		free(uio, M_IOV);
 		return;
 	}
 	uio->uio_offset = 0;
 	uio->uio_rw = UIO_WRITE;
 	datalen = imin(uio->uio_resid, ktr_geniosize);
 	buf = malloc(datalen, M_KTRACE, M_WAITOK);
 	error = uiomove(buf, datalen, uio);
 	free(uio, M_IOV);
 	if (error) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	req = ktr_getrequest(KTR_GENIO);
 	if (req == NULL) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	ktg = &req->ktr_data.ktr_genio;
 	ktg->ktr_fd = fd;
 	ktg->ktr_rw = rw;
 	req->ktr_header.ktr_len = datalen;
 	req->ktr_buffer = buf;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrpsig(sig, action, mask, code)
 	int sig;
 	sig_t action;
 	sigset_t *mask;
 	int code;
 {
 	struct ktr_request *req;
 	struct ktr_psig	*kp;
 
 	req = ktr_getrequest(KTR_PSIG);
 	if (req == NULL)
 		return;
 	kp = &req->ktr_data.ktr_psig;
 	kp->signo = (char)sig;
 	kp->action = action;
 	kp->mask = *mask;
 	kp->code = code;
 	ktr_enqueuerequest(curthread, req);
 }
 
 void
 ktrcsw(out, user)
 	int out, user;
 {
 	struct ktr_request *req;
 	struct ktr_csw *kc;
 
 	req = ktr_getrequest(KTR_CSW);
 	if (req == NULL)
 		return;
 	kc = &req->ktr_data.ktr_csw;
 	kc->out = out;
 	kc->user = user;
 	ktr_enqueuerequest(curthread, req);
 }
 #endif /* KTRACE */
 
 /* Interface and common routines */
 
 /*
  * ktrace system call
- *
- * MPSAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ktrace_args {
 	char	*fname;
 	int	ops;
 	int	facs;
 	int	pid;
 };
 #endif
 /* ARGSUSED */
 int
 ktrace(td, uap)
 	struct thread *td;
 	register struct ktrace_args *uap;
 {
 #ifdef KTRACE
 	register struct vnode *vp = NULL;
 	register struct proc *p;
 	struct pgrp *pg;
 	int facs = uap->facs & ~KTRFAC_ROOT;
 	int ops = KTROP(uap->ops);
 	int descend = uap->ops & KTRFLAG_DESCEND;
 	int nfound, ret = 0;
 	int flags, error = 0, vfslocked;
 	struct nameidata nd;
 	struct ucred *cred;
 
 	/*
 	 * Need something to (un)trace.
 	 */
 	if (ops != KTROP_CLEARFILE && facs == 0)
 		return (EINVAL);
 
 	ktrace_enter(td);
 	if (ops != KTROP_CLEAR) {
 		/*
 		 * an operation which requires a file argument.
 		 */
 		NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE,
 		    uap->fname, td);
 		flags = FREAD | FWRITE | O_NOFOLLOW;
 		error = vn_open(&nd, &flags, 0, -1);
 		if (error) {
 			ktrace_exit(td);
 			return (error);
 		}
 		vfslocked = NDHASGIANT(&nd);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vp = nd.ni_vp;
 		VOP_UNLOCK(vp, 0, td);
 		if (vp->v_type != VREG) {
 			(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			ktrace_exit(td);
 			return (EACCES);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	/*
 	 * Clear all uses of the tracefile.
 	 */
 	if (ops == KTROP_CLEARFILE) {
 		int vrele_count;
 
 		vrele_count = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_tracevp == vp) {
 				if (ktrcanset(td, p)) {
 					mtx_lock(&ktrace_mtx);
 					cred = p->p_tracecred;
 					p->p_tracecred = NULL;
 					p->p_tracevp = NULL;
 					p->p_traceflag = 0;
 					mtx_unlock(&ktrace_mtx);
 					vrele_count++;
 					crfree(cred);
 				} else
 					error = EPERM;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		if (vrele_count > 0) {
 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			while (vrele_count-- > 0)
 				vrele(vp);
 			VFS_UNLOCK_GIANT(vfslocked);
 		}
 		goto done;
 	}
 	/*
 	 * do it
 	 */
 	sx_slock(&proctree_lock);
 	if (uap->pid < 0) {
 		/*
 		 * by process group
 		 */
 		pg = pgfind(-uap->pid);
 		if (pg == NULL) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 		/*
 		 * ktrops() may call vrele(). Lock pg_members
 		 * by the proctree_lock rather than pg_mtx.
 		 */
 		PGRP_UNLOCK(pg);
 		nfound = 0;
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p_cansee(td, p) != 0) {
 				PROC_UNLOCK(p); 
 				continue;
 			}
 			PROC_UNLOCK(p); 
 			nfound++;
 			if (descend)
 				ret |= ktrsetchildren(td, p, ops, facs, vp);
 			else
 				ret |= ktrops(td, p, ops, facs, vp);
 		}
 		if (nfound == 0) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 	} else {
 		/*
 		 * by pid
 		 */
 		p = pfind(uap->pid);
 		if (p == NULL) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 		error = p_cansee(td, p);
 		/*
 		 * The slock of the proctree lock will keep this process
 		 * from going away, so unlocking the proc here is ok.
 		 */
 		PROC_UNLOCK(p);
 		if (error) {
 			sx_sunlock(&proctree_lock);
 			goto done;
 		}
 		if (descend)
 			ret |= ktrsetchildren(td, p, ops, facs, vp);
 		else
 			ret |= ktrops(td, p, ops, facs, vp);
 	}
 	sx_sunlock(&proctree_lock);
 	if (!ret)
 		error = EPERM;
 done:
 	if (vp != NULL) {
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		(void) vn_close(vp, FWRITE, td->td_ucred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	ktrace_exit(td);
 	return (error);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
 /*
  * utrace system call
- *
- * MPSAFE
  */
 /* ARGSUSED */
 int
 utrace(td, uap)
 	struct thread *td;
 	register struct utrace_args *uap;
 {
 
 #ifdef KTRACE
 	struct ktr_request *req;
 	void *cp;
 	int error;
 
 	if (!KTRPOINT(td, KTR_USER))
 		return (0);
 	if (uap->len > KTR_USER_MAXLEN)
 		return (EINVAL);
 	cp = malloc(uap->len, M_KTRACE, M_WAITOK);
 	error = copyin(uap->addr, cp, uap->len);
 	if (error) {
 		free(cp, M_KTRACE);
 		return (error);
 	}
 	req = ktr_getrequest(KTR_USER);
 	if (req == NULL) {
 		free(cp, M_KTRACE);
 		return (ENOMEM);
 	}
 	req->ktr_buffer = cp;
 	req->ktr_header.ktr_len = uap->len;
 	ktr_submitrequest(td, req);
 	return (0);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
 #ifdef KTRACE
 static int
 ktrops(td, p, ops, facs, vp)
 	struct thread *td;
 	struct proc *p;
 	int ops, facs;
 	struct vnode *vp;
 {
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 
 	PROC_LOCK(p);
 	if (!ktrcanset(td, p)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	mtx_lock(&ktrace_mtx);
 	if (ops == KTROP_SET) {
 		if (p->p_tracevp != vp) {
 			/*
 			 * if trace file already in use, relinquish below
 			 */
 			tracevp = p->p_tracevp;
 			VREF(vp);
 			p->p_tracevp = vp;
 		}
 		if (p->p_tracecred != td->td_ucred) {
 			tracecred = p->p_tracecred;
 			p->p_tracecred = crhold(td->td_ucred);
 		}
 		p->p_traceflag |= facs;
 		if (priv_check_cred(td->td_ucred, PRIV_KTRACE,
 		    SUSER_ALLOWJAIL) == 0)
 			p->p_traceflag |= KTRFAC_ROOT;
 	} else {
 		/* KTROP_CLEAR */
 		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
 			/* no more tracing */
 			p->p_traceflag = 0;
 			tracevp = p->p_tracevp;
 			p->p_tracevp = NULL;
 			tracecred = p->p_tracecred;
 			p->p_tracecred = NULL;
 		}
 	}
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
 	if (tracevp != NULL) {
 		int vfslocked;
 
 		vfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
 		vrele(tracevp);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (tracecred != NULL)
 		crfree(tracecred);
 
 	return (1);
 }
 
 static int
 ktrsetchildren(td, top, ops, facs, vp)
 	struct thread *td;
 	struct proc *top;
 	int ops, facs;
 	struct vnode *vp;
 {
 	register struct proc *p;
 	register int ret = 0;
 
 	p = top;
 	sx_assert(&proctree_lock, SX_LOCKED);
 	for (;;) {
 		ret |= ktrops(td, p, ops, facs, vp);
 		/*
 		 * If this process has children, descend to them next,
 		 * otherwise do any siblings, and if done with this level,
 		 * follow back up the tree (but not past top).
 		 */
 		if (!LIST_EMPTY(&p->p_children))
 			p = LIST_FIRST(&p->p_children);
 		else for (;;) {
 			if (p == top)
 				return (ret);
 			if (LIST_NEXT(p, p_sibling)) {
 				p = LIST_NEXT(p, p_sibling);
 				break;
 			}
 			p = p->p_pptr;
 		}
 	}
 	/*NOTREACHED*/
 }
 
 static void
 ktr_writerequest(struct thread *td, struct ktr_request *req)
 {
 	struct ktr_header *kth;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct uio auio;
 	struct iovec aiov[3];
 	struct mount *mp;
 	int datalen, buflen, vrele_count;
 	int error, vfslocked;
 
 	/*
 	 * We hold the vnode and credential for use in I/O in case ktrace is
 	 * disabled on the process as we write out the request.
 	 *
 	 * XXXRW: This is not ideal: we could end up performing a write after
 	 * the vnode has been closed.
 	 */
 	mtx_lock(&ktrace_mtx);
 	vp = td->td_proc->p_tracevp;
 	if (vp != NULL)
 		VREF(vp);
 	cred = td->td_proc->p_tracecred;
 	if (cred != NULL)
 		crhold(cred);
 	mtx_unlock(&ktrace_mtx);
 
 	/*
 	 * If vp is NULL, the vp has been cleared out from under this
 	 * request, so just drop it.  Make sure the credential and vnode are
 	 * in sync: we should have both or neither.
 	 */
 	if (vp == NULL) {
 		KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
 		return;
 	}
 	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
 
 	kth = &req->ktr_header;
 	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
 	buflen = kth->ktr_len;
 	auio.uio_iov = &aiov[0];
 	auio.uio_offset = 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	aiov[0].iov_base = (caddr_t)kth;
 	aiov[0].iov_len = sizeof(struct ktr_header);
 	auio.uio_resid = sizeof(struct ktr_header);
 	auio.uio_iovcnt = 1;
 	auio.uio_td = td;
 	if (datalen != 0) {
 		aiov[1].iov_base = (caddr_t)&req->ktr_data;
 		aiov[1].iov_len = datalen;
 		auio.uio_resid += datalen;
 		auio.uio_iovcnt++;
 		kth->ktr_len += datalen;
 	}
 	if (buflen != 0) {
 		KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
 		aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
 		aiov[auio.uio_iovcnt].iov_len = buflen;
 		auio.uio_resid += buflen;
 		auio.uio_iovcnt++;
 	}
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	(void)VOP_LEASE(vp, td, cred, LEASE_WRITE);
 #ifdef MAC
 	error = mac_check_vnode_write(cred, NOCRED, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (!error)
 		return;
 	/*
 	 * If error encountered, give up tracing on this vnode.  We defer
 	 * all the vrele()'s on the vnode until after we are finished walking
 	 * the various lists to avoid needlessly holding locks.
 	 */
 	log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
 	    error);
 	vrele_count = 0;
 	/*
 	 * First, clear this vnode from being used by any processes in the
 	 * system.
 	 * XXX - If one process gets an EPERM writing to the vnode, should
 	 * we really do this?  Other processes might have suitable
 	 * credentials for the operation.
 	 */
 	cred = NULL;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_tracevp == vp) {
 			mtx_lock(&ktrace_mtx);
 			p->p_tracevp = NULL;
 			p->p_traceflag = 0;
 			cred = p->p_tracecred;
 			p->p_tracecred = NULL;
 			mtx_unlock(&ktrace_mtx);
 			vrele_count++;
 		}
 		PROC_UNLOCK(p);
 		if (cred != NULL) {
 			crfree(cred);
 			cred = NULL;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	/*
 	 * We can't clear any pending requests in threads that have cached
 	 * them but not yet committed them, as those are per-thread.  The
 	 * thread will have to clear it itself on system call return.
 	 */
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	while (vrele_count-- > 0)
 		vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 }
 
 /*
  * Return true if caller has permission to set the ktracing state
  * of target.  Essentially, the target can't possess any
  * more permissions than the caller.  KTRFAC_ROOT signifies that
  * root previously set the tracing status on the target process, and
  * so, only root may further change it.
  */
 static int
 ktrcanset(td, targetp)
 	struct thread *td;
 	struct proc *targetp;
 {
 
 	PROC_LOCK_ASSERT(targetp, MA_OWNED);
 	if (targetp->p_traceflag & KTRFAC_ROOT &&
 	    priv_check_cred(td->td_ucred, PRIV_KTRACE, SUSER_ALLOWJAIL))
 		return (0);
 
 	if (p_candebug(td, targetp) != 0)
 		return (0);
 
 	return (1);
 }
 
 #endif /* KTRACE */
Index: head/sys/kern/kern_linker.c
===================================================================
--- head/sys/kern/kern_linker.c	(revision 167210)
+++ head/sys/kern/kern_linker.c	(revision 167211)
@@ -1,2022 +1,1995 @@
 /*-
  * Copyright (c) 1997-2000 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/linker.h>
 #include <sys/fcntl.h>
 #include <sys/libkern.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 #include <security/mac/mac_framework.h>
 
 #include "linker_if.h"
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifdef KLD_DEBUG
 int kld_debug = 0;
 #endif
 
 #define	KLD_LOCK()		sx_xlock(&kld_sx)
 #define	KLD_UNLOCK()		sx_xunlock(&kld_sx)
 #define	KLD_LOCKED()		sx_xlocked(&kld_sx)
 #define	KLD_LOCK_ASSERT() do {						\
 	if (!cold)							\
 		sx_assert(&kld_sx, SX_XLOCKED);				\
 } while (0)
 
 /*
  * static char *linker_search_path(const char *name, struct mod_depend
  * *verinfo);
  */
 static const char 	*linker_basename(const char *path);
 
 /*
  * Find a currently loaded file given its filename.
  */
 static linker_file_t linker_find_file_by_name(const char* _filename);
 
 /*
  * Find a currently loaded file given its file id.
  */
 static linker_file_t linker_find_file_by_id(int _fileid);
 
 /* Metadata from the static kernel */
 SET_DECLARE(modmetadata_set, struct mod_metadata);
 
 MALLOC_DEFINE(M_LINKER, "linker", "kernel linker");
 
 linker_file_t linker_kernel_file;
 
 static struct sx kld_sx;	/* kernel linker lock */
 
 static linker_class_list_t classes;
 static linker_file_list_t linker_files;
 static int next_file_id = 1;
 static int linker_no_more_classes = 0;
 
 #define	LINKER_GET_NEXT_FILE_ID(a) do {					\
 	linker_file_t lftmp;						\
 									\
 	KLD_LOCK_ASSERT();						\
 retry:									\
 	TAILQ_FOREACH(lftmp, &linker_files, link) {			\
 		if (next_file_id == lftmp->id) {			\
 			next_file_id++;					\
 			goto retry;					\
 		}							\
 	}								\
 	(a) = next_file_id;						\
 } while(0)
 
 
 /* XXX wrong name; we're looking at version provision tags here, not modules */
 typedef TAILQ_HEAD(, modlist) modlisthead_t;
 struct modlist {
 	TAILQ_ENTRY(modlist) link;	/* chain together all modules */
 	linker_file_t   container;
 	const char 	*name;
 	int             version;
 };
 typedef struct modlist *modlist_t;
 static modlisthead_t found_modules;
 
 static int	linker_file_add_dependency(linker_file_t file,
 		    linker_file_t dep);
 static caddr_t	linker_file_lookup_symbol_internal(linker_file_t file,
 		    const char* name, int deps);
 static int	linker_load_module(const char *kldname,
 		    const char *modname, struct linker_file *parent,
 		    struct mod_depend *verinfo, struct linker_file **lfpp);
 static modlist_t modlist_lookup2(const char *name, struct mod_depend *verinfo);
 
 static char *
 linker_strdup(const char *str)
 {
 	char *result;
 
 	if ((result = malloc((strlen(str) + 1), M_LINKER, M_WAITOK)) != NULL)
 		strcpy(result, str);
 	return (result);
 }
 
 static void
 linker_init(void *arg)
 {
 
 	sx_init(&kld_sx, "kernel linker");
 	TAILQ_INIT(&classes);
 	TAILQ_INIT(&linker_files);
 }
 
 SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0)
 
 static void
 linker_stop_class_add(void *arg)
 {
 
 	linker_no_more_classes = 1;
 }
 
 SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL)
 
 int
 linker_add_class(linker_class_t lc)
 {
 
 	/*
 	 * We disallow any class registration past SI_ORDER_ANY
 	 * of SI_SUB_KLD.  We bump the reference count to keep the
 	 * ops from being freed.
 	 */
 	if (linker_no_more_classes == 1)
 		return (EPERM);
 	kobj_class_compile((kobj_class_t) lc);
 	((kobj_class_t)lc)->refs++;	/* XXX: kobj_mtx */
 	TAILQ_INSERT_TAIL(&classes, lc, link);
 	return (0);
 }
 
 static void
 linker_file_sysinit(linker_file_t lf)
 {
 	struct sysinit **start, **stop, **sipp, **xipp, *save;
 
 	KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n",
 	    lf->filename));
 
 	if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0)
 		return;
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
 	 *
 	 * Since some things care about execution order, this is the operation
 	 * which ensures continued function.
 	 */
 	for (sipp = start; sipp < stop; sipp++) {
 		for (xipp = sipp + 1; xipp < stop; xipp++) {
 			if ((*sipp)->subsystem < (*xipp)->subsystem ||
 			    ((*sipp)->subsystem == (*xipp)->subsystem &&
 			    (*sipp)->order <= (*xipp)->order))
 				continue;	/* skip */
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 */
 	mtx_lock(&Giant);
 	for (sipp = start; sipp < stop; sipp++) {
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s) */
 
 		/* Call function */
 		(*((*sipp)->func)) ((*sipp)->udata);
 	}
 	mtx_unlock(&Giant);
 }
 
 static void
 linker_file_sysuninit(linker_file_t lf)
 {
 	struct sysinit **start, **stop, **sipp, **xipp, *save;
 
 	KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n",
 	    lf->filename));
 
 	if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop,
 	    NULL) != 0)
 		return;
 
 	/*
 	 * Perform a reverse bubble sort of the system initialization objects
 	 * by their subsystem (primary key) and order (secondary key).
 	 *
 	 * Since some things care about execution order, this is the operation
 	 * which ensures continued function.
 	 */
 	for (sipp = start; sipp < stop; sipp++) {
 		for (xipp = sipp + 1; xipp < stop; xipp++) {
 			if ((*sipp)->subsystem > (*xipp)->subsystem ||
 			    ((*sipp)->subsystem == (*xipp)->subsystem &&
 			    (*sipp)->order >= (*xipp)->order))
 				continue;	/* skip */
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 */
 	mtx_lock(&Giant);
 	for (sipp = start; sipp < stop; sipp++) {
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s) */
 
 		/* Call function */
 		(*((*sipp)->func)) ((*sipp)->udata);
 	}
 	mtx_unlock(&Giant);
 }
 
 static void
 linker_file_register_sysctls(linker_file_t lf)
 {
 	struct sysctl_oid **start, **stop, **oidp;
 
 	KLD_DPF(FILE,
 	    ("linker_file_register_sysctls: registering SYSCTLs for %s\n",
 	    lf->filename));
 
 	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
 		return;
 
 	mtx_lock(&Giant);
 	for (oidp = start; oidp < stop; oidp++)
 		sysctl_register_oid(*oidp);
 	mtx_unlock(&Giant);
 }
 
 static void
 linker_file_unregister_sysctls(linker_file_t lf)
 {
 	struct sysctl_oid **start, **stop, **oidp;
 
 	KLD_DPF(FILE, ("linker_file_unregister_sysctls: registering SYSCTLs"
 	    " for %s\n", lf->filename));
 
 	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
 		return;
 
 	mtx_lock(&Giant);
 	for (oidp = start; oidp < stop; oidp++)
 		sysctl_unregister_oid(*oidp);
 	mtx_unlock(&Giant);
 }
 
 static int
 linker_file_register_modules(linker_file_t lf)
 {
 	struct mod_metadata **start, **stop, **mdp;
 	const moduledata_t *moddata;
 	int first_error, error;
 
 	KLD_DPF(FILE, ("linker_file_register_modules: registering modules"
 	    " in %s\n", lf->filename));
 
 	if (linker_file_lookup_set(lf, "modmetadata_set", &start,
 	    &stop, NULL) != 0) {
 		/*
 		 * This fallback should be unnecessary, but if we get booted
 		 * from boot2 instead of loader and we are missing our
 		 * metadata then we have to try the best we can.
 		 */
 		if (lf == linker_kernel_file) {
 			start = SET_BEGIN(modmetadata_set);
 			stop = SET_LIMIT(modmetadata_set);
 		} else
 			return (0);
 	}
 	first_error = 0;
 	for (mdp = start; mdp < stop; mdp++) {
 		if ((*mdp)->md_type != MDT_MODULE)
 			continue;
 		moddata = (*mdp)->md_data;
 		KLD_DPF(FILE, ("Registering module %s in %s\n",
 		    moddata->name, lf->filename));
 		error = module_register(moddata, lf);
 		if (error) {
 			printf("Module %s failed to register: %d\n",
 			    moddata->name, error);
 			if (first_error == 0)
 				first_error = error;
 		}
 	}
 	return (first_error);
 }
 
 static void
 linker_init_kernel_modules(void)
 {
 
 	linker_file_register_modules(linker_kernel_file);
 }
 
 SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules, 0)
 
 static int
 linker_load_file(const char *filename, linker_file_t *result)
 {
 	linker_class_t lc;
 	linker_file_t lf;
 	int foundfile, error;
 
 	/* Refuse to load modules if securelevel raised */
 	if (securelevel > 0)
 		return (EPERM);
 
 	KLD_LOCK_ASSERT();
 	lf = linker_find_file_by_name(filename);
 	if (lf) {
 		KLD_DPF(FILE, ("linker_load_file: file %s is already loaded,"
 		    " incrementing refs\n", filename));
 		*result = lf;
 		lf->refs++;
 		return (0);
 	}
 	foundfile = 0;
 	error = 0;
 
 	/*
 	 * We do not need to protect (lock) classes here because there is
 	 * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY)
 	 * and there is no class deregistration mechanism at this time.
 	 */
 	TAILQ_FOREACH(lc, &classes, link) {
 		KLD_DPF(FILE, ("linker_load_file: trying to load %s\n",
 		    filename));
 		error = LINKER_LOAD_FILE(lc, filename, &lf);
 		/*
 		 * If we got something other than ENOENT, then it exists but
 		 * we cannot load it for some other reason.
 		 */
 		if (error != ENOENT)
 			foundfile = 1;
 		if (lf) {
 			error = linker_file_register_modules(lf);
 			if (error == EEXIST) {
 				linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 				return (error);
 			}
 			KLD_UNLOCK();
 			linker_file_register_sysctls(lf);
 			linker_file_sysinit(lf);
 			KLD_LOCK();
 			lf->flags |= LINKER_FILE_LINKED;
 			*result = lf;
 			return (0);
 		}
 	}
 	/*
 	 * Less than ideal, but tells the user whether it failed to load or
 	 * the module was not found.
 	 */
 	if (foundfile) {
 		/*
 		 * Format not recognized or otherwise unloadable.
 		 * When loading a module that is statically built into
 		 * the kernel EEXIST percolates back up as the return
 		 * value.  Preserve this so that apps like sysinstall
 		 * can recognize this special case and not post bogus
 		 * dialog boxes.
 		 */
 		if (error != EEXIST)
 			error = ENOEXEC;
 	} else
 		error = ENOENT;		/* Nothing found */
 	return (error);
 }
 
 int
 linker_reference_module(const char *modname, struct mod_depend *verinfo,
     linker_file_t *result)
 {
 	modlist_t mod;
 	int error;
 
 	KLD_LOCK();
 	if ((mod = modlist_lookup2(modname, verinfo)) != NULL) {
 		*result = mod->container;
 		(*result)->refs++;
 		KLD_UNLOCK();
 		return (0);
 	}
 
 	error = linker_load_module(NULL, modname, NULL, verinfo, result);
 	KLD_UNLOCK();
 	return (error);
 }
 
 int
 linker_release_module(const char *modname, struct mod_depend *verinfo,
     linker_file_t lf)
 {
 	modlist_t mod;
 	int error;
 
 	KLD_LOCK();
 	if (lf == NULL) {
 		KASSERT(modname != NULL,
 		    ("linker_release_module: no file or name"));
 		mod = modlist_lookup2(modname, verinfo);
 		if (mod == NULL) {
 			KLD_UNLOCK();
 			return (ESRCH);
 		}
 		lf = mod->container;
 	} else
 		KASSERT(modname == NULL && verinfo == NULL,
 		    ("linker_release_module: both file and name"));
 	error =	linker_file_unload(lf, LINKER_UNLOAD_NORMAL);
 	KLD_UNLOCK();
 	return (error);
 }
 
 static linker_file_t
 linker_find_file_by_name(const char *filename)
 {
 	linker_file_t lf;
 	char *koname;
 
 	koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
 	sprintf(koname, "%s.ko", filename);
 
 	KLD_LOCK_ASSERT();
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		if (strcmp(lf->filename, koname) == 0)
 			break;
 		if (strcmp(lf->filename, filename) == 0)
 			break;
 	}
 	free(koname, M_LINKER);
 	return (lf);
 }
 
 static linker_file_t
 linker_find_file_by_id(int fileid)
 {
 	linker_file_t lf;
 
 	KLD_LOCK_ASSERT();
 	TAILQ_FOREACH(lf, &linker_files, link)
 		if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED)
 			break;
 	return (lf);
 }
 
 int
 linker_file_foreach(linker_predicate_t *predicate, void *context)
 {
 	linker_file_t lf;
 	int retval = 0;
 
 	KLD_LOCK();
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		retval = predicate(lf, context);
 		if (retval != 0)
 			break;
 	}
 	KLD_UNLOCK();
 	return (retval);
 }
 
 linker_file_t
 linker_make_file(const char *pathname, linker_class_t lc)
 {
 	linker_file_t lf;
 	const char *filename;
 
 	KLD_LOCK_ASSERT();
 	filename = linker_basename(pathname);
 
 	KLD_DPF(FILE, ("linker_make_file: new file, filename=%s\n", filename));
 	lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK);
 	if (lf == NULL)
 		return (NULL);
 	lf->refs = 1;
 	lf->userrefs = 0;
 	lf->flags = 0;
 	lf->filename = linker_strdup(filename);
 	LINKER_GET_NEXT_FILE_ID(lf->id);
 	lf->ndeps = 0;
 	lf->deps = NULL;
 	STAILQ_INIT(&lf->common);
 	TAILQ_INIT(&lf->modules);
 	TAILQ_INSERT_TAIL(&linker_files, lf, link);
 	return (lf);
 }
 
 int
 linker_file_unload(linker_file_t file, int flags)
 {
 	module_t mod, next;
 	modlist_t ml, nextml;
 	struct common_symbol *cp;
 	int error, i;
 
 	/* Refuse to unload modules if securelevel raised. */
 	if (securelevel > 0)
 		return (EPERM);
 #ifdef MAC
 	error = mac_check_kld_unload(curthread->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	KLD_LOCK_ASSERT();
 	KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
 
 	/* Easy case of just dropping a reference. */
 	if (file->refs > 1) {
 		file->refs--;
 		return (0);
 	}
 
 	KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
 	    " informing modules\n"));
 
 	/*
 	 * Inform any modules associated with this file.
 	 */
 	MOD_XLOCK;
 	for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
 		next = module_getfnext(mod);
 		MOD_XUNLOCK;
 
 		/*
 		 * Give the module a chance to veto the unload.
 		 */
 		if ((error = module_unload(mod, flags)) != 0) {
 			KLD_DPF(FILE, ("linker_file_unload: module %p"
 			    " vetoes unload\n", mod));
 			return (error);
 		}
 		MOD_XLOCK;
 		module_release(mod);
 	}
 	MOD_XUNLOCK;
 
 	TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) {
 		if (ml->container == file) {
 			TAILQ_REMOVE(&found_modules, ml, link);
 			free(ml, M_LINKER);
 		}
 	}
 
 	/*
 	 * Don't try to run SYSUNINITs if we are unloaded due to a
 	 * link error.
 	 */
 	if (file->flags & LINKER_FILE_LINKED) {
 		linker_file_sysuninit(file);
 		linker_file_unregister_sysctls(file);
 	}
 	TAILQ_REMOVE(&linker_files, file, link);
 
 	if (file->deps) {
 		for (i = 0; i < file->ndeps; i++)
 			linker_file_unload(file->deps[i], flags);
 		free(file->deps, M_LINKER);
 		file->deps = NULL;
 	}
 	while ((cp = STAILQ_FIRST(&file->common)) != NULL) {
 		STAILQ_REMOVE_HEAD(&file->common, link);
 		free(cp, M_LINKER);
 	}
 
 	LINKER_UNLOAD(file);
 	if (file->filename) {
 		free(file->filename, M_LINKER);
 		file->filename = NULL;
 	}
 	kobj_delete((kobj_t) file, M_LINKER);
 	return (0);
 }
 
 static int
 linker_file_add_dependency(linker_file_t file, linker_file_t dep)
 {
 	linker_file_t *newdeps;
 
 	KLD_LOCK_ASSERT();
 	newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t *),
 	    M_LINKER, M_WAITOK | M_ZERO);
 	if (newdeps == NULL)
 		return (ENOMEM);
 
 	if (file->deps) {
 		bcopy(file->deps, newdeps,
 		    file->ndeps * sizeof(linker_file_t *));
 		free(file->deps, M_LINKER);
 	}
 	file->deps = newdeps;
 	file->deps[file->ndeps] = dep;
 	file->ndeps++;
 	return (0);
 }
 
 /*
  * Locate a linker set and its contents.  This is a helper function to avoid
  * linker_if.h exposure elsewhere.  Note: firstp and lastp are really void **.
  * This function is used in this file so we can avoid having lots of (void **)
  * casts.
  */
 int
 linker_file_lookup_set(linker_file_t file, const char *name,
     void *firstp, void *lastp, int *countp)
 {
 	int error, locked;
 
 	locked = KLD_LOCKED();
 	if (!locked)
 		KLD_LOCK();
 	error = LINKER_LOOKUP_SET(file, name, firstp, lastp, countp);
 	if (!locked)
 		KLD_UNLOCK();
 	return (error);
 }
 
 caddr_t
 linker_file_lookup_symbol(linker_file_t file, const char *name, int deps)
 {
 	caddr_t sym;
 	int locked;
 
 	locked = KLD_LOCKED();
 	if (!locked)
 		KLD_LOCK();
 	sym = linker_file_lookup_symbol_internal(file, name, deps);
 	if (!locked)
 		KLD_UNLOCK();
 	return (sym);
 }
 
 static caddr_t
 linker_file_lookup_symbol_internal(linker_file_t file, const char *name,
     int deps)
 {
 	c_linker_sym_t sym;
 	linker_symval_t symval;
 	caddr_t address;
 	size_t common_size = 0;
 	int i;
 
 	KLD_LOCK_ASSERT();
 	KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n",
 	    file, name, deps));
 
 	if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) {
 		LINKER_SYMBOL_VALUES(file, sym, &symval);
 		if (symval.value == 0)
 			/*
 			 * For commons, first look them up in the
 			 * dependencies and only allocate space if not found
 			 * there.
 			 */
 			common_size = symval.size;
 		else {
 			KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol"
 			    ".value=%p\n", symval.value));
 			return (symval.value);
 		}
 	}
 	if (deps) {
 		for (i = 0; i < file->ndeps; i++) {
 			address = linker_file_lookup_symbol_internal(
 			    file->deps[i], name, 0);
 			if (address) {
 				KLD_DPF(SYM, ("linker_file_lookup_symbol:"
 				    " deps value=%p\n", address));
 				return (address);
 			}
 		}
 	}
 	if (common_size > 0) {
 		/*
 		 * This is a common symbol which was not found in the
 		 * dependencies.  We maintain a simple common symbol table in
 		 * the file object.
 		 */
 		struct common_symbol *cp;
 
 		STAILQ_FOREACH(cp, &file->common, link) {
 			if (strcmp(cp->name, name) == 0) {
 				KLD_DPF(SYM, ("linker_file_lookup_symbol:"
 				    " old common value=%p\n", cp->address));
 				return (cp->address);
 			}
 		}
 		/*
 		 * Round the symbol size up to align.
 		 */
 		common_size = (common_size + sizeof(int) - 1) & -sizeof(int);
 		cp = malloc(sizeof(struct common_symbol)
 		    + common_size + strlen(name) + 1, M_LINKER,
 		    M_WAITOK | M_ZERO);
 		cp->address = (caddr_t)(cp + 1);
 		cp->name = cp->address + common_size;
 		strcpy(cp->name, name);
 		bzero(cp->address, common_size);
 		STAILQ_INSERT_TAIL(&file->common, cp, link);
 
 		KLD_DPF(SYM, ("linker_file_lookup_symbol: new common"
 		    " value=%p\n", cp->address));
 		return (cp->address);
 	}
 	KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n"));
 	return (0);
 }
 
 #ifdef DDB
 /*
  * DDB Helpers.  DDB has to look across multiple files with their own symbol
  * tables and string tables.
  *
  * Note that we do not obey list locking protocols here.  We really don't need
  * DDB to hang because somebody's got the lock held.  We'll take the chance
  * that the files list is inconsistant instead.
  */
 
 int
 linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym)
 {
 	linker_file_t lf;
 
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0)
 			return (0);
 	}
 	return (ENOENT);
 }
 
 int
 linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
 {
 	linker_file_t lf;
 	c_linker_sym_t best, es;
 	u_long diff, bestdiff, off;
 
 	best = 0;
 	off = (uintptr_t)value;
 	bestdiff = off;
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0)
 			continue;
 		if (es != 0 && diff < bestdiff) {
 			best = es;
 			bestdiff = diff;
 		}
 		if (bestdiff == 0)
 			break;
 	}
 	if (best) {
 		*sym = best;
 		*diffp = bestdiff;
 		return (0);
 	} else {
 		*sym = 0;
 		*diffp = off;
 		return (ENOENT);
 	}
 }
 
 int
 linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
 {
 	linker_file_t lf;
 
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0)
 			return (0);
 	}
 	return (ENOENT);
 }
 #endif
 
 /*
  * Syscalls.
  */
-/*
- * MPSAFE
- */
 int
 kern_kldload(struct thread *td, const char *file, int *fileid)
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_in pkm;
 #endif
 	const char *kldname, *modname;
 	linker_file_t lf;
 	int error;
 
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
 		return (error);
 
 	if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0)
 		return (error);
 
 	/*
 	 * If file does not contain a qualified name or any dot in it
 	 * (kldname.ko, or kldname.ver.ko) treat it as an interface
 	 * name.
 	 */
 	if (index(file, '/') || index(file, '.')) {
 		kldname = file;
 		modname = NULL;
 	} else {
 		kldname = NULL;
 		modname = file;
 	}
 
 	KLD_LOCK();
 	error = linker_load_module(kldname, modname, NULL, NULL, &lf);
 	if (error)
 		goto unlock;
 #ifdef HWPMC_HOOKS
 	pkm.pm_file = lf->filename;
 	pkm.pm_address = (uintptr_t) lf->address;
 	PMC_CALL_HOOK(td, PMC_FN_KLD_LOAD, (void *) &pkm);
 #endif
 	lf->userrefs++;
 	if (fileid != NULL)
 		*fileid = lf->id;
 unlock:
 	KLD_UNLOCK();
 	return (error);
 }
 
 int
 kldload(struct thread *td, struct kldload_args *uap)
 {
 	char *pathname = NULL;
 	int error, fileid;
 
 	td->td_retval[0] = -1;
 
 	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL);
 	if (error == 0) {
 		error = kern_kldload(td, pathname, &fileid);
 		if (error == 0)
 			td->td_retval[0] = fileid;
 	}
 	free(pathname, M_TEMP);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 kern_kldunload(struct thread *td, int fileid, int flags)
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_out pkm;
 #endif
 	linker_file_t lf;
 	int error = 0;
 
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
 		return (error);
 
 	if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0)
 		return (error);
 
 	KLD_LOCK();
 	lf = linker_find_file_by_id(fileid);
 	if (lf) {
 		KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
 		if (lf->userrefs == 0) {
 			/*
 			 * XXX: maybe LINKER_UNLOAD_FORCE should override ?
 			 */
 			printf("kldunload: attempt to unload file that was"
 			    " loaded by the kernel\n");
 			error = EBUSY;
 		} else {
 #ifdef HWPMC_HOOKS
 			/* Save data needed by hwpmc(4) before unloading. */
 			pkm.pm_address = (uintptr_t) lf->address;
 			pkm.pm_size = lf->size;
 #endif
 			lf->userrefs--;
 			error = linker_file_unload(lf, flags);
 			if (error)
 				lf->userrefs++;
 		}
 	} else
 		error = ENOENT;
 
 #ifdef HWPMC_HOOKS
 	if (error == 0)
 		PMC_CALL_HOOK(td, PMC_FN_KLD_UNLOAD, (void *) &pkm);
 #endif
 	KLD_UNLOCK();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 kldunload(struct thread *td, struct kldunload_args *uap)
 {
 
 	return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL));
 }
 
-/*
- * MPSAFE
- */
 int
 kldunloadf(struct thread *td, struct kldunloadf_args *uap)
 {
 
 	if (uap->flags != LINKER_UNLOAD_NORMAL &&
 	    uap->flags != LINKER_UNLOAD_FORCE)
 		return (EINVAL);
 	return (kern_kldunload(td, uap->fileid, uap->flags));
 }
 
-/*
- * MPSAFE
- */
 int
 kldfind(struct thread *td, struct kldfind_args *uap)
 {
 	char *pathname;
 	const char *filename;
 	linker_file_t lf;
 	int error;
 
 #ifdef MAC
 	error = mac_check_kld_stat(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	td->td_retval[0] = -1;
 
 	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0)
 		goto out;
 
 	filename = linker_basename(pathname);
 	KLD_LOCK();
 	lf = linker_find_file_by_name(filename);
 	if (lf)
 		td->td_retval[0] = lf->id;
 	else
 		error = ENOENT;
 	KLD_UNLOCK();
 out:
 	free(pathname, M_TEMP);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 kldnext(struct thread *td, struct kldnext_args *uap)
 {
 	linker_file_t lf;
 	int error = 0;
 
 #ifdef MAC
 	error = mac_check_kld_stat(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	KLD_LOCK();
 	if (uap->fileid == 0)
 		lf = TAILQ_FIRST(&linker_files);
 	else {
 		lf = linker_find_file_by_id(uap->fileid);
 		if (lf == NULL) {
 			error = ENOENT;
 			goto out;
 		}
 		lf = TAILQ_NEXT(lf, link);
 	}
 
 	/* Skip partially loaded files. */
 	while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED))
 		lf = TAILQ_NEXT(lf, link);
 
 	if (lf)
 		td->td_retval[0] = lf->id;
 	else
 		td->td_retval[0] = 0;
 out:
 	KLD_UNLOCK();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 kldstat(struct thread *td, struct kldstat_args *uap)
 {
 	struct kld_file_stat stat;
 	linker_file_t lf;
 	int error, namelen;
 
 	/*
 	 * Check the version of the user's structure.
 	 */
 	error = copyin(uap->stat, &stat, sizeof(struct kld_file_stat));
 	if (error)
 		return (error);
 	if (stat.version != sizeof(struct kld_file_stat))
 		return (EINVAL);
 
 #ifdef MAC
 	error = mac_check_kld_stat(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	KLD_LOCK();
 	lf = linker_find_file_by_id(uap->fileid);
 	if (lf == NULL) {
 		KLD_UNLOCK();
 		return (ENOENT);
 	}
 
 	namelen = strlen(lf->filename) + 1;
 	if (namelen > MAXPATHLEN)
 		namelen = MAXPATHLEN;
 	bcopy(lf->filename, &stat.name[0], namelen);
 	stat.refs = lf->refs;
 	stat.id = lf->id;
 	stat.address = lf->address;
 	stat.size = lf->size;
 	KLD_UNLOCK();
 
 	td->td_retval[0] = 0;
 
 	return (copyout(&stat, uap->stat, sizeof(struct kld_file_stat)));
 }
 
-/*
- * MPSAFE
- */
 int
 kldfirstmod(struct thread *td, struct kldfirstmod_args *uap)
 {
 	linker_file_t lf;
 	module_t mp;
 	int error = 0;
 
 #ifdef MAC
 	error = mac_check_kld_stat(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	KLD_LOCK();
 	lf = linker_find_file_by_id(uap->fileid);
 	if (lf) {
 		MOD_SLOCK;
 		mp = TAILQ_FIRST(&lf->modules);
 		if (mp != NULL)
 			td->td_retval[0] = module_getid(mp);
 		else
 			td->td_retval[0] = 0;
 		MOD_SUNLOCK;
 	} else
 		error = ENOENT;
 	KLD_UNLOCK();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 kldsym(struct thread *td, struct kldsym_args *uap)
 {
 	char *symstr = NULL;
 	c_linker_sym_t sym;
 	linker_symval_t symval;
 	linker_file_t lf;
 	struct kld_sym_lookup lookup;
 	int error = 0;
 
 #ifdef MAC
 	error = mac_check_kld_stat(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 
 	if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0)
 		return (error);
 	if (lookup.version != sizeof(lookup) ||
 	    uap->cmd != KLDSYM_LOOKUP)
 		return (EINVAL);
 	symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0)
 		goto out;
 	KLD_LOCK();
 	if (uap->fileid != 0) {
 		lf = linker_find_file_by_id(uap->fileid);
 		if (lf == NULL)
 			error = ENOENT;
 		else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
 		    LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
 			lookup.symvalue = (uintptr_t) symval.value;
 			lookup.symsize = symval.size;
 			error = copyout(&lookup, uap->data, sizeof(lookup));
 		} else
 			error = ENOENT;
 	} else {
 		TAILQ_FOREACH(lf, &linker_files, link) {
 			if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
 			    LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
 				lookup.symvalue = (uintptr_t)symval.value;
 				lookup.symsize = symval.size;
 				error = copyout(&lookup, uap->data,
 				    sizeof(lookup));
 				break;
 			}
 		}
 		if (lf == NULL)
 			error = ENOENT;
 	}
 	KLD_UNLOCK();
 out:
 	free(symstr, M_TEMP);
 	return (error);
 }
 
 /*
  * Preloaded module support
  */
 
 static modlist_t
 modlist_lookup(const char *name, int ver)
 {
 	modlist_t mod;
 
 	TAILQ_FOREACH(mod, &found_modules, link) {
 		if (strcmp(mod->name, name) == 0 &&
 		    (ver == 0 || mod->version == ver))
 			return (mod);
 	}
 	return (NULL);
 }
 
 static modlist_t
 modlist_lookup2(const char *name, struct mod_depend *verinfo)
 {
 	modlist_t mod, bestmod;
 	int ver;
 
 	if (verinfo == NULL)
 		return (modlist_lookup(name, 0));
 	bestmod = NULL;
 	TAILQ_FOREACH(mod, &found_modules, link) {
 		if (strcmp(mod->name, name) != 0)
 			continue;
 		ver = mod->version;
 		if (ver == verinfo->md_ver_preferred)
 			return (mod);
 		if (ver >= verinfo->md_ver_minimum &&
 		    ver <= verinfo->md_ver_maximum &&
 		    (bestmod == NULL || ver > bestmod->version))
 			bestmod = mod;
 	}
 	return (bestmod);
 }
 
 static modlist_t
 modlist_newmodule(const char *modname, int version, linker_file_t container)
 {
 	modlist_t mod;
 
 	mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO);
 	if (mod == NULL)
 		panic("no memory for module list");
 	mod->container = container;
 	mod->name = modname;
 	mod->version = version;
 	TAILQ_INSERT_TAIL(&found_modules, mod, link);
 	return (mod);
 }
 
 static void
 linker_addmodules(linker_file_t lf, struct mod_metadata **start,
     struct mod_metadata **stop, int preload)
 {
 	struct mod_metadata *mp, **mdp;
 	const char *modname;
 	int ver;
 
 	for (mdp = start; mdp < stop; mdp++) {
 		mp = *mdp;
 		if (mp->md_type != MDT_VERSION)
 			continue;
 		modname = mp->md_cval;
 		ver = ((struct mod_version *)mp->md_data)->mv_version;
 		if (modlist_lookup(modname, ver) != NULL) {
 			printf("module %s already present!\n", modname);
 			/* XXX what can we do? this is a build error. :-( */
 			continue;
 		}
 		modlist_newmodule(modname, ver, lf);
 	}
 }
 
 static void
 linker_preload(void *arg)
 {
 	caddr_t modptr;
 	const char *modname, *nmodname;
 	char *modtype;
 	linker_file_t lf, nlf;
 	linker_class_t lc;
 	int error;
 	linker_file_list_t loaded_files;
 	linker_file_list_t depended_files;
 	struct mod_metadata *mp, *nmp;
 	struct mod_metadata **start, **stop, **mdp, **nmdp;
 	struct mod_depend *verinfo;
 	int nver;
 	int resolves;
 	modlist_t mod;
 	struct sysinit **si_start, **si_stop;
 
 	TAILQ_INIT(&loaded_files);
 	TAILQ_INIT(&depended_files);
 	TAILQ_INIT(&found_modules);
 	error = 0;
 
 	modptr = NULL;
 	while ((modptr = preload_search_next_name(modptr)) != NULL) {
 		modname = (char *)preload_search_info(modptr, MODINFO_NAME);
 		modtype = (char *)preload_search_info(modptr, MODINFO_TYPE);
 		if (modname == NULL) {
 			printf("Preloaded module at %p does not have a"
 			    " name!\n", modptr);
 			continue;
 		}
 		if (modtype == NULL) {
 			printf("Preloaded module at %p does not have a type!\n",
 			    modptr);
 			continue;
 		}
 		if (bootverbose)
 			printf("Preloaded %s \"%s\" at %p.\n", modtype, modname,
 			    modptr);
 		lf = NULL;
 		TAILQ_FOREACH(lc, &classes, link) {
 			error = LINKER_LINK_PRELOAD(lc, modname, &lf);
 			if (!error)
 				break;
 			lf = NULL;
 		}
 		if (lf)
 			TAILQ_INSERT_TAIL(&loaded_files, lf, loaded);
 	}
 
 	/*
 	 * First get a list of stuff in the kernel.
 	 */
 	if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start,
 	    &stop, NULL) == 0)
 		linker_addmodules(linker_kernel_file, start, stop, 1);
 
 	/*
 	 * This is a once-off kinky bubble sort to resolve relocation
 	 * dependency requirements.
 	 */
 restart:
 	TAILQ_FOREACH(lf, &loaded_files, loaded) {
 		error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
 		    &stop, NULL);
 		/*
 		 * First, look to see if we would successfully link with this
 		 * stuff.
 		 */
 		resolves = 1;	/* unless we know otherwise */
 		if (!error) {
 			for (mdp = start; mdp < stop; mdp++) {
 				mp = *mdp;
 				if (mp->md_type != MDT_DEPEND)
 					continue;
 				modname = mp->md_cval;
 				verinfo = mp->md_data;
 				for (nmdp = start; nmdp < stop; nmdp++) {
 					nmp = *nmdp;
 					if (nmp->md_type != MDT_VERSION)
 						continue;
 					nmodname = nmp->md_cval;
 					if (strcmp(modname, nmodname) == 0)
 						break;
 				}
 				if (nmdp < stop)   /* it's a self reference */
 					continue;
 
 				/*
 				 * ok, the module isn't here yet, we
 				 * are not finished
 				 */
 				if (modlist_lookup2(modname, verinfo) == NULL)
 					resolves = 0;
 			}
 		}
 		/*
 		 * OK, if we found our modules, we can link.  So, "provide"
 		 * the modules inside and add it to the end of the link order
 		 * list.
 		 */
 		if (resolves) {
 			if (!error) {
 				for (mdp = start; mdp < stop; mdp++) {
 					mp = *mdp;
 					if (mp->md_type != MDT_VERSION)
 						continue;
 					modname = mp->md_cval;
 					nver = ((struct mod_version *)
 					    mp->md_data)->mv_version;
 					if (modlist_lookup(modname,
 					    nver) != NULL) {
 						printf("module %s already"
 						    " present!\n", modname);
 						TAILQ_REMOVE(&loaded_files,
 						    lf, loaded);
 						linker_file_unload(lf,
 						    LINKER_UNLOAD_FORCE);
 						/* we changed tailq next ptr */
 						goto restart;
 					}
 					modlist_newmodule(modname, nver, lf);
 				}
 			}
 			TAILQ_REMOVE(&loaded_files, lf, loaded);
 			TAILQ_INSERT_TAIL(&depended_files, lf, loaded);
 			/*
 			 * Since we provided modules, we need to restart the
 			 * sort so that the previous files that depend on us
 			 * have a chance. Also, we've busted the tailq next
 			 * pointer with the REMOVE.
 			 */
 			goto restart;
 		}
 	}
 
 	/*
 	 * At this point, we check to see what could not be resolved..
 	 */
 	while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) {
 		TAILQ_REMOVE(&loaded_files, lf, loaded);
 		printf("KLD file %s is missing dependencies\n", lf->filename);
 		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 	}
 
 	/*
 	 * We made it. Finish off the linking in the order we determined.
 	 */
 	TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) {
 		if (linker_kernel_file) {
 			linker_kernel_file->refs++;
 			error = linker_file_add_dependency(lf,
 			    linker_kernel_file);
 			if (error)
 				panic("cannot add dependency");
 		}
 		lf->userrefs++;	/* so we can (try to) kldunload it */
 		error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
 		    &stop, NULL);
 		if (!error) {
 			for (mdp = start; mdp < stop; mdp++) {
 				mp = *mdp;
 				if (mp->md_type != MDT_DEPEND)
 					continue;
 				modname = mp->md_cval;
 				verinfo = mp->md_data;
 				mod = modlist_lookup2(modname, verinfo);
 				/* Don't count self-dependencies */
 				if (lf == mod->container)
 					continue;
 				mod->container->refs++;
 				error = linker_file_add_dependency(lf,
 				    mod->container);
 				if (error)
 					panic("cannot add dependency");
 			}
 		}
 		/*
 		 * Now do relocation etc using the symbol search paths
 		 * established by the dependencies
 		 */
 		error = LINKER_LINK_PRELOAD_FINISH(lf);
 		if (error) {
 			TAILQ_REMOVE(&depended_files, lf, loaded);
 			printf("KLD file %s - could not finalize loading\n",
 			    lf->filename);
 			linker_file_unload(lf, LINKER_UNLOAD_FORCE);
 			continue;
 		}
 		linker_file_register_modules(lf);
 		if (linker_file_lookup_set(lf, "sysinit_set", &si_start,
 		    &si_stop, NULL) == 0)
 			sysinit_add(si_start, si_stop);
 		linker_file_register_sysctls(lf);
 		lf->flags |= LINKER_FILE_LINKED;
 	}
 	/* woohoo! we made it! */
 }
 
 SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0)
 
 /*
  * Search for a not-loaded module by name.
  *
  * Modules may be found in the following locations:
  *
  * - preloaded (result is just the module name) - on disk (result is full path
  * to module)
  *
  * If the module name is qualified in any way (contains path, etc.) the we
  * simply return a copy of it.
  *
  * The search path can be manipulated via sysctl.  Note that we use the ';'
  * character as a separator to be consistent with the bootloader.
  */
 
 static char linker_hintfile[] = "linker.hints";
 static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules";
 
 SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path,
     sizeof(linker_path), "module load search path");
 
 TUNABLE_STR("module_path", linker_path, sizeof(linker_path));
 
 static char *linker_ext_list[] = {
 	"",
 	".ko",
 	NULL
 };
 
 /*
  * Check if file actually exists either with or without extension listed in
  * the linker_ext_list. (probably should be generic for the rest of the
  * kernel)
  */
 static char *
 linker_lookup_file(const char *path, int pathlen, const char *name,
     int namelen, struct vattr *vap)
 {
 	struct nameidata nd;
 	struct thread *td = curthread;	/* XXX */
 	char *result, **cpp, *sep;
 	int error, len, extlen, reclen, flags, vfslocked;
 	enum vtype type;
 
 	extlen = 0;
 	for (cpp = linker_ext_list; *cpp; cpp++) {
 		len = strlen(*cpp);
 		if (len > extlen)
 			extlen = len;
 	}
 	extlen++;		/* trailing '\0' */
 	sep = (path[pathlen - 1] != '/') ? "/" : "";
 
 	reclen = pathlen + strlen(sep) + namelen + extlen + 1;
 	result = malloc(reclen, M_LINKER, M_WAITOK);
 	for (cpp = linker_ext_list; *cpp; cpp++) {
 		snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep,
 		    namelen, name, *cpp);
 		/*
 		 * Attempt to open the file, and return the path if
 		 * we succeed and it's a regular file.
 		 */
 		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, result, td);
 		flags = FREAD;
 		error = vn_open(&nd, &flags, 0, -1);
 		if (error == 0) {
 			vfslocked = NDHASGIANT(&nd);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			type = nd.ni_vp->v_type;
 			if (vap)
 				VOP_GETATTR(nd.ni_vp, vap, td->td_ucred, td);
 			VOP_UNLOCK(nd.ni_vp, 0, td);
 			vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			if (type == VREG)
 				return (result);
 		}
 	}
 	free(result, M_LINKER);
 	return (NULL);
 }
 
 #define	INT_ALIGN(base, ptr)	ptr =					\
 	(base) + (((ptr) - (base) + sizeof(int) - 1) & ~(sizeof(int) - 1))
 
 /*
  * Lookup KLD which contains requested module in the "linker.hints" file. If
  * version specification is available, then try to find the best KLD.
  * Otherwise just find the latest one.
  */
 static char *
 linker_hints_lookup(const char *path, int pathlen, const char *modname,
     int modnamelen, struct mod_depend *verinfo)
 {
 	struct thread *td = curthread;	/* XXX */
 	struct ucred *cred = td ? td->td_ucred : NULL;
 	struct nameidata nd;
 	struct vattr vattr, mattr;
 	u_char *hints = NULL;
 	u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep;
 	int error, ival, bestver, *intp, reclen, found, flags, clen, blen;
 	int vfslocked = 0;
 
 	result = NULL;
 	bestver = found = 0;
 
 	sep = (path[pathlen - 1] != '/') ? "/" : "";
 	reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen +
 	    strlen(sep) + 1;
 	pathbuf = malloc(reclen, M_LINKER, M_WAITOK);
 	snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep,
 	    linker_hintfile);
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, pathbuf, td);
 	flags = FREAD;
 	error = vn_open(&nd, &flags, 0, -1);
 	if (error)
 		goto bad;
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_vp->v_type != VREG)
 		goto bad;
 	best = cp = NULL;
 	error = VOP_GETATTR(nd.ni_vp, &vattr, cred, td);
 	if (error)
 		goto bad;
 	/*
 	 * XXX: we need to limit this number to some reasonable value
 	 */
 	if (vattr.va_size > 100 * 1024) {
 		printf("hints file too large %ld\n", (long)vattr.va_size);
 		goto bad;
 	}
 	hints = malloc(vattr.va_size, M_TEMP, M_WAITOK);
 	if (hints == NULL)
 		goto bad;
 	error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0,
 	    UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td);
 	if (error)
 		goto bad;
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	vn_close(nd.ni_vp, FREAD, cred, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	nd.ni_vp = NULL;
 	if (reclen != 0) {
 		printf("can't read %d\n", reclen);
 		goto bad;
 	}
 	intp = (int *)hints;
 	ival = *intp++;
 	if (ival != LINKER_HINTS_VERSION) {
 		printf("hints file version mismatch %d\n", ival);
 		goto bad;
 	}
 	bufend = hints + vattr.va_size;
 	recptr = (u_char *)intp;
 	clen = blen = 0;
 	while (recptr < bufend && !found) {
 		intp = (int *)recptr;
 		reclen = *intp++;
 		ival = *intp++;
 		cp = (char *)intp;
 		switch (ival) {
 		case MDT_VERSION:
 			clen = *cp++;
 			if (clen != modnamelen || bcmp(cp, modname, clen) != 0)
 				break;
 			cp += clen;
 			INT_ALIGN(hints, cp);
 			ival = *(int *)cp;
 			cp += sizeof(int);
 			clen = *cp++;
 			if (verinfo == NULL ||
 			    ival == verinfo->md_ver_preferred) {
 				found = 1;
 				break;
 			}
 			if (ival >= verinfo->md_ver_minimum &&
 			    ival <= verinfo->md_ver_maximum &&
 			    ival > bestver) {
 				bestver = ival;
 				best = cp;
 				blen = clen;
 			}
 			break;
 		default:
 			break;
 		}
 		recptr += reclen + sizeof(int);
 	}
 	/*
 	 * Finally check if KLD is in the place
 	 */
 	if (found)
 		result = linker_lookup_file(path, pathlen, cp, clen, &mattr);
 	else if (best)
 		result = linker_lookup_file(path, pathlen, best, blen, &mattr);
 
 	/*
 	 * KLD is newer than hints file. What we should do now?
 	 */
 	if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >))
 		printf("warning: KLD '%s' is newer than the linker.hints"
 		    " file\n", result);
 bad:
 	free(pathbuf, M_LINKER);
 	if (hints)
 		free(hints, M_TEMP);
 	if (nd.ni_vp != NULL) {
 		VOP_UNLOCK(nd.ni_vp, 0, td);
 		vn_close(nd.ni_vp, FREAD, cred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	/*
 	 * If nothing found or hints is absent - fallback to the old
 	 * way by using "kldname[.ko]" as module name.
 	 */
 	if (!found && !bestver && result == NULL)
 		result = linker_lookup_file(path, pathlen, modname,
 		    modnamelen, NULL);
 	return (result);
 }
 
 /*
  * Lookup KLD which contains requested module in the all directories.
  */
 static char *
 linker_search_module(const char *modname, int modnamelen,
     struct mod_depend *verinfo)
 {
 	char *cp, *ep, *result;
 
 	/*
 	 * traverse the linker path
 	 */
 	for (cp = linker_path; *cp; cp = ep + 1) {
 		/* find the end of this component */
 		for (ep = cp; (*ep != 0) && (*ep != ';'); ep++);
 		result = linker_hints_lookup(cp, ep - cp, modname,
 		    modnamelen, verinfo);
 		if (result != NULL)
 			return (result);
 		if (*ep == 0)
 			break;
 	}
 	return (NULL);
 }
 
 /*
  * Search for module in all directories listed in the linker_path.
  */
 static char *
 linker_search_kld(const char *name)
 {
 	char *cp, *ep, *result;
 	int len;
 
 	/* qualified at all? */
 	if (index(name, '/'))
 		return (linker_strdup(name));
 
 	/* traverse the linker path */
 	len = strlen(name);
 	for (ep = linker_path; *ep; ep++) {
 		cp = ep;
 		/* find the end of this component */
 		for (; *ep != 0 && *ep != ';'; ep++);
 		result = linker_lookup_file(cp, ep - cp, name, len, NULL);
 		if (result != NULL)
 			return (result);
 	}
 	return (NULL);
 }
 
 static const char *
 linker_basename(const char *path)
 {
 	const char *filename;
 
 	filename = rindex(path, '/');
 	if (filename == NULL)
 		return path;
 	if (filename[1])
 		filename++;
 	return (filename);
 }
 
 #ifdef HWPMC_HOOKS
 
 struct hwpmc_context {
 	int	nobjects;
 	int	nmappings;
 	struct pmckern_map_in *kobase;
 };
 
 static int
 linker_hwpmc_list_object(linker_file_t lf, void *arg)
 {
 	struct hwpmc_context *hc;
 
 	hc = arg;
 
 	/* If we run out of mappings, fail. */
 	if (hc->nobjects >= hc->nmappings)
 		return (1);
 
 	/* Save the info for this linker file. */
 	hc->kobase[hc->nobjects].pm_file = lf->filename;
 	hc->kobase[hc->nobjects].pm_address = (uintptr_t)lf->address;
 	hc->nobjects++;
 	return (0);
 }
 
 /*
  * Inform hwpmc about the set of kernel modules currently loaded.
  */
 void *
 linker_hwpmc_list_objects(void)
 {
 	struct hwpmc_context hc;
 
 	hc.nmappings = 15;	/* a reasonable default */
 
  retry:
 	/* allocate nmappings+1 entries */
 	MALLOC(hc.kobase, struct pmckern_map_in *,
 	    (hc.nmappings + 1) * sizeof(struct pmckern_map_in), M_LINKER,
 	    M_WAITOK | M_ZERO);
 
 	hc.nobjects = 0;
 	if (linker_file_foreach(linker_hwpmc_list_object, &hc) != 0) {
 		hc.nmappings = hc.nobjects;
 		FREE(hc.kobase, M_LINKER);
 		goto retry;
 	}
 
 	KASSERT(hc.nobjects > 0, ("linker_hpwmc_list_objects: no kernel "
 		"objects?"));
 
 	/* The last entry of the malloced area comprises of all zeros. */
 	KASSERT(hc.kobase[hc.nobjects].pm_file == NULL,
 	    ("linker_hwpmc_list_objects: last object not NULL"));
 
 	return ((void *)hc.kobase);
 }
 #endif
 
 /*
  * Find a file which contains given module and load it, if "parent" is not
  * NULL, register a reference to it.
  */
 static int
 linker_load_module(const char *kldname, const char *modname,
     struct linker_file *parent, struct mod_depend *verinfo,
     struct linker_file **lfpp)
 {
 	linker_file_t lfdep;
 	const char *filename;
 	char *pathname;
 	int error;
 
 	KLD_LOCK_ASSERT();
 	if (modname == NULL) {
 		/*
  		 * We have to load KLD
  		 */
 		KASSERT(verinfo == NULL, ("linker_load_module: verinfo"
 		    " is not NULL"));
 		pathname = linker_search_kld(kldname);
 	} else {
 		if (modlist_lookup2(modname, verinfo) != NULL)
 			return (EEXIST);
 		if (kldname != NULL)
 			pathname = linker_strdup(kldname);
 		else if (rootvnode == NULL)
 			pathname = NULL;
 		else
 			/*
 			 * Need to find a KLD with required module
 			 */
 			pathname = linker_search_module(modname,
 			    strlen(modname), verinfo);
 	}
 	if (pathname == NULL)
 		return (ENOENT);
 
 	/*
 	 * Can't load more than one file with the same basename XXX:
 	 * Actually it should be possible to have multiple KLDs with
 	 * the same basename but different path because they can
 	 * provide different versions of the same modules.
 	 */
 	filename = linker_basename(pathname);
 	if (linker_find_file_by_name(filename))
 		error = EEXIST;
 	else do {
 		error = linker_load_file(pathname, &lfdep);
 		if (error)
 			break;
 		if (modname && verinfo &&
 		    modlist_lookup2(modname, verinfo) == NULL) {
 			linker_file_unload(lfdep, LINKER_UNLOAD_FORCE);
 			error = ENOENT;
 			break;
 		}
 		if (parent) {
 			error = linker_file_add_dependency(parent, lfdep);
 			if (error)
 				break;
 		}
 		if (lfpp)
 			*lfpp = lfdep;
 	} while (0);
 	free(pathname, M_LINKER);
 	return (error);
 }
 
 /*
  * This routine is responsible for finding dependencies of userland initiated
  * kldload(2)'s of files.
  */
 int
 linker_load_dependencies(linker_file_t lf)
 {
 	linker_file_t lfdep;
 	struct mod_metadata **start, **stop, **mdp, **nmdp;
 	struct mod_metadata *mp, *nmp;
 	struct mod_depend *verinfo;
 	modlist_t mod;
 	const char *modname, *nmodname;
 	int ver, error = 0, count;
 
 	/*
 	 * All files are dependant on /kernel.
 	 */
 	KLD_LOCK_ASSERT();
 	if (linker_kernel_file) {
 		linker_kernel_file->refs++;
 		error = linker_file_add_dependency(lf, linker_kernel_file);
 		if (error)
 			return (error);
 	}
 	if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop,
 	    &count) != 0)
 		return (0);
 	for (mdp = start; mdp < stop; mdp++) {
 		mp = *mdp;
 		if (mp->md_type != MDT_VERSION)
 			continue;
 		modname = mp->md_cval;
 		ver = ((struct mod_version *)mp->md_data)->mv_version;
 		mod = modlist_lookup(modname, ver);
 		if (mod != NULL) {
 			printf("interface %s.%d already present in the KLD"
 			    " '%s'!\n", modname, ver,
 			    mod->container->filename);
 			return (EEXIST);
 		}
 	}
 
 	for (mdp = start; mdp < stop; mdp++) {
 		mp = *mdp;
 		if (mp->md_type != MDT_DEPEND)
 			continue;
 		modname = mp->md_cval;
 		verinfo = mp->md_data;
 		nmodname = NULL;
 		for (nmdp = start; nmdp < stop; nmdp++) {
 			nmp = *nmdp;
 			if (nmp->md_type != MDT_VERSION)
 				continue;
 			nmodname = nmp->md_cval;
 			if (strcmp(modname, nmodname) == 0)
 				break;
 		}
 		if (nmdp < stop)/* early exit, it's a self reference */
 			continue;
 		mod = modlist_lookup2(modname, verinfo);
 		if (mod) {	/* woohoo, it's loaded already */
 			lfdep = mod->container;
 			lfdep->refs++;
 			error = linker_file_add_dependency(lf, lfdep);
 			if (error)
 				break;
 			continue;
 		}
 		error = linker_load_module(NULL, modname, lf, verinfo, NULL);
 		if (error) {
 			printf("KLD %s: depends on %s - not available\n",
 			    lf->filename, modname);
 			break;
 		}
 	}
 
 	if (error)
 		return (error);
 	linker_addmodules(lf, start, stop, 0);
 	return (error);
 }
 
 static int
 sysctl_kern_function_list_iterate(const char *name, void *opaque)
 {
 	struct sysctl_req *req;
 
 	req = opaque;
 	return (SYSCTL_OUT(req, name, strlen(name) + 1));
 }
 
 /*
  * Export a nul-separated, double-nul-terminated list of all function names
  * in the kernel.
  */
 static int
 sysctl_kern_function_list(SYSCTL_HANDLER_ARGS)
 {
 	linker_file_t lf;
 	int error;
 
 #ifdef MAC
 	error = mac_check_kld_stat(req->td->td_ucred);
 	if (error)
 		return (error);
 #endif
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	KLD_LOCK();
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		error = LINKER_EACH_FUNCTION_NAME(lf,
 		    sysctl_kern_function_list_iterate, req);
 		if (error) {
 			KLD_UNLOCK();
 			return (error);
 		}
 	}
 	KLD_UNLOCK();
 	return (SYSCTL_OUT(req, "", 1));
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLFLAG_RD,
     NULL, 0, sysctl_kern_function_list, "", "kernel function list");
Index: head/sys/kern/kern_module.c
===================================================================
--- head/sys/kern/kern_module.c	(revision 167210)
+++ head/sys/kern/kern_module.c	(revision 167211)
@@ -1,506 +1,491 @@
 /*-
  * Copyright (c) 1997 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_compat.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/reboot.h>
 #include <sys/sx.h>
 #include <sys/module.h>
 #include <sys/linker.h>
 
 static MALLOC_DEFINE(M_MODULE, "module", "module data structures");
 
 typedef TAILQ_HEAD(, module) modulelist_t;
 struct module {
 	TAILQ_ENTRY(module)	link;	/* chain together all modules */
 	TAILQ_ENTRY(module)	flink;	/* all modules in a file */
 	struct linker_file	*file;	/* file which contains this module */
 	int			refs;	/* reference count */
 	int 			id;	/* unique id number */
 	char 			*name;	/* module name */
 	modeventhand_t 		handler;	/* event handler */
 	void 			*arg;	/* argument for handler */
 	modspecific_t 		data;	/* module specific data */
 };
 
 #define MOD_EVENT(mod, type)	(mod)->handler((mod), (type), (mod)->arg)
 
 static modulelist_t modules;
 struct sx modules_sx;
 static int nextid = 1;
 static void module_shutdown(void *, int);
 
 static int
 modevent_nop(module_t mod, int what, void *arg)
 {
 
 	switch(what) {
 	case MOD_LOAD:
 		return (0);
 	case MOD_UNLOAD:
 		return (EBUSY);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static void
 module_init(void *arg)
 {
 
 	sx_init(&modules_sx, "module subsystem sx lock");
 	TAILQ_INIT(&modules);
 	EVENTHANDLER_REGISTER(shutdown_final, module_shutdown, NULL,
 	    SHUTDOWN_PRI_DEFAULT);
 }
 
 SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0)
 
 static void
 module_shutdown(void *arg1, int arg2)
 {
 	module_t mod;
 
 	if (arg2 & RB_NOSYNC)
 		return;
 	mtx_lock(&Giant);
 	MOD_SLOCK;
 	TAILQ_FOREACH(mod, &modules, link)
 		MOD_EVENT(mod, MOD_SHUTDOWN);
 	MOD_SUNLOCK;
 	mtx_unlock(&Giant);
 }
 
 void
 module_register_init(const void *arg)
 {
 	const moduledata_t *data = (const moduledata_t *)arg;
 	int error;
 	module_t mod;
 
 	mtx_lock(&Giant);
 	MOD_SLOCK;
 	mod = module_lookupbyname(data->name);
 	if (mod == NULL)
 		panic("module_register_init: module named %s not found\n",
 		    data->name);
 	MOD_SUNLOCK;
 	error = MOD_EVENT(mod, MOD_LOAD);
 	if (error) {
 		MOD_EVENT(mod, MOD_UNLOAD);
 		MOD_XLOCK;
 		module_release(mod);
 		MOD_XUNLOCK;
 		printf("module_register_init: MOD_LOAD (%s, %p, %p) error"
 		    " %d\n", data->name, (void *)data->evhand, data->priv,
 		    error); 
 	}
 	mtx_unlock(&Giant);
 }
 
 int
 module_register(const moduledata_t *data, linker_file_t container)
 {
 	size_t namelen;
 	module_t newmod;
 
 	MOD_XLOCK;
 	newmod = module_lookupbyname(data->name);
 	if (newmod != NULL) {
 		MOD_XUNLOCK;
 		printf("module_register: module %s already exists!\n",
 		    data->name);
 		return (EEXIST);
 	}
 	namelen = strlen(data->name) + 1;
 	newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK);
 	if (newmod == NULL) {
 		MOD_XUNLOCK;
 		return (ENOMEM);
 	}
 	newmod->refs = 1;
 	newmod->id = nextid++;
 	newmod->name = (char *)(newmod + 1);
 	strcpy(newmod->name, data->name);
 	newmod->handler = data->evhand ? data->evhand : modevent_nop;
 	newmod->arg = data->priv;
 	bzero(&newmod->data, sizeof(newmod->data));
 	TAILQ_INSERT_TAIL(&modules, newmod, link);
 
 	if (container)
 		TAILQ_INSERT_TAIL(&container->modules, newmod, flink);
 	newmod->file = container;
 	MOD_XUNLOCK;
 	return (0);
 }
 
 void
 module_reference(module_t mod)
 {
 
 	MOD_XLOCK_ASSERT;
 
 	MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs));
 	mod->refs++;
 }
 
 void
 module_release(module_t mod)
 {
 
 	MOD_XLOCK_ASSERT;
 
 	if (mod->refs <= 0)
 		panic("module_release: bad reference count");
 
 	MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs));
 	
 	mod->refs--;
 	if (mod->refs == 0) {
 		TAILQ_REMOVE(&modules, mod, link);
 		if (mod->file)
 			TAILQ_REMOVE(&mod->file->modules, mod, flink);
 		MOD_XUNLOCK;
 		free(mod, M_MODULE);
 		MOD_XLOCK;
 	}
 }
 
 module_t
 module_lookupbyname(const char *name)
 {
 	module_t mod;
 	int err;
 
 	MOD_LOCK_ASSERT;
 
 	TAILQ_FOREACH(mod, &modules, link) {
 		err = strcmp(mod->name, name);
 		if (err == 0)
 			return (mod);
 	}
 	return (NULL);
 }
 
 module_t
 module_lookupbyid(int modid)
 {
         module_t mod;
 
         MOD_LOCK_ASSERT;
 
         TAILQ_FOREACH(mod, &modules, link)
                 if (mod->id == modid)
                         return(mod);
         return (NULL);
 }
 
 int
 module_unload(module_t mod, int flags)
 {
 	int error;
 
 	mtx_lock(&Giant);
 	error = MOD_EVENT(mod, MOD_QUIESCE);
 	if (error == EOPNOTSUPP || error == EINVAL)
 		error = 0;
 	if (error == 0 || flags == LINKER_UNLOAD_FORCE)
 		error = MOD_EVENT(mod, MOD_UNLOAD);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 int
 module_getid(module_t mod)
 {
 
 	MOD_LOCK_ASSERT;
 	return (mod->id);
 }
 
 module_t
 module_getfnext(module_t mod)
 {
 
 	MOD_LOCK_ASSERT;
 	return (TAILQ_NEXT(mod, flink));
 }
 
 void
 module_setspecific(module_t mod, modspecific_t *datap)
 {
 
 	MOD_XLOCK_ASSERT;
 	mod->data = *datap;
 }
 
 linker_file_t
 module_file(module_t mod)
 {
 
 	return (mod->file);
 }
 
 /*
  * Syscalls.
  */
-/*
- * MPSAFE
- */
 int
 modnext(struct thread *td, struct modnext_args *uap)
 {
 	module_t mod;
 	int error = 0;
 
 	td->td_retval[0] = -1;
 
 	MOD_SLOCK;
 	if (uap->modid == 0) {
 		mod = TAILQ_FIRST(&modules);
 		if (mod)
 			td->td_retval[0] = mod->id;
 		else
 			error = ENOENT;
 		goto done2;
 	}
 	mod = module_lookupbyid(uap->modid);
 	if (mod == NULL) {
 		error = ENOENT;
 		goto done2;
 	}
 	if (TAILQ_NEXT(mod, link))
 		td->td_retval[0] = TAILQ_NEXT(mod, link)->id;
 	else
 		td->td_retval[0] = 0;
 done2:
 	MOD_SUNLOCK;
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 modfnext(struct thread *td, struct modfnext_args *uap)
 {
 	module_t mod;
 	int error;
 
 	td->td_retval[0] = -1;
 
 	MOD_SLOCK;
 	mod = module_lookupbyid(uap->modid);
 	if (mod == NULL) {
 		error = ENOENT;
 	} else {
 		error = 0;
 		if (TAILQ_NEXT(mod, flink))
 			td->td_retval[0] = TAILQ_NEXT(mod, flink)->id;
 		else
 			td->td_retval[0] = 0;
 	}
 	MOD_SUNLOCK;
 	return (error);
 }
 
 struct module_stat_v1 {
 	int	version;		/* set to sizeof(struct module_stat) */
 	char	name[MAXMODNAME];
 	int	refs;
 	int	id;
 };
 
-/*
- * MPSAFE
- */
 int
 modstat(struct thread *td, struct modstat_args *uap)
 {
 	module_t mod;
 	modspecific_t data;
 	int error = 0;
 	int id, namelen, refs, version;
 	struct module_stat *stat;
 	char *name;
 
 	MOD_SLOCK;
 	mod = module_lookupbyid(uap->modid);
 	if (mod == NULL) {
 		MOD_SUNLOCK;
 		return (ENOENT);
 	}
 	id = mod->id;
 	refs = mod->refs;
 	name = mod->name;
 	data = mod->data;
 	MOD_SUNLOCK;
 	stat = uap->stat;
 
 	/*
 	 * Check the version of the user's structure.
 	 */
 	if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
 		return (error);
 	if (version != sizeof(struct module_stat_v1)
 	    && version != sizeof(struct module_stat))
 		return (EINVAL);
 	namelen = strlen(mod->name) + 1;
 	if (namelen > MAXMODNAME)
 		namelen = MAXMODNAME;
 	if ((error = copyout(name, &stat->name[0], namelen)) != 0)
 		return (error);
 
 	if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0)
 		return (error);
 	if ((error = copyout(&id, &stat->id, sizeof(int))) != 0)
 		return (error);
 
 	/*
 	 * >v1 stat includes module data.
 	 */
 	if (version == sizeof(struct module_stat))
 		if ((error = copyout(&data, &stat->data, 
 		    sizeof(data))) != 0)
 			return (error);
 	td->td_retval[0] = 0;
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 modfind(struct thread *td, struct modfind_args *uap)
 {
 	int error = 0;
 	char name[MAXMODNAME];
 	module_t mod;
 
 	if ((error = copyinstr(uap->name, name, sizeof name, 0)) != 0)
 		return (error);
 
 	MOD_SLOCK;
 	mod = module_lookupbyname(name);
 	if (mod == NULL)
 		error = ENOENT;
 	else
 		td->td_retval[0] = module_getid(mod);
 	MOD_SUNLOCK;
 	return (error);
 }
 
 #ifdef COMPAT_IA32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 
 typedef union modspecific32 {
 	int		intval;
 	u_int32_t	uintval;
 	int		longval;
 	u_int32_t	ulongval;
 } modspecific32_t;
 
 struct module_stat32 {
 	int		version;
 	char		name[MAXMODNAME];
 	int		refs;
 	int		id;
 	modspecific32_t	data;
 };
 
-/*
- * MPSAFE
- */
 int
 freebsd32_modstat(struct thread *td, struct freebsd32_modstat_args *uap)
 {
 	module_t mod;
 	modspecific32_t data32;
 	int error = 0;
 	int id, namelen, refs, version;
 	struct module_stat32 *stat32;
 	char *name;
 
 	MOD_SLOCK;
 	mod = module_lookupbyid(uap->modid);
 	if (mod == NULL) {
 		MOD_SUNLOCK;
 		return (ENOENT);
 	}
 
 	id = mod->id;
 	refs = mod->refs;
 	name = mod->name;
 	CP(mod->data, data32, intval);
 	CP(mod->data, data32, uintval);
 	CP(mod->data, data32, longval);
 	CP(mod->data, data32, ulongval);
 	MOD_SUNLOCK;
 	stat32 = uap->stat;
 
 	if ((error = copyin(&stat32->version, &version, sizeof(version))) != 0)
 		return (error);
 	if (version != sizeof(struct module_stat_v1)
 	    && version != sizeof(struct module_stat32))
 		return (EINVAL);
 	namelen = strlen(mod->name) + 1;
 	if (namelen > MAXMODNAME)
 		namelen = MAXMODNAME;
 	if ((error = copyout(name, &stat32->name[0], namelen)) != 0)
 		return (error);
 
 	if ((error = copyout(&refs, &stat32->refs, sizeof(int))) != 0)
 		return (error);
 	if ((error = copyout(&id, &stat32->id, sizeof(int))) != 0)
 		return (error);
 
 	/*
 	 * >v1 stat includes module data.
 	 */
 	if (version == sizeof(struct module_stat32))
 		if ((error = copyout(&data32, &stat32->data,
 		    sizeof(data32))) != 0)
 			return (error);
 	td->td_retval[0] = 0;
 	return (error);
 }
 #endif
Index: head/sys/kern/kern_ntptime.c
===================================================================
--- head/sys/kern/kern_ntptime.c	(revision 167210)
+++ head/sys/kern/kern_ntptime.c	(revision 167211)
@@ -1,978 +1,972 @@
 /*-
  ***********************************************************************
  *								       *
  * Copyright (c) David L. Mills 1993-2001			       *
  *								       *
  * Permission to use, copy, modify, and distribute this software and   *
  * its documentation for any purpose and without fee is hereby	       *
  * granted, provided that the above copyright notice appears in all    *
  * copies and that both the copyright notice and this permission       *
  * notice appear in supporting documentation, and that the name	       *
  * University of Delaware not be used in advertising or publicity      *
  * pertaining to distribution of the software without specific,	       *
  * written prior permission. The University of Delaware makes no       *
  * representations about the suitability this software for any	       *
  * purpose. It is provided "as is" without express or implied	       *
  * warranty.							       *
  *								       *
  **********************************************************************/
 
 /*
  * Adapted from the original sources for FreeBSD and timecounters by:
  * Poul-Henning Kamp <phk@FreeBSD.org>.
  *
  * The 32bit version of the "LP" macros seems a bit past its "sell by" 
  * date so I have retained only the 64bit version and included it directly
  * in this file.
  *
  * Only minor changes done to interface with the timecounters over in
  * sys/kern/kern_clock.c.   Some of the comments below may be (even more)
  * confusing and/or plain wrong in that context.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ntp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/time.h>
 #include <sys/timex.h>
 #include <sys/timetc.h>
 #include <sys/timepps.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 /*
  * Single-precision macros for 64-bit machines
  */
 typedef int64_t l_fp;
 #define L_ADD(v, u)	((v) += (u))
 #define L_SUB(v, u)	((v) -= (u))
 #define L_ADDHI(v, a)	((v) += (int64_t)(a) << 32)
 #define L_NEG(v)	((v) = -(v))
 #define L_RSHIFT(v, n) \
 	do { \
 		if ((v) < 0) \
 			(v) = -(-(v) >> (n)); \
 		else \
 			(v) = (v) >> (n); \
 	} while (0)
 #define L_MPY(v, a)	((v) *= (a))
 #define L_CLR(v)	((v) = 0)
 #define L_ISNEG(v)	((v) < 0)
 #define L_LINT(v, a)	((v) = (int64_t)(a) << 32)
 #define L_GINT(v)	((v) < 0 ? -(-(v) >> 32) : (v) >> 32)
 
 /*
  * Generic NTP kernel interface
  *
  * These routines constitute the Network Time Protocol (NTP) interfaces
  * for user and daemon application programs. The ntp_gettime() routine
  * provides the time, maximum error (synch distance) and estimated error
  * (dispersion) to client user application programs. The ntp_adjtime()
  * routine is used by the NTP daemon to adjust the system clock to an
  * externally derived time. The time offset and related variables set by
  * this routine are used by other routines in this module to adjust the
  * phase and frequency of the clock discipline loop which controls the
  * system clock.
  *
  * When the kernel time is reckoned directly in nanoseconds (NTP_NANO
  * defined), the time at each tick interrupt is derived directly from
  * the kernel time variable. When the kernel time is reckoned in
  * microseconds, (NTP_NANO undefined), the time is derived from the
  * kernel time variable together with a variable representing the
  * leftover nanoseconds at the last tick interrupt. In either case, the
  * current nanosecond time is reckoned from these values plus an
  * interpolated value derived by the clock routines in another
  * architecture-specific module. The interpolation can use either a
  * dedicated counter or a processor cycle counter (PCC) implemented in
  * some architectures.
  *
  * Note that all routines must run at priority splclock or higher.
  */
 /*
  * Phase/frequency-lock loop (PLL/FLL) definitions
  *
  * The nanosecond clock discipline uses two variable types, time
  * variables and frequency variables. Both types are represented as 64-
  * bit fixed-point quantities with the decimal point between two 32-bit
  * halves. On a 32-bit machine, each half is represented as a single
  * word and mathematical operations are done using multiple-precision
  * arithmetic. On a 64-bit machine, ordinary computer arithmetic is
  * used.
  *
  * A time variable is a signed 64-bit fixed-point number in ns and
  * fraction. It represents the remaining time offset to be amortized
  * over succeeding tick interrupts. The maximum time offset is about
  * 0.5 s and the resolution is about 2.3e-10 ns.
  *
  *			1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
  *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |s s s|			 ns				   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |			    fraction				   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *
  * A frequency variable is a signed 64-bit fixed-point number in ns/s
  * and fraction. It represents the ns and fraction to be added to the
  * kernel time variable at each second. The maximum frequency offset is
  * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
  *
  *			1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
  *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |s s s s s s s s s s s s s|	          ns/s			   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |			    fraction				   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
 /*
  * The following variables establish the state of the PLL/FLL and the
  * residual time and frequency offset of the local clock.
  */
 #define SHIFT_PLL	4		/* PLL loop gain (shift) */
 #define SHIFT_FLL	2		/* FLL loop gain (shift) */
 
 static int time_state = TIME_OK;	/* clock state */
 static int time_status = STA_UNSYNC;	/* clock status bits */
 static long time_tai;			/* TAI offset (s) */
 static long time_monitor;		/* last time offset scaled (ns) */
 static long time_constant;		/* poll interval (shift) (s) */
 static long time_precision = 1;		/* clock precision (ns) */
 static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
 static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
 static long time_reftime;		/* time at last adjustment (s) */
 static l_fp time_offset;		/* time offset (ns) */
 static l_fp time_freq;			/* frequency offset (ns/s) */
 static l_fp time_adj;			/* tick adjust (ns/s) */
 
 static int64_t time_adjtime;		/* correction from adjtime(2) (usec) */
 
 #ifdef PPS_SYNC
 /*
  * The following variables are used when a pulse-per-second (PPS) signal
  * is available and connected via a modem control lead. They establish
  * the engineering parameters of the clock discipline loop when
  * controlled by the PPS signal.
  */
 #define PPS_FAVG	2		/* min freq avg interval (s) (shift) */
 #define PPS_FAVGDEF	8		/* default freq avg int (s) (shift) */
 #define PPS_FAVGMAX	15		/* max freq avg interval (s) (shift) */
 #define PPS_PAVG	4		/* phase avg interval (s) (shift) */
 #define PPS_VALID	120		/* PPS signal watchdog max (s) */
 #define PPS_MAXWANDER	100000		/* max PPS wander (ns/s) */
 #define PPS_POPCORN	2		/* popcorn spike threshold (shift) */
 
 static struct timespec pps_tf[3];	/* phase median filter */
 static l_fp pps_freq;			/* scaled frequency offset (ns/s) */
 static long pps_fcount;			/* frequency accumulator */
 static long pps_jitter;			/* nominal jitter (ns) */
 static long pps_stabil;			/* nominal stability (scaled ns/s) */
 static long pps_lastsec;		/* time at last calibration (s) */
 static int pps_valid;			/* signal watchdog counter */
 static int pps_shift = PPS_FAVG;	/* interval duration (s) (shift) */
 static int pps_shiftmax = PPS_FAVGDEF;	/* max interval duration (s) (shift) */
 static int pps_intcnt;			/* wander counter */
 
 /*
  * PPS signal quality monitors
  */
 static long pps_calcnt;			/* calibration intervals */
 static long pps_jitcnt;			/* jitter limit exceeded */
 static long pps_stbcnt;			/* stability limit exceeded */
 static long pps_errcnt;			/* calibration errors */
 #endif /* PPS_SYNC */
 /*
  * End of phase/frequency-lock loop (PLL/FLL) definitions
  */
 
 static void ntp_init(void);
 static void hardupdate(long offset);
 static void ntp_gettime1(struct ntptimeval *ntvp);
 
 static void
 ntp_gettime1(struct ntptimeval *ntvp)
 {
 	struct timespec atv;	/* nanosecond time */
 
 	GIANT_REQUIRED;
 
 	nanotime(&atv);
 	ntvp->time.tv_sec = atv.tv_sec;
 	ntvp->time.tv_nsec = atv.tv_nsec;
 	ntvp->maxerror = time_maxerror;
 	ntvp->esterror = time_esterror;
 	ntvp->tai = time_tai;
 	ntvp->time_state = time_state;
 
 	/*
 	 * Status word error decode. If any of these conditions occur,
 	 * an error is returned, instead of the status word. Most
 	 * applications will care only about the fact the system clock
 	 * may not be trusted, not about the details.
 	 *
 	 * Hardware or software error
 	 */
 	if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
 
 	/*
 	 * PPS signal lost when either time or frequency synchronization
 	 * requested
 	 */
 	    (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
 	    !(time_status & STA_PPSSIGNAL)) ||
 
 	/*
 	 * PPS jitter exceeded when time synchronization requested
 	 */
 	    (time_status & STA_PPSTIME &&
 	    time_status & STA_PPSJITTER) ||
 
 	/*
 	 * PPS wander exceeded or calibration error when frequency
 	 * synchronization requested
 	 */
 	    (time_status & STA_PPSFREQ &&
 	    time_status & (STA_PPSWANDER | STA_PPSERROR)))
 		ntvp->time_state = TIME_ERROR;
 }
 
 /*
  * ntp_gettime() - NTP user application interface
  *
  * See the timex.h header file for synopsis and API description. Note
  * that the TAI offset is returned in the ntvtimeval.tai structure
  * member.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ntp_gettime_args {
 	struct ntptimeval *ntvp;
 };
 #endif
 /* ARGSUSED */
 int
 ntp_gettime(struct thread *td, struct ntp_gettime_args *uap)
 {	
 	struct ntptimeval ntv;
 
 	mtx_lock(&Giant);
 	ntp_gettime1(&ntv);
 	mtx_unlock(&Giant);
 
 	td->td_retval[0] = ntv.time_state;
 	return (copyout(&ntv, uap->ntvp, sizeof(ntv)));
 }
 
 static int
 ntp_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct ntptimeval ntv;	/* temporary structure */
 
 	ntp_gettime1(&ntv);
 
 	return (sysctl_handle_opaque(oidp, &ntv, sizeof(ntv), req));
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW, 0, "");
 SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
 
 #ifdef PPS_SYNC
 SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW, &pps_shiftmax, 0, "");
 SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW, &pps_shift, 0, "");
 SYSCTL_INT(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD, &time_monitor, 0, "");
 
 SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD, &pps_freq, sizeof(pps_freq), "I", "");
 SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD, &time_freq, sizeof(time_freq), "I", "");
 #endif
 /*
  * ntp_adjtime() - NTP daemon application interface
  *
  * See the timex.h header file for synopsis and API description. Note
  * that the timex.constant structure member has a dual purpose to set
  * the time constant and to set the TAI offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ntp_adjtime_args {
 	struct timex *tp;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
 {
 	struct timex ntv;	/* temporary structure */
 	long freq;		/* frequency ns/s) */
 	int modes;		/* mode bits from structure */
 	int s;			/* caller priority */
 	int error;
 
 	error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
 	if (error)
 		return(error);
 
 	/*
 	 * Update selected clock variables - only the superuser can
 	 * change anything. Note that there is no error checking here on
 	 * the assumption the superuser should know what it is doing.
 	 * Note that either the time constant or TAI offset are loaded
 	 * from the ntv.constant member, depending on the mode bits. If
 	 * the STA_PLL bit in the status word is cleared, the state and
 	 * status words are reset to the initial values at boot.
 	 */
 	mtx_lock(&Giant);
 	modes = ntv.modes;
 	if (modes)
 		error = priv_check(td, PRIV_NTP_ADJTIME);
 	if (error)
 		goto done2;
 	s = splclock();
 	if (modes & MOD_MAXERROR)
 		time_maxerror = ntv.maxerror;
 	if (modes & MOD_ESTERROR)
 		time_esterror = ntv.esterror;
 	if (modes & MOD_STATUS) {
 		if (time_status & STA_PLL && !(ntv.status & STA_PLL)) {
 			time_state = TIME_OK;
 			time_status = STA_UNSYNC;
 #ifdef PPS_SYNC
 			pps_shift = PPS_FAVG;
 #endif /* PPS_SYNC */
 		}
 		time_status &= STA_RONLY;
 		time_status |= ntv.status & ~STA_RONLY;
 	}
 	if (modes & MOD_TIMECONST) {
 		if (ntv.constant < 0)
 			time_constant = 0;
 		else if (ntv.constant > MAXTC)
 			time_constant = MAXTC;
 		else
 			time_constant = ntv.constant;
 	}
 	if (modes & MOD_TAI) {
 		if (ntv.constant > 0) /* XXX zero & negative numbers ? */
 			time_tai = ntv.constant;
 	}
 #ifdef PPS_SYNC
 	if (modes & MOD_PPSMAX) {
 		if (ntv.shift < PPS_FAVG)
 			pps_shiftmax = PPS_FAVG;
 		else if (ntv.shift > PPS_FAVGMAX)
 			pps_shiftmax = PPS_FAVGMAX;
 		else
 			pps_shiftmax = ntv.shift;
 	}
 #endif /* PPS_SYNC */
 	if (modes & MOD_NANO)
 		time_status |= STA_NANO;
 	if (modes & MOD_MICRO)
 		time_status &= ~STA_NANO;
 	if (modes & MOD_CLKB)
 		time_status |= STA_CLK;
 	if (modes & MOD_CLKA)
 		time_status &= ~STA_CLK;
 	if (modes & MOD_FREQUENCY) {
 		freq = (ntv.freq * 1000LL) >> 16;
 		if (freq > MAXFREQ)
 			L_LINT(time_freq, MAXFREQ);
 		else if (freq < -MAXFREQ)
 			L_LINT(time_freq, -MAXFREQ);
 		else {
 			/*
 			 * ntv.freq is [PPM * 2^16] = [us/s * 2^16]
 			 * time_freq is [ns/s * 2^32]
 			 */
 			time_freq = ntv.freq * 1000LL * 65536LL;
 		}
 #ifdef PPS_SYNC
 		pps_freq = time_freq;
 #endif /* PPS_SYNC */
 	}
 	if (modes & MOD_OFFSET) {
 		if (time_status & STA_NANO)
 			hardupdate(ntv.offset);
 		else
 			hardupdate(ntv.offset * 1000);
 	}
 
 	/*
 	 * Retrieve all clock variables. Note that the TAI offset is
 	 * returned only by ntp_gettime();
 	 */
 	if (time_status & STA_NANO)
 		ntv.offset = L_GINT(time_offset);
 	else
 		ntv.offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
 	ntv.freq = L_GINT((time_freq / 1000LL) << 16);
 	ntv.maxerror = time_maxerror;
 	ntv.esterror = time_esterror;
 	ntv.status = time_status;
 	ntv.constant = time_constant;
 	if (time_status & STA_NANO)
 		ntv.precision = time_precision;
 	else
 		ntv.precision = time_precision / 1000;
 	ntv.tolerance = MAXFREQ * SCALE_PPM;
 #ifdef PPS_SYNC
 	ntv.shift = pps_shift;
 	ntv.ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
 	if (time_status & STA_NANO)
 		ntv.jitter = pps_jitter;
 	else
 		ntv.jitter = pps_jitter / 1000;
 	ntv.stabil = pps_stabil;
 	ntv.calcnt = pps_calcnt;
 	ntv.errcnt = pps_errcnt;
 	ntv.jitcnt = pps_jitcnt;
 	ntv.stbcnt = pps_stbcnt;
 #endif /* PPS_SYNC */
 	splx(s);
 
 	error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
 	if (error)
 		goto done2;
 
 	/*
 	 * Status word error decode. See comments in
 	 * ntp_gettime() routine.
 	 */
 	if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
 	    (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
 	    !(time_status & STA_PPSSIGNAL)) ||
 	    (time_status & STA_PPSTIME &&
 	    time_status & STA_PPSJITTER) ||
 	    (time_status & STA_PPSFREQ &&
 	    time_status & (STA_PPSWANDER | STA_PPSERROR))) {
 		td->td_retval[0] = TIME_ERROR;
 	} else {
 		td->td_retval[0] = time_state;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * second_overflow() - called after ntp_tick_adjust()
  *
  * This routine is ordinarily called immediately following the above
  * routine ntp_tick_adjust(). While these two routines are normally
  * combined, they are separated here only for the purposes of
  * simulation.
  */
 void
 ntp_update_second(int64_t *adjustment, time_t *newsec)
 {
 	int tickrate;
 	l_fp ftemp;		/* 32/64-bit temporary */
 
 	/*
 	 * On rollover of the second both the nanosecond and microsecond
 	 * clocks are updated and the state machine cranked as
 	 * necessary. The phase adjustment to be used for the next
 	 * second is calculated and the maximum error is increased by
 	 * the tolerance.
 	 */
 	time_maxerror += MAXFREQ / 1000;
 
 	/*
 	 * Leap second processing. If in leap-insert state at
 	 * the end of the day, the system clock is set back one
 	 * second; if in leap-delete state, the system clock is
 	 * set ahead one second. The nano_time() routine or
 	 * external clock driver will insure that reported time
 	 * is always monotonic.
 	 */
 	switch (time_state) {
 
 		/*
 		 * No warning.
 		 */
 		case TIME_OK:
 		if (time_status & STA_INS)
 			time_state = TIME_INS;
 		else if (time_status & STA_DEL)
 			time_state = TIME_DEL;
 		break;
 
 		/*
 		 * Insert second 23:59:60 following second
 		 * 23:59:59.
 		 */
 		case TIME_INS:
 		if (!(time_status & STA_INS))
 			time_state = TIME_OK;
 		else if ((*newsec) % 86400 == 0) {
 			(*newsec)--;
 			time_state = TIME_OOP;
 			time_tai++;
 		}
 		break;
 
 		/*
 		 * Delete second 23:59:59.
 		 */
 		case TIME_DEL:
 		if (!(time_status & STA_DEL))
 			time_state = TIME_OK;
 		else if (((*newsec) + 1) % 86400 == 0) {
 			(*newsec)++;
 			time_tai--;
 			time_state = TIME_WAIT;
 		}
 		break;
 
 		/*
 		 * Insert second in progress.
 		 */
 		case TIME_OOP:
 			time_state = TIME_WAIT;
 		break;
 
 		/*
 		 * Wait for status bits to clear.
 		 */
 		case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
 			time_state = TIME_OK;
 	}
 
 	/*
 	 * Compute the total time adjustment for the next second
 	 * in ns. The offset is reduced by a factor depending on
 	 * whether the PPS signal is operating. Note that the
 	 * value is in effect scaled by the clock frequency,
 	 * since the adjustment is added at each tick interrupt.
 	 */
 	ftemp = time_offset;
 #ifdef PPS_SYNC
 	/* XXX even if PPS signal dies we should finish adjustment ? */
 	if (time_status & STA_PPSTIME && time_status &
 	    STA_PPSSIGNAL)
 		L_RSHIFT(ftemp, pps_shift);
 	else
 		L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
 #else
 		L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
 #endif /* PPS_SYNC */
 	time_adj = ftemp;
 	L_SUB(time_offset, ftemp);
 	L_ADD(time_adj, time_freq);
 	
 	/*
 	 * Apply any correction from adjtime(2).  If more than one second
 	 * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
 	 * until the last second is slewed the final < 500 usecs.
 	 */
 	if (time_adjtime != 0) {
 		if (time_adjtime > 1000000)
 			tickrate = 5000;
 		else if (time_adjtime < -1000000)
 			tickrate = -5000;
 		else if (time_adjtime > 500)
 			tickrate = 500;
 		else if (time_adjtime < -500)
 			tickrate = -500;
 		else
 			tickrate = time_adjtime;
 		time_adjtime -= tickrate;
 		L_LINT(ftemp, tickrate * 1000);
 		L_ADD(time_adj, ftemp);
 	}
 	*adjustment = time_adj;
 		
 #ifdef PPS_SYNC
 	if (pps_valid > 0)
 		pps_valid--;
 	else
 		time_status &= ~STA_PPSSIGNAL;
 #endif /* PPS_SYNC */
 }
 
 /*
  * ntp_init() - initialize variables and structures
  *
  * This routine must be called after the kernel variables hz and tick
  * are set or changed and before the next tick interrupt. In this
  * particular implementation, these values are assumed set elsewhere in
  * the kernel. The design allows the clock frequency and tick interval
  * to be changed while the system is running. So, this routine should
  * probably be integrated with the code that does that.
  */
 static void
 ntp_init()
 {
 
 	/*
 	 * The following variables are initialized only at startup. Only
 	 * those structures not cleared by the compiler need to be
 	 * initialized, and these only in the simulator. In the actual
 	 * kernel, any nonzero values here will quickly evaporate.
 	 */
 	L_CLR(time_offset);
 	L_CLR(time_freq);
 #ifdef PPS_SYNC
 	pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
 	pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
 	pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
 	pps_fcount = 0;
 	L_CLR(pps_freq);
 #endif /* PPS_SYNC */	   
 }
 
 SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, ntp_init, NULL)
 
 /*
  * hardupdate() - local clock update
  *
  * This routine is called by ntp_adjtime() to update the local clock
  * phase and frequency. The implementation is of an adaptive-parameter,
  * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
  * time and frequency offset estimates for each call. If the kernel PPS
  * discipline code is configured (PPS_SYNC), the PPS signal itself
  * determines the new time offset, instead of the calling argument.
  * Presumably, calls to ntp_adjtime() occur only when the caller
  * believes the local clock is valid within some bound (+-128 ms with
  * NTP). If the caller's time is far different than the PPS time, an
  * argument will ensue, and it's not clear who will lose.
  *
  * For uncompensated quartz crystal oscillators and nominal update
  * intervals less than 256 s, operation should be in phase-lock mode,
  * where the loop is disciplined to phase. For update intervals greater
  * than 1024 s, operation should be in frequency-lock mode, where the
  * loop is disciplined to frequency. Between 256 s and 1024 s, the mode
  * is selected by the STA_MODE status bit.
  */
 static void
 hardupdate(offset)
 	long offset;		/* clock offset (ns) */
 {
 	long mtemp;
 	l_fp ftemp;
 
 	/*
 	 * Select how the phase is to be controlled and from which
 	 * source. If the PPS signal is present and enabled to
 	 * discipline the time, the PPS offset is used; otherwise, the
 	 * argument offset is used.
 	 */
 	if (!(time_status & STA_PLL))
 		return;
 	if (!(time_status & STA_PPSTIME && time_status &
 	    STA_PPSSIGNAL)) {
 		if (offset > MAXPHASE)
 			time_monitor = MAXPHASE;
 		else if (offset < -MAXPHASE)
 			time_monitor = -MAXPHASE;
 		else
 			time_monitor = offset;
 		L_LINT(time_offset, time_monitor);
 	}
 
 	/*
 	 * Select how the frequency is to be controlled and in which
 	 * mode (PLL or FLL). If the PPS signal is present and enabled
 	 * to discipline the frequency, the PPS frequency is used;
 	 * otherwise, the argument offset is used to compute it.
 	 */
 	if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
 		time_reftime = time_second;
 		return;
 	}
 	if (time_status & STA_FREQHOLD || time_reftime == 0)
 		time_reftime = time_second;
 	mtemp = time_second - time_reftime;
 	L_LINT(ftemp, time_monitor);
 	L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
 	L_MPY(ftemp, mtemp);
 	L_ADD(time_freq, ftemp);
 	time_status &= ~STA_MODE;
 	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
 	    MAXSEC)) {
 		L_LINT(ftemp, (time_monitor << 4) / mtemp);
 		L_RSHIFT(ftemp, SHIFT_FLL + 4);
 		L_ADD(time_freq, ftemp);
 		time_status |= STA_MODE;
 	}
 	time_reftime = time_second;
 	if (L_GINT(time_freq) > MAXFREQ)
 		L_LINT(time_freq, MAXFREQ);
 	else if (L_GINT(time_freq) < -MAXFREQ)
 		L_LINT(time_freq, -MAXFREQ);
 }
 
 #ifdef PPS_SYNC
 /*
  * hardpps() - discipline CPU clock oscillator to external PPS signal
  *
  * This routine is called at each PPS interrupt in order to discipline
  * the CPU clock oscillator to the PPS signal. There are two independent
  * first-order feedback loops, one for the phase, the other for the
  * frequency. The phase loop measures and grooms the PPS phase offset
  * and leaves it in a handy spot for the seconds overflow routine. The
  * frequency loop averages successive PPS phase differences and
  * calculates the PPS frequency offset, which is also processed by the
  * seconds overflow routine. The code requires the caller to capture the
  * time and architecture-dependent hardware counter values in
  * nanoseconds at the on-time PPS signal transition.
  *
  * Note that, on some Unix systems this routine runs at an interrupt
  * priority level higher than the timer interrupt routine hardclock().
  * Therefore, the variables used are distinct from the hardclock()
  * variables, except for the actual time and frequency variables, which
  * are determined by this routine and updated atomically.
  */
 void
 hardpps(tsp, nsec)
 	struct timespec *tsp;	/* time at PPS */
 	long nsec;		/* hardware counter at PPS */
 {
 	long u_sec, u_nsec, v_nsec; /* temps */
 	l_fp ftemp;
 
 	/*
 	 * The signal is first processed by a range gate and frequency
 	 * discriminator. The range gate rejects noise spikes outside
 	 * the range +-500 us. The frequency discriminator rejects input
 	 * signals with apparent frequency outside the range 1 +-500
 	 * PPM. If two hits occur in the same second, we ignore the
 	 * later hit; if not and a hit occurs outside the range gate,
 	 * keep the later hit for later comparison, but do not process
 	 * it.
 	 */
 	time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
 	time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
 	pps_valid = PPS_VALID;
 	u_sec = tsp->tv_sec;
 	u_nsec = tsp->tv_nsec;
 	if (u_nsec >= (NANOSECOND >> 1)) {
 		u_nsec -= NANOSECOND;
 		u_sec++;
 	}
 	v_nsec = u_nsec - pps_tf[0].tv_nsec;
 	if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
 	    MAXFREQ)
 		return;
 	pps_tf[2] = pps_tf[1];
 	pps_tf[1] = pps_tf[0];
 	pps_tf[0].tv_sec = u_sec;
 	pps_tf[0].tv_nsec = u_nsec;
 
 	/*
 	 * Compute the difference between the current and previous
 	 * counter values. If the difference exceeds 0.5 s, assume it
 	 * has wrapped around, so correct 1.0 s. If the result exceeds
 	 * the tick interval, the sample point has crossed a tick
 	 * boundary during the last second, so correct the tick. Very
 	 * intricate.
 	 */
 	u_nsec = nsec;
 	if (u_nsec > (NANOSECOND >> 1))
 		u_nsec -= NANOSECOND;
 	else if (u_nsec < -(NANOSECOND >> 1))
 		u_nsec += NANOSECOND;
 	pps_fcount += u_nsec;
 	if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
 		return;
 	time_status &= ~STA_PPSJITTER;
 
 	/*
 	 * A three-stage median filter is used to help denoise the PPS
 	 * time. The median sample becomes the time offset estimate; the
 	 * difference between the other two samples becomes the time
 	 * dispersion (jitter) estimate.
 	 */
 	if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
 		if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
 			v_nsec = pps_tf[1].tv_nsec;	/* 0 1 2 */
 			u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
 		} else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
 			v_nsec = pps_tf[0].tv_nsec;	/* 2 0 1 */
 			u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
 		} else {
 			v_nsec = pps_tf[2].tv_nsec;	/* 0 2 1 */
 			u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
 		}
 	} else {
 		if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
 			v_nsec = pps_tf[1].tv_nsec;	/* 2 1 0 */
 			u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
 		} else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
 			v_nsec = pps_tf[0].tv_nsec;	/* 1 0 2 */
 			u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
 		} else {
 			v_nsec = pps_tf[2].tv_nsec;	/* 1 2 0 */
 			u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
 		}
 	}
 
 	/*
 	 * Nominal jitter is due to PPS signal noise and interrupt
 	 * latency. If it exceeds the popcorn threshold, the sample is
 	 * discarded. otherwise, if so enabled, the time offset is
 	 * updated. We can tolerate a modest loss of data here without
 	 * much degrading time accuracy.
 	 */
 	if (u_nsec > (pps_jitter << PPS_POPCORN)) {
 		time_status |= STA_PPSJITTER;
 		pps_jitcnt++;
 	} else if (time_status & STA_PPSTIME) {
 		time_monitor = -v_nsec;
 		L_LINT(time_offset, time_monitor);
 	}
 	pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
 	u_sec = pps_tf[0].tv_sec - pps_lastsec;
 	if (u_sec < (1 << pps_shift))
 		return;
 
 	/*
 	 * At the end of the calibration interval the difference between
 	 * the first and last counter values becomes the scaled
 	 * frequency. It will later be divided by the length of the
 	 * interval to determine the frequency update. If the frequency
 	 * exceeds a sanity threshold, or if the actual calibration
 	 * interval is not equal to the expected length, the data are
 	 * discarded. We can tolerate a modest loss of data here without
 	 * much degrading frequency accuracy.
 	 */
 	pps_calcnt++;
 	v_nsec = -pps_fcount;
 	pps_lastsec = pps_tf[0].tv_sec;
 	pps_fcount = 0;
 	u_nsec = MAXFREQ << pps_shift;
 	if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 <<
 	    pps_shift)) {
 		time_status |= STA_PPSERROR;
 		pps_errcnt++;
 		return;
 	}
 
 	/*
 	 * Here the raw frequency offset and wander (stability) is
 	 * calculated. If the wander is less than the wander threshold
 	 * for four consecutive averaging intervals, the interval is
 	 * doubled; if it is greater than the threshold for four
 	 * consecutive intervals, the interval is halved. The scaled
 	 * frequency offset is converted to frequency offset. The
 	 * stability metric is calculated as the average of recent
 	 * frequency changes, but is used only for performance
 	 * monitoring.
 	 */
 	L_LINT(ftemp, v_nsec);
 	L_RSHIFT(ftemp, pps_shift);
 	L_SUB(ftemp, pps_freq);
 	u_nsec = L_GINT(ftemp);
 	if (u_nsec > PPS_MAXWANDER) {
 		L_LINT(ftemp, PPS_MAXWANDER);
 		pps_intcnt--;
 		time_status |= STA_PPSWANDER;
 		pps_stbcnt++;
 	} else if (u_nsec < -PPS_MAXWANDER) {
 		L_LINT(ftemp, -PPS_MAXWANDER);
 		pps_intcnt--;
 		time_status |= STA_PPSWANDER;
 		pps_stbcnt++;
 	} else {
 		pps_intcnt++;
 	}
 	if (pps_intcnt >= 4) {
 		pps_intcnt = 4;
 		if (pps_shift < pps_shiftmax) {
 			pps_shift++;
 			pps_intcnt = 0;
 		}
 	} else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
 		pps_intcnt = -4;
 		if (pps_shift > PPS_FAVG) {
 			pps_shift--;
 			pps_intcnt = 0;
 		}
 	}
 	if (u_nsec < 0)
 		u_nsec = -u_nsec;
 	pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;
 
 	/*
 	 * The PPS frequency is recalculated and clamped to the maximum
 	 * MAXFREQ. If enabled, the system clock frequency is updated as
 	 * well.
 	 */
 	L_ADD(pps_freq, ftemp);
 	u_nsec = L_GINT(pps_freq);
 	if (u_nsec > MAXFREQ)
 		L_LINT(pps_freq, MAXFREQ);
 	else if (u_nsec < -MAXFREQ)
 		L_LINT(pps_freq, -MAXFREQ);
 	if (time_status & STA_PPSFREQ)
 		time_freq = pps_freq;
 }
 #endif /* PPS_SYNC */
 
 #ifndef _SYS_SYSPROTO_H_
 struct adjtime_args {
 	struct timeval *delta;
 	struct timeval *olddelta;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 adjtime(struct thread *td, struct adjtime_args *uap)
 {
 	struct timeval delta, olddelta, *deltap;
 	int error;
 
 	if (uap->delta) {
 		error = copyin(uap->delta, &delta, sizeof(delta));
 		if (error)
 			return (error);
 		deltap = &delta;
 	} else
 		deltap = NULL;
 	error = kern_adjtime(td, deltap, &olddelta);
 	if (uap->olddelta && error == 0)
 		error = copyout(&olddelta, uap->olddelta, sizeof(olddelta));
 	return (error);
 }
 
 int
 kern_adjtime(struct thread *td, struct timeval *delta, struct timeval *olddelta)
 {
 	struct timeval atv;
 	int error;
 
 	if ((error = priv_check(td, PRIV_ADJTIME)))
 		return (error);
 
 	mtx_lock(&Giant);
 	if (olddelta) {
 		atv.tv_sec = time_adjtime / 1000000;
 		atv.tv_usec = time_adjtime % 1000000;
 		if (atv.tv_usec < 0) {
 			atv.tv_usec += 1000000;
 			atv.tv_sec--;
 		}
 		*olddelta = atv;
 	}
 	if (delta)
 		time_adjtime = (int64_t)delta->tv_sec * 1000000 +
 		    delta->tv_usec;
 	mtx_unlock(&Giant);
 	return (error);
 }
 
Index: head/sys/kern/kern_prot.c
===================================================================
--- head/sys/kern/kern_prot.c	(revision 167210)
+++ head/sys/kern/kern_prot.c	(revision 167211)
@@ -1,2144 +1,2050 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
  *	The Regents of the University of California.
  * (c) UNIX System Laboratories, Inc.
  * Copyright (c) 2000-2001 Robert N. M. Watson.
  * All rights reserved.
  *
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_prot.c	8.6 (Berkeley) 1/21/94
  */
 
 /*
  * System calls related to processes and protection
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/acct.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/refcount.h>
 #include <sys/sx.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_CRED, "cred", "credentials");
 
 SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy");
 
 #ifndef _SYS_SYSPROTO_H_
 struct getpid_args {
 	int	dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getpid(struct thread *td, struct getpid_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	td->td_retval[0] = p->p_pid;
 #if defined(COMPAT_43)
 	PROC_LOCK(p);
 	td->td_retval[1] = p->p_pptr->p_pid;
 	PROC_UNLOCK(p);
 #endif
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getppid_args {
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getppid(struct thread *td, struct getppid_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	td->td_retval[0] = p->p_pptr->p_pid;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Get process group ID; note that POSIX getpgrp takes no parameter.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getpgrp_args {
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getpgrp(struct thread *td, struct getpgrp_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	td->td_retval[0] = p->p_pgrp->pg_id;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /* Get an arbitary pid's process group id */
 #ifndef _SYS_SYSPROTO_H_
 struct getpgid_args {
 	pid_t	pid;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getpgid(struct thread *td, struct getpgid_args *uap)
 {
 	struct proc *p;
 	int error;
 
 	if (uap->pid == 0) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(uap->pid);
 		if (p == NULL)
 			return (ESRCH);
 		error = p_cansee(td, p);
 		if (error) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 	}
 	td->td_retval[0] = p->p_pgrp->pg_id;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Get an arbitary pid's session id.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getsid_args {
 	pid_t	pid;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getsid(struct thread *td, struct getsid_args *uap)
 {
 	struct proc *p;
 	int error;
 
 	if (uap->pid == 0) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(uap->pid);
 		if (p == NULL)
 			return (ESRCH);
 		error = p_cansee(td, p);
 		if (error) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 	}
 	td->td_retval[0] = p->p_session->s_sid;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getuid_args {
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getuid(struct thread *td, struct getuid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_ruid;
 #if defined(COMPAT_43)
 	td->td_retval[1] = td->td_ucred->cr_uid;
 #endif
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct geteuid_args {
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 geteuid(struct thread *td, struct geteuid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_uid;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getgid_args {
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getgid(struct thread *td, struct getgid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_rgid;
 #if defined(COMPAT_43)
 	td->td_retval[1] = td->td_ucred->cr_groups[0];
 #endif
 	return (0);
 }
 
 /*
  * Get effective group ID.  The "egid" is groups[0], and could be obtained
  * via getgroups.  This syscall exists because it is somewhat painful to do
  * correctly in a library function.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getegid_args {
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getegid(struct thread *td, struct getegid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_groups[0];
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getgroups_args {
 	u_int	gidsetsize;
 	gid_t	*gidset;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getgroups(struct thread *td, register struct getgroups_args *uap)
 {
 	gid_t groups[NGROUPS];
 	u_int ngrp;
 	int error;
 
 	ngrp = MIN(uap->gidsetsize, NGROUPS);
 	error = kern_getgroups(td, &ngrp, groups);
 	if (error)
 		return (error);
 	if (uap->gidsetsize > 0)
 		error = copyout(groups, uap->gidset, ngrp * sizeof(gid_t));
 	if (error == 0)
 		td->td_retval[0] = ngrp;
 	return (error);
 }
 
 int
 kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups)
 {
 	struct ucred *cred;
 
 	cred = td->td_ucred;
 	if (*ngrp == 0) {
 		*ngrp = cred->cr_ngroups;
 		return (0);
 	}
 	if (*ngrp < cred->cr_ngroups)
 		return (EINVAL);
 	*ngrp = cred->cr_ngroups;
 	bcopy(cred->cr_groups, groups, *ngrp * sizeof(gid_t));
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setsid_args {
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setsid(register struct thread *td, struct setsid_args *uap)
 {
 	struct pgrp *pgrp;
 	int error;
 	struct proc *p = td->td_proc;
 	struct pgrp *newpgrp;
 	struct session *newsess;
 
 	error = 0;
 	pgrp = NULL;
 
 	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
 	MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
 
 	sx_xlock(&proctree_lock);
 
 	if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
 		if (pgrp != NULL)
 			PGRP_UNLOCK(pgrp);
 		error = EPERM;
 	} else {
 		(void)enterpgrp(p, p->p_pid, newpgrp, newsess);
 		td->td_retval[0] = p->p_pid;
 		newpgrp = NULL;
 		newsess = NULL;
 	}
 
 	sx_xunlock(&proctree_lock);
 
 	if (newpgrp != NULL)
 		FREE(newpgrp, M_PGRP);
 	if (newsess != NULL)
 		FREE(newsess, M_SESSION);
 
 	return (error);
 }
 
 /*
  * set process group (setpgid/old setpgrp)
  *
  * caller does setpgid(targpid, targpgid)
  *
  * pid must be caller or child of caller (ESRCH)
  * if a child
  *	pid must be in same session (EPERM)
  *	pid can't have done an exec (EACCES)
  * if pgid != pid
  * 	there must exist some pid in same session having pgid (EPERM)
  * pid must not be session leader (EPERM)
  */
 #ifndef _SYS_SYSPROTO_H_
 struct setpgid_args {
 	int	pid;		/* target process id */
 	int	pgid;		/* target pgrp id */
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setpgid(struct thread *td, register struct setpgid_args *uap)
 {
 	struct proc *curp = td->td_proc;
 	register struct proc *targp;	/* target process */
 	register struct pgrp *pgrp;	/* target pgrp */
 	int error;
 	struct pgrp *newpgrp;
 
 	if (uap->pgid < 0)
 		return (EINVAL);
 
 	error = 0;
 
 	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
 
 	sx_xlock(&proctree_lock);
 	if (uap->pid != 0 && uap->pid != curp->p_pid) {
 		if ((targp = pfind(uap->pid)) == NULL) {
 			error = ESRCH;
 			goto done;
 		}
 		if (!inferior(targp)) {
 			PROC_UNLOCK(targp);
 			error = ESRCH;
 			goto done;
 		}
 		if ((error = p_cansee(td, targp))) {
 			PROC_UNLOCK(targp);
 			goto done;
 		}
 		if (targp->p_pgrp == NULL ||
 		    targp->p_session != curp->p_session) {
 			PROC_UNLOCK(targp);
 			error = EPERM;
 			goto done;
 		}
 		if (targp->p_flag & P_EXEC) {
 			PROC_UNLOCK(targp);
 			error = EACCES;
 			goto done;
 		}
 		PROC_UNLOCK(targp);
 	} else
 		targp = curp;
 	if (SESS_LEADER(targp)) {
 		error = EPERM;
 		goto done;
 	}
 	if (uap->pgid == 0)
 		uap->pgid = targp->p_pid;
 	if ((pgrp = pgfind(uap->pgid)) == NULL) {
 		if (uap->pgid == targp->p_pid) {
 			error = enterpgrp(targp, uap->pgid, newpgrp,
 			    NULL);
 			if (error == 0)
 				newpgrp = NULL;
 		} else
 			error = EPERM;
 	} else {
 		if (pgrp == targp->p_pgrp) {
 			PGRP_UNLOCK(pgrp);
 			goto done;
 		}
 		if (pgrp->pg_id != targp->p_pid &&
 		    pgrp->pg_session != curp->p_session) {
 			PGRP_UNLOCK(pgrp);
 			error = EPERM;
 			goto done;
 		}
 		PGRP_UNLOCK(pgrp);
 		error = enterthispgrp(targp, pgrp);
 	}
 done:
 	sx_xunlock(&proctree_lock);
 	KASSERT((error == 0) || (newpgrp != NULL),
 	    ("setpgid failed and newpgrp is NULL"));
 	if (newpgrp != NULL)
 		FREE(newpgrp, M_PGRP);
 	return (error);
 }
 
 /*
  * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
  * compatible.  It says that setting the uid/gid to euid/egid is a special
  * case of "appropriate privilege".  Once the rules are expanded out, this
  * basically means that setuid(nnn) sets all three id's, in all permitted
  * cases unless _POSIX_SAVED_IDS is enabled.  In that case, setuid(getuid())
  * does not set the saved id - this is dangerous for traditional BSD
  * programs.  For this reason, we *really* do not want to set
  * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
  */
 #define POSIX_APPENDIX_B_4_2_2
 
 #ifndef _SYS_SYSPROTO_H_
 struct setuid_args {
 	uid_t	uid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setuid(struct thread *td, struct setuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t uid;
 	struct uidinfo *uip;
 	int error;
 
 	uid = uap->uid;
 	AUDIT_ARG(uid, uid);
 	newcred = crget();
 	uip = uifind(uid);
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setuid(p, oldcred, uid);
 	if (error)
 		goto fail;
 #endif
 
 	/*
 	 * See if we have "permission" by POSIX 1003.1 rules.
 	 *
 	 * Note that setuid(geteuid()) is a special case of
 	 * "appropriate privileges" in appendix B.4.2.2.  We need
 	 * to use this clause to be compatible with traditional BSD
 	 * semantics.  Basically, it means that "setuid(xx)" sets all
 	 * three id's (assuming you have privs).
 	 *
 	 * Notes on the logic.  We do things in three steps.
 	 * 1: We determine if the euid is going to change, and do EPERM
 	 *    right away.  We unconditionally change the euid later if this
 	 *    test is satisfied, simplifying that part of the logic.
 	 * 2: We determine if the real and/or saved uids are going to
 	 *    change.  Determined by compile options.
 	 * 3: Change euid last. (after tests in #2 for "appropriate privs")
 	 */
 	if (uid != oldcred->cr_ruid &&		/* allow setuid(getuid()) */
 #ifdef _POSIX_SAVED_IDS
 	    uid != oldcred->cr_svuid &&		/* allow setuid(saved gid) */
 #endif
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
 	    uid != oldcred->cr_uid &&		/* allow setuid(geteuid()) */
 #endif
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETUID,
 	    SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	/*
 	 * Copy credentials so other references do not see our changes.
 	 */
 	crcopy(newcred, oldcred);
 #ifdef _POSIX_SAVED_IDS
 	/*
 	 * Do we have "appropriate privileges" (are we root or uid == euid)
 	 * If so, we are changing the real uid and/or saved uid.
 	 */
 	if (
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use the clause from B.4.2.2 */
 	    uid == oldcred->cr_uid ||
 #endif
 	    /* We are using privs. */
 	    priv_check_cred(oldcred, PRIV_CRED_SETUID, SUSER_ALLOWJAIL) == 0)
 #endif
 	{
 		/*
 		 * Set the real uid and transfer proc count to new user.
 		 */
 		if (uid != oldcred->cr_ruid) {
 			change_ruid(newcred, uip);
 			setsugid(p);
 		}
 		/*
 		 * Set saved uid
 		 *
 		 * XXX always set saved uid even if not _POSIX_SAVED_IDS, as
 		 * the security of seteuid() depends on it.  B.4.2.2 says it
 		 * is important that we should do this.
 		 */
 		if (uid != oldcred->cr_svuid) {
 			change_svuid(newcred, uid);
 			setsugid(p);
 		}
 	}
 
 	/*
 	 * In all permitted cases, we are changing the euid.
 	 */
 	if (uid != oldcred->cr_uid) {
 		change_euid(newcred, uip);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	uifree(uip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(uip);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct seteuid_args {
 	uid_t	euid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 seteuid(struct thread *td, struct seteuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t euid;
 	struct uidinfo *euip;
 	int error;
 
 	euid = uap->euid;
 	AUDIT_ARG(euid, euid);
 	newcred = crget();
 	euip = uifind(euid);
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_seteuid(p, oldcred, euid);
 	if (error)
 		goto fail;
 #endif
 
 	if (euid != oldcred->cr_ruid &&		/* allow seteuid(getuid()) */
 	    euid != oldcred->cr_svuid &&	/* allow seteuid(saved uid) */
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID,
 	    SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	/*
 	 * Everything's okay, do it.  Copy credentials so other references do
 	 * not see our changes.
 	 */
 	crcopy(newcred, oldcred);
 	if (oldcred->cr_uid != euid) {
 		change_euid(newcred, euip);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	uifree(euip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(euip);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setgid_args {
 	gid_t	gid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setgid(struct thread *td, struct setgid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t gid;
 	int error;
 
 	gid = uap->gid;
 	AUDIT_ARG(gid, gid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setgid(p, oldcred, gid);
 	if (error)
 		goto fail;
 #endif
 
 	/*
 	 * See if we have "permission" by POSIX 1003.1 rules.
 	 *
 	 * Note that setgid(getegid()) is a special case of
 	 * "appropriate privileges" in appendix B.4.2.2.  We need
 	 * to use this clause to be compatible with traditional BSD
 	 * semantics.  Basically, it means that "setgid(xx)" sets all
 	 * three id's (assuming you have privs).
 	 *
 	 * For notes on the logic here, see setuid() above.
 	 */
 	if (gid != oldcred->cr_rgid &&		/* allow setgid(getgid()) */
 #ifdef _POSIX_SAVED_IDS
 	    gid != oldcred->cr_svgid &&		/* allow setgid(saved gid) */
 #endif
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
 	    gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
 #endif
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETGID,
 	    SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 #ifdef _POSIX_SAVED_IDS
 	/*
 	 * Do we have "appropriate privileges" (are we root or gid == egid)
 	 * If so, we are changing the real uid and saved gid.
 	 */
 	if (
 #ifdef POSIX_APPENDIX_B_4_2_2	/* use the clause from B.4.2.2 */
 	    gid == oldcred->cr_groups[0] ||
 #endif
 	    /* We are using privs. */
 	    priv_check_cred(oldcred, PRIV_CRED_SETGID, SUSER_ALLOWJAIL) == 0)
 #endif
 	{
 		/*
 		 * Set real gid
 		 */
 		if (oldcred->cr_rgid != gid) {
 			change_rgid(newcred, gid);
 			setsugid(p);
 		}
 		/*
 		 * Set saved gid
 		 *
 		 * XXX always set saved gid even if not _POSIX_SAVED_IDS, as
 		 * the security of setegid() depends on it.  B.4.2.2 says it
 		 * is important that we should do this.
 		 */
 		if (oldcred->cr_svgid != gid) {
 			change_svgid(newcred, gid);
 			setsugid(p);
 		}
 	}
 	/*
 	 * In all cases permitted cases, we are changing the egid.
 	 * Copy credentials so other references do not see our changes.
 	 */
 	if (oldcred->cr_groups[0] != gid) {
 		change_egid(newcred, gid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setegid_args {
 	gid_t	egid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setegid(struct thread *td, struct setegid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t egid;
 	int error;
 
 	egid = uap->egid;
 	AUDIT_ARG(egid, egid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setegid(p, oldcred, egid);
 	if (error)
 		goto fail;
 #endif
 
 	if (egid != oldcred->cr_rgid &&		/* allow setegid(getgid()) */
 	    egid != oldcred->cr_svgid &&	/* allow setegid(saved gid) */
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID,
 	    SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 	if (oldcred->cr_groups[0] != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setgroups_args {
 	u_int	gidsetsize;
 	gid_t	*gidset;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setgroups(struct thread *td, struct setgroups_args *uap)
 {
 	gid_t groups[NGROUPS];
 	int error;
 
 	if (uap->gidsetsize > NGROUPS)
 		return (EINVAL);
 	error = copyin(uap->gidset, groups, uap->gidsetsize * sizeof(gid_t));
 	if (error)
 		return (error);
 	return (kern_setgroups(td, uap->gidsetsize, groups));
 }
 
 int
 kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	int error;
 
 	if (ngrp > NGROUPS)
 		return (EINVAL);
 	AUDIT_ARG(groupset, groups, ngrp);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setgroups(p, oldcred, ngrp, groups);
 	if (error)
 		goto fail;
 #endif
 
 	error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS,
 	    SUSER_ALLOWJAIL);
 	if (error)
 		goto fail;
 
 	/*
 	 * XXX A little bit lazy here.  We could test if anything has
 	 * changed before crcopy() and setting P_SUGID.
 	 */
 	crcopy(newcred, oldcred);
 	if (ngrp < 1) {
 		/*
 		 * setgroups(0, NULL) is a legitimate way of clearing the
 		 * groups vector on non-BSD systems (which generally do not
 		 * have the egid in the groups[0]).  We risk security holes
 		 * when running non-BSD software if we do not do the same.
 		 */
 		newcred->cr_ngroups = 1;
 	} else {
 		bcopy(groups, newcred->cr_groups, ngrp * sizeof(gid_t));
 		newcred->cr_ngroups = ngrp;
 	}
 	setsugid(p);
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setreuid_args {
 	uid_t	ruid;
 	uid_t	euid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setreuid(register struct thread *td, struct setreuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t euid, ruid;
 	struct uidinfo *euip, *ruip;
 	int error;
 
 	euid = uap->euid;
 	ruid = uap->ruid;
 	AUDIT_ARG(euid, euid);
 	AUDIT_ARG(ruid, ruid);
 	newcred = crget();
 	euip = uifind(euid);
 	ruip = uifind(ruid);
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setreuid(p, oldcred, ruid, euid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
 	      ruid != oldcred->cr_svuid) ||
 	     (euid != (uid_t)-1 && euid != oldcred->cr_uid &&
 	      euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID,
 	     SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
 		change_euid(newcred, euip);
 		setsugid(p);
 	}
 	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
 		change_ruid(newcred, ruip);
 		setsugid(p);
 	}
 	if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) &&
 	    newcred->cr_svuid != newcred->cr_uid) {
 		change_svuid(newcred, newcred->cr_uid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	uifree(ruip);
 	uifree(euip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(ruip);
 	uifree(euip);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setregid_args {
 	gid_t	rgid;
 	gid_t	egid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setregid(register struct thread *td, struct setregid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t egid, rgid;
 	int error;
 
 	egid = uap->egid;
 	rgid = uap->rgid;
 	AUDIT_ARG(egid, egid);
 	AUDIT_ARG(rgid, rgid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setregid(p, oldcred, rgid, egid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
 	    rgid != oldcred->cr_svgid) ||
 	     (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
 	     egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID,
 	     SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
 	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
 		change_rgid(newcred, rgid);
 		setsugid(p);
 	}
 	if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) &&
 	    newcred->cr_svgid != newcred->cr_groups[0]) {
 		change_svgid(newcred, newcred->cr_groups[0]);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 /*
  * setresuid(ruid, euid, suid) is like setreuid except control over the
  * saved uid is explicit.
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct setresuid_args {
 	uid_t	ruid;
 	uid_t	euid;
 	uid_t	suid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setresuid(register struct thread *td, struct setresuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t euid, ruid, suid;
 	struct uidinfo *euip, *ruip;
 	int error;
 
 	euid = uap->euid;
 	ruid = uap->ruid;
 	suid = uap->suid;
 	AUDIT_ARG(euid, euid);
 	AUDIT_ARG(ruid, ruid);
 	AUDIT_ARG(suid, suid);
 	newcred = crget();
 	euip = uifind(euid);
 	ruip = uifind(ruid);
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setresuid(p, oldcred, ruid, euid, suid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
 	     ruid != oldcred->cr_svuid &&
 	      ruid != oldcred->cr_uid) ||
 	     (euid != (uid_t)-1 && euid != oldcred->cr_ruid &&
 	    euid != oldcred->cr_svuid &&
 	      euid != oldcred->cr_uid) ||
 	     (suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
 	    suid != oldcred->cr_svuid &&
 	      suid != oldcred->cr_uid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID,
 	     SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
 		change_euid(newcred, euip);
 		setsugid(p);
 	}
 	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
 		change_ruid(newcred, ruip);
 		setsugid(p);
 	}
 	if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) {
 		change_svuid(newcred, suid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	uifree(ruip);
 	uifree(euip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(ruip);
 	uifree(euip);
 	crfree(newcred);
 	return (error);
 
 }
 
 /*
  * setresgid(rgid, egid, sgid) is like setregid except control over the
  * saved gid is explicit.
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct setresgid_args {
 	gid_t	rgid;
 	gid_t	egid;
 	gid_t	sgid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setresgid(register struct thread *td, struct setresgid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t egid, rgid, sgid;
 	int error;
 
 	egid = uap->egid;
 	rgid = uap->rgid;
 	sgid = uap->sgid;
 	AUDIT_ARG(egid, egid);
 	AUDIT_ARG(rgid, rgid);
 	AUDIT_ARG(sgid, sgid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setresgid(p, oldcred, rgid, egid, sgid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
 	      rgid != oldcred->cr_svgid &&
 	      rgid != oldcred->cr_groups[0]) ||
 	     (egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
 	      egid != oldcred->cr_svgid &&
 	      egid != oldcred->cr_groups[0]) ||
 	     (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
 	      sgid != oldcred->cr_svgid &&
 	      sgid != oldcred->cr_groups[0])) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID,
 	     SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
 	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
 		change_rgid(newcred, rgid);
 		setsugid(p);
 	}
 	if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) {
 		change_svgid(newcred, sgid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getresuid_args {
 	uid_t	*ruid;
 	uid_t	*euid;
 	uid_t	*suid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getresuid(register struct thread *td, struct getresuid_args *uap)
 {
 	struct ucred *cred;
 	int error1 = 0, error2 = 0, error3 = 0;
 
 	cred = td->td_ucred;
 	if (uap->ruid)
 		error1 = copyout(&cred->cr_ruid,
 		    uap->ruid, sizeof(cred->cr_ruid));
 	if (uap->euid)
 		error2 = copyout(&cred->cr_uid,
 		    uap->euid, sizeof(cred->cr_uid));
 	if (uap->suid)
 		error3 = copyout(&cred->cr_svuid,
 		    uap->suid, sizeof(cred->cr_svuid));
 	return (error1 ? error1 : error2 ? error2 : error3);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getresgid_args {
 	gid_t	*rgid;
 	gid_t	*egid;
 	gid_t	*sgid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getresgid(register struct thread *td, struct getresgid_args *uap)
 {
 	struct ucred *cred;
 	int error1 = 0, error2 = 0, error3 = 0;
 
 	cred = td->td_ucred;
 	if (uap->rgid)
 		error1 = copyout(&cred->cr_rgid,
 		    uap->rgid, sizeof(cred->cr_rgid));
 	if (uap->egid)
 		error2 = copyout(&cred->cr_groups[0],
 		    uap->egid, sizeof(cred->cr_groups[0]));
 	if (uap->sgid)
 		error3 = copyout(&cred->cr_svgid,
 		    uap->sgid, sizeof(cred->cr_svgid));
 	return (error1 ? error1 : error2 ? error2 : error3);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct issetugid_args {
 	int dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 issetugid(register struct thread *td, struct issetugid_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	/*
 	 * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
 	 * we use P_SUGID because we consider changing the owners as
 	 * "tainting" as well.
 	 * This is significant for procs that start as root and "become"
 	 * a user without an exec - programs cannot know *everything*
 	 * that libc *might* have put in their data segment.
 	 */
 	PROC_LOCK(p);
 	td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
-/*
- * MPSAFE
- */
 int
 __setugid(struct thread *td, struct __setugid_args *uap)
 {
 #ifdef REGRESSION
 	struct proc *p;
 
 	p = td->td_proc;
 	switch (uap->flag) {
 	case 0:
 		PROC_LOCK(p);
 		p->p_flag &= ~P_SUGID;
 		PROC_UNLOCK(p);
 		return (0);
 	case 1:
 		PROC_LOCK(p);
 		p->p_flag |= P_SUGID;
 		PROC_UNLOCK(p);
 		return (0);
 	default:
 		return (EINVAL);
 	}
 #else /* !REGRESSION */
 
 	return (ENOSYS);
 #endif /* REGRESSION */
 }
 
 /*
  * Check if gid is a member of the group set.
- *
- * MPSAFE (cred must be held)
  */
 int
 groupmember(gid_t gid, struct ucred *cred)
 {
 	register gid_t *gp;
 	gid_t *egp;
 
 	egp = &(cred->cr_groups[cred->cr_ngroups]);
 	for (gp = cred->cr_groups; gp < egp; gp++)
 		if (*gp == gid)
 			return (1);
 	return (0);
 }
 
 /*
  * Test the active securelevel against a given level.  securelevel_gt()
  * implements (securelevel > level).  securelevel_ge() implements
  * (securelevel >= level).  Note that the logic is inverted -- these
  * functions return EPERM on "success" and 0 on "failure".
  *
  * XXXRW: Possibly since this has to do with privilege, it should move to
  * kern_priv.c.
- *
- * MPSAFE
  */
 int
 securelevel_gt(struct ucred *cr, int level)
 {
 	int active_securelevel;
 
 	active_securelevel = securelevel;
 	KASSERT(cr != NULL, ("securelevel_gt: null cr"));
 	if (cr->cr_prison != NULL)
 		active_securelevel = imax(cr->cr_prison->pr_securelevel,
 		    active_securelevel);
 	return (active_securelevel > level ? EPERM : 0);
 }
 
 int
 securelevel_ge(struct ucred *cr, int level)
 {
 	int active_securelevel;
 
 	active_securelevel = securelevel;
 	KASSERT(cr != NULL, ("securelevel_ge: null cr"));
 	if (cr->cr_prison != NULL)
 		active_securelevel = imax(cr->cr_prison->pr_securelevel,
 		    active_securelevel);
 	return (active_securelevel >= level ? EPERM : 0);
 }
 
 /*
  * 'see_other_uids' determines whether or not visibility of processes
  * and sockets with credentials holding different real uids is possible
  * using a variety of system MIBs.
  * XXX: data declarations should be together near the beginning of the file.
  */
 static int	see_other_uids = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW,
     &see_other_uids, 0,
     "Unprivileged processes may see subjects/objects with different real uid");
 
 /*-
  * Determine if u1 "can see" the subject specified by u2, according to the
  * 'see_other_uids' policy.
  * Returns: 0 for permitted, ESRCH otherwise
  * Locks: none
  * References: *u1 and *u2 must not change during the call
  *             u1 may equal u2, in which case only one reference is required
  */
 static int
 cr_seeotheruids(struct ucred *u1, struct ucred *u2)
 {
 
 	if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
 		if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, SUSER_ALLOWJAIL)
 		    != 0)
 			return (ESRCH);
 	}
 	return (0);
 }
 
 /*
  * 'see_other_gids' determines whether or not visibility of processes
  * and sockets with credentials holding different real gids is possible
  * using a variety of system MIBs.
  * XXX: data declarations should be together near the beginning of the file.
  */
 static int	see_other_gids = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW,
     &see_other_gids, 0,
     "Unprivileged processes may see subjects/objects with different real gid");
 
 /*
  * Determine if u1 can "see" the subject specified by u2, according to the
  * 'see_other_gids' policy.
  * Returns: 0 for permitted, ESRCH otherwise
  * Locks: none
  * References: *u1 and *u2 must not change during the call
  *             u1 may equal u2, in which case only one reference is required
  */
 static int
 cr_seeothergids(struct ucred *u1, struct ucred *u2)
 {
 	int i, match;
 	
 	if (!see_other_gids) {
 		match = 0;
 		for (i = 0; i < u1->cr_ngroups; i++) {
 			if (groupmember(u1->cr_groups[i], u2))
 				match = 1;
 			if (match)
 				break;
 		}
 		if (!match) {
 			if (priv_check_cred(u1, PRIV_SEEOTHERGIDS,
 			    SUSER_ALLOWJAIL) != 0)
 				return (ESRCH);
 		}
 	}
 	return (0);
 }
 
 /*-
  * Determine if u1 "can see" the subject specified by u2.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: none
  * References: *u1 and *u2 must not change during the call
  *             u1 may equal u2, in which case only one reference is required
  */
 int
 cr_cansee(struct ucred *u1, struct ucred *u2)
 {
 	int error;
 
 	if ((error = prison_check(u1, u2)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_check_cred_visible(u1, u2)))
 		return (error);
 #endif
 	if ((error = cr_seeotheruids(u1, u2)))
 		return (error);
 	if ((error = cr_seeothergids(u1, u2)))
 		return (error);
 	return (0);
 }
 
 /*-
  * Determine if td "can see" the subject specified by p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect p->p_ucred must be held.  td really
  *        should be curthread.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_cansee(struct thread *td, struct proc *p)
 {
 
 	/* Wrap cr_cansee() for all functionality. */
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	return (cr_cansee(td->td_ucred, p->p_ucred));
 }
 
 /*
  * 'conservative_signals' prevents the delivery of a broad class of
  * signals by unprivileged processes to processes that have changed their
  * credentials since the last invocation of execve().  This can prevent
  * the leakage of cached information or retained privileges as a result
  * of a common class of signal-related vulnerabilities.  However, this
  * may interfere with some applications that expect to be able to
  * deliver these signals to peer processes after having given up
  * privilege.
  */
 static int	conservative_signals = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW,
     &conservative_signals, 0, "Unprivileged processes prevented from "
     "sending certain signals to processes whose credentials have changed");
 /*-
  * Determine whether cred may deliver the specified signal to proc.
  * Returns: 0 for permitted, an errno value otherwise.
  * Locks: A lock must be held for proc.
  * References: cred and proc must be valid for the lifetime of the call.
  */
 int
 cr_cansignal(struct ucred *cred, struct proc *proc, int signum)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(proc, MA_OWNED);
 	/*
 	 * Jail semantics limit the scope of signalling to proc in the
 	 * same jail as cred, if cred is in jail.
 	 */
 	error = prison_check(cred, proc->p_ucred);
 	if (error)
 		return (error);
 #ifdef MAC
 	if ((error = mac_check_proc_signal(cred, proc, signum)))
 		return (error);
 #endif
 	if ((error = cr_seeotheruids(cred, proc->p_ucred)))
 		return (error);
 	if ((error = cr_seeothergids(cred, proc->p_ucred)))
 		return (error);
 
 	/*
 	 * UNIX signal semantics depend on the status of the P_SUGID
 	 * bit on the target process.  If the bit is set, then additional
 	 * restrictions are placed on the set of available signals.
 	 */
 	if (conservative_signals && (proc->p_flag & P_SUGID)) {
 		switch (signum) {
 		case 0:
 		case SIGKILL:
 		case SIGINT:
 		case SIGTERM:
 		case SIGALRM:
 		case SIGSTOP:
 		case SIGTTIN:
 		case SIGTTOU:
 		case SIGTSTP:
 		case SIGHUP:
 		case SIGUSR1:
 		case SIGUSR2:
 			/*
 			 * Generally, permit job and terminal control
 			 * signals.
 			 */
 			break;
 		default:
 			/* Not permitted without privilege. */
 			error = priv_check_cred(cred, PRIV_SIGNAL_SUGID,
 			    SUSER_ALLOWJAIL);
 			if (error)
 				return (error);
 		}
 	}
 
 	/*
 	 * Generally, the target credential's ruid or svuid must match the
 	 * subject credential's ruid or euid.
 	 */
 	if (cred->cr_ruid != proc->p_ucred->cr_ruid &&
 	    cred->cr_ruid != proc->p_ucred->cr_svuid &&
 	    cred->cr_uid != proc->p_ucred->cr_ruid &&
 	    cred->cr_uid != proc->p_ucred->cr_svuid) {
 		/* Not permitted without privilege. */
 		error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*-
  * Determine whether td may deliver the specified signal to p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must be
  *        held for p.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_cansignal(struct thread *td, struct proc *p, int signum)
 {
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (td->td_proc == p)
 		return (0);
 
 	/*
 	 * UNIX signalling semantics require that processes in the same
 	 * session always be able to deliver SIGCONT to one another,
 	 * overriding the remaining protections.
 	 */
 	/* XXX: This will require an additional lock of some sort. */
 	if (signum == SIGCONT && td->td_proc->p_session == p->p_session)
 		return (0);
 	/*
 	 * Some compat layers use SIGTHR and higher signals for
 	 * communication between different kernel threads of the same
 	 * process, so that they expect that it's always possible to
 	 * deliver them, even for suid applications where cr_cansignal() can
 	 * deny such ability for security consideration.  It should be
 	 * pretty safe to do since the only way to create two processes
 	 * with the same p_leader is via rfork(2).
 	 */
 	if (td->td_proc->p_leader != NULL && signum >= SIGTHR &&
 	    signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader)
 		return (0);
 
 	return (cr_cansignal(td->td_ucred, p, signum));
 }
 
 /*-
  * Determine whether td may reschedule p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must
  *        be held for p.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_cansched(struct thread *td, struct proc *p)
 {
 	int error;
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (td->td_proc == p)
 		return (0);
 	if ((error = prison_check(td->td_ucred, p->p_ucred)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_check_proc_sched(td->td_ucred, p)))
 		return (error);
 #endif
 	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
 		return (error);
 	if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
 		return (error);
 	if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid &&
 	    td->td_ucred->cr_uid != p->p_ucred->cr_ruid) {
 		error = priv_check_cred(td->td_ucred, PRIV_SCHED_DIFFCRED,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * The 'unprivileged_proc_debug' flag may be used to disable a variety of
  * unprivileged inter-process debugging services, including some procfs
  * functionality, ptrace(), and ktrace().  In the past, inter-process
  * debugging has been involved in a variety of security problems, and sites
  * not requiring the service might choose to disable it when hardening
  * systems.
  *
  * XXX: Should modifying and reading this variable require locking?
  * XXX: data declarations should be together near the beginning of the file.
  */
 static int	unprivileged_proc_debug = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW,
     &unprivileged_proc_debug, 0,
     "Unprivileged processes may use process debugging facilities");
 
 /*-
  * Determine whether td may debug p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must
  *        be held for p.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_candebug(struct thread *td, struct proc *p)
 {
 	int credentialchanged, error, grpsubset, i, uidsubset;
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (!unprivileged_proc_debug) {
 		error = priv_check_cred(td->td_ucred, PRIV_DEBUG_UNPRIV,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 	if (td->td_proc == p)
 		return (0);
 	if ((error = prison_check(td->td_ucred, p->p_ucred)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_check_proc_debug(td->td_ucred, p)))
 		return (error);
 #endif
 	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
 		return (error);
 	if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
 		return (error);
 
 	/*
 	 * Is p's group set a subset of td's effective group set?  This
 	 * includes p's egid, group access list, rgid, and svgid.
 	 */
 	grpsubset = 1;
 	for (i = 0; i < p->p_ucred->cr_ngroups; i++) {
 		if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) {
 			grpsubset = 0;
 			break;
 		}
 	}
 	grpsubset = grpsubset &&
 	    groupmember(p->p_ucred->cr_rgid, td->td_ucred) &&
 	    groupmember(p->p_ucred->cr_svgid, td->td_ucred);
 
 	/*
 	 * Are the uids present in p's credential equal to td's
 	 * effective uid?  This includes p's euid, svuid, and ruid.
 	 */
 	uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid &&
 	    td->td_ucred->cr_uid == p->p_ucred->cr_svuid &&
 	    td->td_ucred->cr_uid == p->p_ucred->cr_ruid);
 
 	/*
 	 * Has the credential of the process changed since the last exec()?
 	 */
 	credentialchanged = (p->p_flag & P_SUGID);
 
 	/*
 	 * If p's gids aren't a subset, or the uids aren't a subset,
 	 * or the credential has changed, require appropriate privilege
 	 * for td to debug p.
 	 */
 	if (!grpsubset || !uidsubset) {
 		error = priv_check_cred(td->td_ucred, PRIV_DEBUG_DIFFCRED,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 
 	if (credentialchanged) {
 		error = priv_check_cred(td->td_ucred, PRIV_DEBUG_SUGID,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 
 	/* Can't trace init when securelevel > 0. */
 	if (p == initproc) {
 		error = securelevel_gt(td->td_ucred, 0);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Can't trace a process that's currently exec'ing.
 	 *
 	 * XXX: Note, this is not a security policy decision, it's a
 	 * basic correctness/functionality decision.  Therefore, this check
 	 * should be moved to the caller's of p_candebug().
 	 */
 	if ((p->p_flag & P_INEXEC) != 0)
 		return (EAGAIN);
 
 	return (0);
 }
 
 /*-
  * Determine whether the subject represented by cred can "see" a socket.
  * Returns: 0 for permitted, ENOENT otherwise.
  */
 int
 cr_canseesocket(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	error = prison_check(cred, so->so_cred);
 	if (error)
 		return (ENOENT);
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_visible(cred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		return (error);
 #endif
 	if (cr_seeotheruids(cred, so->so_cred))
 		return (ENOENT);
 	if (cr_seeothergids(cred, so->so_cred))
 		return (ENOENT);
 
 	return (0);
 }
 
 /*-
  * Determine whether td can wait for the exit of p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must
  *        be held for p.
  * References: td and p must be valid for the lifetime of the call
 
  */
 int
 p_canwait(struct thread *td, struct proc *p)
 {
 	int error;
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((error = prison_check(td->td_ucred, p->p_ucred)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_check_proc_wait(td->td_ucred, p)))
 		return (error);
 #endif
 #if 0
 	/* XXXMAC: This could have odd effects on some shells. */
 	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
 		return (error);
 #endif
 
 	return (0);
 }
 
 /*
  * Allocate a zeroed cred structure.
- * MPSAFE
  */
 struct ucred *
 crget(void)
 {
 	register struct ucred *cr;
 
 	MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
 	refcount_init(&cr->cr_ref, 1);
 #ifdef MAC
 	mac_init_cred(cr);
 #endif
 	return (cr);
 }
 
 /*
  * Claim another reference to a ucred structure.
- * MPSAFE
  */
 struct ucred *
 crhold(struct ucred *cr)
 {
 
 	refcount_acquire(&cr->cr_ref);
 	return (cr);
 }
 
 /*
- * Free a cred structure.
- * Throws away space when ref count gets to 0.
- * MPSAFE
+ * Free a cred structure.  Throws away space when ref count gets to 0.
  */
 void
 crfree(struct ucred *cr)
 {
 
 	KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
 	KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
 	if (refcount_release(&cr->cr_ref)) {
 		/*
 		 * Some callers of crget(), such as nfs_statfs(),
 		 * allocate a temporary credential, but don't
 		 * allocate a uidinfo structure.
 		 */
 		if (cr->cr_uidinfo != NULL)
 			uifree(cr->cr_uidinfo);
 		if (cr->cr_ruidinfo != NULL)
 			uifree(cr->cr_ruidinfo);
 		/*
 		 * Free a prison, if any.
 		 */
 		if (jailed(cr))
 			prison_free(cr->cr_prison);
 #ifdef MAC
 		mac_destroy_cred(cr);
 #endif
 		FREE(cr, M_CRED);
 	}
 }
 
 /*
  * Check to see if this ucred is shared.
- * MPSAFE
  */
 int
 crshared(struct ucred *cr)
 {
 
 	return (cr->cr_ref > 1);
 }
 
 /*
  * Copy a ucred's contents from a template.  Does not block.
- * MPSAFE
  */
 void
 crcopy(struct ucred *dest, struct ucred *src)
 {
 
 	KASSERT(crshared(dest) == 0, ("crcopy of shared ucred"));
 	bcopy(&src->cr_startcopy, &dest->cr_startcopy,
 	    (unsigned)((caddr_t)&src->cr_endcopy -
 		(caddr_t)&src->cr_startcopy));
 	uihold(dest->cr_uidinfo);
 	uihold(dest->cr_ruidinfo);
 	if (jailed(dest))
 		prison_hold(dest->cr_prison);
 #ifdef MAC
 	mac_copy_cred(src, dest);
 #endif
 }
 
 /*
  * Dup cred struct to a new held one.
- * MPSAFE
  */
 struct ucred *
 crdup(struct ucred *cr)
 {
 	struct ucred *newcr;
 
 	newcr = crget();
 	crcopy(newcr, cr);
 	return (newcr);
 }
 
 /*
  * Fill in a struct xucred based on a struct ucred.
- * MPSAFE
  */
 void
 cru2x(struct ucred *cr, struct xucred *xcr)
 {
 
 	bzero(xcr, sizeof(*xcr));
 	xcr->cr_version = XUCRED_VERSION;
 	xcr->cr_uid = cr->cr_uid;
 	xcr->cr_ngroups = cr->cr_ngroups;
 	bcopy(cr->cr_groups, xcr->cr_groups, sizeof(cr->cr_groups));
 }
 
 /*
- * small routine to swap a thread's current ucred for the correct one
- * taken from the process.
- * MPSAFE
+ * small routine to swap a thread's current ucred for the correct one taken
+ * from the process.
  */
 void
 cred_update_thread(struct thread *td)
 {
 	struct proc *p;
 	struct ucred *cred;
 
 	p = td->td_proc;
 	cred = td->td_ucred;
 	PROC_LOCK(p);
 	td->td_ucred = crhold(p->p_ucred);
 	PROC_UNLOCK(p);
 	if (cred != NULL)
 		crfree(cred);
 }
 
 /*
  * Get login name, if available.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getlogin_args {
 	char	*namebuf;
 	u_int	namelen;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getlogin(struct thread *td, struct getlogin_args *uap)
 {
 	int error;
 	char login[MAXLOGNAME];
 	struct proc *p = td->td_proc;
 
 	if (uap->namelen > MAXLOGNAME)
 		uap->namelen = MAXLOGNAME;
 	PROC_LOCK(p);
 	SESS_LOCK(p->p_session);
 	bcopy(p->p_session->s_login, login, uap->namelen);
 	SESS_UNLOCK(p->p_session);
 	PROC_UNLOCK(p);
 	error = copyout(login, uap->namebuf, uap->namelen);
 	return(error);
 }
 
 /*
  * Set login name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct setlogin_args {
 	char	*namebuf;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setlogin(struct thread *td, struct setlogin_args *uap)
 {
 	struct proc *p = td->td_proc;
 	int error;
 	char logintmp[MAXLOGNAME];
 
 	error = priv_check_cred(td->td_ucred, PRIV_PROC_SETLOGIN,
 	    SUSER_ALLOWJAIL);
 	if (error)
 		return (error);
 	error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL);
 	if (error == ENAMETOOLONG)
 		error = EINVAL;
 	else if (!error) {
 		PROC_LOCK(p);
 		SESS_LOCK(p->p_session);
 		(void) memcpy(p->p_session->s_login, logintmp,
 		    sizeof(logintmp));
 		SESS_UNLOCK(p->p_session);
 		PROC_UNLOCK(p);
 	}
 	return (error);
 }
 
 void
 setsugid(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag |= P_SUGID;
 	if (!(p->p_pfsflags & PF_ISUGID))
 		p->p_stops = 0;
 }
 
 /*-
  * Change a process's effective uid.
  * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_euid(struct ucred *newcred, struct uidinfo *euip)
 {
 
 	newcred->cr_uid = euip->ui_uid;
 	uihold(euip);
 	uifree(newcred->cr_uidinfo);
 	newcred->cr_uidinfo = euip;
 }
 
 /*-
  * Change a process's effective gid.
  * Side effects: newcred->cr_gid will be modified.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_egid(struct ucred *newcred, gid_t egid)
 {
 
 	newcred->cr_groups[0] = egid;
 }
 
 /*-
  * Change a process's real uid.
  * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo
  *               will be updated, and the old and new cr_ruidinfo proc
  *               counts will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_ruid(struct ucred *newcred, struct uidinfo *ruip)
 {
 
 	(void)chgproccnt(newcred->cr_ruidinfo, -1, 0);
 	newcred->cr_ruid = ruip->ui_uid;
 	uihold(ruip);
 	uifree(newcred->cr_ruidinfo);
 	newcred->cr_ruidinfo = ruip;
 	(void)chgproccnt(newcred->cr_ruidinfo, 1, 0);
 }
 
 /*-
  * Change a process's real gid.
  * Side effects: newcred->cr_rgid will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_rgid(struct ucred *newcred, gid_t rgid)
 {
 
 	newcred->cr_rgid = rgid;
 }
 
 /*-
  * Change a process's saved uid.
  * Side effects: newcred->cr_svuid will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_svuid(struct ucred *newcred, uid_t svuid)
 {
 
 	newcred->cr_svuid = svuid;
 }
 
 /*-
  * Change a process's saved gid.
  * Side effects: newcred->cr_svgid will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_svgid(struct ucred *newcred, gid_t svgid)
 {
 
 	newcred->cr_svgid = svgid;
 }
Index: head/sys/kern/kern_resource.c
===================================================================
--- head/sys/kern/kern_resource.c	(revision 167210)
+++ head/sys/kern/kern_resource.c	(revision 167211)
@@ -1,1285 +1,1260 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/time.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 
 static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
 static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
 #define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
 static struct mtx uihashtbl_mtx;
 static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
 static u_long uihash;		/* size of hash table - 1 */
 
 static void	calcru1(struct proc *p, struct rusage_ext *ruxp,
 		    struct timeval *up, struct timeval *sp);
 static int	donice(struct thread *td, struct proc *chgp, int n);
 static struct uidinfo *uilookup(uid_t uid);
 
 /*
  * Resource controls and accounting.
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct getpriority_args {
 	int	which;
 	int	who;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getpriority(td, uap)
 	struct thread *td;
 	register struct getpriority_args *uap;
 {
 	struct proc *p;
 	struct pgrp *pg;
 	int error, low;
 
 	error = 0;
 	low = PRIO_MAX + 1;
 	switch (uap->which) {
 
 	case PRIO_PROCESS:
 		if (uap->who == 0)
 			low = td->td_proc->p_nice;
 		else {
 			p = pfind(uap->who);
 			if (p == NULL)
 				break;
 			if (p_cansee(td, p) == 0)
 				low = p->p_nice;
 			PROC_UNLOCK(p);
 		}
 		break;
 
 	case PRIO_PGRP:
 		sx_slock(&proctree_lock);
 		if (uap->who == 0) {
 			pg = td->td_proc->p_pgrp;
 			PGRP_LOCK(pg);
 		} else {
 			pg = pgfind(uap->who);
 			if (pg == NULL) {
 				sx_sunlock(&proctree_lock);
 				break;
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (!p_cansee(td, p)) {
 				if (p->p_nice < low)
 					low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pg);
 		break;
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			/* Do not bother to check PRS_NEW processes */
 			if (p->p_state == PRS_NEW)
 				continue;
 			PROC_LOCK(p);
 			if (!p_cansee(td, p) &&
 			    p->p_ucred->cr_uid == uap->who) {
 				if (p->p_nice < low)
 					low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (low == PRIO_MAX + 1 && error == 0)
 		error = ESRCH;
 	td->td_retval[0] = low;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setpriority_args {
 	int	which;
 	int	who;
 	int	prio;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 setpriority(td, uap)
 	struct thread *td;
 	struct setpriority_args *uap;
 {
 	struct proc *curp, *p;
 	struct pgrp *pg;
 	int found = 0, error = 0;
 
 	curp = td->td_proc;
 	switch (uap->which) {
 	case PRIO_PROCESS:
 		if (uap->who == 0) {
 			PROC_LOCK(curp);
 			error = donice(td, curp, uap->prio);
 			PROC_UNLOCK(curp);
 		} else {
 			p = pfind(uap->who);
 			if (p == 0)
 				break;
 			if (p_cansee(td, p) == 0)
 				error = donice(td, p, uap->prio);
 			PROC_UNLOCK(p);
 		}
 		found++;
 		break;
 
 	case PRIO_PGRP:
 		sx_slock(&proctree_lock);
 		if (uap->who == 0) {
 			pg = curp->p_pgrp;
 			PGRP_LOCK(pg);
 		} else {
 			pg = pgfind(uap->who);
 			if (pg == NULL) {
 				sx_sunlock(&proctree_lock);
 				break;
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (!p_cansee(td, p)) {
 				error = donice(td, p, uap->prio);
 				found++;
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pg);
 		break;
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_ucred->cr_uid == uap->who &&
 			    !p_cansee(td, p)) {
 				error = donice(td, p, uap->prio);
 				found++;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (found == 0 && error == 0)
 		error = ESRCH;
 	return (error);
 }
 
 /*
  * Set "nice" for a (whole) process.
  */
 static int
 donice(struct thread *td, struct proc *p, int n)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((error = p_cansched(td, p)))
 		return (error);
 	if (n > PRIO_MAX)
 		n = PRIO_MAX;
 	if (n < PRIO_MIN)
 		n = PRIO_MIN;
  	if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
 		return (EACCES);
 	mtx_lock_spin(&sched_lock);
 	sched_nice(p, n);
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 /*
  * Set realtime priority for LWP.
- *
- * MPSAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rtprio_thread_args {
 	int		function;
 	lwpid_t		lwpid;
 	struct rtprio	*rtp;
 };
 #endif
 
 int
 rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
 {
 	struct proc *curp;
 	struct proc *p;
 	struct rtprio rtp;
 	struct thread *td1;
 	int cierror, error;
 
 	/* Perform copyin before acquiring locks if needed. */
 	if (uap->function == RTP_SET)
 		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 	else
 		cierror = 0;
 
 	curp = td->td_proc;
 	/*
 	 * Though lwpid is unique, only current process is supported
 	 * since there is no efficient way to look up a LWP yet.
 	 */
 	p = curp;
 	PROC_LOCK(p);
 
 	switch (uap->function) {
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
 		mtx_lock_spin(&sched_lock);
 		if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 			td1 = td;
 		else
 			td1 = thread_find(p, uap->lwpid);
 		if (td1 != NULL)
 			pri_to_rtp(td1, &rtp);
 		else
 			error = ESRCH;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
 		if ((error = p_cansched(td, p)) || (error = cierror))
 			break;
 
 		/* Disallow setting rtprio in most cases if not superuser. */
 		if (priv_check(td, PRIV_SCHED_RTPRIO) != 0) {
 			/* can't set realtime priority */
 /*
  * Realtime priority has to be restricted for reasons which should be
  * obvious.  However, for idle priority, there is a potential for
  * system deadlock if an idleprio process gains a lock on a resource
  * that other processes need (and the idleprio process can't run
  * due to a CPU-bound normal process).  Fix me!  XXX
  */
 #if 0
  			if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 #else
 			if (rtp.type != RTP_PRIO_NORMAL) {
 #endif
 				error = EPERM;
 				break;
 			}
 		}
 
 		mtx_lock_spin(&sched_lock);
 		if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 			td1 = td;
 		else
 			td1 = thread_find(p, uap->lwpid);
 		if (td1 != NULL)
 			error = rtp_to_pri(&rtp, td1);
 		else
 			error = ESRCH;
 		mtx_unlock_spin(&sched_lock);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Set realtime priority.
- *
- * MPSAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rtprio_args {
 	int		function;
 	pid_t		pid;
 	struct rtprio	*rtp;
 };
 #endif
 
 int
 rtprio(td, uap)
 	struct thread *td;		/* curthread */
 	register struct rtprio_args *uap;
 {
 	struct proc *curp;
 	struct proc *p;
 	struct thread *tdp;
 	struct rtprio rtp;
 	int cierror, error;
 
 	/* Perform copyin before acquiring locks if needed. */
 	if (uap->function == RTP_SET)
 		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 	else
 		cierror = 0;
 
 	curp = td->td_proc;
 	if (uap->pid == 0) {
 		p = curp;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(uap->pid);
 		if (p == NULL)
 			return (ESRCH);
 	}
 
 	switch (uap->function) {
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Return OUR priority if no pid specified,
 		 * or if one is, report the highest priority
 		 * in the process.  There isn't much more you can do as 
 		 * there is only room to return a single priority.
 		 * XXXKSE: maybe need a new interface to report 
 		 * priorities of multiple system scope threads.
 		 * Note: specifying our own pid is not the same
 		 * as leaving it zero.
 		 */
 		if (uap->pid == 0) {
 			pri_to_rtp(td, &rtp);
 		} else {
 			struct rtprio rtp2;
 
 			rtp.type = RTP_PRIO_IDLE;
 			rtp.prio = RTP_PRIO_MAX;
 			FOREACH_THREAD_IN_PROC(p, tdp) {
 				pri_to_rtp(tdp, &rtp2);
 				if (rtp2.type <  rtp.type ||
 				    (rtp2.type == rtp.type &&
 				    rtp2.prio < rtp.prio)) {
 					rtp.type = rtp2.type;
 					rtp.prio = rtp2.prio;
 				}
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
 		if ((error = p_cansched(td, p)) || (error = cierror))
 			break;
 
 		/* Disallow setting rtprio in most cases if not superuser. */
 		if (priv_check(td, PRIV_SCHED_RTPRIO) != 0) {
 			/* can't set someone else's */
 			if (uap->pid) {
 				error = EPERM;
 				break;
 			}
 			/* can't set realtime priority */
 /*
  * Realtime priority has to be restricted for reasons which should be
  * obvious.  However, for idle priority, there is a potential for
  * system deadlock if an idleprio process gains a lock on a resource
  * that other processes need (and the idleprio process can't run
  * due to a CPU-bound normal process).  Fix me!  XXX
  */
 #if 0
  			if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 #else
 			if (rtp.type != RTP_PRIO_NORMAL) {
 #endif
 				error = EPERM;
 				break;
 			}
 		}
 
 		/*
 		 * If we are setting our own priority, set just our
 		 * thread but if we are doing another process,
 		 * do all the threads on that process. If we
 		 * specify our own pid we do the latter.
 		 */
 		mtx_lock_spin(&sched_lock);
 		if (uap->pid == 0) {
 			error = rtp_to_pri(&rtp, td);
 		} else {
 			FOREACH_THREAD_IN_PROC(p, td) {
 				if ((error = rtp_to_pri(&rtp, td)) != 0)
 					break;
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 int
 rtp_to_pri(struct rtprio *rtp, struct thread *td)
 {
 	u_char	newpri;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (rtp->prio > RTP_PRIO_MAX)
 		return (EINVAL);
 	switch (RTP_PRIO_BASE(rtp->type)) {
 	case RTP_PRIO_REALTIME:
 		newpri = PRI_MIN_REALTIME + rtp->prio;
 		break;
 	case RTP_PRIO_NORMAL:
 		newpri = PRI_MIN_TIMESHARE + rtp->prio;
 		break;
 	case RTP_PRIO_IDLE:
 		newpri = PRI_MIN_IDLE + rtp->prio;
 		break;
 	default:
 		return (EINVAL);
 	}
 	sched_class(td, rtp->type);	/* XXX fix */
 	sched_user_prio(td, newpri);
 	if (curthread == td)
 		sched_prio(curthread, td->td_user_pri); /* XXX dubious */
 	return (0);
 }
 
 void
 pri_to_rtp(struct thread *td, struct rtprio *rtp)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	switch (PRI_BASE(td->td_pri_class)) {
 	case PRI_REALTIME:
 		rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
 		break;
 	case PRI_TIMESHARE:
 		rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
 		break;
 	case PRI_IDLE:
 		rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
 		break;
 	default:
 		break;
 	}
 	rtp->type = td->td_pri_class;
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 osetrlimit(td, uap)
 	struct thread *td;
 	register struct osetrlimit_args *uap;
 {
 	struct orlimit olim;
 	struct rlimit lim;
 	int error;
 
 	if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
 		return (error);
 	lim.rlim_cur = olim.rlim_cur;
 	lim.rlim_max = olim.rlim_max;
 	error = kern_setrlimit(td, uap->which, &lim);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ogetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 ogetrlimit(td, uap)
 	struct thread *td;
 	register struct ogetrlimit_args *uap;
 {
 	struct orlimit olim;
 	struct rlimit rl;
 	struct proc *p;
 	int error;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	lim_rlimit(p, uap->which, &rl);
 	PROC_UNLOCK(p);
 
 	/*
 	 * XXX would be more correct to convert only RLIM_INFINITY to the
 	 * old RLIM_INFINITY and fail with EOVERFLOW for other larger
 	 * values.  Most 64->32 and 32->16 conversions, including not
 	 * unimportant ones of uids are even more broken than what we
 	 * do here (they blindly truncate).  We don't do this correctly
 	 * here since we have little experience with EOVERFLOW yet.
 	 * Elsewhere, getuid() can't fail...
 	 */
 	olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
 	olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
 	error = copyout(&olim, uap->rlp, sizeof(olim));
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct __setrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 setrlimit(td, uap)
 	struct thread *td;
 	register struct __setrlimit_args *uap;
 {
 	struct rlimit alim;
 	int error;
 
 	if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
 		return (error);
 	error = kern_setrlimit(td, uap->which, &alim);
 	return (error);
 }
 
 int
 kern_setrlimit(td, which, limp)
 	struct thread *td;
 	u_int which;
 	struct rlimit *limp;
 {
 	struct plimit *newlim, *oldlim;
 	struct proc *p;
 	register struct rlimit *alimp;
 	rlim_t oldssiz;
 	int error;
 
 	if (which >= RLIM_NLIMITS)
 		return (EINVAL);
 
 	/*
 	 * Preserve historical bugs by treating negative limits as unsigned.
 	 */
 	if (limp->rlim_cur < 0)
 		limp->rlim_cur = RLIM_INFINITY;
 	if (limp->rlim_max < 0)
 		limp->rlim_max = RLIM_INFINITY;
 
 	oldssiz = 0;
 	p = td->td_proc;
 	newlim = lim_alloc();
 	PROC_LOCK(p);
 	oldlim = p->p_limit;
 	alimp = &oldlim->pl_rlimit[which];
 	if (limp->rlim_cur > alimp->rlim_max ||
 	    limp->rlim_max > alimp->rlim_max)
 		if ((error = priv_check_cred(td->td_ucred,
 		    PRIV_PROC_SETRLIMIT, SUSER_ALLOWJAIL))) {
 			PROC_UNLOCK(p);
 			lim_free(newlim);
 			return (error);
 		}
 	if (limp->rlim_cur > limp->rlim_max)
 		limp->rlim_cur = limp->rlim_max;
 	lim_copy(newlim, oldlim);
 	alimp = &newlim->pl_rlimit[which];
 
 	switch (which) {
 
 	case RLIMIT_CPU:
 		mtx_lock_spin(&sched_lock);
 		p->p_cpulimit = limp->rlim_cur;
 		mtx_unlock_spin(&sched_lock);
 		break;
 	case RLIMIT_DATA:
 		if (limp->rlim_cur > maxdsiz)
 			limp->rlim_cur = maxdsiz;
 		if (limp->rlim_max > maxdsiz)
 			limp->rlim_max = maxdsiz;
 		break;
 
 	case RLIMIT_STACK:
 		if (limp->rlim_cur > maxssiz)
 			limp->rlim_cur = maxssiz;
 		if (limp->rlim_max > maxssiz)
 			limp->rlim_max = maxssiz;
 		oldssiz = alimp->rlim_cur;
 		break;
 
 	case RLIMIT_NOFILE:
 		if (limp->rlim_cur > maxfilesperproc)
 			limp->rlim_cur = maxfilesperproc;
 		if (limp->rlim_max > maxfilesperproc)
 			limp->rlim_max = maxfilesperproc;
 		break;
 
 	case RLIMIT_NPROC:
 		if (limp->rlim_cur > maxprocperuid)
 			limp->rlim_cur = maxprocperuid;
 		if (limp->rlim_max > maxprocperuid)
 			limp->rlim_max = maxprocperuid;
 		if (limp->rlim_cur < 1)
 			limp->rlim_cur = 1;
 		if (limp->rlim_max < 1)
 			limp->rlim_max = 1;
 		break;
 	}
 	*alimp = *limp;
 	p->p_limit = newlim;
 	PROC_UNLOCK(p);
 	lim_free(oldlim);
 
 	if (which == RLIMIT_STACK) {
 		/*
 		 * Stack is allocated to the max at exec time with only
 		 * "rlim_cur" bytes accessible.  If stack limit is going
 		 * up make more accessible, if going down make inaccessible.
 		 */
 		if (limp->rlim_cur != oldssiz) {
 			vm_offset_t addr;
 			vm_size_t size;
 			vm_prot_t prot;
 
 			if (limp->rlim_cur > oldssiz) {
 				prot = p->p_sysent->sv_stackprot;
 				size = limp->rlim_cur - oldssiz;
 				addr = p->p_sysent->sv_usrstack -
 				    limp->rlim_cur;
 			} else {
 				prot = VM_PROT_NONE;
 				size = oldssiz - limp->rlim_cur;
 				addr = p->p_sysent->sv_usrstack - oldssiz;
 			}
 			addr = trunc_page(addr);
 			size = round_page(size);
 			(void)vm_map_protect(&p->p_vmspace->vm_map,
 			    addr, addr + size, prot, FALSE);
 		}
 	}
 
 	/*
 	 * The data size limit may need to be changed to a value
 	 * that makes sense for the 32 bit binary.
 	 */
 	if (p->p_sysent->sv_fixlimits != NULL)
 		p->p_sysent->sv_fixlimits(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __getrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getrlimit(td, uap)
 	struct thread *td;
 	register struct __getrlimit_args *uap;
 {
 	struct rlimit rlim;
 	struct proc *p;
 	int error;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	lim_rlimit(p, uap->which, &rlim);
 	PROC_UNLOCK(p);
 	error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
 	return (error);
 }
 
 /*
  * Transform the running time and tick information for children of proc p
  * into user and system time usage.
  */
 void
 calccru(p, up, sp)
 	struct proc *p;
 	struct timeval *up;
 	struct timeval *sp;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	calcru1(p, &p->p_crux, up, sp);
 }
 
 /*
  * Transform the running time and tick information in proc p into user
  * and system time usage.  If appropriate, include the current time slice
  * on this CPU.
  */
 void
 calcru(struct proc *p, struct timeval *up, struct timeval *sp)
 {
 	struct rusage_ext rux;
 	struct thread *td;
 	uint64_t u;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_lock_spin(&sched_lock);
 
 	/*
 	 * If we are getting stats for the current process, then add in the
 	 * stats that this thread has accumulated in its current time slice.
 	 * We reset the thread and CPU state as if we had performed a context
 	 * switch right here.
 	 */
 	if (curthread->td_proc == p) {
 		td = curthread;
 		u = cpu_ticks();
 		p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
 		PCPU_SET(switchtime, u);
 		p->p_rux.rux_uticks += td->td_uticks;
 		td->td_uticks = 0;
 		p->p_rux.rux_iticks += td->td_iticks;
 		td->td_iticks = 0;
 		p->p_rux.rux_sticks += td->td_sticks;
 		td->td_sticks = 0;
 	}
 	/* Work on a copy of p_rux so we can let go of sched_lock */
 	rux = p->p_rux;
 	mtx_unlock_spin(&sched_lock);
 	calcru1(p, &rux, up, sp);
 	/* Update the result from the p_rux copy */
 	p->p_rux.rux_uu = rux.rux_uu;
 	p->p_rux.rux_su = rux.rux_su;
 	p->p_rux.rux_tu = rux.rux_tu;
 }
 
 static void
 calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
     struct timeval *sp)
 {
 	/* {user, system, interrupt, total} {ticks, usec}: */
 	u_int64_t ut, uu, st, su, it, tt, tu;
 
 	ut = ruxp->rux_uticks;
 	st = ruxp->rux_sticks;
 	it = ruxp->rux_iticks;
 	tt = ut + st + it;
 	if (tt == 0) {
 		/* Avoid divide by zero */
 		st = 1;
 		tt = 1;
 	}
 	tu = cputick2usec(ruxp->rux_runtime);
 	if ((int64_t)tu < 0) {
 		/* XXX: this should be an assert /phk */
 		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
 		    (intmax_t)tu, p->p_pid, p->p_comm);
 		tu = ruxp->rux_tu;
 	}
 
 	if (tu >= ruxp->rux_tu) {
 		/*
 		 * The normal case, time increased.
 		 * Enforce monotonicity of bucketed numbers.
 		 */
 		uu = (tu * ut) / tt;
 		if (uu < ruxp->rux_uu)
 			uu = ruxp->rux_uu;
 		su = (tu * st) / tt;
 		if (su < ruxp->rux_su)
 			su = ruxp->rux_su;
 	} else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
 		/* 
 		 * When we calibrate the cputicker, it is not uncommon to
 		 * see the presumably fixed frequency increase slightly over
 		 * time as a result of thermal stabilization and NTP
 		 * discipline (of the reference clock).  We therefore ignore
 		 * a bit of backwards slop because we  expect to catch up
  		 * shortly.  We use a 3 microsecond limit to catch low
 		 * counts and a 1% limit for high counts.
 		 */
 		uu = ruxp->rux_uu;
 		su = ruxp->rux_su;
 		tu = ruxp->rux_tu;
 	} else { /* tu < ruxp->rux_tu */
 		/*
 		 * What happene here was likely that a laptop, which ran at
 		 * a reduced clock frequency at boot, kicked into high gear.
 		 * The wisdom of spamming this message in that case is
 		 * dubious, but it might also be indicative of something
 		 * serious, so lets keep it and hope laptops can be made
 		 * more truthful about their CPU speed via ACPI.
 		 */
 		printf("calcru: runtime went backwards from %ju usec "
 		    "to %ju usec for pid %d (%s)\n",
 		    (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
 		    p->p_pid, p->p_comm);
 		uu = (tu * ut) / tt;
 		su = (tu * st) / tt;
 	}
 
 	ruxp->rux_uu = uu;
 	ruxp->rux_su = su;
 	ruxp->rux_tu = tu;
 
 	up->tv_sec = uu / 1000000;
 	up->tv_usec = uu % 1000000;
 	sp->tv_sec = su / 1000000;
 	sp->tv_usec = su % 1000000;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getrusage_args {
 	int	who;
 	struct	rusage *rusage;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getrusage(td, uap)
 	register struct thread *td;
 	register struct getrusage_args *uap;
 {
 	struct rusage ru;
 	int error;
 
 	error = kern_getrusage(td, uap->who, &ru);
 	if (error == 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
 
 int
 kern_getrusage(td, who, rup)
 	struct thread *td;
 	int who;
 	struct rusage *rup;
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	switch (who) {
 
 	case RUSAGE_SELF:
 		*rup = p->p_stats->p_ru;
 		calcru(p, &rup->ru_utime, &rup->ru_stime);
 		break;
 
 	case RUSAGE_CHILDREN:
 		*rup = p->p_stats->p_cru;
 		calccru(p, &rup->ru_utime, &rup->ru_stime);
 		break;
 
 	default:
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 void
 ruadd(ru, rux, ru2, rux2)
 	struct rusage *ru;
 	struct rusage_ext *rux;
 	struct rusage *ru2;
 	struct rusage_ext *rux2;
 {
 	register long *ip, *ip2;
 	register int i;
 
 	rux->rux_runtime += rux2->rux_runtime;
 	rux->rux_uticks += rux2->rux_uticks;
 	rux->rux_sticks += rux2->rux_sticks;
 	rux->rux_iticks += rux2->rux_iticks;
 	rux->rux_uu += rux2->rux_uu;
 	rux->rux_su += rux2->rux_su;
 	rux->rux_tu += rux2->rux_tu;
 	if (ru->ru_maxrss < ru2->ru_maxrss)
 		ru->ru_maxrss = ru2->ru_maxrss;
 	ip = &ru->ru_first;
 	ip2 = &ru2->ru_first;
 	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
 		*ip++ += *ip2++;
 }
 
 /*
  * Allocate a new resource limits structure and initialize its
  * reference count and mutex pointer.
  */
 struct plimit *
 lim_alloc()
 {
 	struct plimit *limp;
 
 	limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
 	refcount_init(&limp->pl_refcnt, 1);
 	return (limp);
 }
 
 struct plimit *
 lim_hold(limp)
 	struct plimit *limp;
 {
 
 	refcount_acquire(&limp->pl_refcnt);
 	return (limp);
 }
 
 void
 lim_free(limp)
 	struct plimit *limp;
 {
 
 	KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
 	if (refcount_release(&limp->pl_refcnt))
 		free((void *)limp, M_PLIMIT);
 }
 
 /*
  * Make a copy of the plimit structure.
  * We share these structures copy-on-write after fork.
  */
 void
 lim_copy(dst, src)
 	struct plimit *dst, *src;
 {
 
 	KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
 	bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
 }
 
 /*
  * Return the hard limit for a particular system resource.  The
  * which parameter specifies the index into the rlimit array.
  */
 rlim_t
 lim_max(struct proc *p, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit(p, which, &rl);
 	return (rl.rlim_max);
 }
 
 /*
  * Return the current (soft) limit for a particular system resource.
  * The which parameter which specifies the index into the rlimit array
  */
 rlim_t
 lim_cur(struct proc *p, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit(p, which, &rl);
 	return (rl.rlim_cur);
 }
 
 /*
  * Return a copy of the entire rlimit structure for the system limit
  * specified by 'which' in the rlimit structure pointed to by 'rlp'.
  */
 void
 lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(which >= 0 && which < RLIM_NLIMITS,
 	    ("request for invalid resource limit"));
 	*rlp = p->p_limit->pl_rlimit[which];
 }
 
 /*
  * Find the uidinfo structure for a uid.  This structure is used to
  * track the total resource consumption (process count, socket buffer
  * size, etc.) for the uid and impose limits.
  */
 void
 uihashinit()
 {
 
 	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
 	mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
 }
 
 /*
  * Look up a uidinfo struct for the parameter uid.
  * uihashtbl_mtx must be locked.
  */
 static struct uidinfo *
 uilookup(uid)
 	uid_t uid;
 {
 	struct uihashhead *uipp;
 	struct uidinfo *uip;
 
 	mtx_assert(&uihashtbl_mtx, MA_OWNED);
 	uipp = UIHASH(uid);
 	LIST_FOREACH(uip, uipp, ui_hash)
 		if (uip->ui_uid == uid)
 			break;
 
 	return (uip);
 }
 
 /*
  * Find or allocate a struct uidinfo for a particular uid.
  * Increase refcount on uidinfo struct returned.
  * uifree() should be called on a struct uidinfo when released.
  */
 struct uidinfo *
 uifind(uid)
 	uid_t uid;
 {
 	struct uidinfo *old_uip, *uip;
 
 	mtx_lock(&uihashtbl_mtx);
 	uip = uilookup(uid);
 	if (uip == NULL) {
 		mtx_unlock(&uihashtbl_mtx);
 		uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
 		mtx_lock(&uihashtbl_mtx);
 		/*
 		 * There's a chance someone created our uidinfo while we
 		 * were in malloc and not holding the lock, so we have to
 		 * make sure we don't insert a duplicate uidinfo.
 		 */
 		if ((old_uip = uilookup(uid)) != NULL) {
 			/* Someone else beat us to it. */
 			free(uip, M_UIDINFO);
 			uip = old_uip;
 		} else {
 			uip->ui_mtxp = mtx_pool_alloc(mtxpool_sleep);
 			uip->ui_uid = uid;
 			LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
 		}
 	}
 	uihold(uip);
 	mtx_unlock(&uihashtbl_mtx);
 	return (uip);
 }
 
 /*
  * Place another refcount on a uidinfo struct.
  */
 void
 uihold(uip)
 	struct uidinfo *uip;
 {
 
 	UIDINFO_LOCK(uip);
 	uip->ui_ref++;
 	UIDINFO_UNLOCK(uip);
 }
 
 /*-
  * Since uidinfo structs have a long lifetime, we use an
  * opportunistic refcounting scheme to avoid locking the lookup hash
  * for each release.
  *
  * If the refcount hits 0, we need to free the structure,
  * which means we need to lock the hash.
  * Optimal case:
  *   After locking the struct and lowering the refcount, if we find
  *   that we don't need to free, simply unlock and return.
  * Suboptimal case:
  *   If refcount lowering results in need to free, bump the count
  *   back up, lose the lock and aquire the locks in the proper
  *   order to try again.
  */
 void
 uifree(uip)
 	struct uidinfo *uip;
 {
 
 	/* Prepare for optimal case. */
 	UIDINFO_LOCK(uip);
 
 	if (--uip->ui_ref != 0) {
 		UIDINFO_UNLOCK(uip);
 		return;
 	}
 
 	/* Prepare for suboptimal case. */
 	uip->ui_ref++;
 	UIDINFO_UNLOCK(uip);
 	mtx_lock(&uihashtbl_mtx);
 	UIDINFO_LOCK(uip);
 
 	/*
 	 * We must subtract one from the count again because we backed out
 	 * our initial subtraction before dropping the lock.
 	 * Since another thread may have added a reference after we dropped the
 	 * initial lock we have to test for zero again.
 	 */
 	if (--uip->ui_ref == 0) {
 		LIST_REMOVE(uip, ui_hash);
 		mtx_unlock(&uihashtbl_mtx);
 		if (uip->ui_sbsize != 0)
 			printf("freeing uidinfo: uid = %d, sbsize = %jd\n",
 			    uip->ui_uid, (intmax_t)uip->ui_sbsize);
 		if (uip->ui_proccnt != 0)
 			printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
 			    uip->ui_uid, uip->ui_proccnt);
 		UIDINFO_UNLOCK(uip);
 		FREE(uip, M_UIDINFO);
 		return;
 	}
 
 	mtx_unlock(&uihashtbl_mtx);
 	UIDINFO_UNLOCK(uip);
 }
 
 /*
  * Change the count associated with number of processes
  * a given user is using.  When 'max' is 0, don't enforce a limit
  */
 int
 chgproccnt(uip, diff, max)
 	struct	uidinfo	*uip;
 	int	diff;
 	int	max;
 {
 
 	UIDINFO_LOCK(uip);
 	/* Don't allow them to exceed max, but allow subtraction. */
 	if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
 		UIDINFO_UNLOCK(uip);
 		return (0);
 	}
 	uip->ui_proccnt += diff;
 	if (uip->ui_proccnt < 0)
 		printf("negative proccnt for uid = %d\n", uip->ui_uid);
 	UIDINFO_UNLOCK(uip);
 	return (1);
 }
 
 /*
  * Change the total socket buffer size a user has used.
  */
 int
 chgsbsize(uip, hiwat, to, max)
 	struct	uidinfo	*uip;
 	u_int  *hiwat;
 	u_int	to;
 	rlim_t	max;
 {
 	rlim_t new;
 
 	UIDINFO_LOCK(uip);
 	new = uip->ui_sbsize + to - *hiwat;
 	/* Don't allow them to exceed max, but allow subtraction. */
 	if (to > *hiwat && new > max) {
 		UIDINFO_UNLOCK(uip);
 		return (0);
 	}
 	uip->ui_sbsize = new;
 	UIDINFO_UNLOCK(uip);
 	*hiwat = to;
 	if (new < 0)
 		printf("negative sbsize for uid = %d\n", uip->ui_uid);
 	return (1);
 }
Index: head/sys/kern/kern_shutdown.c
===================================================================
--- head/sys/kern/kern_shutdown.c	(revision 167210)
+++ head/sys/kern/kern_shutdown.c	(revision 167211)
@@ -1,645 +1,641 @@
 /*-
  * Copyright (c) 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_shutdown.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kdb.h"
 #include "opt_mac.h"
 #include "opt_panic.h"
 #include "opt_show_busybufs.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/smp.h>		/* smp_active */
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 #include <sys/signalvar.h>
 
 #ifndef PANIC_REBOOT_WAIT_TIME
 #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
 #endif
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
  * ANSI and traditional C compilers.
  */
 #include <machine/stdarg.h>
 
 #ifdef KDB
 #ifdef KDB_UNATTENDED
 int debugger_on_panic = 0;
 #else
 int debugger_on_panic = 1;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW,
 	&debugger_on_panic, 0, "Run debugger on kernel panic");
 
 #ifdef KDB_TRACE
 int trace_on_panic = 1;
 #else
 int trace_on_panic = 0;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RW,
 	&trace_on_panic, 0, "Print stack trace on kernel panic");
 #endif /* KDB */
 
 int sync_on_panic = 0;
 SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW,
 	&sync_on_panic, 0, "Do a sync before rebooting from a panic");
 
 SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment");
 
 /*
  * Variable panicstr contains argument to first call to panic; used as flag
  * to indicate that the kernel has already called panic.
  */
 const char *panicstr;
 
 int dumping;				/* system is dumping */
 int rebooting;				/* system is rebooting */
 static struct dumperinfo dumper;	/* our selected dumper */
 
 /* Context information for dump-debuggers. */
 static struct pcb dumppcb;		/* Registers. */
 static lwpid_t dumptid;			/* Thread ID. */
 
 static void boot(int) __dead2;
 static void poweroff_wait(void *, int);
 static void shutdown_halt(void *junk, int howto);
 static void shutdown_panic(void *junk, int howto);
 static void shutdown_reset(void *junk, int howto);
 
 /* register various local shutdown events */
 static void
 shutdown_conf(void *unused)
 {
 
 	EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL,
 	    SHUTDOWN_PRI_FIRST);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL,
 	    SHUTDOWN_PRI_LAST + 100);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL,
 	    SHUTDOWN_PRI_LAST + 100);
 	EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL,
 	    SHUTDOWN_PRI_LAST + 200);
 }
 
 SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL)
 
 /*
- * The system call that results in a reboot
- *
- * MPSAFE
+ * The system call that results in a reboot.
  */
 /* ARGSUSED */
 int
 reboot(struct thread *td, struct reboot_args *uap)
 {
 	int error;
 
 	error = 0;
 #ifdef MAC
 	error = mac_check_system_reboot(td->td_ucred, uap->opt);
 #endif
 	if (error == 0)
 		error = priv_check(td, PRIV_REBOOT);
 	if (error == 0) {
 		mtx_lock(&Giant);
 		boot(uap->opt);
 		mtx_unlock(&Giant);
 	}
 	return (error);
 }
 
 /*
  * Called by events that want to shut down.. e.g  <CTL><ALT><DEL> on a PC
  */
 static int shutdown_howto = 0;
 
 void
 shutdown_nice(int howto)
 {
 
 	shutdown_howto = howto;
 
 	/* Send a signal to init(8) and have it shutdown the world */
 	if (initproc != NULL) {
 		PROC_LOCK(initproc);
 		psignal(initproc, SIGINT);
 		PROC_UNLOCK(initproc);
 	} else {
 		/* No init(8) running, so simply reboot */
 		boot(RB_NOSYNC);
 	}
 	return;
 }
 static int	waittime = -1;
 
 static void
 print_uptime(void)
 {
 	int f;
 	struct timespec ts;
 
 	getnanouptime(&ts);
 	printf("Uptime: ");
 	f = 0;
 	if (ts.tv_sec >= 86400) {
 		printf("%ldd", (long)ts.tv_sec / 86400);
 		ts.tv_sec %= 86400;
 		f = 1;
 	}
 	if (f || ts.tv_sec >= 3600) {
 		printf("%ldh", (long)ts.tv_sec / 3600);
 		ts.tv_sec %= 3600;
 		f = 1;
 	}
 	if (f || ts.tv_sec >= 60) {
 		printf("%ldm", (long)ts.tv_sec / 60);
 		ts.tv_sec %= 60;
 		f = 1;
 	}
 	printf("%lds\n", (long)ts.tv_sec);
 }
 
 static void
 doadump(void)
 {
 
 	/*
 	 * Sometimes people have to call this from the kernel debugger. 
 	 * (if 'panic' can not dump)
 	 * Give them a clue as to why they can't dump.
 	 */
 	if (dumper.dumper == NULL) {
 		printf("Cannot dump. No dump device defined.\n");
 		return;
 	}
 
 	savectx(&dumppcb);
 	dumptid = curthread->td_tid;
 	dumping++;
 	dumpsys(&dumper);
 }
 
 static int
 isbufbusy(struct buf *bp)
 {
 	if (((bp->b_flags & (B_INVAL | B_PERSISTENT)) == 0 &&
 	    BUF_REFCNT(bp) > 0) ||
 	    ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
 		return (1);
 	return (0);
 }
 
 /*
  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
  */
 static void
 boot(int howto)
 {
 	static int first_buf_printf = 1;
 
 #if defined(SMP)
 	/*
 	 * Bind us to CPU 0 so that all shutdown code runs there.  Some
 	 * systems don't shutdown properly (i.e., ACPI power off) if we
 	 * run on another processor.
 	 */
 	mtx_lock_spin(&sched_lock);
 	sched_bind(curthread, 0);
 	mtx_unlock_spin(&sched_lock);
 	KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0"));
 #endif
 	/* We're in the process of rebooting. */
 	rebooting = 1;
 
 	/* collect extra flags that shutdown_nice might have set */
 	howto |= shutdown_howto;
 
 	/* We are out of the debugger now. */
 	kdb_active = 0;
 
 	/*
 	 * Do any callouts that should be done BEFORE syncing the filesystems.
 	 */
 	EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
 
 	/* 
 	 * Now sync filesystems
 	 */
 	if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
 		register struct buf *bp;
 		int iter, nbusy, pbusy;
 #ifndef PREEMPTION
 		int subiter;
 #endif
 
 		waittime = 0;
 
 		sync(curthread, NULL);
 
 		/*
 		 * With soft updates, some buffers that are
 		 * written will be remarked as dirty until other
 		 * buffers are written.
 		 */
 		for (iter = pbusy = 0; iter < 20; iter++) {
 			nbusy = 0;
 			for (bp = &buf[nbuf]; --bp >= buf; )
 				if (isbufbusy(bp))
 					nbusy++;
 			if (nbusy == 0) {
 				if (first_buf_printf)
 					printf("All buffers synced.");
 				break;
 			}
 			if (first_buf_printf) {
 				printf("Syncing disks, buffers remaining... ");
 				first_buf_printf = 0;
 			}
 			printf("%d ", nbusy);
 			if (nbusy < pbusy)
 				iter = 0;
 			pbusy = nbusy;
 			sync(curthread, NULL);
 
 #ifdef PREEMPTION
 			/*
 			 * Drop Giant and spin for a while to allow
 			 * interrupt threads to run.
 			 */
 			DROP_GIANT();
 			DELAY(50000 * iter);
 			PICKUP_GIANT();
 #else
 			/*
 			 * Drop Giant and context switch several times to
 			 * allow interrupt threads to run.
 			 */
 			DROP_GIANT();
 			for (subiter = 0; subiter < 50 * iter; subiter++) {
 				mtx_lock_spin(&sched_lock);
 				mi_switch(SW_VOL, NULL);
 				mtx_unlock_spin(&sched_lock);
 				DELAY(1000);
 			}
 			PICKUP_GIANT();
 #endif
 		}
 		printf("\n");
 		/*
 		 * Count only busy local buffers to prevent forcing 
 		 * a fsck if we're just a client of a wedged NFS server
 		 */
 		nbusy = 0;
 		for (bp = &buf[nbuf]; --bp >= buf; ) {
 			if (isbufbusy(bp)) {
 #if 0
 /* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
 				if (bp->b_dev == NULL) {
 					TAILQ_REMOVE(&mountlist,
 					    bp->b_vp->v_mount, mnt_list);
 					continue;
 				}
 #endif
 				nbusy++;
 #if defined(SHOW_BUSYBUFS) || defined(DIAGNOSTIC)
 				printf(
 			    "%d: bufobj:%p, flags:%0x, blkno:%ld, lblkno:%ld\n",
 				    nbusy, bp->b_bufobj,
 				    bp->b_flags, (long)bp->b_blkno,
 				    (long)bp->b_lblkno);
 #endif
 			}
 		}
 		if (nbusy) {
 			/*
 			 * Failed to sync all blocks. Indicate this and don't
 			 * unmount filesystems (thus forcing an fsck on reboot).
 			 */
 			printf("Giving up on %d buffers\n", nbusy);
 			DELAY(5000000);	/* 5 seconds */
 		} else {
 			if (!first_buf_printf)
 				printf("Final sync complete\n");
 			/*
 			 * Unmount filesystems
 			 */
 			if (panicstr == 0)
 				vfs_unmountall();
 		}
 		swapoff_all();
 		DELAY(100000);		/* wait for console output to finish */
 	}
 
 	print_uptime();
 
 	/*
 	 * Ok, now do things that assume all filesystem activity has
 	 * been completed.
 	 */
 	EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
 
 	/* XXX This doesn't disable interrupts any more.  Reconsider? */
 	splhigh();
 
 	if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) 
 		doadump();
 
 	/* Now that we're going to really halt the system... */
 	EVENTHANDLER_INVOKE(shutdown_final, howto);
 
 	for(;;) ;	/* safety against shutdown_reset not working */
 	/* NOTREACHED */
 }
 
 /*
  * If the shutdown was a clean halt, behave accordingly.
  */
 static void
 shutdown_halt(void *junk, int howto)
 {
 
 	if (howto & RB_HALT) {
 		printf("\n");
 		printf("The operating system has halted.\n");
 		printf("Please press any key to reboot.\n\n");
 		switch (cngetc()) {
 		case -1:		/* No console, just die */
 			cpu_halt();
 			/* NOTREACHED */
 		default:
 			howto &= ~RB_HALT;
 			break;
 		}
 	}
 }
 
 /*
  * Check to see if the system paniced, pause and then reboot
  * according to the specified delay.
  */
 static void
 shutdown_panic(void *junk, int howto)
 {
 	int loop;
 
 	if (howto & RB_DUMP) {
 		if (PANIC_REBOOT_WAIT_TIME != 0) {
 			if (PANIC_REBOOT_WAIT_TIME != -1) {
 				printf("Automatic reboot in %d seconds - "
 				       "press a key on the console to abort\n",
 					PANIC_REBOOT_WAIT_TIME);
 				for (loop = PANIC_REBOOT_WAIT_TIME * 10;
 				     loop > 0; --loop) {
 					DELAY(1000 * 100); /* 1/10th second */
 					/* Did user type a key? */
 					if (cncheckc() != -1)
 						break;
 				}
 				if (!loop)
 					return;
 			}
 		} else { /* zero time specified - reboot NOW */
 			return;
 		}
 		printf("--> Press a key on the console to reboot,\n");
 		printf("--> or switch off the system now.\n");
 		cngetc();
 	}
 }
 
 /*
  * Everything done, now reset
  */
 static void
 shutdown_reset(void *junk, int howto)
 {
 
 	printf("Rebooting...\n");
 	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
 	/* cpu_boot(howto); */ /* doesn't do anything at the moment */
 	cpu_reset();
 	/* NOTREACHED */ /* assuming reset worked */
 }
 
 #ifdef SMP
 static u_int panic_cpu = NOCPU;
 #endif
 
 /*
  * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
  * and then reboots.  If we are called twice, then we avoid trying to sync
  * the disks as this often leads to recursive panics.
- *
- * MPSAFE
  */
 void
 panic(const char *fmt, ...)
 {
 	struct thread *td = curthread;
 	int bootopt, newpanic;
 	va_list ap;
 	static char buf[256];
 
 #ifdef SMP
 	/*
 	 * We don't want multiple CPU's to panic at the same time, so we
 	 * use panic_cpu as a simple spinlock.  We have to keep checking
 	 * panic_cpu if we are spinning in case the panic on the first
 	 * CPU is canceled.
 	 */
 	if (panic_cpu != PCPU_GET(cpuid))
 		while (atomic_cmpset_int(&panic_cpu, NOCPU,
 		    PCPU_GET(cpuid)) == 0)
 			while (panic_cpu != NOCPU)
 				; /* nothing */
 #endif
 
 	bootopt = RB_AUTOBOOT | RB_DUMP;
 	newpanic = 0;
 	if (panicstr)
 		bootopt |= RB_NOSYNC;
 	else {
 		panicstr = fmt;
 		newpanic = 1;
 	}
 
 	va_start(ap, fmt);
 	if (newpanic) {
 		(void)vsnprintf(buf, sizeof(buf), fmt, ap);
 		panicstr = buf;
 		printf("panic: %s\n", buf);
 	} else {
 		printf("panic: ");
 		vprintf(fmt, ap);
 		printf("\n");
 	}
 	va_end(ap);
 #ifdef SMP
 	printf("cpuid = %d\n", PCPU_GET(cpuid));
 #endif
 
 #ifdef KDB
 	if (newpanic && trace_on_panic)
 		kdb_backtrace();
 	if (debugger_on_panic)
 		kdb_enter("panic");
 #ifdef RESTARTABLE_PANICS
 	/* See if the user aborted the panic, in which case we continue. */
 	if (panicstr == NULL) {
 #ifdef SMP
 		atomic_store_rel_int(&panic_cpu, NOCPU);
 #endif
 		return;
 	}
 #endif
 #endif
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_INPANIC;
 	mtx_unlock_spin(&sched_lock);
 	if (!sync_on_panic)
 		bootopt |= RB_NOSYNC;
 	boot(bootopt);
 }
 
 /*
  * Support for poweroff delay.
  */
 #ifndef POWEROFF_DELAY
 # define POWEROFF_DELAY 5000
 #endif
 static int poweroff_delay = POWEROFF_DELAY;
 
 SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
 	&poweroff_delay, 0, "");
 
 static void
 poweroff_wait(void *junk, int howto)
 {
 
 	if (!(howto & RB_POWEROFF) || poweroff_delay <= 0)
 		return;
 	DELAY(poweroff_delay * 1000);
 }
 
 /*
  * Some system processes (e.g. syncer) need to be stopped at appropriate
  * points in their main loops prior to a system shutdown, so that they
  * won't interfere with the shutdown process (e.g. by holding a disk buf
  * to cause sync to fail).  For each of these system processes, register
  * shutdown_kproc() as a handler for one of shutdown events.
  */
 static int kproc_shutdown_wait = 60;
 SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
     &kproc_shutdown_wait, 0, "");
 
 void
 kproc_shutdown(void *arg, int howto)
 {
 	struct proc *p;
 	char procname[MAXCOMLEN + 1];
 	int error;
 
 	if (panicstr)
 		return;
 
 	p = (struct proc *)arg;
 	strlcpy(procname, p->p_comm, sizeof(procname));
 	printf("Waiting (max %d seconds) for system process `%s' to stop...",
 	    kproc_shutdown_wait, procname);
 	error = kthread_suspend(p, kproc_shutdown_wait * hz);
 
 	if (error == EWOULDBLOCK)
 		printf("timed out\n");
 	else
 		printf("done\n");
 }
 
 /* Registration of dumpers */
 int
 set_dumper(struct dumperinfo *di)
 {
 
 	if (di == NULL) {
 		bzero(&dumper, sizeof dumper);
 		return (0);
 	}
 	if (dumper.dumper != NULL)
 		return (EBUSY);
 	dumper = *di;
 	return (0);
 }
 
 #if defined(__powerpc__)
 void
 dumpsys(struct dumperinfo *di __unused)
 {
 
 	printf("Kernel dumps not implemented on this architecture\n");
 }
 #endif
Index: head/sys/kern/kern_sig.c
===================================================================
--- head/sys/kern/kern_sig.c	(revision 167210)
+++ head/sys/kern/kern_sig.c	(revision 167211)
@@ -1,3366 +1,3299 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/condvar.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 static int	coredump(struct thread *);
 static char	*expand_name(const char *, uid_t, pid_t);
 static int	killpg1(struct thread *td, int sig, int pgid, int all);
 static int	issignal(struct thread *p);
 static int	sigprop(int sig);
 static void	tdsigwakeup(struct thread *, int, sig_t, int);
 static void	sig_suspend_threads(struct thread *, struct proc *, int);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, int prop);
 #ifdef KSE
 static int	do_tdsignal(struct proc *, struct thread *, int, ksiginfo_t *);
 #endif
 static void	sigqueue_start(void);
 
 static uma_zone_t	ksiginfo_zone = NULL;
 struct filterops sig_filtops =
 	{ 0, filt_sigattach, filt_sigdetach, filt_signal };
 
 static int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, 
     &kern_logsigexit, 0, 
     "Log processes quitting on abnormal signals to syslog(3)");
 
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
 
 SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal");
 
 static int	max_pending_per_proc = 128;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
     &max_pending_per_proc, 0, "Max pending signals per proc");
 
 static int	preallocate_siginfo = 1024;
 TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
     &preallocate_siginfo, 0, "Preallocated signal memory size");
 
 static int	signal_overflow = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
     &signal_overflow, 0, "Number of signals overflew");
 
 static int	signal_alloc_fail = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
     &signal_alloc_fail, 0, "signals failed to be allocated");
 
 SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
  * in the right situations.
  */
 #define CANSIGIO(cr1, cr2) \
 	((cr1)->cr_uid == 0 || \
 	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
 	    (cr1)->cr_uid == (cr2)->cr_ruid || \
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
 int sugid_coredump;
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, 
     &sugid_coredump, 0, "Enable coredumping set user/group ID processes");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 static int	set_core_nodump_flag = 0;
 SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
 	0, "Enable setting the NODUMP flag on coredump files");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SA_KILL		0x01		/* terminates process by default */
 #define	SA_CORE		0x02		/* ditto and coredumps */
 #define	SA_STOP		0x04		/* suspend process */
 #define	SA_TTYSTOP	0x08		/* ditto, from tty */
 #define	SA_IGNORE	0x10		/* ignore by default */
 #define	SA_CONT		0x20		/* continue if suspended */
 #define	SA_CANTMASK	0x40		/* non-maskable, catchable */
 #define	SA_PROC		0x80		/* deliverable to any thread */
 
 static int sigproptbl[NSIG] = {
         SA_KILL|SA_PROC,		/* SIGHUP */
         SA_KILL|SA_PROC,		/* SIGINT */
         SA_KILL|SA_CORE|SA_PROC,	/* SIGQUIT */
         SA_KILL|SA_CORE,		/* SIGILL */
         SA_KILL|SA_CORE,		/* SIGTRAP */
         SA_KILL|SA_CORE,		/* SIGABRT */
         SA_KILL|SA_CORE|SA_PROC,	/* SIGEMT */
         SA_KILL|SA_CORE,		/* SIGFPE */
         SA_KILL|SA_PROC,		/* SIGKILL */
         SA_KILL|SA_CORE,		/* SIGBUS */
         SA_KILL|SA_CORE,		/* SIGSEGV */
         SA_KILL|SA_CORE,		/* SIGSYS */
         SA_KILL|SA_PROC,		/* SIGPIPE */
         SA_KILL|SA_PROC,		/* SIGALRM */
         SA_KILL|SA_PROC,		/* SIGTERM */
         SA_IGNORE|SA_PROC,		/* SIGURG */
         SA_STOP|SA_PROC,		/* SIGSTOP */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTSTP */
         SA_IGNORE|SA_CONT|SA_PROC,	/* SIGCONT */
         SA_IGNORE|SA_PROC,		/* SIGCHLD */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTTIN */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTTOU */
         SA_IGNORE|SA_PROC,		/* SIGIO */
         SA_KILL,			/* SIGXCPU */
         SA_KILL,			/* SIGXFSZ */
         SA_KILL|SA_PROC,		/* SIGVTALRM */
         SA_KILL|SA_PROC,		/* SIGPROF */
         SA_IGNORE|SA_PROC,		/* SIGWINCH  */
         SA_IGNORE|SA_PROC,		/* SIGINFO */
         SA_KILL|SA_PROC,		/* SIGUSR1 */
         SA_KILL|SA_PROC,		/* SIGUSR2 */
 };
 
 static void
 sigqueue_start(void)
 {
 	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
 	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
 	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
 	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
 }
 
 ksiginfo_t *
 ksiginfo_alloc(int wait)
 {
 	int flags;
 
 	flags = M_ZERO;
 	if (! wait)
 		flags |= M_NOWAIT;
 	if (ksiginfo_zone != NULL)
 		return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
 	return (NULL);
 }
 
 void
 ksiginfo_free(ksiginfo_t *ksi)
 {
 	uma_zfree(ksiginfo_zone, ksi);
 }
 
 static __inline int
 ksiginfo_tryfree(ksiginfo_t *ksi)
 {
 	if (!(ksi->ksi_flags & KSI_EXT)) {
 		uma_zfree(ksiginfo_zone, ksi);
 		return (1);
 	}
 	return (0);
 }
 
 void
 sigqueue_init(sigqueue_t *list, struct proc *p)
 {
 	SIGEMPTYSET(list->sq_signals);
 	SIGEMPTYSET(list->sq_kill);
 	TAILQ_INIT(&list->sq_list);
 	list->sq_proc = p;
 	list->sq_flags = SQ_INIT;
 }
 
 /*
  * Get a signal's ksiginfo.
  * Return:
  * 	0	-	signal not found
  *	others	-	signal number
  */ 
 int
 sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi, *next;
 	int count = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (!SIGISMEMBER(sq->sq_signals, signo))
 		return (0);
 
 	if (SIGISMEMBER(sq->sq_kill, signo)) {
 		count++;
 		SIGDELSET(sq->sq_kill, signo);
 	}
 
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (ksi->ksi_signo == signo) {
 			if (count == 0) {
 				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 				ksi->ksi_sigq = NULL;
 				ksiginfo_copy(ksi, si);
 				if (ksiginfo_tryfree(ksi) && p != NULL)
 					p->p_pendingcnt--;
 			}
 			if (++count > 1)
 				break;
 		}
 	}
 
 	if (count <= 1)
 		SIGDELSET(sq->sq_signals, signo);
 	si->ksi_signo = signo;
 	return (signo);
 }
 
 void
 sigqueue_take(ksiginfo_t *ksi)
 {
 	struct ksiginfo *kp;
 	struct proc	*p;
 	sigqueue_t	*sq;
 
 	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
 		return;
 
 	p = sq->sq_proc;
 	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 	ksi->ksi_sigq = NULL;
 	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
 		p->p_pendingcnt--;
 
 	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
 	     kp = TAILQ_NEXT(kp, ksi_link)) {
 		if (kp->ksi_signo == ksi->ksi_signo)
 			break;
 	}
 	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
 		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
 }
 
 int
 sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi;
 	int ret = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 	
 	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	/* directly insert the ksi, don't copy it */
 	if (si->ksi_flags & KSI_INS) {
 		TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
 		si->ksi_sigq = sq;
 		goto out_set_bit;
 	}
 
 	if (__predict_false(ksiginfo_zone == NULL)) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 	
 	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
 		signal_overflow++;
 		ret = EAGAIN;
 	} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
 		signal_alloc_fail++;
 		ret = EAGAIN;
 	} else {
 		if (p != NULL)
 			p->p_pendingcnt++;
 		ksiginfo_copy(si, ksi);
 		ksi->ksi_signo = signo;
 		TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = sq;
 	}
 
 	if ((si->ksi_flags & KSI_TRAP) != 0) {
 		if (ret != 0)
 			SIGADDSET(sq->sq_kill, signo);
 		ret = 0;
 		goto out_set_bit;
 	}
 
 	if (ret != 0)
 		return (ret);
 	
 out_set_bit:
 	SIGADDSET(sq->sq_signals, signo);
 	return (ret);
 }
 
 void
 sigqueue_flush(sigqueue_t *sq)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (p != NULL)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
 		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = NULL;
 		if (ksiginfo_tryfree(ksi) && p != NULL)
 			p->p_pendingcnt--;
 	}
 
 	SIGEMPTYSET(sq->sq_signals);
 	SIGEMPTYSET(sq->sq_kill);
 }
 
 void
 sigqueue_collect_set(sigqueue_t *sq, sigset_t *set)
 {
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	TAILQ_FOREACH(ksi, &sq->sq_list, ksi_link)
 		SIGADDSET(*set, ksi->ksi_signo);
 	SIGSETOR(*set, sq->sq_kill);
 }
 
 void
 sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, sigset_t *setp)
 {
 	sigset_t tmp, set;
 	struct proc *p1, *p2;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
 	/*
 	 * make a copy, this allows setp to point to src or dst
 	 * sq_signals without trouble.
 	 */
 	set = *setp;
 	p1 = src->sq_proc;
 	p2 = dst->sq_proc;
 	/* Move siginfo to target list */
 	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
 			if (p1 != NULL)
 				p1->p_pendingcnt--;
 			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = dst;
 			if (p2 != NULL)
 				p2->p_pendingcnt++;
 		}
 	}
 
 	/* Move pending bits to target list */
 	tmp = src->sq_kill;
 	SIGSETAND(tmp, set);
 	SIGSETOR(dst->sq_kill, tmp);
 	SIGSETNAND(src->sq_kill, tmp);
 
 	tmp = src->sq_signals;
 	SIGSETAND(tmp, set);
 	SIGSETOR(dst->sq_signals, tmp);
 	SIGSETNAND(src->sq_signals, tmp);
 
 	/* Finally, rescan src queue and set pending bits for it */
 	sigqueue_collect_set(src, &src->sq_signals);
 }
 
 void
 sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_move_set(src, dst, &set);
 }
 
 void
 sigqueue_delete_set(sigqueue_t *sq, sigset_t *set)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 
 	/* Remove siginfo queue */
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = NULL;
 			if (ksiginfo_tryfree(ksi) && p != NULL)
 				p->p_pendingcnt--;
 		}
 	}
 	SIGSETNAND(sq->sq_kill, *set);
 	SIGSETNAND(sq->sq_signals, *set);
 	/* Finally, rescan queue and set pending bits for it */
 	sigqueue_collect_set(sq, &sq->sq_signals);
 }
 
 void
 sigqueue_delete(sigqueue_t *sq, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set(sq, &set);
 }
 
 /* Remove a set of signals for a process */
 void
 sigqueue_delete_set_proc(struct proc *p, sigset_t *set)
 {
 	sigqueue_t worklist;
 	struct thread *td0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_init(&worklist, NULL);
 	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
 
 	mtx_lock_spin(&sched_lock);
 	FOREACH_THREAD_IN_PROC(p, td0)
 		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
 	mtx_unlock_spin(&sched_lock);
 
 	sigqueue_flush(&worklist);
 }
 
 void
 sigqueue_delete_proc(struct proc *p, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 void
 sigqueue_delete_stopmask_proc(struct proc *p)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, SIGSTOP);
 	SIGADDSET(set, SIGTSTP);
 	SIGADDSET(set, SIGTTIN);
 	SIGADDSET(set, SIGTTOU);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 /*
  * Determine signal that should be delivered to process p, the current
  * process, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
  *
  * MP SAFE.
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
  * mode.  This must be called whenever a signal is added to td_sigqueue or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 	struct proc *p;
 #ifdef KSE
 	sigset_t set, saved;
 #else
 	sigset_t set;
 #endif
 
 	p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * If our mask changed we may have to move signal that were
 	 * previously masked by all threads to our sigqueue.
 	 */
 	set = p->p_sigqueue.sq_signals;
 #ifdef KSE
 	if (p->p_flag & P_SA)
 		saved = p->p_sigqueue.sq_signals;
 #endif
 	SIGSETNAND(set, td->td_sigmask);
 	if (! SIGISEMPTY(set))
 		sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set);
 	if (SIGPENDING(td)) {
 		mtx_lock_spin(&sched_lock);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
 		mtx_unlock_spin(&sched_lock);
 	}
 #ifdef KSE
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
 		if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
 			/* pending set changed */
 			p->p_flag |= P_SIGEVENT;
 			wakeup(&p->p_siglist);
 		}
 	}
 #endif
 }
 
 int
 sigonstack(size_t sp)
 {
 	struct thread *td = curthread;
 
 	return ((td->td_pflags & TDP_ALTSTACK) ?
 #if defined(COMPAT_43)
 	    ((td->td_sigstk.ss_size == 0) ?
 		(td->td_sigstk.ss_flags & SS_ONSTACK) :
 		((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size))
 #else
 	    ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)
 #endif
 	    : 0);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < NSIG)
 		return (sigproptbl[_SIG_IDX(sig)]);
 	return (0);
 }
 
 int
 sig_ffs(sigset_t *set)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++)
 		if (set->__bits[i])
 			return (ffs(set->__bits[i]) + (i * 32));
 	return (0);
 }
 
 /*
  * kern_sigaction
  * sigaction
  * freebsd4_sigaction
  * osigaction
- *
- * MPSAFE
  */
 int
 kern_sigaction(td, sig, act, oact, flags)
 	struct thread *td;
 	register int sig;
 	struct sigaction *act, *oact;
 	int flags;
 {
 	struct sigacts *ps;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if (oact) {
 		oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		oact->sa_flags = 0;
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig))
 			oact->sa_flags |= SA_SIGINFO;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL) {
 			mtx_unlock(&ps->ps_mtx);
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 
 		/*
 		 * Change setting atomically.
 		 */
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (act->sa_flags & SA_SIGINFO) {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!(act->sa_flags & SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (act->sa_flags & SA_ONSTACK)
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (act->sa_flags & SA_RESETHAND)
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (act->sa_flags & SA_NODEFER)
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				ps->ps_flag |= PS_NOCLDSTOP;
 			else
 				ps->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					ps->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					ps->ps_flag |= PS_NOCLDWAIT;
 			} else
 				ps->ps_flag &= ~PS_NOCLDWAIT;
 			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 				ps->ps_flag |= PS_CLDSIGIGN;
 			else
 				ps->ps_flag &= ~PS_CLDSIGIGN;
 		}
 		/*
 		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SA_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 #ifdef KSE
 			if ((p->p_flag & P_SA) &&
 			     SIGISMEMBER(p->p_sigqueue.sq_signals, sig)) {
 				p->p_flag |= P_SIGEVENT;
 				wakeup(&p->p_siglist);
 			}
 #endif
 			/* never to be seen again */
 			sigqueue_delete_proc(p, sig);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 		} else {
 			SIGDELSET(ps->ps_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(ps->ps_sigcatch, sig);
 			else
 				SIGADDSET(ps->ps_sigcatch, sig);
 		}
 #ifdef COMPAT_FREEBSD4
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_FREEBSD4) == 0)
 			SIGDELSET(ps->ps_freebsd4, sig);
 		else
 			SIGADDSET(ps->ps_freebsd4, sig);
 #endif
 #ifdef COMPAT_43
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_OSIGSET) == 0)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 #endif
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 sigaction(td, uap)
 	struct thread *td;
 	register struct sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 freebsd4_sigaction(td, uap)
 	struct thread *td;
 	register struct freebsd4_sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 #endif	/* COMAPT_FREEBSD4 */
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 osigaction(td, uap)
 	struct thread *td;
 	register struct osigaction_args *uap;
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 
 #if !defined(__i386__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args *uap;
 {
 
 	return (nosys(td, (struct nosys_args *)uap));
 }
 #endif
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(p)
 	struct proc *p;
 {
 	register int i;
 	struct sigacts *ps;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	for (i = 1; i <= NSIG; i++)
 		if (sigprop(i) & SA_IGNORE && i != SIGCONT)
 			SIGADDSET(ps->ps_sigignore, i);
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(struct proc *p)
 {
 	struct sigacts *ps;
 	int sig;
 	struct thread *td;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through td_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	while (SIGNOTEMPTY(ps->ps_sigcatch)) {
 		sig = sig_ffs(&ps->ps_sigcatch);
 		SIGDELSET(ps->ps_sigcatch, sig);
 		if (sigprop(sig) & SA_IGNORE) {
 			if (sig != SIGCONT)
 				SIGADDSET(ps->ps_sigignore, sig);
 			sigqueue_delete_proc(p, sig);
 		}
 		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	}
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	td->td_sigstk.ss_flags = SS_DISABLE;
 	td->td_sigstk.ss_size = 0;
 	td->td_sigstk.ss_sp = 0;
 	td->td_pflags &= ~TDP_ALTSTACK;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
 	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
 	mtx_unlock(&ps->ps_mtx);
 }
 
 /*
  * kern_sigprocmask()
  *
  *	Manipulate signal mask.
  */
 int
 kern_sigprocmask(td, how, set, oset, old)
 	struct thread *td;
 	int how;
 	sigset_t *set, *oset;
 	int old;
 {
 	int error;
 
 	PROC_LOCK(td->td_proc);
 	if (oset != NULL)
 		*oset = td->td_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			SIGSETOR(td->td_sigmask, *set);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(td->td_sigmask, *set);
 			signotify(td);
 			break;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			if (old)
 				SIGSETLO(td->td_sigmask, *set);
 			else
 				td->td_sigmask = *set;
 			signotify(td);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 	}
 	PROC_UNLOCK(td->td_proc);
 	return (error);
 }
 
 /*
  * sigprocmask() - MP SAFE
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sigprocmask(td, uap)
 	register struct thread *td;
 	struct sigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * osigprocmask() - MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(td, uap)
 	register struct thread *td;
 	struct osigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
-/*
- * MPSAFE
- */
 int
 sigwait(struct thread *td, struct sigwait_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error) {
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error) {
 		if (error == ERESTART)
 			return (error);
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
 	td->td_retval[0] = error;
 	return (0);
 }
-/*
- * MPSAFE
- */
+
 int
 sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 	
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
 	struct timespec *timeout)
 {
 	struct sigacts *ps;
 	sigset_t savedmask;
 	struct proc *p;
 	int error, sig, hz, i, timevalid = 0;
 	struct timespec rts, ets, ts;
 	struct timeval tv;
 
 	p = td->td_proc;
 	error = 0;
 	sig = 0;
 	SIG_CANTMASK(waitset);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	savedmask = td->td_sigmask;
 	if (timeout) {
 		if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
 			timevalid = 1;
 			getnanouptime(&rts);
 		 	ets = rts;
 			timespecadd(&ets, timeout);
 		}
 	}
 
 restart:
 	for (i = 1; i <= _SIG_MAXSIG; ++i) {
 		if (!SIGISMEMBER(waitset, i))
 			continue;
 		if (!SIGISMEMBER(td->td_sigqueue.sq_signals, i)) {
 			if (SIGISMEMBER(p->p_sigqueue.sq_signals, i)) {
 #ifdef KSE
 				if (p->p_flag & P_SA) {
 					p->p_flag |= P_SIGEVENT;
 					wakeup(&p->p_siglist);
 				}
 #endif
 				sigqueue_move(&p->p_sigqueue,
 					&td->td_sigqueue, i);
 			} else
 				continue;
 		}
 
 		SIGFILLSET(td->td_sigmask);
 		SIG_CANTMASK(td->td_sigmask);
 		SIGDELSET(td->td_sigmask, i);
 		mtx_lock(&ps->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&ps->ps_mtx);
 		if (sig)
 			goto out;
 		else {
 			/*
 			 * Because cursig() may have stopped current thread,
 			 * after it is resumed, things may have already been 
 			 * changed, it should rescan any pending signals.
 			 */
 			goto restart;
 		}
 	}
 
 	if (error)
 		goto out;
 
 	/*
 	 * POSIX says this must be checked after looking for pending
 	 * signals.
 	 */
 	if (timeout) {
 		if (!timevalid) {
 			error = EINVAL;
 			goto out;
 		}
 		getnanouptime(&rts);
 		if (timespeccmp(&rts, &ets, >=)) {
 			error = EAGAIN;
 			goto out;
 		}
 		ts = ets;
 		timespecsub(&ts, &rts);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		hz = tvtohz(&tv);
 	} else
 		hz = 0;
 
 	td->td_sigmask = savedmask;
 	SIGSETNAND(td->td_sigmask, waitset);
 	signotify(td);
 	error = msleep(&ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", hz);
 	if (timeout) {
 		if (error == ERESTART) {
 			/* timeout can not be restarted. */
 			error = EINTR;
 		} else if (error == EAGAIN) {
 			/* will calculate timeout by ourself. */
 			error = 0;
 		}
 	}
 	goto restart;
 
 out:
 	td->td_sigmask = savedmask;
 	signotify(td);
 	if (sig) {
 		ksiginfo_init(ksi);
 		sigqueue_get(&td->td_sigqueue, sig, ksi);
 		ksi->ksi_signo = sig;
 		if (ksi->ksi_code == SI_TIMER)
 			itimer_accept(p, ksi->ksi_timerid, ksi);
 		error = 0;
 
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_PSIG)) {
 			sig_t action;
 
 			mtx_lock(&ps->ps_mtx);
 			action = ps->ps_sigact[_SIG_IDX(sig)];
 			mtx_unlock(&ps->ps_mtx);
 			ktrpsig(sig, action, &td->td_sigmask, 0);
 		}
 #endif
 		if (sig == SIGKILL)
 			sigexit(td, sig);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 sigpending(td, uap)
 	struct thread *td;
 	struct sigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	return (copyout(&pending, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 osigpending(td, uap)
 	struct thread *td;
 	struct osigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	SIG2OSIG(pending, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 osigvec(td, uap)
 	struct thread *td;
 	register struct osigvec_args *uap;
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 osigblock(td, uap)
 	register struct thread *td;
 	struct osigblock_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	PROC_LOCK(p);
 	SIG2OSIG(td->td_sigmask, td->td_retval[0]);
 	SIGSETOR(td->td_sigmask, set);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 osigsetmask(td, uap)
 	struct thread *td;
 	struct osigsetmask_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	PROC_LOCK(p);
 	SIG2OSIG(td->td_sigmask, td->td_retval[0]);
 	SIGSETLO(td->td_sigmask, set);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Suspend calling thread until signal, providing mask to be set
  * in the meantime. 
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 sigsuspend(td, uap)
 	struct thread *td;
 	struct sigsuspend_args *uap;
 {
 	sigset_t mask;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 	return (kern_sigsuspend(td, mask));
 }
 
 int
 kern_sigsuspend(struct thread *td, sigset_t mask)
 {
 	struct proc *p = td->td_proc;
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	PROC_LOCK(p);
 	td->td_oldsigmask = td->td_sigmask;
 	td->td_pflags |= TDP_OLDMASK;
 	SIG_CANTMASK(mask);
 	td->td_sigmask = mask;
 	signotify(td);
 	while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0)
 		/* void */;
 	PROC_UNLOCK(p);
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * Compatibility sigsuspend call for old binaries.  Note nonstandard calling
  * convention: libc stub passes mask, not pointer, to save a copyin.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 osigsuspend(td, uap)
 	struct thread *td;
 	struct osigsuspend_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t mask;
 
 	PROC_LOCK(p);
 	td->td_oldsigmask = td->td_sigmask;
 	td->td_pflags |= TDP_OLDMASK;
 	OSIG2SIG(uap->mask, mask);
 	SIG_CANTMASK(mask);
 	SIGSETLO(td->td_sigmask, mask);
 	signotify(td);
 	while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0)
 		/* void */;
 	PROC_UNLOCK(p);
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 osigstack(td, uap)
 	struct thread *td;
 	register struct osigstack_args *uap;
 {
 	struct sigstack nss, oss;
 	int error = 0;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &nss, sizeof(nss));
 		if (error)
 			return (error);
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (uap->nss != NULL) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(oss));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 sigaltstack(td, uap)
 	struct thread *td;
 	register struct sigaltstack_args *uap;
 {
 	stack_t ss, oss;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &ss, sizeof(ss));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
 	    (uap->oss != NULL) ? &oss : NULL);
 	if (error)
 		return (error);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(stack_t));
 	return (error);
 }
 
 int
 kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
 {
 	struct proc *p = td->td_proc;
 	int oonstack;
 
 	oonstack = sigonstack(cpu_getstack(td));
 
 	if (oss != NULL) {
 		*oss = td->td_sigstk;
 		oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	}
 
 	if (ss != NULL) {
 		if (oonstack)
 			return (EPERM);
 		if ((ss->ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss->ss_flags & SS_DISABLE)) {
 			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
 
 			td->td_sigstk = *ss;
 			td->td_pflags |= TDP_ALTSTACK;
 		} else {
 			td->td_pflags &= ~TDP_ALTSTACK;
 		}
 	}
 	return (0);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * cp is calling process.
  */
 static int
 killpg1(td, sig, pgid, all)
 	register struct thread *td;
 	int sig, pgid, all;
 {
 	register struct proc *p;
 	struct pgrp *pgrp;
 	int nfound = 0;
 
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p == td->td_proc || p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p_cansignal(td, p, sig) == 0) {
 				nfound++;
 				if (sig)
 					psignal(p, sig);
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 	} else {
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = td->td_proc->p_pgrp;
 			PGRP_LOCK(pgrp);
 		} else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL) {
 				sx_sunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);	      
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 				p->p_state == PRS_NEW ) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p_cansignal(td, p, sig) == 0) {
 				nfound++;
 				if (sig)
 					psignal(p, sig);
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pgrp);
 	}
 	return (nfound ? 0 : ESRCH);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 kill(td, uap)
 	register struct thread *td;
 	register struct kill_args *uap;
 {
 	register struct proc *p;
 	int error;
 
 	AUDIT_ARG(signum, uap->signum);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	if (uap->pid > 0) {
 		/* kill single process */
 		if ((p = pfind(uap->pid)) == NULL) {
 			if ((p = zpfind(uap->pid)) == NULL)
 				return (ESRCH);
 		}
 		AUDIT_ARG(process, p);
 		error = p_cansignal(td, p, uap->signum);
 		if (error == 0 && uap->signum)
 			psignal(p, uap->signum);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	AUDIT_ARG(pid, uap->pid);
 	switch (uap->pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(td, uap->signum, 0, 1));
 	case 0:			/* signal own process group */
 		return (killpg1(td, uap->signum, 0, 0));
 	default:		/* negative explicit process group */
 		return (killpg1(td, uap->signum, -uap->pid, 0));
 	}
 	/* NOTREACHED */
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 okillpg(td, uap)
 	struct thread *td;
 	register struct okillpg_args *uap;
 {
 
 	AUDIT_ARG(signum, uap->signum);
 	AUDIT_ARG(pid, uap->pgid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	return (killpg1(td, uap->signum, uap->pgid, 0));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigqueue_args {
 	pid_t pid;
 	int signum;
 	/* union sigval */ void *value;
 };
 #endif
 
 int
 sigqueue(struct thread *td, struct sigqueue_args *uap)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	/*
 	 * Specification says sigqueue can only send signal to
 	 * single process.
 	 */
 	if (uap->pid <= 0)
 		return (EINVAL);
 
 	if ((p = pfind(uap->pid)) == NULL) {
 		if ((p = zpfind(uap->pid)) == NULL)
 			return (ESRCH);
 	}
 	error = p_cansignal(td, p, uap->signum);
 	if (error == 0 && uap->signum != 0) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = uap->signum;
 		ksi.ksi_code = SI_QUEUE;
 		ksi.ksi_pid = td->td_proc->p_pid;
 		ksi.ksi_uid = td->td_ucred->cr_ruid;
 		ksi.ksi_value.sival_ptr = uap->value;
 		error = tdsignal(p, NULL, ksi.ksi_signo, &ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Send a signal to a process group.
  */
 void
 gsignal(pgid, sig)
 	int pgid, sig;
 {
 	struct pgrp *pgrp;
 
 	if (pgid != 0) {
 		sx_slock(&proctree_lock);
 		pgrp = pgfind(pgid);
 		sx_sunlock(&proctree_lock);
 		if (pgrp != NULL) {
 			pgsignal(pgrp, sig, 0);
 			PGRP_UNLOCK(pgrp);
 		}
 	}
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(pgrp, sig, checkctty)
 	struct pgrp *pgrp;
 	int sig, checkctty;
 {
 	register struct proc *p;
 
 	if (pgrp) {
 		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (checkctty == 0 || p->p_flag & P_CONTROLT)
 				psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 /*
- * Send a signal caused by a trap to the current thread.
- * If it will be caught immediately, deliver it with correct code.
- * Otherwise, post it normally.
- *
- * MPSAFE
+ * Send a signal caused by a trap to the current thread.  If it will be
+ * caught immediately, deliver it with correct code.  Otherwise, post it
+ * normally.
  */
 void
 trapsignal(struct thread *td, ksiginfo_t *ksi)
 {
 	struct sigacts *ps;
 	struct proc *p;
 #ifdef KSE
 	int error;
 #endif
 	int sig;
 	int code;
 
 	p = td->td_proc;
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	KASSERT(_SIG_VALID(sig), ("invalid signal"));
 
 #ifdef KSE
 	if (td->td_pflags & TDP_SA) {
 		if (td->td_mailbox == NULL)
 			thread_user_enter(td);
 		PROC_LOCK(p);
 		SIGDELSET(td->td_sigmask, sig);
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Force scheduling an upcall, so UTS has chance to
 		 * process the signal before thread runs again in
 		 * userland.
 		 */
 		if (td->td_upcall)
 			td->td_upcall->ku_flags |= KUF_DOUPCALL;
 		mtx_unlock_spin(&sched_lock);
 	} else {
 		PROC_LOCK(p);
 	}
 #else
 	PROC_LOCK(p);
 #endif
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(td->td_sigmask, sig)) {
 		p->p_stats->p_ru.ru_nsignals++;
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, code);
 #endif
 #ifdef KSE
 		if (!(td->td_pflags & TDP_SA))
 			(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], 
 				ksi, &td->td_sigmask);
 #else
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], 
 				ksi, &td->td_sigmask);
 #endif
 #ifdef KSE
 		else if (td->td_mailbox == NULL) {
 			mtx_unlock(&ps->ps_mtx);
 			/* UTS caused a sync signal */
 			p->p_code = code;	/* XXX for core dump/debugger */
 			p->p_sig = sig;		/* XXX to verify code */
 			sigexit(td, sig);
 		} else {
 			mtx_unlock(&ps->ps_mtx);
 			SIGADDSET(td->td_sigmask, sig);
 			PROC_UNLOCK(p);
 			error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig,
 			    sizeof(siginfo_t));
 			PROC_LOCK(p);
 			/* UTS memory corrupted */
 			if (error)
 				sigexit(td, SIGSEGV);
 			mtx_lock(&ps->ps_mtx);
 		}
 #endif
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		/*
 		 * Avoid a possible infinite loop if the thread
 		 * masking the signal or process is ignoring the
 		 * signal.
 		 */
 		if (kern_forcesigexit &&
 		    (SIGISMEMBER(td->td_sigmask, sig) ||
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
 			SIGDELSET(td->td_sigmask, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 			SIGDELSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 		p->p_code = code;	/* XXX for core dump/debugger */
 		p->p_sig = sig;		/* XXX to verify code */
 		tdsignal(p, td, sig, ksi);
 	}
 	PROC_UNLOCK(p);
 }
 
 static struct thread *
 sigtd(struct proc *p, int sig, int prop)
 {
 	struct thread *td, *signal_td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Check if current thread can handle the signal without
 	 * switching conetxt to another thread.
 	 */
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
 		return (curthread);
 	signal_td = NULL;
 	mtx_lock_spin(&sched_lock);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig)) {
 			signal_td = td;
 			break;
 		}
 	}
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
 	mtx_unlock_spin(&sched_lock);
 	return (signal_td);
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
- *
- * MPSAFE
  */
 void
 psignal(struct proc *p, int sig)
 {
 	(void) tdsignal(p, NULL, sig, NULL);
 }
 
 int
 psignal_event(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
 {
 	struct thread *td = NULL;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	KASSERT(!KSI_ONQ(ksi), ("psignal_event: ksi on queue"));
 
 	/*
 	 * ksi_code and other fields should be set before
 	 * calling this function.
 	 */
 	ksi->ksi_signo = sigev->sigev_signo;
 	ksi->ksi_value = sigev->sigev_value;
 	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
 		td = thread_find(p, sigev->sigev_notify_thread_id);
 		if (td == NULL)
 			return (ESRCH);
 	}
 	return (tdsignal(p, td, ksi->ksi_signo, ksi));
 }
 
-/*
- * MPSAFE
- */
 int
 tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 #ifdef KSE
 	sigset_t saved;
 	int ret;
 
 	if (p->p_flag & P_SA)
 		saved = p->p_sigqueue.sq_signals;
 	ret = do_tdsignal(p, td, sig, ksi);
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
 		if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
 			/* pending set changed */
 			p->p_flag |= P_SIGEVENT;
 			wakeup(&p->p_siglist);
 		}
 	}
 	return (ret);
 }
 
 static int
 do_tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 #endif
 	sig_t action;
 	sigqueue_t *sigqueue;
 	int prop;
 	struct sigacts *ps;
 	int intrval;
 	int ret = 0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!_SIG_VALID(sig))
 #ifdef KSE
 		panic("do_tdsignal(): invalid signal %d", sig);
 #else
 		panic("tdsignal(): invalid signal %d", sig);
 #endif
 
 #ifdef KSE
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("do_tdsignal: ksi on queue"));
 #else
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("tdsignal: ksi on queue"));
 #endif
 
 	/*
 	 * IEEE Std 1003.1-2001: return success when killing a zombie.
 	 */
 	if (p->p_state == PRS_ZOMBIE) {
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 
 	ps = p->p_sigacts;
 	KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
 	/*
 	 * If the signal is blocked and not destined for this thread, then
 	 * assign it to the process so that we can find it later in the first
 	 * thread that unblocks it.  Otherwise, assign it to this thread now.
 	 */
 	if (td == NULL) {
 		td = sigtd(p, sig, prop);
 		if (SIGISMEMBER(td->td_sigmask, sig))
 			sigqueue = &p->p_sigqueue;
 		else
 			sigqueue = &td->td_sigqueue;
 	} else {
 		KASSERT(td->td_proc == p, ("invalid thread"));
 		sigqueue = &td->td_sigqueue;
 	}
 
 	/*
 	 * If the signal is being ignored,
 	 * then we forget about it immediately.
 	 * (Note: we don't set SIGCONT in ps_sigignore,
 	 * and if it is set to SIG_IGN,
 	 * action will be SIG_DFL here.)
 	 */
 	mtx_lock(&ps->ps_mtx);
 	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
 		mtx_unlock(&ps->ps_mtx);
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 	if (SIGISMEMBER(td->td_sigmask, sig))
 		action = SIG_HOLD;
 	else if (SIGISMEMBER(ps->ps_sigcatch, sig))
 		action = SIG_CATCH;
 	else
 		action = SIG_DFL;
 	if (SIGISMEMBER(ps->ps_sigintr, sig))
 		intrval = EINTR;
 	else
 		intrval = ERESTART;
 	mtx_unlock(&ps->ps_mtx);
 
 	if (prop & SA_CONT)
 		sigqueue_delete_stopmask_proc(p);
 	else if (prop & SA_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if ((prop & SA_TTYSTOP) &&
 		    (p->p_pgrp->pg_jobc == 0) &&
 		    (action == SIG_DFL)) {
 			if (ksi && (ksi->ksi_flags & KSI_INS))
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		}
 		sigqueue_delete_proc(p, SIGCONT);
 		if (p->p_flag & P_CONTINUED) {
 			p->p_flag &= ~P_CONTINUED;
 			PROC_LOCK(p->p_pptr);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(p->p_pptr);
 		}
 	}
 
 	ret = sigqueue_add(sigqueue, sig, ksi);
 	if (ret != 0)
 		return (ret);
 	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
 		return (ret);
 	/*
 	 * SIGKILL: Remove procfs STOPEVENTs.
 	 */
 	if (sig == SIGKILL) {
 		/* from procfs_ioctl.c: PIOCBIC */
 		p->p_stops = 0;
 		/* from procfs_ioctl.c: PIOCCONT */
 		p->p_step = 0;
 		wakeup(&p->p_step);
 	}
 	/*
 	 * Some signals have a process-wide effect and a per-thread
 	 * component.  Most processing occurs when the process next
 	 * tries to cross the user boundary, however there are some
 	 * times when processing needs to be done immediatly, such as
 	 * waking up threads so that they can cross the user boundary.
 	 * We try do the per-process part here.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		/*
 		 * The process is in stopped mode. All the threads should be
 		 * either winding down or already on the suspended queue.
 		 */
 		if (p->p_flag & P_TRACED) {
 			/*
 			 * The traced process is already stopped,
 			 * so no further action is necessary.
 			 * No signal can restart us.
 			 */
 			goto out;
 		}
 
 		if (sig == SIGKILL) {
 			/*
 			 * SIGKILL sets process running.
 			 * It will die elsewhere.
 			 * All threads must be restarted.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			goto runfast;
 		}
 
 		if (prop & SA_CONT) {
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in sigqueue as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * sigqueue.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			if (p->p_numthreads == p->p_suspcount) {
 				p->p_flag |= P_CONTINUED;
 				p->p_xstat = SIGCONT;
 				PROC_LOCK(p->p_pptr);
 				childproc_continued(p);
 				PROC_UNLOCK(p->p_pptr);
 			}
 			if (action == SIG_DFL) {
 				sigqueue_delete(sigqueue, sig);
 			} else if (action == SIG_CATCH) {
 #ifdef KSE
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 * It would seem that the answer would be to
 				 * run an upcall in the next KSE to run, and
 				 * deliver the signal that way. In a NON KSE
 				 * process, we need to make sure that the
 				 * single thread is runnable asap.
 				 * XXXKSE for now however, make them all run.
 				 */
 #else
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 */
 #endif
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
 			mtx_lock_spin(&sched_lock);
 			thread_unsuspend(p);
 			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
 
 		if (prop & SA_STOP) {
 			/*
 			 * Already stopped, don't need to stop again
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
 			p->p_flag |= P_STOPPED_SIG;
 			sigqueue_delete(sigqueue, sig);
 			goto out;
 		}
 
 		/*
 		 * All other kinds of signals:
 		 * If a thread is sleeping interruptibly, simulate a
 		 * wakeup so that when it is continued it will be made
 		 * runnable and can look at the signal.  However, don't make
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
 		mtx_lock_spin(&sched_lock);
 		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
 			sleepq_abort(td, intrval);
 		mtx_unlock_spin(&sched_lock);
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
 		 * hit thread_suspend_check() soon.
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
 			mtx_lock_spin(&sched_lock);
 			tdsigwakeup(td, sig, action, intrval);
 			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
 
 		MPASS(action == SIG_DFL);
 
 		if (prop & SA_STOP) {
 			if (p->p_flag & P_PPWAIT)
 				goto out;
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xstat = sig;
 			mtx_lock_spin(&sched_lock);
 			sig_suspend_threads(td, p, 1);
 			if (p->p_numthreads == p->p_suspcount) {
 				/*
 				 * only thread sending signal to another
 				 * process can reach here, if thread is sending
 				 * signal to its process, because thread does
 				 * not suspend itself here, p_numthreads
 				 * should never be equal to p_suspcount.
 				 */
 				thread_stopped(p);
 				mtx_unlock_spin(&sched_lock);
 				sigqueue_delete_proc(p, p->p_xstat);
 			} else
 				mtx_unlock_spin(&sched_lock);
 			goto out;
 		} 
 		else
 			goto runfast;
 		/* NOTREACHED */
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
 		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
 
 	/*
 	 * The process is not stopped so we need to apply the signal to all the
 	 * running threads.
 	 */
 
 runfast:
 	mtx_lock_spin(&sched_lock);
 	tdsigwakeup(td, sig, action, intrval);
 	thread_unsuspend(p);
 	mtx_unlock_spin(&sched_lock);
 out:
 	/* If we jump here, sched_lock should not be owned. */
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	return (ret);
 }
 
 /*
  * The force of a signal has been directed against a single
  * thread.  We need to see what we can do about knocking it
  * out of any sleep it may be in etc.
  */
 static void
 tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
 {
 	struct proc *p = td->td_proc;
 	register int prop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	prop = sigprop(sig);
 
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.
 	 */
 	if (action == SIG_DFL && (prop & SA_KILL)) {
 		if (p->p_nice > 0)
 			sched_nice(td->td_proc, 0);
 		if (td->td_priority > PUSER)
 			sched_prio(td, PUSER);
 	}
 
 	if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If thread is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((td->td_flags & TDF_SINTR) == 0)
 			return;
 		/*
 		 * If SIGCONT is default (or ignored) and process is
 		 * asleep, we are finished; the process should not
 		 * be awakened.
 		 */
 		if ((prop & SA_CONT) && action == SIG_DFL) {
 			mtx_unlock_spin(&sched_lock);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
 			sigqueue_delete(&td->td_sigqueue, sig);
 			mtx_lock_spin(&sched_lock);
 			return;
 		}
 
 		/*
 		 * Give low priority threads a better chance to run.
 		 */
 		if (td->td_priority > PUSER)
 			sched_prio(td, PUSER);
 
 		sleepq_abort(td, intrval);
 	} else {
 		/*
 		 * Other states do nothing with the signal immediately,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
 #ifdef SMP
 		if (TD_IS_RUNNING(td) && td != curthread)
 			forward_signal(td);
 #endif
 	}
 }
 
 static void
 sig_suspend_threads(struct thread *td, struct proc *p, int sending)
 {
 	struct thread *td2;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR) &&
 		    !TD_IS_SUSPENDED(td2)) {
 			thread_suspend_one(td2);
 		} else {
 			if (sending || td != td2)
 				td2->td_flags |= TDF_ASTPENDING;
 #ifdef SMP
 			if (TD_IS_RUNNING(td2) && td2 != td)
 				forward_signal(td2);
 #endif
 		}
 	}
 }
 
 int
 ptracestop(struct thread *td, int sig)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 	    &p->p_mtx.mtx_object, "Stopping for traced signal");
 
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_XSIG;
 	mtx_unlock_spin(&sched_lock);
 	td->td_xsig = sig;
 	while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) {
 		if (p->p_flag & P_SINGLE_EXIT) {
 			mtx_lock_spin(&sched_lock);
 			td->td_flags &= ~TDF_XSIG;
 			mtx_unlock_spin(&sched_lock);
 			return (sig);
 		}
 		/*
 		 * Just make wait() to work, the last stopped thread
 		 * will win.
 		 */
 		p->p_xstat = sig;
 		p->p_xthread = td;
 		p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
 		mtx_lock_spin(&sched_lock);
 		sig_suspend_threads(td, p, 0);
 stopme:
 		thread_stopped(p);
 		thread_suspend_one(td);
 		PROC_UNLOCK(p);
 		DROP_GIANT();
 		mi_switch(SW_VOL, NULL);
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		PROC_LOCK(p);
 		if (!(p->p_flag & P_TRACED))
 			break;
 		if (td->td_flags & TDF_DBSUSPEND) {
 			if (p->p_flag & P_SINGLE_EXIT)
 				break;
 			mtx_lock_spin(&sched_lock);
 			goto stopme;
 		}
 	}
 	return (td->td_xsig);
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling issignal
  * by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
  *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 static int
 issignal(td)
 	struct thread *td;
 {
 	struct proc *p;
 	struct sigacts *ps;
 	sigset_t sigpending;
 	int sig, prop, newsig;
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (;;) {
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
 		sigpending = td->td_sigqueue.sq_signals;
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if (p->p_flag & P_PPWAIT)
 			SIG_STOPSIGMASK(sigpending);
 		if (SIGISEMPTY(sigpending))	/* no signal to send */
 			return (0);
 		sig = sig_ffs(&sigpending);
 
 		if (p->p_stops & S_SIG) {
 			mtx_unlock(&ps->ps_mtx);
 			stopevent(p, S_SIG, sig);
 			mtx_lock(&ps->ps_mtx);
 		}
 
 		/*
 		 * We should see pending but ignored signals
 		 * only if P_TRACED was on when they were posted.
 		 */
 		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
 			sigqueue_delete(&td->td_sigqueue, sig);
 #ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				SIGADDSET(td->td_sigmask, sig);
 #endif
 			continue;
 		}
 		if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
 			/*
 			 * If traced, always stop.
 			 */
 			mtx_unlock(&ps->ps_mtx);
 			newsig = ptracestop(td, sig);
 			mtx_lock(&ps->ps_mtx);
 
 #ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				SIGADDSET(td->td_sigmask, sig);
 
 #endif
 			if (sig != newsig) {
 				ksiginfo_t ksi;
 				/*
 				 * clear old signal.
 				 * XXX shrug off debugger, it causes siginfo to
 				 * be thrown away.
 				 */
 				sigqueue_get(&td->td_sigqueue, sig, &ksi);
 
 				/*
 				 * If parent wants us to take the signal,
 				 * then it will leave it in p->p_xstat;
 				 * otherwise we just look for signals again.
 			 	*/
 				if (newsig == 0)
 					continue;
 				sig = newsig;
 
 				/*
 				 * Put the new signal into td_sigqueue. If the
 				 * signal is being masked, look for other signals.
 				 */
 				SIGADDSET(td->td_sigqueue.sq_signals, sig);
 #ifdef KSE
 				if (td->td_pflags & TDP_SA)
 					SIGDELSET(td->td_sigmask, sig);
 #endif
 				if (SIGISMEMBER(td->td_sigmask, sig))
 					continue;
 				signotify(td);
 			}
 
 			/*
 			 * If the traced bit got turned off, go back up
 			 * to the top to rescan signals.  This ensures
 			 * that p_sig* and p_sigact are consistent.
 			 */
 			if ((p->p_flag & P_TRACED) == 0)
 				continue;
 		}
 
 		prop = sigprop(sig);
 
 		/*
 		 * Decide whether the signal should be returned.
 		 * Return the signal's number, or fall through
 		 * to clear it from the pending mask.
 		 */
 		switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 
 		case (intptr_t)SIG_DFL:
 			/*
 			 * Don't take default actions on system processes.
 			 */
 			if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 				/*
 				 * Are you sure you want to ignore SIGSEGV
 				 * in init? XXX
 				 */
 				printf("Process (pid %lu) got signal %d\n",
 					(u_long)p->p_pid, sig);
 #endif
 				break;		/* == ignore */
 			}
 			/*
 			 * If there is a pending stop signal to process
 			 * with default action, stop here,
 			 * then clear the signal.  However,
 			 * if process is member of an orphaned
 			 * process group, ignore tty stop signals.
 			 */
 			if (prop & SA_STOP) {
 				if (p->p_flag & P_TRACED ||
 		    		    (p->p_pgrp->pg_jobc == 0 &&
 				     prop & SA_TTYSTOP))
 					break;	/* == ignore */
 				mtx_unlock(&ps->ps_mtx);
 				WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 				    &p->p_mtx.mtx_object, "Catching SIGSTOP");
 				p->p_flag |= P_STOPPED_SIG;
 				p->p_xstat = sig;
 				mtx_lock_spin(&sched_lock);
 				sig_suspend_threads(td, p, 0);
 				thread_stopped(p);
 				thread_suspend_one(td);
 				PROC_UNLOCK(p);
 				DROP_GIANT();
 				mi_switch(SW_INVOL, NULL);
 				mtx_unlock_spin(&sched_lock);
 				PICKUP_GIANT();
 				PROC_LOCK(p);
 				mtx_lock(&ps->ps_mtx);
 				break;
 			} else if (prop & SA_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
 				 */
 				break;		/* == ignore */
 			} else
 				return (sig);
 			/*NOTREACHED*/
 
 		case (intptr_t)SIG_IGN:
 			/*
 			 * Masking above should prevent us ever trying
 			 * to take action on an ignored signal other
 			 * than SIGCONT, unless process is traced.
 			 */
 			if ((prop & SA_CONT) == 0 &&
 			    (p->p_flag & P_TRACED) == 0)
 				printf("issignal\n");
 			break;		/* == ignore */
 
 		default:
 			/*
 			 * This signal has an action, let
 			 * postsig() process it.
 			 */
 			return (sig);
 		}
 		sigqueue_delete(&td->td_sigqueue, sig);		/* take the signal! */
 	}
 	/* NOTREACHED */
 }
 
-/*
- * MPSAFE
- */
 void
 thread_stopped(struct proc *p)
 {
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
 		mtx_unlock_spin(&sched_lock);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
 		childproc_stopped(p, (p->p_flag & P_TRACED) ?
 			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
 		mtx_lock_spin(&sched_lock);
 	}
 }
  
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 void
 postsig(sig)
 	register int sig;
 {
 	struct thread *td = curthread;
 	register struct proc *p = td->td_proc;
 	struct sigacts *ps;
 	sig_t action;
 	ksiginfo_t ksi;
 	sigset_t returnmask;
 	int code;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	ksiginfo_init(&ksi);
 	sigqueue_get(&td->td_sigqueue, sig, &ksi);
 	ksi.ksi_signo = sig;
 	if (ksi.ksi_code == SI_TIMER)
 		itimer_accept(p, ksi.ksi_timerid, &ksi);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
 		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 		    &td->td_oldsigmask : &td->td_sigmask, 0);
 #endif
 	if (p->p_stops & S_SIG) {
 		mtx_unlock(&ps->ps_mtx);
 		stopevent(p, S_SIG, sig);
 		mtx_lock(&ps->ps_mtx);
 	}
 
 #ifdef KSE
 	if (!(td->td_pflags & TDP_SA) && action == SIG_DFL) {
 #else
 	if (action == SIG_DFL) {
 #endif
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
 #ifdef KSE
 		if (td->td_pflags & TDP_SA) {
 			if (sig == SIGKILL) {
 				mtx_unlock(&ps->ps_mtx);
 				sigexit(td, sig);
 			}
 		}
 
 #endif
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig),
 		    ("postsig action"));
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		if (td->td_pflags & TDP_OLDMASK) {
 			returnmask = td->td_oldsigmask;
 			td->td_pflags &= ~TDP_OLDMASK;
 		} else
 			returnmask = td->td_sigmask;
 
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
 
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		p->p_stats->p_ru.ru_nsignals++;
 		if (p->p_sig != sig) {
 			code = 0;
 		} else {
 			code = p->p_code;
 			p->p_code = 0;
 			p->p_sig = 0;
 		}
 #ifdef KSE
 		if (td->td_pflags & TDP_SA)
 			thread_signal_add(curthread, &ksi);
 		else
 			(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 #else
 		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 #endif
 	}
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(p, why)
 	struct proc *p;
 	char *why;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
 		p, p->p_pid, p->p_comm);
 	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
 		p->p_ucred ? p->p_ucred->cr_uid : -1, why);
 	psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
- *
- * MPSAFE
  */
 void
 sigexit(td, sig)
 	struct thread *td;
 	int sig;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_acflag |= AXSIG;
 	/*
 	 * We must be single-threading to generate a core dump.  This
 	 * ensures that the registers in the core file are up-to-date.
 	 * Also, the ELF dump handler assumes that the thread list doesn't
 	 * change out from under it.
 	 *
 	 * XXX If another thread attempts to single-thread before us
 	 *     (e.g. via fork()), we won't get a dump at all.
 	 */
 	if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 * Note that coredump() drops proc lock.
 		 */
 		if (coredump(td) == 0)
 			sig |= WCOREFLAG;
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), uid %d: exited on signal %d%s\n",
 			    p->p_pid, p->p_comm,
 			    td->td_ucred ? td->td_ucred->cr_uid : -1,
 			    sig &~ WCOREFLAG,
 			    sig & WCOREFLAG ? " (core dumped)" : "");
 	} else
 		PROC_UNLOCK(p);
 	exit1(td, W_EXITCODE(0, sig));
 	/* NOTREACHED */
 }
 
 /*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
 static void
 sigparent(struct proc *p, int reason, int status)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	if (p->p_ksi != NULL) {
 		p->p_ksi->ksi_signo  = SIGCHLD;
 		p->p_ksi->ksi_code   = reason;
 		p->p_ksi->ksi_status = status;
 		p->p_ksi->ksi_pid    = p->p_pid;
 		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
 		if (KSI_ONQ(p->p_ksi))
 			return;
 	}
 	tdsignal(p->p_pptr, NULL, SIGCHLD, p->p_ksi);
 }
 
 static void
 childproc_jobstate(struct proc *p, int reason, int status)
 {
 	struct sigacts *ps;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	/*
 	 * Wake up parent sleeping in kern_wait(), also send
 	 * SIGCHLD to parent, but SIGCHLD does not guarantee
 	 * that parent will awake, because parent may masked
 	 * the signal.
 	 */
 	p->p_pptr->p_flag |= P_STATCHILD;
 	wakeup(p->p_pptr);
 
 	ps = p->p_pptr->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		sigparent(p, reason, status);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 void
 childproc_stopped(struct proc *p, int reason)
 {
 	childproc_jobstate(p, reason, p->p_xstat);
 }
 
 void
 childproc_continued(struct proc *p)
 {
 	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
 }
 
 void
 childproc_exited(struct proc *p)
 {
 	int reason;
 	int status = p->p_xstat; /* convert to int */
 
 	reason = CLD_EXITED;
 	if (WCOREDUMP(status))
 		reason = CLD_DUMPED;
 	else if (WIFSIGNALED(status))
 		reason = CLD_KILLED;
 	/*
 	 * XXX avoid calling wakeup(p->p_pptr), the work is
 	 * done in exit1().
 	 */
 	sigparent(p, reason, status);
 }
 
 static char corefilename[MAXPATHLEN] = {"%N.core"};
 SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
 	      sizeof(corefilename), "process corefile name format string");
 
 /*
  * expand_name(name, uid, pid)
  * Expand the name described in corefilename, using name, uid, and pid.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 
 static char *
 expand_name(name, uid, pid)
 	const char *name;
 	uid_t uid;
 	pid_t pid;
 {
 	const char *format, *appendstr;
 	char *temp;
 	char buf[11];		/* Buffer for pid/uid -- max 4B */
 	size_t i, l, n;
 
 	format = corefilename;
 	temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO);
 	if (temp == NULL)
 		return (NULL);
 	for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) {
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				appendstr = "%";
 				break;
 			case 'N':	/* process name */
 				appendstr = name;
 				break;
 			case 'P':	/* process id */
 				sprintf(buf, "%u", pid);
 				appendstr = buf;
 				break;
 			case 'U':	/* user id */
 				sprintf(buf, "%u", uid);
 				appendstr = buf;
 				break;
 			default:
 				appendstr = "";
 			  	log(LOG_ERR,
 				    "Unknown format character %c in `%s'\n",
 				    format[i], format);
 			}
 			l = strlen(appendstr);
 			if ((n + l) >= MAXPATHLEN)
 				goto toolong;
 			memcpy(temp + n, appendstr, l);
 			n += l;
 			break;
 		default:
 			temp[n++] = format[i];
 		}
 	}
 	if (format[i] != '\0')
 		goto toolong;
 	return (temp);
 toolong:
 	log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n",
 	    (long)pid, name, (u_long)uid);
 	free(temp, M_TEMP);
 	return (NULL);
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	register struct vnode *vp;
 	register struct ucred *cred = td->td_ucred;
 	struct flock lf;
 	struct nameidata nd;
 	struct vattr vattr;
 	int error, error1, flags, locked;
 	struct mount *mp;
 	char *name;			/* name of corefile */
 	off_t limit;
 	int vfslocked;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
 	_STOPEVENT(p, S_CORE, 0);
 
 	if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) {
 		PROC_UNLOCK(p);
 		return (EFAULT);
 	}
 	
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = (off_t)lim_cur(p, RLIMIT_CORE);
 	PROC_UNLOCK(p);
 	if (limit == 0)
 		return (EFBIG);
 
 restart:
 	name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid);
 	if (name == NULL)
 		return (EINVAL);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td);
 	flags = O_CREAT | FWRITE | O_NOFOLLOW;
 	error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, -1);
 	free(name, M_TEMP);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/* Don't dump to non-regular files or files with links. */
 	if (vp->v_type != VREG ||
 	    VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) {
 		VOP_UNLOCK(vp, 0, td);
 		error = EFAULT;
 		goto close;
 	}
 
 	VOP_UNLOCK(vp, 0, td);
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = F_WRLCK;
 	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
 
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		lf.l_type = F_UNLCK;
 		if (locked)
 			VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 		if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
 			goto out;
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto restart;
 	}
 
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	if (set_core_nodump_flag)
 		vattr.va_flags = UF_NODUMP;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VOP_LEASE(vp, td, cred, LEASE_WRITE);
 	VOP_SETATTR(vp, &vattr, cred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
 
 	error = p->p_sysent->sv_coredump ?
 	  p->p_sysent->sv_coredump(td, vp, limit) :
 	  ENOSYS;
 
 	if (locked) {
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	}
 close:
 	error1 = vn_close(vp, FWRITE, cred, td);
 	if (error == 0)
 		error = error1;
 out:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
- * Nonexistent system call-- signal process (may want to handle it).
- * Flag error in case process won't see signal immediately (blocked or ignored).
+ * Nonexistent system call-- signal process (may want to handle it).  Flag
+ * error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 nosys(td, args)
 	struct thread *td;
 	struct nosys_args *args;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	psignal(p, SIGSYS);
 	PROC_UNLOCK(p);
 	return (ENOSYS);
 }
 
 /*
- * Send a SIGIO or SIGURG signal to a process or process group using
- * stored credentials rather than those of the current process.
+ * Send a SIGIO or SIGURG signal to a process or process group using stored
+ * credentials rather than those of the current process.
  */
 void
 pgsigio(sigiop, sig, checkctty)
 	struct sigio **sigiop;
 	int sig, checkctty;
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	if (sigio->sio_pgid > 0) {
 		PROC_LOCK(sigio->sio_proc);
 		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
 			psignal(sigio->sio_proc, sig);
 		PROC_UNLOCK(sigio->sio_proc);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		PGRP_LOCK(sigio->sio_pgrp);
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(sigio->sio_pgrp);
 	}
 	SIGIO_UNLOCK();
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	knlist_add(&p->p_klist, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	knlist_remove(&p->p_klist, kn, 0);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to 
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
 
 struct sigacts *
 sigacts_alloc(void)
 {
 	struct sigacts *ps;
 
 	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
 	ps->ps_refcnt = 1;
 	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
 	return (ps);
 }
 
 void
 sigacts_free(struct sigacts *ps)
 {
 
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt--;
 	if (ps->ps_refcnt == 0) {
 		mtx_destroy(&ps->ps_mtx);
 		free(ps, M_SUBPROC);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 struct sigacts *
 sigacts_hold(struct sigacts *ps)
 {
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt++;
 	mtx_unlock(&ps->ps_mtx);
 	return (ps);
 }
 
 void
 sigacts_copy(struct sigacts *dest, struct sigacts *src)
 {
 
 	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
 	mtx_lock(&src->ps_mtx);
 	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
 	mtx_unlock(&src->ps_mtx);
 }
 
 int
 sigacts_shared(struct sigacts *ps)
 {
 	int shared;
 
 	mtx_lock(&ps->ps_mtx);
 	shared = ps->ps_refcnt > 1;
 	mtx_unlock(&ps->ps_mtx);
 	return (shared);
 }
Index: head/sys/kern/kern_sysctl.c
===================================================================
--- head/sys/kern/kern_sysctl.c	(revision 167210)
+++ head/sys/kern/kern_sysctl.c	(revision 167211)
@@ -1,1616 +1,1610 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
  * project, to make these variables more userfriendly.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
 static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
 static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
 
 /*
  * Locking - this locks the sysctl tree in memory.
  */
 static struct sx sysctllock;
 
 #define	SYSCTL_LOCK()		sx_xlock(&sysctllock)
 #define	SYSCTL_UNLOCK()		sx_xunlock(&sysctllock)
 #define	SYSCTL_INIT()		sx_init(&sysctllock, "sysctl lock")
 
 static int sysctl_root(SYSCTL_HANDLER_ARGS);
 
 struct sysctl_oid_list sysctl__children; /* root list */
 
 static struct sysctl_oid *
 sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
 {
 	struct sysctl_oid *oidp;
 
 	SLIST_FOREACH(oidp, list, oid_link) {
 		if (strcmp(oidp->oid_name, name) == 0) {
 			return (oidp);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Initialization of the MIB tree.
  *
  * Order by number in each list.
  */
 
 void
 sysctl_register_oid(struct sysctl_oid *oidp)
 {
 	struct sysctl_oid_list *parent = oidp->oid_parent;
 	struct sysctl_oid *p;
 	struct sysctl_oid *q;
 
 	/*
 	 * First check if another oid with the same name already
 	 * exists in the parent's list.
 	 */
 	p = sysctl_find_oidname(oidp->oid_name, parent);
 	if (p != NULL) {
 		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			p->oid_refcnt++;
 			return;
 		} else {
 			printf("can't re-use a leaf (%s)!\n", p->oid_name);
 			return;
 		}
 	}
 	/*
 	 * If this oid has a number OID_AUTO, give it a number which
 	 * is greater than any current oid.
 	 * NOTE: DO NOT change the starting value here, change it in
 	 * <sys/sysctl.h>, and make sure it is at least 256 to
 	 * accomodate e.g. net.inet.raw as a static sysctl node.
 	 */
 	if (oidp->oid_number == OID_AUTO) {
 		static int newoid = CTL_AUTO_START;
 
 		oidp->oid_number = newoid++;
 		if (newoid == 0x7fffffff)
 			panic("out of oids");
 	}
 #if 0
 	else if (oidp->oid_number >= CTL_AUTO_START) {
 		/* do not panic; this happens when unregistering sysctl sets */
 		printf("static sysctl oid too high: %d", oidp->oid_number);
 	}
 #endif
 
 	/*
 	 * Insert the oid into the parent's list in order.
 	 */
 	q = NULL;
 	SLIST_FOREACH(p, parent, oid_link) {
 		if (oidp->oid_number < p->oid_number)
 			break;
 		q = p;
 	}
 	if (q)
 		SLIST_INSERT_AFTER(q, oidp, oid_link);
 	else
 		SLIST_INSERT_HEAD(parent, oidp, oid_link);
 }
 
 void
 sysctl_unregister_oid(struct sysctl_oid *oidp)
 {
 	struct sysctl_oid *p;
 	int error;
 
 	error = ENOENT;
 	if (oidp->oid_number == OID_AUTO) {
 		error = EINVAL;
 	} else {
 		SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
 			if (p == oidp) {
 				SLIST_REMOVE(oidp->oid_parent, oidp,
 				    sysctl_oid, oid_link);
 				error = 0;
 				break;
 			}
 		}
 	}
 
 	/* 
 	 * This can happen when a module fails to register and is
 	 * being unloaded afterwards.  It should not be a panic()
 	 * for normal use.
 	 */
 	if (error)
 		printf("%s: failed to unregister sysctl\n", __func__);
 }
 
 /* Initialize a new context to keep track of dynamically added sysctls. */
 int
 sysctl_ctx_init(struct sysctl_ctx_list *c)
 {
 
 	if (c == NULL) {
 		return (EINVAL);
 	}
 	TAILQ_INIT(c);
 	return (0);
 }
 
 /* Free the context, and destroy all dynamic oids registered in this context */
 int
 sysctl_ctx_free(struct sysctl_ctx_list *clist)
 {
 	struct sysctl_ctx_entry *e, *e1;
 	int error;
 
 	error = 0;
 	/*
 	 * First perform a "dry run" to check if it's ok to remove oids.
 	 * XXX FIXME
 	 * XXX This algorithm is a hack. But I don't know any
 	 * XXX better solution for now...
 	 */
 	TAILQ_FOREACH(e, clist, link) {
 		error = sysctl_remove_oid(e->entry, 0, 0);
 		if (error)
 			break;
 	}
 	/*
 	 * Restore deregistered entries, either from the end,
 	 * or from the place where error occured.
 	 * e contains the entry that was not unregistered
 	 */
 	if (error)
 		e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
 	else
 		e1 = TAILQ_LAST(clist, sysctl_ctx_list);
 	while (e1 != NULL) {
 		sysctl_register_oid(e1->entry);
 		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
 	}
 	if (error)
 		return(EBUSY);
 	/* Now really delete the entries */
 	e = TAILQ_FIRST(clist);
 	while (e != NULL) {
 		e1 = TAILQ_NEXT(e, link);
 		error = sysctl_remove_oid(e->entry, 1, 0);
 		if (error)
 			panic("sysctl_remove_oid: corrupt tree, entry: %s",
 			    e->entry->oid_name);
 		free(e, M_SYSCTLOID);
 		e = e1;
 	}
 	return (error);
 }
 
 /* Add an entry to the context */
 struct sysctl_ctx_entry *
 sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
 	e->entry = oidp;
 	TAILQ_INSERT_HEAD(clist, e, link);
 	return (e);
 }
 
 /* Find an entry in the context */
 struct sysctl_ctx_entry *
 sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	TAILQ_FOREACH(e, clist, link) {
 		if(e->entry == oidp)
 			return(e);
 	}
 	return (e);
 }
 
 /*
  * Delete an entry from the context.
  * NOTE: this function doesn't free oidp! You have to remove it
  * with sysctl_remove_oid().
  */
 int
 sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return (EINVAL);
 	e = sysctl_ctx_entry_find(clist, oidp);
 	if (e != NULL) {
 		TAILQ_REMOVE(clist, e, link);
 		free(e, M_SYSCTLOID);
 		return (0);
 	} else
 		return (ENOENT);
 }
 
 /*
  * Remove dynamically created sysctl trees.
  * oidp - top of the tree to be removed
  * del - if 0 - just deregister, otherwise free up entries as well
  * recurse - if != 0 traverse the subtree to be deleted
  */
 int
 sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
 {
 	struct sysctl_oid *p;
 	int error;
 
 	if (oidp == NULL)
 		return(EINVAL);
 	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
 		printf("can't remove non-dynamic nodes!\n");
 		return (EINVAL);
 	}
 	/*
 	 * WARNING: normal method to do this should be through
 	 * sysctl_ctx_free(). Use recursing as the last resort
 	 * method to purge your sysctl tree of leftovers...
 	 * However, if some other code still references these nodes,
 	 * it will panic.
 	 */
 	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		if (oidp->oid_refcnt == 1) {
 			SLIST_FOREACH(p, SYSCTL_CHILDREN(oidp), oid_link) {
 				if (!recurse)
 					return (ENOTEMPTY);
 				error = sysctl_remove_oid(p, del, recurse);
 				if (error)
 					return (error);
 			}
 			if (del)
 				free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID);
 		}
 	}
 	if (oidp->oid_refcnt > 1 ) {
 		oidp->oid_refcnt--;
 	} else {
 		if (oidp->oid_refcnt == 0) {
 			printf("Warning: bad oid_refcnt=%u (%s)!\n",
 				oidp->oid_refcnt, oidp->oid_name);
 			return (EINVAL);
 		}
 		sysctl_unregister_oid(oidp);
 		if (del) {
 			if (oidp->oid_descr)
 				free((void *)(uintptr_t)(const void *)oidp->oid_descr, M_SYSCTLOID);
 			free((void *)(uintptr_t)(const void *)oidp->oid_name,
 			     M_SYSCTLOID);
 			free(oidp, M_SYSCTLOID);
 		}
 	}
 	return (0);
 }
 
 /*
  * Create new sysctls at run time.
  * clist may point to a valid context initialized with sysctl_ctx_init().
  */
 struct sysctl_oid *
 sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
 	int number, const char *name, int kind, void *arg1, int arg2,
 	int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
 {
 	struct sysctl_oid *oidp;
 	ssize_t len;
 	char *newname;
 
 	/* You have to hook up somewhere.. */
 	if (parent == NULL)
 		return(NULL);
 	/* Check if the node already exists, otherwise create it */
 	oidp = sysctl_find_oidname(name, parent);
 	if (oidp != NULL) {
 		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			oidp->oid_refcnt++;
 			/* Update the context */
 			if (clist != NULL)
 				sysctl_ctx_entry_add(clist, oidp);
 			return (oidp);
 		} else {
 			printf("can't re-use a leaf (%s)!\n", name);
 			return (NULL);
 		}
 	}
 	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
 	oidp->oid_parent = parent;
 	SLIST_NEXT(oidp, oid_link) = NULL;
 	oidp->oid_number = number;
 	oidp->oid_refcnt = 1;
 	len = strlen(name);
 	newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
 	bcopy(name, newname, len + 1);
 	newname[len] = '\0';
 	oidp->oid_name = newname;
 	oidp->oid_handler = handler;
 	oidp->oid_kind = CTLFLAG_DYN | kind;
 	if ((kind & CTLTYPE) == CTLTYPE_NODE) {
 		/* Allocate space for children */
 		SYSCTL_CHILDREN_SET(oidp, malloc(sizeof(struct sysctl_oid_list),
 		    M_SYSCTLOID, M_WAITOK));
 		SLIST_INIT(SYSCTL_CHILDREN(oidp));
 	} else {
 		oidp->oid_arg1 = arg1;
 		oidp->oid_arg2 = arg2;
 	}
 	oidp->oid_fmt = fmt;
 	if (descr) {
 		int len = strlen(descr) + 1;
 		oidp->oid_descr = malloc(len, M_SYSCTLOID, M_WAITOK);
 		if (oidp->oid_descr)
 			strcpy((char *)(uintptr_t)(const void *)oidp->oid_descr, descr);
 	}
 	/* Update the context, if used */
 	if (clist != NULL)
 		sysctl_ctx_entry_add(clist, oidp);
 	/* Register this oid */
 	sysctl_register_oid(oidp);
 	return (oidp);
 }
 
 /*
  * Reparent an existing oid.
  */
 int
 sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
 {
 	struct sysctl_oid *oidp;
 
 	if (oid->oid_parent == parent)
 		return (0);
 	oidp = sysctl_find_oidname(oid->oid_name, parent);
 	if (oidp != NULL)
 		return (EEXIST);
 	sysctl_unregister_oid(oid);
 	oid->oid_parent = parent;
 	oid->oid_number = OID_AUTO;
 	sysctl_register_oid(oid);
 	return (0);
 }
 
 /*
  * Register the kernel's oids on startup.
  */
 SET_DECLARE(sysctl_set, struct sysctl_oid);
 
 static void
 sysctl_register_all(void *arg)
 {
 	struct sysctl_oid **oidp;
 
 	SYSCTL_INIT();
 	SET_FOREACH(oidp, sysctl_set)
 		sysctl_register_oid(*oidp);
 }
 SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);
 
 /*
  * "Staff-functions"
  *
  * These functions implement a presently undocumented interface 
  * used by the sysctl program to walk the tree, and get the type
  * so it can print the value.
  * This interface is under work and consideration, and should probably
  * be killed with a big axe by the first person who can find the time.
  * (be aware though, that the proper interface isn't as obvious as it
  * may seem, there are various conflicting requirements.
  *
  * {0,0}	printf the entire MIB-tree.
  * {0,1,...}	return the name of the "..." OID.
  * {0,2,...}	return the next OID.
  * {0,3}	return the OID of the name in "new"
  * {0,4,...}	return the kind & format info for the "..." OID.
  * {0,5,...}	return the description the "..." OID.
  */
 
 #ifdef SYSCTL_DEBUG
 static void
 sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
 {
 	int k;
 	struct sysctl_oid *oidp;
 
 	SLIST_FOREACH(oidp, l, oid_link) {
 
 		for (k=0; k<i; k++)
 			printf(" ");
 
 		printf("%d %s ", oidp->oid_number, oidp->oid_name);
 
 		printf("%c%c",
 			oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
 			oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
 
 		if (oidp->oid_handler)
 			printf(" *Handler");
 
 		switch (oidp->oid_kind & CTLTYPE) {
 			case CTLTYPE_NODE:
 				printf(" Node\n");
 				if (!oidp->oid_handler) {
 					sysctl_sysctl_debug_dump_node(
 						oidp->oid_arg1, i+2);
 				}
 				break;
 			case CTLTYPE_INT:    printf(" Int\n"); break;
 			case CTLTYPE_STRING: printf(" String\n"); break;
 			case CTLTYPE_QUAD:   printf(" Quad\n"); break;
 			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
 			default:	     printf("\n");
 		}
 
 	}
 }
 
 static int
 sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
 	if (error)
 		return (error);
 	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
 	return (ENOENT);
 }
 
 SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_sysctl_debug, "-", "");
 #endif
 
 static int
 sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	int error = 0;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
 	char buf[10];
 
 	while (namelen) {
 		if (!lsp) {
 			snprintf(buf,sizeof(buf),"%d",*name);
 			if (req->oldidx)
 				error = SYSCTL_OUT(req, ".", 1);
 			if (!error)
 				error = SYSCTL_OUT(req, buf, strlen(buf));
 			if (error)
 				return (error);
 			namelen--;
 			name++;
 			continue;
 		}
 		lsp2 = 0;
 		SLIST_FOREACH(oid, lsp, oid_link) {
 			if (oid->oid_number != *name)
 				continue;
 
 			if (req->oldidx)
 				error = SYSCTL_OUT(req, ".", 1);
 			if (!error)
 				error = SYSCTL_OUT(req, oid->oid_name,
 					strlen(oid->oid_name));
 			if (error)
 				return (error);
 
 			namelen--;
 			name++;
 
 			if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
 				break;
 
 			if (oid->oid_handler)
 				break;
 
 			lsp2 = (struct sysctl_oid_list *)oid->oid_arg1;
 			break;
 		}
 		lsp = lsp2;
 	}
 	return (SYSCTL_OUT(req, "", 1));
 }
 
 static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, "");
 
 static int
 sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, 
 	int *next, int *len, int level, struct sysctl_oid **oidpp)
 {
 	struct sysctl_oid *oidp;
 
 	*len = level;
 	SLIST_FOREACH(oidp, lsp, oid_link) {
 		*next = oidp->oid_number;
 		*oidpp = oidp;
 
 		if (oidp->oid_kind & CTLFLAG_SKIP)
 			continue;
 
 		if (!namelen) {
 			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
 				return (0);
 			if (oidp->oid_handler) 
 				/* We really should call the handler here...*/
 				return (0);
 			lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 			if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, 
 				len, level+1, oidpp))
 				return (0);
 			goto emptynode;
 		}
 
 		if (oidp->oid_number < *name)
 			continue;
 
 		if (oidp->oid_number > *name) {
 			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 				return (0);
 			if (oidp->oid_handler)
 				return (0);
 			lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 			if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, 
 				next+1, len, level+1, oidpp))
 				return (0);
 			goto next;
 		}
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 			continue;
 
 		if (oidp->oid_handler)
 			continue;
 
 		lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 		if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, 
 			len, level+1, oidpp))
 			return (0);
 	next:
 		namelen = 1;
 	emptynode:
 		*len = level;
 	}
 	return (1);
 }
 
 static int
 sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	int i, j, error;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	int newoid[CTL_MAXNAME];
 
 	i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
 	if (i)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, "");
 
 static int
 name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp)
 {
 	int i;
 	struct sysctl_oid *oidp;
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	char *p;
 
 	if (!*name)
 		return (ENOENT);
 
 	p = name + strlen(name) - 1 ;
 	if (*p == '.')
 		*p = '\0';
 
 	*len = 0;
 
 	for (p = name; *p && *p != '.'; p++) 
 		;
 	i = *p;
 	if (i == '.')
 		*p = '\0';
 
 	oidp = SLIST_FIRST(lsp);
 
 	while (oidp && *len < CTL_MAXNAME) {
 		if (strcmp(name, oidp->oid_name)) {
 			oidp = SLIST_NEXT(oidp, oid_link);
 			continue;
 		}
 		*oid++ = oidp->oid_number;
 		(*len)++;
 
 		if (!i) {
 			if (oidpp)
 				*oidpp = oidp;
 			return (0);
 		}
 
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 			break;
 
 		if (oidp->oid_handler)
 			break;
 
 		lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 		oidp = SLIST_FIRST(lsp);
 		name = p+1;
 		for (p = name; *p && *p != '.'; p++) 
 				;
 		i = *p;
 		if (i == '.')
 			*p = '\0';
 	}
 	return (ENOENT);
 }
 
 static int
 sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
 {
 	char *p;
 	int error, oid[CTL_MAXNAME], len;
 	struct sysctl_oid *op = 0;
 
 	if (!req->newlen) 
 		return (ENOENT);
 	if (req->newlen >= MAXPATHLEN)	/* XXX arbitrary, undocumented */
 		return (ENAMETOOLONG);
 
 	p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
 
 	error = SYSCTL_IN(req, p, req->newlen);
 	if (error) {
 		free(p, M_SYSCTL);
 		return (error);
 	}
 
 	p [req->newlen] = '\0';
 
 	error = name2oid(p, oid, &len, &op);
 
 	free(p, M_SYSCTL);
 
 	if (error)
 		return (error);
 
 	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
 	return (error);
 }
 
 SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, 
 	sysctl_sysctl_name2oid, "I", "");
 
 static int
 sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		return (error);
 
 	if (!oid->oid_fmt)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
 	return (error);
 }
 
 
 static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, "");
 
 static int
 sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		return (error);
 
 	if (!oid->oid_descr)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD, sysctl_sysctl_oiddescr, "");
 
 /*
  * Default "handler" functions.
  */
 
 /*
  * Handle an int, signed or unsigned.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 
 int
 sysctl_handle_int(SYSCTL_HANDLER_ARGS)
 {
 	int tmpout, error = 0;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		tmpout = *(int *)arg1;
 	else
 		tmpout = arg2;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(int));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else
 		error = SYSCTL_IN(req, arg1, sizeof(int));
 	return (error);
 }
 
 
 /*
  * Based on on sysctl_handle_int() convert milliseconds into ticks.
  */
 
 int
 sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
 {
 	int error, s, tt;
 
 	tt = *(int *)oidp->oid_arg1;
 	s = (int)((int64_t)tt * 1000 / hz);
 
 	error = sysctl_handle_int(oidp, &s, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	tt = (int)((int64_t)s * hz / 1000);
 	if (tt < 1)
 		return (EINVAL);
 
 	*(int *)oidp->oid_arg1 = tt;
 	return (0);
 }
 
 
 /*
  * Handle a long, signed or unsigned.  arg1 points to it.
  */
 
 int
 sysctl_handle_long(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	long tmplong;
 #ifdef SCTL_MASK32
 	int tmpint;
 #endif
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (!arg1)
 		return (EINVAL);
 	tmplong = *(long *)arg1;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		tmpint = tmplong;
 		error = SYSCTL_OUT(req, &tmpint, sizeof(int));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &tmplong, sizeof(long));
 
 	if (error || !req->newptr)
 		return (error);
 
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		error = SYSCTL_IN(req, &tmpint, sizeof(int));
 		*(long *)arg1 = (long)tmpint;
 	} else
 #endif
 		error = SYSCTL_IN(req, arg1, sizeof(long));
 	return (error);
 }
 
 /*
  * Handle our generic '\0' terminated 'C' string.
  * Two cases:
  * 	a variable string:  point arg1 at it, arg2 is max length.
  * 	a constant string:  point arg1 at it, arg2 is zero.
  */
 
 int
 sysctl_handle_string(SYSCTL_HANDLER_ARGS)
 {
 	int error=0;
 	char *tmparg;
 	size_t outlen;
 
 	/*
 	 * Attempt to get a coherent snapshot by copying to a
 	 * temporary kernel buffer.
 	 */
 retry:
 	outlen = strlen((char *)arg1)+1;
 	tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK);
 
 	if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) {
 		free(tmparg, M_SYSCTLTMP);
 		goto retry;
 	}
 
 	error = SYSCTL_OUT(req, tmparg, outlen);
 	free(tmparg, M_SYSCTLTMP);
 
 	if (error || !req->newptr)
 		return (error);
 
 	if ((req->newlen - req->newidx) >= arg2) {
 		error = EINVAL;
 	} else {
 		arg2 = (req->newlen - req->newidx);
 		error = SYSCTL_IN(req, arg1, arg2);
 		((char *)arg1)[arg2] = '\0';
 	}
 
 	return (error);
 }
 
 /*
  * Handle any kind of opaque data.
  * arg1 points to it, arg2 is the size.
  */
 
 int
 sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
 {
 	int error, tries;
 	u_int generation;
 	struct sysctl_req req2;
 
 	/*
 	 * Attempt to get a coherent snapshot, by using the thread
 	 * pre-emption counter updated from within mi_switch() to
 	 * determine if we were pre-empted during a bcopy() or
 	 * copyout(). Make 3 attempts at doing this before giving up.
 	 * If we encounter an error, stop immediately.
 	 */
 	tries = 0;
 	req2 = *req;
 retry:
 	generation = curthread->td_generation;
 	error = SYSCTL_OUT(req, arg1, arg2);
 	if (error)
 		return (error);
 	tries++;
 	if (generation != curthread->td_generation && tries < 3) {
 		*req = req2;
 		goto retry;
 	}
 
 	error = SYSCTL_IN(req, arg1, arg2);
 
 	return (error);
 }
 
 /*
  * Transfer functions to/from kernel space.
  * XXX: rather untested at this point
  */
 static int
 sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
 {
 	size_t i = 0;
 
 	if (req->oldptr) {
 		i = l;
 		if (req->oldlen <= req->oldidx)
 			i = 0;
 		else
 			if (i > req->oldlen - req->oldidx)
 				i = req->oldlen - req->oldidx;
 		if (i > 0)
 			bcopy(p, (char *)req->oldptr + req->oldidx, i);
 	}
 	req->oldidx += l;
 	if (req->oldptr && i != l)
 		return (ENOMEM);
 	return (0);
 }
 
 static int
 sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
 {
 	if (!req->newptr)
 		return (0);
 	if (req->newlen - req->newidx < l)
 		return (EINVAL);
 	bcopy((char *)req->newptr + req->newidx, p, l);
 	req->newidx += l;
 	return (0);
 }
 
 int
 kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
     size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
 {
 	int error = 0;
 	struct sysctl_req req;
 
 	bzero(&req, sizeof req);
 
 	req.td = td;
 	req.flags = flags;
 
 	if (oldlenp) {
 		req.oldlen = *oldlenp;
 	}
 	req.validlen = req.oldlen;
 
 	if (old) {
 		req.oldptr= old;
 	}
 
 	if (new != NULL) {
 		req.newlen = newlen;
 		req.newptr = new;
 	}
 
 	req.oldfunc = sysctl_old_kernel;
 	req.newfunc = sysctl_new_kernel;
 	req.lock = REQ_LOCKED;
 
 	SYSCTL_LOCK();
 
 	error = sysctl_root(0, name, namelen, &req);
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
 
 	SYSCTL_UNLOCK();
 
 	if (error && error != ENOMEM)
 		return (error);
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
 }
 
 int
 kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
     void *new, size_t newlen, size_t *retval, int flags)
 {
         int oid[CTL_MAXNAME];
         size_t oidlen, plen;
 	int error;
 
 	oid[0] = 0;		/* sysctl internal magic */
 	oid[1] = 3;		/* name2oid */
 	oidlen = sizeof(oid);
 
 	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
 	    (void *)name, strlen(name), &plen, flags);
 	if (error)
 		return (error);
 
 	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
 	    new, newlen, retval, flags);
 	return (error);
 }
 
 /*
  * Transfer function to/from user space.
  */
 static int
 sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
 {
 	int error = 0;
 	size_t i, len, origidx;
 
 	origidx = req->oldidx;
 	req->oldidx += l;
 	if (req->oldptr == NULL)
 		return (0);
 	/*
 	 * If we have not wired the user supplied buffer and we are currently
 	 * holding locks, drop a witness warning, as it's possible that
 	 * write operations to the user page can sleep.
 	 */
 	if (req->lock != REQ_WIRED)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "sysctl_old_user()");
 	i = l;
 	len = req->validlen;
 	if (len <= origidx)
 		i = 0;
 	else {
 		if (i > len - origidx)
 			i = len - origidx;
 		error = copyout(p, (char *)req->oldptr + origidx, i);
 	}
 	if (error)
 		return (error);
 	if (i < l)
 		return (ENOMEM);
 	return (0);
 }
 
 static int
 sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
 {
 	int error;
 
 	if (!req->newptr)
 		return (0);
 	if (req->newlen - req->newidx < l)
 		return (EINVAL);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "sysctl_new_user()");
 	error = copyin((char *)req->newptr + req->newidx, p, l);
 	req->newidx += l;
 	return (error);
 }
 
 /*
  * Wire the user space destination buffer.  If set to a value greater than
  * zero, the len parameter limits the maximum amount of wired memory.
  */
 int
 sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
 {
 	int ret;
 	size_t i, wiredlen;
 	char *cp, dummy;
 
 	wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
 	ret = 0;
 	if (req->lock == REQ_LOCKED && req->oldptr &&
 	    req->oldfunc == sysctl_old_user) {
 		if (wiredlen != 0) {
 			ret = vslock(req->oldptr, wiredlen);
 			if (ret != 0) {
 				if (ret != ENOMEM)
 					return (ret);
 				wiredlen = 0;
 			}
 			/*
 			 * Touch all the wired pages to avoid PTE modified
 			 * bit emulation traps on Alpha while holding locks
 			 * in the sysctl handler.
 			 */
 			for (i = (wiredlen + PAGE_SIZE - 1) / PAGE_SIZE,
 			    cp = req->oldptr; i > 0; i--, cp += PAGE_SIZE) {
 				copyin(cp, &dummy, 1);
 				copyout(&dummy, cp, 1);
 			}
 		}
 		req->lock = REQ_WIRED;
 		req->validlen = wiredlen;
 	}
 	return (0);
 }
 
 int
 sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
     int *nindx, struct sysctl_req *req)
 {
 	struct sysctl_oid *oid;
 	int indx;
 
 	oid = SLIST_FIRST(&sysctl__children);
 	indx = 0;
 	while (oid && indx < CTL_MAXNAME) {
 		if (oid->oid_number == name[indx]) {
 			indx++;
 			if (oid->oid_kind & CTLFLAG_NOLOCK)
 				req->lock = REQ_UNLOCKED;
 			if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 				if (oid->oid_handler != NULL ||
 				    indx == namelen) {
 					*noid = oid;
 					if (nindx != NULL)
 						*nindx = indx;
 					return (0);
 				}
 				oid = SLIST_FIRST(
 				    (struct sysctl_oid_list *)oid->oid_arg1);
 			} else if (indx == namelen) {
 				*noid = oid;
 				if (nindx != NULL)
 					*nindx = indx;
 				return (0);
 			} else {
 				return (ENOTDIR);
 			}
 		} else {
 			oid = SLIST_NEXT(oid, oid_link);
 		}
 	}
 	return (ENOENT);
 }
 
 /*
  * Traverse our tree, and find the right node, execute whatever it points
  * to, and return the resulting error code.
  */
 
 static int
 sysctl_root(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error, indx, lvl;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
 	if (error)
 		return (error);
 
 	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		/*
 		 * You can't call a sysctl when it's a node, but has
 		 * no handler.  Inform the user that it's a node.
 		 * The indx may or may not be the same as namelen.
 		 */
 		if (oid->oid_handler == NULL)
 			return (EISDIR);
 	}
 
 	/* Is this sysctl writable? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
 		return (EPERM);
 
 	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
 
 	/* Is this sysctl sensitive to securelevels? */
 	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
 		lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
 		error = securelevel_gt(req->td->td_ucred, lvl);
 		if (error)
 			return (error);
 	}
 
 	/* Is this sysctl writable by only privileged users? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
 		if (oid->oid_kind & CTLFLAG_PRISON)
 			error = priv_check_cred(req->td->td_ucred,
 			    PRIV_SYSCTL_WRITEJAIL, SUSER_ALLOWJAIL);
 		else
 			error = priv_check(req->td, PRIV_SYSCTL_WRITE);
 		if (error)
 			return (error);
 	}
 
 	if (!oid->oid_handler)
 		return (EINVAL);
 
 	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		arg1 = (int *)arg1 + indx;
 		arg2 -= indx;
 	} else {
 		arg1 = oid->oid_arg1;
 		arg2 = oid->oid_arg2;
 	}
 #ifdef MAC
 	error = mac_check_system_sysctl(req->td->td_ucred, oid, arg1, arg2,
 	    req);
 	if (error != 0)
 		return (error);
 #endif
 	error = oid->oid_handler(oid, arg1, arg2, req);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sysctl_args {
 	int	*name;
 	u_int	namelen;
 	void	*old;
 	size_t	*oldlenp;
 	void	*new;
 	size_t	newlen;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 __sysctl(struct thread *td, struct sysctl_args *uap)
 {
 	int error, name[CTL_MAXNAME];
 	size_t j;
 
 	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
 		return (EINVAL);
 
  	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
  	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 
 	error = userland_sysctl(td, name, uap->namelen,
 		uap->old, uap->oldlenp, 0,
 		uap->new, uap->newlen, &j, 0);
 	if (error && error != ENOMEM)
 		goto done2;
 	if (uap->oldlenp) {
 		int i = copyout(&j, uap->oldlenp, sizeof(j));
 		if (i)
 			error = i;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * This is used from various compatibility syscalls too.  That's why name
  * must be in kernel space.
  */
 int
 userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
     size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval,
     int flags)
 {
 	int error = 0;
 	struct sysctl_req req;
 
 	bzero(&req, sizeof req);
 
 	req.td = td;
 	req.flags = flags;
 
 	if (oldlenp) {
 		if (inkernel) {
 			req.oldlen = *oldlenp;
 		} else {
 			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
 			if (error)
 				return (error);
 		}
 	}
 	req.validlen = req.oldlen;
 
 	if (old) {
 		if (!useracc(old, req.oldlen, VM_PROT_WRITE))
 			return (EFAULT);
 		req.oldptr= old;
 	}
 
 	if (new != NULL) {
 		if (!useracc(new, req.newlen, VM_PROT_READ))
 			return (EFAULT);
 		req.newlen = newlen;
 		req.newptr = new;
 	}
 
 	req.oldfunc = sysctl_old_user;
 	req.newfunc = sysctl_new_user;
 	req.lock = REQ_LOCKED;
 
 	SYSCTL_LOCK();
 
 	do {
 		req.oldidx = 0;
 		req.newidx = 0;
 		error = sysctl_root(0, name, namelen, &req);
 	} while (error == EAGAIN);
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
 
 	SYSCTL_UNLOCK();
 
 	if (error && error != ENOMEM)
 		return (error);
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43
 #include <sys/socket.h>
 #include <vm/vm_param.h>
 
 #define	KINFO_PROC		(0<<8)
 #define	KINFO_RT		(1<<8)
 #define	KINFO_VNODE		(2<<8)
 #define	KINFO_FILE		(3<<8)
 #define	KINFO_METER		(4<<8)
 #define	KINFO_LOADAVG		(5<<8)
 #define	KINFO_CLOCKRATE		(6<<8)
 
 /* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
 #define	KINFO_BSDI_SYSINFO	(101<<8)
 
 /*
  * XXX this is bloat, but I hope it's better here than on the potentially
  * limited kernel stack...  -Peter
  */
 
 static struct {
 	int	bsdi_machine;		/* "i386" on BSD/386 */
 /*      ^^^ this is an offset to the string, relative to the struct start */
 	char	*pad0;
 	long	pad1;
 	long	pad2;
 	long	pad3;
 	u_long	pad4;
 	u_long	pad5;
 	u_long	pad6;
 
 	int	bsdi_ostype;		/* "BSD/386" on BSD/386 */
 	int	bsdi_osrelease;		/* "1.1" on BSD/386 */
 	long	pad7;
 	long	pad8;
 	char	*pad9;
 
 	long	pad10;
 	long	pad11;
 	int	pad12;
 	long	pad13;
 	quad_t	pad14;
 	long	pad15;
 
 	struct	timeval pad16;
 	/* we dont set this, because BSDI's uname used gethostname() instead */
 	int	bsdi_hostname;		/* hostname on BSD/386 */
 
 	/* the actual string data is appended here */
 
 } bsdi_si;
 /*
  * this data is appended to the end of the bsdi_si structure during copyout.
  * The "char *" offsets are relative to the base of the bsdi_si struct.
  * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
  * should not exceed the length of the buffer here... (or else!! :-)
  */
 static char bsdi_strings[80];	/* It had better be less than this! */
 
 #ifndef _SYS_SYSPROTO_H_
 struct getkerninfo_args {
 	int	op;
 	char	*where;
 	size_t	*size;
 	int	arg;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 ogetkerninfo(struct thread *td, struct getkerninfo_args *uap)
 {
 	int error, name[6];
 	size_t size;
 	u_int needed = 0;
 
 	mtx_lock(&Giant);
 
 	switch (uap->op & 0xff00) {
 
 	case KINFO_RT:
 		name[0] = CTL_NET;
 		name[1] = PF_ROUTE;
 		name[2] = 0;
 		name[3] = (uap->op & 0xff0000) >> 16;
 		name[4] = uap->op & 0xff;
 		name[5] = uap->arg;
 		error = userland_sysctl(td, name, 6, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_VNODE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_VNODE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_PROC:
 		name[0] = CTL_KERN;
 		name[1] = KERN_PROC;
 		name[2] = uap->op & 0xff;
 		name[3] = uap->arg;
 		error = userland_sysctl(td, name, 4, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_FILE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_FILE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_METER:
 		name[0] = CTL_VM;
 		name[1] = VM_TOTAL;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_LOADAVG:
 		name[0] = CTL_VM;
 		name[1] = VM_LOADAVG;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_CLOCKRATE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_CLOCKRATE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_BSDI_SYSINFO: {
 		/*
 		 * this is pretty crude, but it's just enough for uname()
 		 * from BSDI's 1.x libc to work.
 		 *
 		 * *size gives the size of the buffer before the call, and
 		 * the amount of data copied after a successful call.
 		 * If successful, the return value is the amount of data
 		 * available, which can be larger than *size.
 		 *
 		 * BSDI's 2.x product apparently fails with ENOMEM if *size
 		 * is too small.
 		 */
 
 		u_int left;
 		char *s;
 
 		bzero((char *)&bsdi_si, sizeof(bsdi_si));
 		bzero(bsdi_strings, sizeof(bsdi_strings));
 
 		s = bsdi_strings;
 
 		bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, ostype);
 		s += strlen(s) + 1;
 
 		bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, osrelease);
 		s += strlen(s) + 1;
 
 		bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, machine);
 		s += strlen(s) + 1;
 
 		needed = sizeof(bsdi_si) + (s - bsdi_strings);
 
 		if ((uap->where == NULL) || (uap->size == NULL)) {
 			/* process is asking how much buffer to supply.. */
 			size = needed;
 			error = 0;
 			break;
 		}
 
 		if ((error = copyin(uap->size, &size, sizeof(size))) != 0)
 			break;
 
 		/* if too much buffer supplied, trim it down */
 		if (size > needed)
 			size = needed;
 
 		/* how much of the buffer is remaining */
 		left = size;
 
 		if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
 			break;
 
 		/* is there any point in continuing? */
 		if (left > sizeof(bsdi_si)) {
 			left -= sizeof(bsdi_si);
 			error = copyout(&bsdi_strings,
 					uap->where + sizeof(bsdi_si), left);
 		}
 		break;
 	}
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	if (error == 0) {
 		td->td_retval[0] = needed ? needed : size;
 		if (uap->size) {
 			error = copyout(&size, uap->size, sizeof(size));
 		}
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 #endif /* COMPAT_43 */
Index: head/sys/kern/kern_time.c
===================================================================
--- head/sys/kern/kern_time.c	(revision 167210)
+++ head/sys/kern/kern_time.c	(revision 167211)
@@ -1,1531 +1,1510 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_time.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/clock.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/time.h>
 #include <sys/timers.h>
 #include <sys/timetc.h>
 #include <sys/vnode.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #define MAX_CLOCKS 	(CLOCK_MONOTONIC+1)
 
 static struct kclock	posix_clocks[MAX_CLOCKS];
 static uma_zone_t	itimer_zone = NULL;
 
 /*
  * Time of day and interval timer support.
  *
  * These routines provide the kernel entry points to get and set
  * the time-of-day and per-process interval timers.  Subroutines
  * here provide support for adding and subtracting timeval structures
  * and decrementing interval timers, optionally reloading the interval
  * timers when they expire.
  */
 
 static int	settime(struct thread *, struct timeval *);
 static void	timevalfix(struct timeval *);
 static void	no_lease_updatetime(int);
 
 static void	itimer_start(void);
 static int	itimer_init(void *, int, int);
 static void	itimer_fini(void *, int);
 static void	itimer_enter(struct itimer *);
 static void	itimer_leave(struct itimer *);
 static struct itimer *itimer_find(struct proc *, int);
 static void	itimers_alloc(struct proc *);
 static void	itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp);
 static void	itimers_event_hook_exit(void *arg, struct proc *p);
 static int	realtimer_create(struct itimer *);
 static int	realtimer_gettime(struct itimer *, struct itimerspec *);
 static int	realtimer_settime(struct itimer *, int,
 			struct itimerspec *, struct itimerspec *);
 static int	realtimer_delete(struct itimer *);
 static void	realtimer_clocktime(clockid_t, struct timespec *);
 static void	realtimer_expire(void *);
 static int	kern_timer_create(struct thread *, clockid_t,
 			struct sigevent *, int *, int);
 static int	kern_timer_delete(struct thread *, int);
 
 int		register_posix_clock(int, struct kclock *);
 void		itimer_fire(struct itimer *it);
 int		itimespecfix(struct timespec *ts);
 
 #define CLOCK_CALL(clock, call, arglist)		\
 	((*posix_clocks[clock].call) arglist)
 
 SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL);
 
 
 static void 
 no_lease_updatetime(deltat)
 	int deltat;
 {
 }
 
 void (*lease_updatetime)(int)  = no_lease_updatetime;
 
 static int
 settime(struct thread *td, struct timeval *tv)
 {
 	struct timeval delta, tv1, tv2;
 	static struct timeval maxtime, laststep;
 	struct timespec ts;
 	int s;
 
 	s = splclock();
 	microtime(&tv1);
 	delta = *tv;
 	timevalsub(&delta, &tv1);
 
 	/*
 	 * If the system is secure, we do not allow the time to be 
 	 * set to a value earlier than 1 second less than the highest
 	 * time we have yet seen. The worst a miscreant can do in
 	 * this circumstance is "freeze" time. He couldn't go
 	 * back to the past.
 	 *
 	 * We similarly do not allow the clock to be stepped more
 	 * than one second, nor more than once per second. This allows
 	 * a miscreant to make the clock march double-time, but no worse.
 	 */
 	if (securelevel_gt(td->td_ucred, 1) != 0) {
 		if (delta.tv_sec < 0 || delta.tv_usec < 0) {
 			/*
 			 * Update maxtime to latest time we've seen.
 			 */
 			if (tv1.tv_sec > maxtime.tv_sec)
 				maxtime = tv1;
 			tv2 = *tv;
 			timevalsub(&tv2, &maxtime);
 			if (tv2.tv_sec < -1) {
 				tv->tv_sec = maxtime.tv_sec - 1;
 				printf("Time adjustment clamped to -1 second\n");
 			}
 		} else {
 			if (tv1.tv_sec == laststep.tv_sec) {
 				splx(s);
 				return (EPERM);
 			}
 			if (delta.tv_sec > 1) {
 				tv->tv_sec = tv1.tv_sec + 1;
 				printf("Time adjustment clamped to +1 second\n");
 			}
 			laststep = *tv;
 		}
 	}
 
 	ts.tv_sec = tv->tv_sec;
 	ts.tv_nsec = tv->tv_usec * 1000;
 	mtx_lock(&Giant);
 	tc_setclock(&ts);
 	(void) splsoftclock();
 	lease_updatetime(delta.tv_sec);
 	splx(s);
 	resettodr();
 	mtx_unlock(&Giant);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_gettime_args {
 	clockid_t clock_id;
 	struct	timespec *tp;
 };
 #endif
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 clock_gettime(struct thread *td, struct clock_gettime_args *uap)
 {
 	struct timespec ats;
 	int error;
 
 	error = kern_clock_gettime(td, uap->clock_id, &ats);
 	if (error == 0)
 		error = copyout(&ats, uap->tp, sizeof(ats));
 
 	return (error);
 }
 
 int
 kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct timeval sys, user;
 	struct proc *p;
 
 	p = td->td_proc;
 	switch (clock_id) {
 	case CLOCK_REALTIME:		/* Default to precise. */
 	case CLOCK_REALTIME_PRECISE:
 		nanotime(ats);
 		break;
 	case CLOCK_REALTIME_FAST:
 		getnanotime(ats);
 		break;
 	case CLOCK_VIRTUAL:
 		PROC_LOCK(p);
 		calcru(p, &user, &sys);
 		PROC_UNLOCK(p);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
 	case CLOCK_PROF:
 		PROC_LOCK(p);
 		calcru(p, &user, &sys);
 		PROC_UNLOCK(p);
 		timevaladd(&user, &sys);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
 	case CLOCK_MONOTONIC:		/* Default to precise. */
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_UPTIME:
 	case CLOCK_UPTIME_PRECISE:
 		nanouptime(ats);
 		break;
 	case CLOCK_UPTIME_FAST:
 	case CLOCK_MONOTONIC_FAST:
 		getnanouptime(ats);
 		break;
 	case CLOCK_SECOND:
 		ats->tv_sec = time_second;
 		ats->tv_nsec = 0;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_settime_args {
 	clockid_t clock_id;
 	const struct	timespec *tp;
 };
 #endif
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 clock_settime(struct thread *td, struct clock_settime_args *uap)
 {
 	struct timespec ats;
 	int error;
 
 	if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
 		return (error);
 	return (kern_clock_settime(td, uap->clock_id, &ats));
 }
 
 int
 kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct timeval atv;
 	int error;
 
 #ifdef MAC
 	error = mac_check_system_settime(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
 		return (error);
 	if (clock_id != CLOCK_REALTIME)
 		return (EINVAL);
 	if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000)
 		return (EINVAL);
 	/* XXX Don't convert nsec->usec and back */
 	TIMESPEC_TO_TIMEVAL(&atv, ats);
 	error = settime(td, &atv);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_getres_args {
 	clockid_t clock_id;
 	struct	timespec *tp;
 };
 #endif
 
 int
 clock_getres(struct thread *td, struct clock_getres_args *uap)
 {
 	struct timespec ts;
 	int error;
 
 	if (uap->tp == NULL)
 		return (0);
 
 	error = kern_clock_getres(td, uap->clock_id, &ts);
 	if (error == 0)
 		error = copyout(&ts, uap->tp, sizeof(ts));
 	return (error);
 }
 
 int
 kern_clock_getres(struct thread *td, clockid_t clock_id, struct timespec *ts)
 {
 
 	ts->tv_sec = 0;
 	switch (clock_id) {
 	case CLOCK_REALTIME:
 	case CLOCK_REALTIME_FAST:
 	case CLOCK_REALTIME_PRECISE:
 	case CLOCK_MONOTONIC:
 	case CLOCK_MONOTONIC_FAST:
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_UPTIME:
 	case CLOCK_UPTIME_FAST:
 	case CLOCK_UPTIME_PRECISE:
 		/*
 		 * Round up the result of the division cheaply by adding 1.
 		 * Rounding up is especially important if rounding down
 		 * would give 0.  Perfect rounding is unimportant.
 		 */
 		ts->tv_nsec = 1000000000 / tc_getfrequency() + 1;
 		break;
 	case CLOCK_VIRTUAL:
 	case CLOCK_PROF:
 		/* Accurately round up here because we can do so cheaply. */
 		ts->tv_nsec = (1000000000 + hz - 1) / hz;
 		break;
 	case CLOCK_SECOND:
 		ts->tv_sec = 1;
 		ts->tv_nsec = 0;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int nanowait;
 
 int
 kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
 {
 	struct timespec ts, ts2, ts3;
 	struct timeval tv;
 	int error;
 
 	if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
 		return (EINVAL);
 	if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
 		return (0);
 	getnanouptime(&ts);
 	timespecadd(&ts, rqt);
 	TIMESPEC_TO_TIMEVAL(&tv, rqt);
 	for (;;) {
 		error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp",
 		    tvtohz(&tv));
 		getnanouptime(&ts2);
 		if (error != EWOULDBLOCK) {
 			if (error == ERESTART)
 				error = EINTR;
 			if (rmt != NULL) {
 				timespecsub(&ts, &ts2);
 				if (ts.tv_sec < 0)
 					timespecclear(&ts);
 				*rmt = ts;
 			}
 			return (error);
 		}
 		if (timespeccmp(&ts2, &ts, >=))
 			return (0);
 		ts3 = ts;
 		timespecsub(&ts3, &ts2);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct nanosleep_args {
 	struct	timespec *rqtp;
 	struct	timespec *rmtp;
 };
 #endif
 
-/* 
- * MPSAFE
- */
 /* ARGSUSED */
 int
 nanosleep(struct thread *td, struct nanosleep_args *uap)
 {
 	struct timespec rmt, rqt;
 	int error;
 
 	error = copyin(uap->rqtp, &rqt, sizeof(rqt));
 	if (error)
 		return (error);
 
 	if (uap->rmtp &&
 	    !useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE))
 			return (EFAULT);
 	error = kern_nanosleep(td, &rqt, &rmt);
 	if (error && uap->rmtp) {
 		int error2;
 
 		error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
 		if (error2)
 			error = error2;
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct gettimeofday_args {
 	struct	timeval *tp;
 	struct	timezone *tzp;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 gettimeofday(struct thread *td, struct gettimeofday_args *uap)
 {
 	struct timeval atv;
 	struct timezone rtz;
 	int error = 0;
 
 	if (uap->tp) {
 		microtime(&atv);
 		error = copyout(&atv, uap->tp, sizeof (atv));
 	}
 	if (error == 0 && uap->tzp != NULL) {
 		rtz.tz_minuteswest = tz_minuteswest;
 		rtz.tz_dsttime = tz_dsttime;
 		error = copyout(&rtz, uap->tzp, sizeof (rtz));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct settimeofday_args {
 	struct	timeval *tv;
 	struct	timezone *tzp;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 settimeofday(struct thread *td, struct settimeofday_args *uap)
 {
 	struct timeval atv, *tvp;
 	struct timezone atz, *tzp;
 	int error;
 
 	if (uap->tv) {
 		error = copyin(uap->tv, &atv, sizeof(atv));
 		if (error)
 			return (error);
 		tvp = &atv;
 	} else
 		tvp = NULL;
 	if (uap->tzp) {
 		error = copyin(uap->tzp, &atz, sizeof(atz));
 		if (error)
 			return (error);
 		tzp = &atz;
 	} else
 		tzp = NULL;
 	return (kern_settimeofday(td, tvp, tzp));
 }
 
 int
 kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp)
 {
 	int error;
 
 #ifdef MAC
 	error = mac_check_system_settime(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 	error = priv_check(td, PRIV_SETTIMEOFDAY);
 	if (error)
 		return (error);
 	/* Verify all parameters before changing time. */
 	if (tv) {
 		if (tv->tv_usec < 0 || tv->tv_usec >= 1000000)
 			return (EINVAL);
 		error = settime(td, tv);
 	}
 	if (tzp && error == 0) {
 		tz_minuteswest = tzp->tz_minuteswest;
 		tz_dsttime = tzp->tz_dsttime;
 	}
 	return (error);
 }
 
 /*
  * Get value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept in the p_stats area, since
  * they can be swapped out.  These are kept internally in the
  * way they are specified externally: in time until they expire.
  *
  * The real time interval timer is kept in the process table slot
  * for the process, and its value (it_value) is kept as an
  * absolute time rather than as a delta, so that it is easy to keep
  * periodic real-time signals from drifting.
  *
  * Virtual time timers are processed in the hardclock() routine of
  * kern_clock.c.  The real time timer is processed by a timeout
  * routine, called from the softclock() routine.  Since a callout
  * may be delayed in real time due to interrupt processing in the system,
  * it is possible for the real time timeout routine (realitexpire, given below),
  * to be delayed in real time past when it is supposed to occur.  It
  * does not suffice, therefore, to reload the real timer .it_value from the
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getitimer_args {
 	u_int	which;
 	struct	itimerval *itv;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getitimer(struct thread *td, struct getitimer_args *uap)
 {
 	struct itimerval aitv;
 	int error;
 
 	error = kern_getitimer(td, uap->which, &aitv);
 	if (error != 0)
 		return (error);
 	return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
 }
 
 int
 kern_getitimer(struct thread *td, u_int which, struct itimerval *aitv)
 {
 	struct proc *p = td->td_proc;
 	struct timeval ctv;
 
 	if (which > ITIMER_PROF)
 		return (EINVAL);
 
 	if (which == ITIMER_REAL) {
 		/*
 		 * Convert from absolute to relative time in .it_value
 		 * part of real time timer.  If time for real time timer
 		 * has passed return 0, else return difference between
 		 * current time and time for the timer to go off.
 		 */
 		PROC_LOCK(p);
 		*aitv = p->p_realtimer;
 		PROC_UNLOCK(p);
 		if (timevalisset(&aitv->it_value)) {
 			getmicrouptime(&ctv);
 			if (timevalcmp(&aitv->it_value, &ctv, <))
 				timevalclear(&aitv->it_value);
 			else
 				timevalsub(&aitv->it_value, &ctv);
 		}
 	} else {
 		mtx_lock_spin(&sched_lock);
 		*aitv = p->p_stats->p_timer[which];
 		mtx_unlock_spin(&sched_lock);
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setitimer_args {
 	u_int	which;
 	struct	itimerval *itv, *oitv;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 setitimer(struct thread *td, struct setitimer_args *uap)
 {
 	struct itimerval aitv, oitv;
 	int error;
 
 	if (uap->itv == NULL) {
 		uap->itv = uap->oitv;
 		return (getitimer(td, (struct getitimer_args *)uap));
 	}
 
 	if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval))))
 		return (error);
 	error = kern_setitimer(td, uap->which, &aitv, &oitv);
 	if (error != 0 || uap->oitv == NULL)
 		return (error);
 	return (copyout(&oitv, uap->oitv, sizeof(struct itimerval)));
 }
 
 int
 kern_setitimer(struct thread *td, u_int which, struct itimerval *aitv,
     struct itimerval *oitv)
 {
 	struct proc *p = td->td_proc;
 	struct timeval ctv;
 
 	if (aitv == NULL)
 		return (kern_getitimer(td, which, oitv));
 
 	if (which > ITIMER_PROF)
 		return (EINVAL);
 	if (itimerfix(&aitv->it_value))
 		return (EINVAL);
 	if (!timevalisset(&aitv->it_value))
 		timevalclear(&aitv->it_interval);
 	else if (itimerfix(&aitv->it_interval))
 		return (EINVAL);
 
 	if (which == ITIMER_REAL) {
 		PROC_LOCK(p);
 		if (timevalisset(&p->p_realtimer.it_value))
 			callout_stop(&p->p_itcallout);
 		getmicrouptime(&ctv);
 		if (timevalisset(&aitv->it_value)) {
 			callout_reset(&p->p_itcallout, tvtohz(&aitv->it_value),
 			    realitexpire, p);
 			timevaladd(&aitv->it_value, &ctv);
 		}
 		*oitv = p->p_realtimer;
 		p->p_realtimer = *aitv;
 		PROC_UNLOCK(p);
 		if (timevalisset(&oitv->it_value)) {
 			if (timevalcmp(&oitv->it_value, &ctv, <))
 				timevalclear(&oitv->it_value);
 			else
 				timevalsub(&oitv->it_value, &ctv);
 		}
 	} else {
 		mtx_lock_spin(&sched_lock);
 		*oitv = p->p_stats->p_timer[which];
 		p->p_stats->p_timer[which] = *aitv;
 		mtx_unlock_spin(&sched_lock);
 	}
 	return (0);
 }
 
 /*
  * Real interval timer expired:
  * send process whose timer expired an alarm signal.
  * If time is not set up to reload, then just return.
  * Else compute next time timer should go off which is > current time.
  * This is where delay in processing this timeout causes multiple
  * SIGALRM calls to be compressed into one.
  * tvtohz() always adds 1 to allow for the time until the next clock
  * interrupt being strictly less than 1 clock tick, but we don't want
  * that here since we want to appear to be in sync with the clock
  * interrupt even when we're delayed.
  */
 void
 realitexpire(void *arg)
 {
 	struct proc *p;
 	struct timeval ctv, ntv;
 
 	p = (struct proc *)arg;
 	PROC_LOCK(p);
 	psignal(p, SIGALRM);
 	if (!timevalisset(&p->p_realtimer.it_interval)) {
 		timevalclear(&p->p_realtimer.it_value);
 		if (p->p_flag & P_WEXIT)
 			wakeup(&p->p_itcallout);
 		PROC_UNLOCK(p);
 		return;
 	}
 	for (;;) {
 		timevaladd(&p->p_realtimer.it_value,
 		    &p->p_realtimer.it_interval);
 		getmicrouptime(&ctv);
 		if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
 			ntv = p->p_realtimer.it_value;
 			timevalsub(&ntv, &ctv);
 			callout_reset(&p->p_itcallout, tvtohz(&ntv) - 1,
 			    realitexpire, p);
 			PROC_UNLOCK(p);
 			return;
 		}
 	}
 	/*NOTREACHED*/
 }
 
 /*
  * Check that a proposed value to load into the .it_value or
  * .it_interval part of an interval timer is acceptable, and
  * fix it to have at least minimal value (i.e. if it is less
  * than the resolution of the clock, round it up.)
  */
 int
 itimerfix(struct timeval *tv)
 {
 
 	if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
 		return (EINVAL);
 	if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
 		tv->tv_usec = tick;
 	return (0);
 }
 
 /*
  * Decrement an interval timer by a specified number
  * of microseconds, which must be less than a second,
  * i.e. < 1000000.  If the timer expires, then reload
  * it.  In this case, carry over (usec - old value) to
  * reduce the value reloaded into the timer so that
  * the timer does not drift.  This routine assumes
  * that it is called in a context where the timers
  * on which it is operating cannot change in value.
  */
 int
 itimerdecr(struct itimerval *itp, int usec)
 {
 
 	if (itp->it_value.tv_usec < usec) {
 		if (itp->it_value.tv_sec == 0) {
 			/* expired, and already in next interval */
 			usec -= itp->it_value.tv_usec;
 			goto expire;
 		}
 		itp->it_value.tv_usec += 1000000;
 		itp->it_value.tv_sec--;
 	}
 	itp->it_value.tv_usec -= usec;
 	usec = 0;
 	if (timevalisset(&itp->it_value))
 		return (1);
 	/* expired, exactly at end of interval */
 expire:
 	if (timevalisset(&itp->it_interval)) {
 		itp->it_value = itp->it_interval;
 		itp->it_value.tv_usec -= usec;
 		if (itp->it_value.tv_usec < 0) {
 			itp->it_value.tv_usec += 1000000;
 			itp->it_value.tv_sec--;
 		}
 	} else
 		itp->it_value.tv_usec = 0;		/* sec is already 0 */
 	return (0);
 }
 
 /*
  * Add and subtract routines for timevals.
  * N.B.: subtract routine doesn't deal with
  * results which are before the beginning,
  * it just gets very confused in this case.
  * Caveat emptor.
  */
 void
 timevaladd(struct timeval *t1, const struct timeval *t2)
 {
 
 	t1->tv_sec += t2->tv_sec;
 	t1->tv_usec += t2->tv_usec;
 	timevalfix(t1);
 }
 
 void
 timevalsub(struct timeval *t1, const struct timeval *t2)
 {
 
 	t1->tv_sec -= t2->tv_sec;
 	t1->tv_usec -= t2->tv_usec;
 	timevalfix(t1);
 }
 
 static void
 timevalfix(struct timeval *t1)
 {
 
 	if (t1->tv_usec < 0) {
 		t1->tv_sec--;
 		t1->tv_usec += 1000000;
 	}
 	if (t1->tv_usec >= 1000000) {
 		t1->tv_sec++;
 		t1->tv_usec -= 1000000;
 	}
 }
 
 /*
  * ratecheck(): simple time-based rate-limit checking.
  */
 int
 ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
 {
 	struct timeval tv, delta;
 	int rv = 0;
 
 	getmicrouptime(&tv);		/* NB: 10ms precision */
 	delta = tv;
 	timevalsub(&delta, lasttime);
 
 	/*
 	 * check for 0,0 is so that the message will be seen at least once,
 	 * even if interval is huge.
 	 */
 	if (timevalcmp(&delta, mininterval, >=) ||
 	    (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
 		*lasttime = tv;
 		rv = 1;
 	}
 
 	return (rv);
 }
 
 /*
  * ppsratecheck(): packets (or events) per second limitation.
  *
  * Return 0 if the limit is to be enforced (e.g. the caller
  * should drop a packet because of the rate limitation).
  *
  * maxpps of 0 always causes zero to be returned.  maxpps of -1
  * always causes 1 to be returned; this effectively defeats rate
  * limiting.
  *
  * Note that we maintain the struct timeval for compatibility
  * with other bsd systems.  We reuse the storage and just monitor
  * clock ticks for minimal overhead.  
  */
 int
 ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
 {
 	int now;
 
 	/*
 	 * Reset the last time and counter if this is the first call
 	 * or more than a second has passed since the last update of
 	 * lasttime.
 	 */
 	now = ticks;
 	if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
 		lasttime->tv_sec = now;
 		*curpps = 1;
 		return (maxpps != 0);
 	} else {
 		(*curpps)++;		/* NB: ignore potential overflow */
 		return (maxpps < 0 || *curpps < maxpps);
 	}
 }
 
 static void
 itimer_start(void)
 {
 	struct kclock rt_clock = {
 		.timer_create  = realtimer_create,
 		.timer_delete  = realtimer_delete,
 		.timer_settime = realtimer_settime,
 		.timer_gettime = realtimer_gettime,
 		.event_hook    = NULL
 	};
 
 	itimer_zone = uma_zcreate("itimer", sizeof(struct itimer),
 		NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0);
 	register_posix_clock(CLOCK_REALTIME,  &rt_clock);
 	register_posix_clock(CLOCK_MONOTONIC, &rt_clock);
 	p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L);
 	p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX);
 	p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX);
 	EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit,
 		(void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY);
 	EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec,
 		(void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY);
 }
 
 int
 register_posix_clock(int clockid, struct kclock *clk)
 {
 	if ((unsigned)clockid >= MAX_CLOCKS) {
 		printf("%s: invalid clockid\n", __func__);
 		return (0);
 	}
 	posix_clocks[clockid] = *clk;
 	return (1);
 }
 
 static int
 itimer_init(void *mem, int size, int flags)
 {
 	struct itimer *it;
 
 	it = (struct itimer *)mem;
 	mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF);
 	return (0);
 }
 
 static void
 itimer_fini(void *mem, int size)
 {
 	struct itimer *it;
 
 	it = (struct itimer *)mem;
 	mtx_destroy(&it->it_mtx);
 }
 
 static void
 itimer_enter(struct itimer *it)
 {
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	it->it_usecount++;
 }
 
 static void
 itimer_leave(struct itimer *it)
 {
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	KASSERT(it->it_usecount > 0, ("invalid it_usecount"));
 
 	if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0)
 		wakeup(it);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_create_args {
 	clockid_t clock_id;
 	struct sigevent * evp;
 	int * timerid;
 };
 #endif
 
 int
 ktimer_create(struct thread *td, struct ktimer_create_args *uap)
 {
 	struct sigevent *evp1, ev;
 	int id;
 	int error;
 
 	if (uap->evp != NULL) {
 		error = copyin(uap->evp, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp1 = &ev;
 	} else
 		evp1 = NULL;
 
 	error = kern_timer_create(td, uap->clock_id, evp1, &id, -1);
 
 	if (error == 0) {
 		error = copyout(&id, uap->timerid, sizeof(int));
 		if (error != 0)
 			kern_timer_delete(td, id);
 	}
 	return (error);
 }
 
 static int
 kern_timer_create(struct thread *td, clockid_t clock_id,
 	struct sigevent *evp, int *timerid, int preset_id)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	int id;
 	int error;
 
 	if (clock_id < 0 || clock_id >= MAX_CLOCKS)
 		return (EINVAL);
 
 	if (posix_clocks[clock_id].timer_create == NULL)
 		return (EINVAL);
 
 	if (evp != NULL) {
 		if (evp->sigev_notify != SIGEV_NONE &&
 		    evp->sigev_notify != SIGEV_SIGNAL &&
 		    evp->sigev_notify != SIGEV_THREAD_ID)
 			return (EINVAL);
 		if ((evp->sigev_notify == SIGEV_SIGNAL ||
 		     evp->sigev_notify == SIGEV_THREAD_ID) &&
 			!_SIG_VALID(evp->sigev_signo))
 			return (EINVAL);
 	}
 	
 	if (p->p_itimers == NULL)
 		itimers_alloc(p);
 	
 	it = uma_zalloc(itimer_zone, M_WAITOK);
 	it->it_flags = 0;
 	it->it_usecount = 0;
 	it->it_active = 0;
 	timespecclear(&it->it_time.it_value);
 	timespecclear(&it->it_time.it_interval);
 	it->it_overrun = 0;
 	it->it_overrun_last = 0;
 	it->it_clockid = clock_id;
 	it->it_timerid = -1;
 	it->it_proc = p;
 	ksiginfo_init(&it->it_ksi);
 	it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 	error = CLOCK_CALL(clock_id, timer_create, (it));
 	if (error != 0)
 		goto out;
 
 	PROC_LOCK(p);
 	if (preset_id != -1) {
 		KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id"));
 		id = preset_id;
 		if (p->p_itimers->its_timers[id] != NULL) {
 			PROC_UNLOCK(p);
 			error = 0;
 			goto out;
 		}
 	} else {
 		/*
 		 * Find a free timer slot, skipping those reserved
 		 * for setitimer().
 		 */
 		for (id = 3; id < TIMER_MAX; id++)
 			if (p->p_itimers->its_timers[id] == NULL)
 				break;
 		if (id == TIMER_MAX) {
 			PROC_UNLOCK(p);
 			error = EAGAIN;
 			goto out;
 		}
 	}
 	it->it_timerid = id;
 	p->p_itimers->its_timers[id] = it;
 	if (evp != NULL)
 		it->it_sigev = *evp;
 	else {
 		it->it_sigev.sigev_notify = SIGEV_SIGNAL;
 		switch (clock_id) {
 		default:
 		case CLOCK_REALTIME:
 			it->it_sigev.sigev_signo = SIGALRM;
 			break;
 		case CLOCK_VIRTUAL:
  			it->it_sigev.sigev_signo = SIGVTALRM;
 			break;
 		case CLOCK_PROF:
 			it->it_sigev.sigev_signo = SIGPROF;
 			break;
 		}
 		it->it_sigev.sigev_value.sival_int = id;
 	}
 
 	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
 	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
 		it->it_ksi.ksi_signo = it->it_sigev.sigev_signo;
 		it->it_ksi.ksi_code = SI_TIMER;
 		it->it_ksi.ksi_value = it->it_sigev.sigev_value;
 		it->it_ksi.ksi_timerid = id;
 	}
 	PROC_UNLOCK(p);
 	*timerid = id;
 	return (0);
 
 out:
 	ITIMER_LOCK(it);
 	CLOCK_CALL(it->it_clockid, timer_delete, (it));
 	ITIMER_UNLOCK(it);
 	uma_zfree(itimer_zone, it);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_delete_args {
 	int timerid;
 };
 #endif
 
 int
 ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
 {
 	return (kern_timer_delete(td, uap->timerid));
 }
 
 static struct itimer *
 itimer_find(struct proc *p, int timerid)
 {
 	struct itimer *it;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_itimers == NULL) || (timerid >= TIMER_MAX) ||
 	    (it = p->p_itimers->its_timers[timerid]) == NULL) {
 		return (NULL);
 	}
 	ITIMER_LOCK(it);
 	if ((it->it_flags & ITF_DELETING) != 0) {
 		ITIMER_UNLOCK(it);
 		it = NULL;
 	}
 	return (it);
 }
 
 static int
 kern_timer_delete(struct thread *td, int timerid)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 
 	PROC_LOCK(p);
 	it = itimer_find(p, timerid);
 	if (it == NULL) {
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	PROC_UNLOCK(p);
 
 	it->it_flags |= ITF_DELETING;
 	while (it->it_usecount > 0) {
 		it->it_flags |= ITF_WANTED;
 		msleep(it, &it->it_mtx, PPAUSE, "itimer", 0);
 	}
 	it->it_flags &= ~ITF_WANTED;
 	CLOCK_CALL(it->it_clockid, timer_delete, (it));
 	ITIMER_UNLOCK(it);
 
 	PROC_LOCK(p);
 	if (KSI_ONQ(&it->it_ksi))
 		sigqueue_take(&it->it_ksi);
 	p->p_itimers->its_timers[timerid] = NULL;
 	PROC_UNLOCK(p);
 	uma_zfree(itimer_zone, it);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_settime_args {
 	int timerid;
 	int flags;
 	const struct itimerspec * value;
 	struct itimerspec * ovalue;
 };
 #endif
 
 int
 ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	struct itimerspec val, oval, *ovalp;
 	int error;
 
 	error = copyin(uap->value, &val, sizeof(val));
 	if (error != 0)
 		return (error);
 	
 	if (uap->ovalue != NULL)
 		ovalp = &oval;
 	else
 		ovalp = NULL;
 
 	PROC_LOCK(p);
 	if (uap->timerid < 3 ||
 	    (it = itimer_find(p, uap->timerid)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		PROC_UNLOCK(p);
 		itimer_enter(it);
 		error = CLOCK_CALL(it->it_clockid, timer_settime,
 				(it, uap->flags, &val, ovalp));
 		itimer_leave(it);
 		ITIMER_UNLOCK(it);
 	}
 	if (error == 0 && uap->ovalue != NULL)
 		error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_gettime_args {
 	int timerid;
 	struct itimerspec * value;
 };
 #endif
 
 int
 ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	struct itimerspec val;
 	int error;
 
 	PROC_LOCK(p);
 	if (uap->timerid < 3 ||
 	   (it = itimer_find(p, uap->timerid)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		PROC_UNLOCK(p);
 		itimer_enter(it);
 		error = CLOCK_CALL(it->it_clockid, timer_gettime,
 				(it, &val));
 		itimer_leave(it);
 		ITIMER_UNLOCK(it);
 	}
 	if (error == 0)
 		error = copyout(&val, uap->value, sizeof(val));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct timer_getoverrun_args {
 	int timerid;
 };
 #endif
 
 int
 ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	int error ;
 
 	PROC_LOCK(p);
 	if (uap->timerid < 3 ||
 	    (it = itimer_find(p, uap->timerid)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		td->td_retval[0] = it->it_overrun_last;
 		ITIMER_UNLOCK(it);
 		PROC_UNLOCK(p);
 		error = 0;
 	}
 	return (error);
 }
 
 static int
 realtimer_create(struct itimer *it)
 {
 	callout_init_mtx(&it->it_callout, &it->it_mtx, 0);
 	return (0);
 }
 
 static int
 realtimer_delete(struct itimer *it)
 {
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	
 	ITIMER_UNLOCK(it);
 	callout_drain(&it->it_callout);
 	ITIMER_LOCK(it);
 	return (0);
 }
 
 static int
 realtimer_gettime(struct itimer *it, struct itimerspec *ovalue)
 {
 	struct timespec cts;
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 
 	realtimer_clocktime(it->it_clockid, &cts);
 	*ovalue = it->it_time;
 	if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) {
 		timespecsub(&ovalue->it_value, &cts);
 		if (ovalue->it_value.tv_sec < 0 ||
 		    (ovalue->it_value.tv_sec == 0 &&
 		     ovalue->it_value.tv_nsec == 0)) {
 			ovalue->it_value.tv_sec  = 0;
 			ovalue->it_value.tv_nsec = 1;
 		}
 	}
 	return (0);
 }
 
 static int
 realtimer_settime(struct itimer *it, int flags,
 	struct itimerspec *value, struct itimerspec *ovalue)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct itimerspec val;
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 
 	val = *value;
 	if (itimespecfix(&val.it_value))
 		return (EINVAL);
 
 	if (timespecisset(&val.it_value)) {
 		if (itimespecfix(&val.it_interval))
 			return (EINVAL);
 	} else {
 		timespecclear(&val.it_interval);
 	}
 	
 	if (ovalue != NULL)
 		realtimer_gettime(it, ovalue);
 
 	it->it_time = val;
 	if (timespecisset(&val.it_value)) {
 		realtimer_clocktime(it->it_clockid, &cts);
 		ts = val.it_value;
 		if ((flags & TIMER_ABSTIME) == 0) {
 			/* Convert to absolute time. */
 			timespecadd(&it->it_time.it_value, &cts);
 		} else {
 			timespecsub(&ts, &cts);
 			/*
 			 * We don't care if ts is negative, tztohz will
 			 * fix it.
 			 */
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&it->it_callout, tvtohz(&tv),
 			realtimer_expire, it);
 	} else {
 		callout_stop(&it->it_callout);
 	}
 
 	return (0);
 }
 
 static void
 realtimer_clocktime(clockid_t id, struct timespec *ts)
 {
 	if (id == CLOCK_REALTIME)
 		getnanotime(ts);
 	else	/* CLOCK_MONOTONIC */
 		getnanouptime(ts);
 }
 
 int
 itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi)
 {
 	struct itimer *it;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	it = itimer_find(p, timerid);
 	if (it != NULL) {
 		ksi->ksi_overrun = it->it_overrun;
 		it->it_overrun_last = it->it_overrun;
 		it->it_overrun = 0;
 		ITIMER_UNLOCK(it);
 		return (0);
 	}
 	return (EINVAL);
 }
 
 int
 itimespecfix(struct timespec *ts)
 {
 
 	if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
 		return (EINVAL);
 	if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
 		ts->tv_nsec = tick * 1000;
 	return (0);
 }
 
 /* Timeout callback for realtime timer */
 static void
 realtimer_expire(void *arg)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct itimer *it;
 	struct proc *p;
 
 	it = (struct itimer *)arg;
 	p = it->it_proc;
 
 	realtimer_clocktime(it->it_clockid, &cts);
 	/* Only fire if time is reached. */
 	if (timespeccmp(&cts, &it->it_time.it_value, >=)) {
 		if (timespecisset(&it->it_time.it_interval)) {
 			timespecadd(&it->it_time.it_value,
 				    &it->it_time.it_interval);
 			while (timespeccmp(&cts, &it->it_time.it_value, >=)) {
 				if (it->it_overrun < INT_MAX)
 					it->it_overrun++;
 				else
 					it->it_ksi.ksi_errno = ERANGE;
 				timespecadd(&it->it_time.it_value,
 					    &it->it_time.it_interval);
 			}
 		} else {
 			/* single shot timer ? */
 			timespecclear(&it->it_time.it_value);
 		}
 		if (timespecisset(&it->it_time.it_value)) {
 			ts = it->it_time.it_value;
 			timespecsub(&ts, &cts);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
 			callout_reset(&it->it_callout, tvtohz(&tv),
 				 realtimer_expire, it);
 		}
 		ITIMER_UNLOCK(it);
 		itimer_fire(it);
 		ITIMER_LOCK(it);
 	} else if (timespecisset(&it->it_time.it_value)) {
 		ts = it->it_time.it_value;
 		timespecsub(&ts, &cts);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire,
  			it);
 	}
 }
 
 void
 itimer_fire(struct itimer *it)
 {
 	struct proc *p = it->it_proc;
 	int ret;
 
 	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
 	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
 		PROC_LOCK(p);
 		if (!KSI_ONQ(&it->it_ksi)) {
 			it->it_ksi.ksi_errno = 0;
 			ret = psignal_event(p, &it->it_sigev, &it->it_ksi);
 			if (__predict_false(ret != 0)) {
 				it->it_overrun++;
 				/*
 				 * Broken userland code, thread went
 				 * away, disarm the timer.
 				 */
 				if (ret == ESRCH) {
 					ITIMER_LOCK(it);
 					timespecclear(&it->it_time.it_value);
 					timespecclear(&it->it_time.it_interval);
 					callout_stop(&it->it_callout);
 					ITIMER_UNLOCK(it);
 				}
 			}
 		} else {
 			if (it->it_overrun < INT_MAX)
 				it->it_overrun++;
 			else
 				it->it_ksi.ksi_errno = ERANGE;
 		}
 		PROC_UNLOCK(p);
 	}
 }
 
 static void
 itimers_alloc(struct proc *p)
 {
 	struct itimers *its;
 	int i;
 
 	its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO);
 	LIST_INIT(&its->its_virtual);
 	LIST_INIT(&its->its_prof);
 	TAILQ_INIT(&its->its_worklist);
 	for (i = 0; i < TIMER_MAX; i++)
 		its->its_timers[i] = NULL;
 	PROC_LOCK(p);
 	if (p->p_itimers == NULL) {
 		p->p_itimers = its;
 		PROC_UNLOCK(p);
 	}
 	else {
 		PROC_UNLOCK(p);
 		free(its, M_SUBPROC);
 	}
 }
 
 static void
 itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
 {
 	itimers_event_hook_exit(arg, p);
 }
 
 /* Clean up timers when some process events are being triggered. */
 static void
 itimers_event_hook_exit(void *arg, struct proc *p)
 {
 	struct itimers *its;
 	struct itimer *it;
 	int event = (int)(intptr_t)arg;
 	int i;
 
 	if (p->p_itimers != NULL) {
 		its = p->p_itimers;
 		for (i = 0; i < MAX_CLOCKS; ++i) {
 			if (posix_clocks[i].event_hook != NULL)
 				CLOCK_CALL(i, event_hook, (p, i, event));
 		}
 		/*
 		 * According to susv3, XSI interval timers should be inherited
 		 * by new image.
 		 */
 		if (event == ITIMER_EV_EXEC)
 			i = 3;
 		else if (event == ITIMER_EV_EXIT)
 			i = 0;
 		else
 			panic("unhandled event");
 		for (; i < TIMER_MAX; ++i) {
 			if ((it = its->its_timers[i]) != NULL)
 				kern_timer_delete(curthread, i);
 		}
 		if (its->its_timers[0] == NULL &&
 		    its->its_timers[1] == NULL &&
 		    its->its_timers[2] == NULL) {
 			free(its, M_SUBPROC);
 			p->p_itimers = NULL;
 		}
 	}
 }
Index: head/sys/kern/kern_xxx.c
===================================================================
--- head/sys/kern/kern_xxx.c	(revision 167210)
+++ head/sys/kern/kern_xxx.c	(revision 167211)
@@ -1,314 +1,290 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_xxx.c	8.2 (Berkeley) 11/14/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/utsname.h>
 
 
 #if defined(COMPAT_43)
 
 #ifndef _SYS_SYSPROTO_H_
 struct gethostname_args {
 	char	*hostname;
 	u_int	len;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 ogethostname(td, uap)
 	struct thread *td;
 	struct gethostname_args *uap;
 {
 	int name[2];
 	int error;
 	size_t len = uap->len;
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTNAME;
 	mtx_lock(&Giant);
 	error = userland_sysctl(td, name, 2, uap->hostname, &len,
 	    1, 0, 0, 0, 0);
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sethostname_args {
 	char	*hostname;
 	u_int	len;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 osethostname(td, uap)
 	struct thread *td;
 	register struct sethostname_args *uap;
 {
 	int name[2];
 	int error;
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTNAME;
 	mtx_lock(&Giant);
 	error = userland_sysctl(td, name, 2, 0, 0, 0, uap->hostname,
 	    uap->len, 0, 0);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ogethostid_args {
 	int	dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 ogethostid(td, uap)
 	struct thread *td;
 	struct ogethostid_args *uap;
 {
 
 	*(long *)(td->td_retval) = hostid;
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
 struct osethostid_args {
 	long	hostid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 osethostid(td, uap)
 	struct thread *td;
 	struct osethostid_args *uap;
 {
 	int error;
 
 	error = priv_check(td, PRIV_SETHOSTID);
 	if (error)
 		return (error);
 	mtx_lock(&Giant);
 	hostid = uap->hostid;
 	mtx_unlock(&Giant);
 	return (0);
 }
 
-/*
- * MPSAFE
- */
 int
 oquota(td, uap)
 	struct thread *td;
 	struct oquota_args *uap;
 {
 	return (ENOSYS);
 }
 #endif /* COMPAT_43 */
 
 /*
  * This is the FreeBSD-1.1 compatable uname(2) interface.  These
  * days it is done in libc as a wrapper around a bunch of sysctl's.
  * This must maintain the old 1.1 binary ABI.
  */
 #if SYS_NMLN != 32
 #error "FreeBSD-1.1 uname syscall has been broken"
 #endif
 #ifndef _SYS_SYSPROTO_H_
 struct uname_args {
         struct utsname  *name;
 };
 #endif
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 uname(td, uap)
 	struct thread *td;
 	struct uname_args *uap;
 {
 	int name[2], error;
 	size_t len;
 	char *s, *us;
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_OSTYPE;
 	len = sizeof (uap->name->sysname);
 	mtx_lock(&Giant);
 	error = userland_sysctl(td, name, 2, uap->name->sysname, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		goto done2;
 	subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
 
 	name[1] = KERN_HOSTNAME;
 	len = sizeof uap->name->nodename;
 	error = userland_sysctl(td, name, 2, uap->name->nodename, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		goto done2;
 	subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
 
 	name[1] = KERN_OSRELEASE;
 	len = sizeof uap->name->release;
 	error = userland_sysctl(td, name, 2, uap->name->release, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		goto done2;
 	subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
 
 /*
 	name = KERN_VERSION;
 	len = sizeof uap->name->version;
 	error = userland_sysctl(td, name, 2, uap->name->version, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		goto done2;
 	subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
 */
 
 /*
  * this stupid hackery to make the version field look like FreeBSD 1.1
  */
 	for(s = version; *s && *s != '#'; s++);
 
 	for(us = uap->name->version; *s && *s != ':'; s++) {
 		error = subyte( us++, *s);
 		if (error)
 			goto done2;
 	}
 	error = subyte( us++, 0);
 	if (error)
 		goto done2;
 
 	name[0] = CTL_HW;
 	name[1] = HW_MACHINE;
 	len = sizeof uap->name->machine;
 	error = userland_sysctl(td, name, 2, uap->name->machine, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		goto done2;
 	subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getdomainname_args {
         char    *domainname;
         int     len;
 };
 #endif
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getdomainname(td, uap)
         struct thread *td;
         struct getdomainname_args *uap;
 {
 	int domainnamelen;
 	int error;
 
 	mtx_lock(&Giant);
 	domainnamelen = strlen(domainname) + 1;
 	if ((u_int)uap->len > domainnamelen)
 		uap->len = domainnamelen;
 	error = copyout(domainname, uap->domainname, uap->len);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setdomainname_args {
         char    *domainname;
         int     len;
 };
 #endif
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setdomainname(td, uap)
         struct thread *td;
         struct setdomainname_args *uap;
 {
         int error, domainnamelen;
 
 	error = priv_check(td, PRIV_SETDOMAINNAME);
 	if (error)
 		return (error);
 	mtx_lock(&Giant);
         if ((u_int)uap->len > sizeof (domainname) - 1) {
 		error = EINVAL;
 		goto done2;
 	}
         domainnamelen = uap->len;
         error = copyin(uap->domainname, domainname, uap->len);
         domainname[domainnamelen] = 0;
 done2:
 	mtx_unlock(&Giant);
         return (error);
 }
Index: head/sys/kern/p1003_1b.c
===================================================================
--- head/sys/kern/p1003_1b.c	(revision 167210)
+++ head/sys/kern/p1003_1b.c	(revision 167211)
@@ -1,344 +1,320 @@
 /*-
  * Copyright (c) 1996, 1997, 1998
  *	HD Associates, Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by HD Associates, Inc
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /* p1003_1b: Real Time common code.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_posix.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 
 MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
 
 /* The system calls return ENOSYS if an entry is called that is
  * not run-time supported.  I am also logging since some programs
  * start to use this when they shouldn't.  That will be removed if annoying.
  */
 int
 syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap)
 {
 	log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
 			td->td_proc->p_comm, td->td_proc->p_pid, s);
 
 	/* a " return nosys(p, uap); " here causes a core dump.
 	 */
 
 	return ENOSYS;
 }
 
 #if !defined(_KPOSIX_PRIORITY_SCHEDULING)
 
 /* Not configured but loadable via a module:
  */
 
 static int
 sched_attach(void)
 {
 	return 0;
 }
 
 SYSCALL_NOT_PRESENT_GEN(sched_setparam)
 SYSCALL_NOT_PRESENT_GEN(sched_getparam)
 SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
 SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
 SYSCALL_NOT_PRESENT_GEN(sched_yield)
 SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
 SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
 SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
 #else
 
 /* Configured in kernel version:
  */
 static struct ksched *ksched;
 
 static int
 sched_attach(void)
 {
 	int ret = ksched_attach(&ksched);
 
 	if (ret == 0)
 		p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1);
 
 	return ret;
 }
 
-/* 
- * MPSAFE
- */
 int
 sched_setparam(struct thread *td, struct sched_setparam_args *uap)
 {
 	struct thread *targettd;
 	struct proc *targetp;
 	int e;
 	struct sched_param sched_param;
 
 	e = copyin(uap->param, &sched_param, sizeof(sched_param));
 	if (e)
 		return (e);
 
 	if (uap->pid == 0) {
 		targetp = td->td_proc;
 		targettd = td;
 		PROC_LOCK(targetp);
 	} else {
 		targetp = pfind(uap->pid);
 		if (targetp == NULL)
 			return (ESRCH);
 		targettd = FIRST_THREAD_IN_PROC(targetp);
 	}
 
 	e = p_cansched(td, targetp);
 	if (e == 0) {
 		e = ksched_setparam(ksched, targettd,
 			(const struct sched_param *)&sched_param);
 	}
 	PROC_UNLOCK(targetp);
 	return (e);
 }
 
-/* 
- * MPSAFE
- */
 int
 sched_getparam(struct thread *td, struct sched_getparam_args *uap)
 {
 	int e;
 	struct sched_param sched_param;
 	struct thread *targettd;
 	struct proc *targetp;
 
 	if (uap->pid == 0) {
 		targetp = td->td_proc;
 		targettd = td;
 		PROC_LOCK(targetp);
 	} else {
 		targetp = pfind(uap->pid);
 		if (targetp == NULL) {
 			return (ESRCH);
 		}
 		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
 	}
 
 	e = p_cansee(td, targetp);
 	if (e == 0) {
 		e = ksched_getparam(ksched, targettd, &sched_param);
 	}
 	PROC_UNLOCK(targetp);
 	if (e == 0)
 		e = copyout(&sched_param, uap->param, sizeof(sched_param));
 	return (e);
 }
 
-/* 
- * MPSAFE
- */
 int
 sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap)
 {
 	int e;
 	struct sched_param sched_param;
 	struct thread *targettd;
 	struct proc *targetp;
 
 	/* Don't allow non root user to set a scheduler policy. */
 	e = priv_check(td, PRIV_SCHED_SET);
 	if (e)
 		return (e);
 
 	e = copyin(uap->param, &sched_param, sizeof(sched_param));
 	if (e)
 		return (e);
 
 	if (uap->pid == 0) {
 		targetp = td->td_proc;
 		targettd = td;
 		PROC_LOCK(targetp);
 	} else {
 		targetp = pfind(uap->pid);
 		if (targetp == NULL)
 			return (ESRCH);
 		targettd = FIRST_THREAD_IN_PROC(targetp);
 	}
 
 	e = p_cansched(td, targetp);
 	if (e == 0) {
 		e = ksched_setscheduler(ksched, targettd,
 			uap->policy, (const struct sched_param *)&sched_param);
 	}
 	PROC_UNLOCK(targetp);
 	return (e);
 }
 
-/* 
- * MPSAFE
- */
 int
 sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap)
 {
 	int e, policy;
 	struct thread *targettd;
 	struct proc *targetp;
 
 	if (uap->pid == 0) {
 		targetp = td->td_proc;
 		targettd = td;
 		PROC_LOCK(targetp);
 	} else {
 		targetp = pfind(uap->pid);
 		if (targetp == NULL) {
 			e = ESRCH;
 			goto done2;
 		}
 		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
 	}
 
 	e = p_cansee(td, targetp);
 	if (e == 0) {
 		e = ksched_getscheduler(ksched, targettd, &policy);
 		td->td_retval[0] = policy;
 	}
 	PROC_UNLOCK(targetp);
 
 done2:
 	return (e);
 }
 
-/* 
- * MPSAFE
- */
 int
 sched_yield(struct thread *td, struct sched_yield_args *uap)
 {
 
 	return (ksched_yield(ksched));
 }
 
-/* 
- * MPSAFE
- */
 int
 sched_get_priority_max(struct thread *td,
     struct sched_get_priority_max_args *uap)
 {
 	int error, prio;
 
 	error = ksched_get_priority_max(ksched, uap->policy, &prio);
 	td->td_retval[0] = prio;
 	return (error);
 }
 
-/* 
- * MPSAFE
- */
 int
 sched_get_priority_min(struct thread *td,
     struct sched_get_priority_min_args *uap)
 {
 	int error, prio;
 
 	error = ksched_get_priority_min(ksched, uap->policy, &prio);
 	td->td_retval[0] = prio;
 	return (error);
 }
 
-/* 
- * MPSAFE
- */
 int
 sched_rr_get_interval(struct thread *td,
     struct sched_rr_get_interval_args *uap)
 {
 	struct timespec timespec;
 	int error;
 
 	error = kern_sched_rr_get_interval(td, uap->pid, &timespec);
 	if (error == 0)
 		error = copyout(&timespec, uap->interval, sizeof(timespec));
 	return (error);
 }
 
 int
 kern_sched_rr_get_interval(struct thread *td, pid_t pid,
     struct timespec *ts)
 {
 	int e;
 	struct thread *targettd;
 	struct proc *targetp;
 
 	if (pid == 0) {
 		targettd = td;
 		targetp = td->td_proc;
 		PROC_LOCK(targetp);
 	} else {
 		targetp = td->td_proc;
 		PROC_LOCK(targetp);
 		targettd = thread_find(targetp, pid);
 		if (targettd == NULL) {
 			PROC_UNLOCK(targetp);
 			return (ESRCH);
 		}
 	}
 
 	e = p_cansee(td, targetp);
 	if (e == 0)
 		e = ksched_rr_get_interval(ksched, targettd, ts);
 	PROC_UNLOCK(targetp);
 	return (e);
 }
 
 #endif
 
 static void
 p31binit(void *notused)
 {
 	(void) sched_attach();
 	p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
 }
 
 SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
Index: head/sys/kern/subr_prof.c
===================================================================
--- head/sys/kern/subr_prof.c	(revision 167210)
+++ head/sys/kern/subr_prof.c	(revision 167211)
@@ -1,591 +1,588 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)subr_prof.c	8.3 (Berkeley) 9/23/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sysctl.h>
 
 #include <machine/cpu.h>
 
 #ifdef GPROF
 #include <sys/malloc.h>
 #include <sys/gmon.h>
 #undef MCOUNT
 
 static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
 
 static void kmstartup(void *);
 SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL)
 
 struct gmonparam _gmonparam = { GMON_PROF_OFF };
 
 #ifdef GUPROF
 void
 nullfunc_loop_profiled()
 {
 	int i;
 
 	for (i = 0; i < CALIB_SCALE; i++)
 		nullfunc_profiled();
 }
 
 #define	nullfunc_loop_profiled_end	nullfunc_profiled	/* XXX */
 
 void
 nullfunc_profiled()
 {
 }
 #endif /* GUPROF */
 
 /*
  * Update the histograms to support extending the text region arbitrarily.
  * This is done slightly naively (no sparse regions), so will waste slight
  * amounts of memory, but will overall work nicely enough to allow profiling
  * of KLDs.
  */
 void
 kmupetext(uintfptr_t nhighpc)
 {
 	struct gmonparam np;	/* slightly large */
 	struct gmonparam *p = &_gmonparam;
 	char *cp;
 
 	GIANT_REQUIRED;
 	bcopy(p, &np, sizeof(*p));
 	np.highpc = ROUNDUP(nhighpc, HISTFRACTION * sizeof(HISTCOUNTER));
 	if (np.highpc <= p->highpc)
 		return;
 	np.textsize = np.highpc - p->lowpc;
 	np.kcountsize = np.textsize / HISTFRACTION;
 	np.hashfraction = HASHFRACTION;
 	np.fromssize = np.textsize / HASHFRACTION;
 	np.tolimit = np.textsize * ARCDENSITY / 100;
 	if (np.tolimit < MINARCS)
 		np.tolimit = MINARCS;
 	else if (np.tolimit > MAXARCS)
 		np.tolimit = MAXARCS;
 	np.tossize = np.tolimit * sizeof(struct tostruct);
 	cp = malloc(np.kcountsize + np.fromssize + np.tossize,
 	    M_GPROF, M_WAITOK);
 	/*
 	 * Check for something else extending highpc while we slept.
 	 */
 	if (np.highpc <= p->highpc) {
 		free(cp, M_GPROF);
 		return;
 	}
 	np.tos = (struct tostruct *)cp;
 	cp += np.tossize;
 	np.kcount = (HISTCOUNTER *)cp;
 	cp += np.kcountsize;
 	np.froms = (u_short *)cp;
 #ifdef GUPROF
 	/* Reinitialize pointers to overhead counters. */
 	np.cputime_count = &KCOUNT(&np, PC_TO_I(&np, cputime));
 	np.mcount_count = &KCOUNT(&np, PC_TO_I(&np, mcount));
 	np.mexitcount_count = &KCOUNT(&np, PC_TO_I(&np, mexitcount));
 #endif
 	critical_enter();
 	bcopy(p->tos, np.tos, p->tossize);
 	bzero((char *)np.tos + p->tossize, np.tossize - p->tossize);
 	bcopy(p->kcount, np.kcount, p->kcountsize);
 	bzero((char *)np.kcount + p->kcountsize, np.kcountsize -
 	    p->kcountsize);
 	bcopy(p->froms, np.froms, p->fromssize);
 	bzero((char *)np.froms + p->fromssize, np.fromssize - p->fromssize);
 	cp = (char *)p->tos;
 	bcopy(&np, p, sizeof(*p));
 	critical_exit();
 	free(cp, M_GPROF);
 }
 
 static void
 kmstartup(dummy)
 	void *dummy;
 {
 	char *cp;
 	struct gmonparam *p = &_gmonparam;
 #ifdef GUPROF
 	int cputime_overhead;
 	int empty_loop_time;
 	int i;
 	int mcount_overhead;
 	int mexitcount_overhead;
 	int nullfunc_loop_overhead;
 	int nullfunc_loop_profiled_time;
 	uintfptr_t tmp_addr;
 #endif
 
 	/*
 	 * Round lowpc and highpc to multiples of the density we're using
 	 * so the rest of the scaling (here and in gprof) stays in ints.
 	 */
 	p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
 	p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
 	p->textsize = p->highpc - p->lowpc;
 	printf("Profiling kernel, textsize=%lu [%jx..%jx]\n",
 	    p->textsize, (uintmax_t)p->lowpc, (uintmax_t)p->highpc);
 	p->kcountsize = p->textsize / HISTFRACTION;
 	p->hashfraction = HASHFRACTION;
 	p->fromssize = p->textsize / HASHFRACTION;
 	p->tolimit = p->textsize * ARCDENSITY / 100;
 	if (p->tolimit < MINARCS)
 		p->tolimit = MINARCS;
 	else if (p->tolimit > MAXARCS)
 		p->tolimit = MAXARCS;
 	p->tossize = p->tolimit * sizeof(struct tostruct);
 	cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize,
 	    M_GPROF, M_WAITOK | M_ZERO);
 	p->tos = (struct tostruct *)cp;
 	cp += p->tossize;
 	p->kcount = (HISTCOUNTER *)cp;
 	cp += p->kcountsize;
 	p->froms = (u_short *)cp;
 	p->histcounter_type = FUNCTION_ALIGNMENT / HISTFRACTION * NBBY;
 
 #ifdef GUPROF
 	/* Signed counters. */
 	p->histcounter_type = -p->histcounter_type;
 
 	/* Initialize pointers to overhead counters. */
 	p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
 	p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
 	p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));
 
 	/*
 	 * Disable interrupts to avoid interference while we calibrate
 	 * things.
 	 */
 	critical_enter();
 
 	/*
 	 * Determine overheads.
 	 * XXX this needs to be repeated for each useful timer/counter.
 	 */
 	cputime_overhead = 0;
 	startguprof(p);
 	for (i = 0; i < CALIB_SCALE; i++)
 		cputime_overhead += cputime();
 
 	empty_loop();
 	startguprof(p);
 	empty_loop();
 	empty_loop_time = cputime();
 
 	nullfunc_loop_profiled();
 
 	/*
 	 * Start profiling.  There won't be any normal function calls since
 	 * interrupts are disabled, but we will call the profiling routines
 	 * directly to determine their overheads.
 	 */
 	p->state = GMON_PROF_HIRES;
 
 	startguprof(p);
 	nullfunc_loop_profiled();
 
 	startguprof(p);
 	for (i = 0; i < CALIB_SCALE; i++)
 		MCOUNT_OVERHEAD(profil);
 	mcount_overhead = KCOUNT(p, PC_TO_I(p, profil));
 
 	startguprof(p);
 	for (i = 0; i < CALIB_SCALE; i++)
 		MEXITCOUNT_OVERHEAD();
 	MEXITCOUNT_OVERHEAD_GETLABEL(tmp_addr);
 	mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));
 
 	p->state = GMON_PROF_OFF;
 	stopguprof(p);
 
 	critical_exit();
 
 	nullfunc_loop_profiled_time = 0;
 	for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled;
 	     tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end;
 	     tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
 		nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
 #define CALIB_DOSCALE(count)	(((count) + CALIB_SCALE / 3) / CALIB_SCALE)
 #define	c2n(count, freq)	((int)((count) * 1000000000LL / freq))
 	printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
 	       CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
 	       CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
 	       CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
 	       CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
 	       CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
 	cputime_overhead -= empty_loop_time;
 	mcount_overhead -= empty_loop_time;
 	mexitcount_overhead -= empty_loop_time;
 
 	/*-
 	 * Profiling overheads are determined by the times between the
 	 * following events:
 	 *	MC1: mcount() is called
 	 *	MC2: cputime() (called from mcount()) latches the timer
 	 *	MC3: mcount() completes
 	 *	ME1: mexitcount() is called
 	 *	ME2: cputime() (called from mexitcount()) latches the timer
 	 *	ME3: mexitcount() completes.
 	 * The times between the events vary slightly depending on instruction
 	 * combination and cache misses, etc.  Attempt to determine the
 	 * minimum times.  These can be subtracted from the profiling times
 	 * without much risk of reducing the profiling times below what they
 	 * would be when profiling is not configured.  Abbreviate:
 	 *	ab = minimum time between MC1 and MC3
 	 *	a  = minumum time between MC1 and MC2
 	 *	b  = minimum time between MC2 and MC3
 	 *	cd = minimum time between ME1 and ME3
 	 *	c  = minimum time between ME1 and ME2
 	 *	d  = minimum time between ME2 and ME3.
 	 * These satisfy the relations:
 	 *	ab            <= mcount_overhead		(just measured)
 	 *	a + b         <= ab
 	 *	        cd    <= mexitcount_overhead		(just measured)
 	 *	        c + d <= cd
 	 *	a         + d <= nullfunc_loop_profiled_time	(just measured)
 	 *	a >= 0, b >= 0, c >= 0, d >= 0.
 	 * Assume that ab and cd are equal to the minimums.
 	 */
 	p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
 	p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
 	p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
 					       - cputime_overhead);
 	nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
 	p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
 						     - nullfunc_loop_overhead)
 						    / 4);
 	p->mexitcount_pre_overhead = p->mexitcount_overhead
 				     + p->cputime_overhead
 				     - p->mexitcount_post_overhead;
 	p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
 				 - p->mexitcount_post_overhead;
 	p->mcount_post_overhead = p->mcount_overhead
 				  + p->cputime_overhead
 				  - p->mcount_pre_overhead;
 	printf(
 "Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
 	       c2n(p->cputime_overhead, p->profrate),
 	       c2n(p->mcount_overhead, p->profrate),
 	       c2n(p->mcount_pre_overhead, p->profrate),
 	       c2n(p->mcount_post_overhead, p->profrate),
 	       c2n(p->cputime_overhead, p->profrate),
 	       c2n(p->mexitcount_overhead, p->profrate),
 	       c2n(p->mexitcount_pre_overhead, p->profrate),
 	       c2n(p->mexitcount_post_overhead, p->profrate));
 	printf(
 "Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
 	       p->cputime_overhead, p->mcount_overhead,
 	       p->mcount_pre_overhead, p->mcount_post_overhead,
 	       p->cputime_overhead, p->mexitcount_overhead,
 	       p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
 #endif /* GUPROF */
 }
 
 /*
  * Return kernel profiling information.
  */
 static int
 sysctl_kern_prof(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	struct gmonparam *gp = &_gmonparam;
 	int error;
 	int state;
 
 	/* all sysctl names at this level are terminal */
 	if (namelen != 1)
 		return (ENOTDIR);		/* overloaded */
 
 	switch (name[0]) {
 	case GPROF_STATE:
 		state = gp->state;
 		error = sysctl_handle_int(oidp, &state, 0, req);
 		if (error)
 			return (error);
 		if (!req->newptr)
 			return (0);
 		if (state == GMON_PROF_OFF) {
 			gp->state = state;
 			PROC_LOCK(&proc0);
 			stopprofclock(&proc0);
 			PROC_UNLOCK(&proc0);
 			stopguprof(gp);
 		} else if (state == GMON_PROF_ON) {
 			gp->state = GMON_PROF_OFF;
 			stopguprof(gp);
 			gp->profrate = profhz;
 			PROC_LOCK(&proc0);
 			startprofclock(&proc0);
 			PROC_UNLOCK(&proc0);
 			gp->state = state;
 #ifdef GUPROF
 		} else if (state == GMON_PROF_HIRES) {
 			gp->state = GMON_PROF_OFF;
 			PROC_LOCK(&proc0);
 			stopprofclock(&proc0);
 			PROC_UNLOCK(&proc0);
 			startguprof(gp);
 			gp->state = state;
 #endif
 		} else if (state != gp->state)
 			return (EINVAL);
 		return (0);
 	case GPROF_COUNT:
 		return (sysctl_handle_opaque(oidp, 
 			gp->kcount, gp->kcountsize, req));
 	case GPROF_FROMS:
 		return (sysctl_handle_opaque(oidp, 
 			gp->froms, gp->fromssize, req));
 	case GPROF_TOS:
 		return (sysctl_handle_opaque(oidp, 
 			gp->tos, gp->tossize, req));
 	case GPROF_GMONPARAM:
 		return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
 	default:
 		return (EOPNOTSUPP);
 	}
 	/* NOTREACHED */
 }
 
 SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
 #endif /* GPROF */
 
 /*
  * Profiling system call.
  *
  * The scale factor is a fixed point number with 16 bits of fraction, so that
  * 1.0 is represented as 0x10000.  A scale factor of 0 turns off profiling.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct profil_args {
 	caddr_t	samples;
 	size_t	size;
 	size_t	offset;
 	u_int	scale;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 profil(td, uap)
 	struct thread *td;
 	register struct profil_args *uap;
 {
 	struct uprof *upp;
 	struct proc *p;
 
 	if (uap->scale > (1 << 16))
 		return (EINVAL);
 
 	p = td->td_proc;
 	if (uap->scale == 0) {
 		PROC_LOCK(p);
 		stopprofclock(p);
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	PROC_LOCK(p);
 	upp = &td->td_proc->p_stats->p_prof;
 	mtx_lock_spin(&sched_lock);
 	upp->pr_off = uap->offset;
 	upp->pr_scale = uap->scale;
 	upp->pr_base = uap->samples;
 	upp->pr_size = uap->size;
 	mtx_unlock_spin(&sched_lock);
 	startprofclock(p);
 	PROC_UNLOCK(p);
 
 	return (0);
 }
 
 /*
  * Scale is a fixed-point number with the binary point 16 bits
  * into the value, and is <= 1.0.  pc is at most 32 bits, so the
  * intermediate result is at most 48 bits.
  */
 #define	PC_TO_INDEX(pc, prof) \
 	((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
 	    (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
 
 /*
  * Collect user-level profiling statistics; called on a profiling tick,
  * when a process is running in user-mode.  This routine may be called
  * from an interrupt context.  We try to update the user profiling buffers
  * cheaply with fuswintr() and suswintr().  If that fails, we revert to
  * an AST that will vector us to trap() with a context in which copyin
  * and copyout will work.  Trap will then call addupc_task().
  *
  * Note that we may (rarely) not get around to the AST soon enough, and
  * lose profile ticks when the next tick overwrites this one, but in this
  * case the system is overloaded and the profile is probably already
  * inaccurate.
  */
 void
 addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks)
 {
 	struct uprof *prof;
 	caddr_t addr;
 	u_int i;
 	int v;
 
 	if (ticks == 0)
 		return;
 	prof = &td->td_proc->p_stats->p_prof;
 	mtx_lock_spin(&sched_lock);
 	if (pc < prof->pr_off ||
 	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
 		mtx_unlock_spin(&sched_lock);		
 		return;			/* out of range; ignore */
 	}
 
 	addr = prof->pr_base + i;
 	mtx_unlock_spin(&sched_lock);
 	if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
 		td->td_profil_addr = pc;
 		td->td_profil_ticks = ticks;
 		td->td_pflags |= TDP_OWEUPC;
 		mtx_lock_spin(&sched_lock);
 		td->td_flags |= TDF_ASTPENDING;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * Much like before, but we can afford to take faults here.  If the
  * update fails, we simply turn off profiling.
  */
 void
 addupc_task(struct thread *td, uintfptr_t pc, u_int ticks)
 {
 	struct proc *p = td->td_proc; 
 	struct uprof *prof;
 	caddr_t addr;
 	u_int i;
 	u_short v;
 	int stop = 0;
 
 	if (ticks == 0)
 		return;
 
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_PROFIL)) {
 		PROC_UNLOCK(p);
 		return;
 	}
 	p->p_profthreads++;
 	prof = &p->p_stats->p_prof;
 	if (pc < prof->pr_off ||
 	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
 		goto out;
 	}
 
 	addr = prof->pr_base + i;
 	PROC_UNLOCK(p);
 	if (copyin(addr, &v, sizeof(v)) == 0) {
 		v += ticks;
 		if (copyout(&v, addr, sizeof(v)) == 0) {
 			PROC_LOCK(p);
 			goto out;
 		}
 	}
 	stop = 1;
 	PROC_LOCK(p);
 
 out:
 	if (--p->p_profthreads == 0) {
 		if (p->p_flag & P_STOPPROF) {
 			wakeup(&p->p_profthreads);
 			stop = 0;
 		}
 	}
 	if (stop)
 		stopprofclock(p);
 	PROC_UNLOCK(p);
 }
 
 #if (defined(__amd64__) || defined(__i386__)) && \
 	defined(__GNUCLIKE_CTOR_SECTION_HANDLING)
 /*
  * Support for "--test-coverage --profile-arcs" in GCC.
  *
  * We need to call all the functions in the .ctor section, in order
  * to get all the counter-arrays strung into a list.
  *
  * XXX: the .ctors call __bb_init_func which is located in over in 
  * XXX: i386/i386/support.s for historical reasons.  There is probably
  * XXX: no reason for that to be assembler anymore, but doing it right
  * XXX: in MI C code requires one to reverse-engineer the type-selection
  * XXX: inside GCC.  Have fun.
  *
  * XXX: Worrisome perspective: Calling the .ctors may make C++ in the
  * XXX: kernel feasible.  Don't.
  */
 typedef void (*ctor_t)(void);
 extern ctor_t _start_ctors, _stop_ctors;
 
 static void
 tcov_init(void *foo __unused)
 {
 	ctor_t *p, q;
 
 	for (p = &_start_ctors; p < &_stop_ctors; p++) {
 		q = *p;
 		q();
 	}
 }
 
 SYSINIT(tcov_init, SI_SUB_KPROF, SI_ORDER_SECOND, tcov_init, NULL)
 
 /*
  * GCC contains magic to recognize calls to for instance execve() and
  * puts in calls to this function to preserve the profile counters.
  * XXX: Put zinging punchline here.
  */
 void __bb_fork_func(void);
 void
 __bb_fork_func(void)
 {
 }
 
 #endif
 
Index: head/sys/kern/subr_trap.c
===================================================================
--- head/sys/kern/subr_trap.c	(revision 167210)
+++ head/sys/kern/subr_trap.c	(revision 167211)
@@ -1,280 +1,278 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 #ifdef __i386__
 #include "opt_npx.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/ktr.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
- * Define the code needed before returning to user mode, for
- * trap and syscall.
- *
- * MPSAFE
+ * Define the code needed before returning to user mode, for trap and
+ * syscall.
  */
 void
 userret(struct thread *td, struct trapframe *frame)
 {
 	struct proc *p = td->td_proc;
 
 	CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
             p->p_comm);
 #ifdef DIAGNOSTIC
 	/* Check that we called signotify() enough. */
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
 	    (td->td_flags & TDF_ASTPENDING) == 0))
 		printf("failed to set signal flags properly for ast()\n");
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(p);
 #endif
 
 #ifdef KTRACE
 	KTRUSERRET(td);
 #endif
 
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
 	 */
 	if (td->td_pflags & TDP_GEOM)
 		g_waitidle();
 
 	/*
 	 * We need to check to see if we have to exit or wait due to a
 	 * single threading requirement or some other STOP condition.
 	 * Don't bother doing all the work if the stop bits are not set
 	 * at this time.. If we miss it, we miss it.. no big deal.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		PROC_LOCK(p);
 		thread_suspend_check(0);	/* Can suspend or kill */
 		PROC_UNLOCK(p);
 	}
 
 #ifdef KSE
 	/*
 	 * Do special thread processing, e.g. upcall tweaking and such.
 	 */
 	if (p->p_flag & P_SA)
 		thread_userret(td, frame);
 #endif
 
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_flag & P_PROFIL) {
 
 		addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
 	}
 
 	/*
 	 * Let the scheduler adjust our priority etc.
 	 */
 	sched_userret(td);
 	KASSERT(td->td_locks == 0,
 	    ("userret: Returning with %d locks held.", td->td_locks));
 }
 
 /*
  * Process an asynchronous software trap.
  * This is relatively easy.
  * This function will return with preemption disabled.
  */
 void
 ast(struct trapframe *framep)
 {
 	struct thread *td;
 	struct proc *p;
 	struct rlimit rlim;
 	int sflag;
 	int flags;
 	int sig;
 #if defined(DEV_NPX) && !defined(SMP)
 	int ucode;
 	ksiginfo_t ksi;
 #endif
 
 	td = curthread;
 	p = td->td_proc;
 
 	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
             p->p_comm);
 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
 	WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
 	mtx_assert(&Giant, MA_NOTOWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	td->td_frame = framep;
 	td->td_pticks = 0;
 
 #ifdef KSE
 	if ((p->p_flag & P_SA) && (td->td_mailbox == NULL))
 		thread_user_enter(td);
 #endif
 
 	/*
 	 * This updates the p_sflag's for the checks below in one
 	 * "atomic" operation with turning off the astpending flag.
 	 * If another AST is triggered while we are handling the
 	 * AST's saved in sflag, the astpending flag will be set and
 	 * ast() will be called again.
 	 */
 	mtx_lock_spin(&sched_lock);
 	flags = td->td_flags;
 	sflag = p->p_sflag;
 	if (p->p_sflag & (PS_ALRMPEND | PS_PROFPEND | PS_XCPU))
 		p->p_sflag &= ~(PS_ALRMPEND | PS_PROFPEND | PS_XCPU);
 #ifdef MAC
 	if (p->p_sflag & PS_MACPEND)
 		p->p_sflag &= ~PS_MACPEND;
 #endif
 	td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK |
 	    TDF_NEEDRESCHED | TDF_INTERRUPT);
 	cnt.v_trap++;
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * XXXKSE While the fact that we owe a user profiling
 	 * tick is stored per thread in this code, the statistics
 	 * themselves are still stored per process.
 	 * This should probably change, by which I mean that
 	 * possibly the location of both might change.
 	 */
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
 	if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) {
 		addupc_task(td, td->td_profil_addr, td->td_profil_ticks);
 		td->td_profil_ticks = 0;
 		td->td_pflags &= ~TDP_OWEUPC;
 	}
 	if (sflag & PS_ALRMPEND) {
 		PROC_LOCK(p);
 		psignal(p, SIGVTALRM);
 		PROC_UNLOCK(p);
 	}
 #if defined(DEV_NPX) && !defined(SMP)
 	if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) {
 		atomic_clear_int(&PCPU_GET(curpcb)->pcb_flags,
 		    PCB_NPXTRAP);
 		ucode = npxtrap();
 		if (ucode != -1) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGFPE;
 			ksi.ksi_code = ucode;
 			trapsignal(td, &ksi);
 		}
 	}
 #endif
 	if (sflag & PS_PROFPEND) {
 		PROC_LOCK(p);
 		psignal(p, SIGPROF);
 		PROC_UNLOCK(p);
 	}
 	if (sflag & PS_XCPU) {
 		PROC_LOCK(p);
 		lim_rlimit(p, RLIMIT_CPU, &rlim);
 		mtx_lock_spin(&sched_lock);
 		if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
 			mtx_unlock_spin(&sched_lock);
 			killproc(p, "exceeded maximum CPU limit");
 		} else {
 			if (p->p_cpulimit < rlim.rlim_max)
 				p->p_cpulimit += 5;
 			mtx_unlock_spin(&sched_lock);
 			psignal(p, SIGXCPU);
 		}
 		PROC_UNLOCK(p);
 	}
 #ifdef MAC
 	if (sflag & PS_MACPEND)
 		mac_thread_userret(td);
 #endif
 	if (flags & TDF_NEEDRESCHED) {
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(1, 1);
 #endif
 		mtx_lock_spin(&sched_lock);
 		sched_prio(td, td->td_user_pri);
 		mi_switch(SW_INVOL, NULL);
 		mtx_unlock_spin(&sched_lock);
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(0, 1);
 #endif
 	}
 	if (flags & TDF_NEEDSIGCHK) {
 		PROC_LOCK(p);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0)
 			postsig(sig);
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		PROC_UNLOCK(p);
 	}
 
 	userret(td, framep);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/kern/sys_generic.c
===================================================================
--- head/sys/kern/sys_generic.c	(revision 167210)
+++ head/sys/kern/sys_generic.c	(revision 167211)
@@ -1,1182 +1,1147 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/sleepqueue.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/condvar.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 
 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
 
 static int	pollscan(struct thread *, struct pollfd *, u_int);
 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
 static int	dofileread(struct thread *, int, struct file *, struct uio *,
 		    off_t, int);
 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
 		    off_t, int);
 static void	doselwakeup(struct selinfo *, int);
 
 /*
  * Read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct read_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 read(td, uap)
 	struct thread *td;
 	struct read_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > INT_MAX)
 		return (EINVAL);
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_readv(td, uap->fd, &auio);
 	return(error);
 }
 
 /*
  * Positioned read system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pread_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 pread(td, uap)
 	struct thread *td;
 	struct pread_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > INT_MAX)
 		return (EINVAL);
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
 	return(error);
 }
 
 /*
  * Scatter read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 readv(struct thread *td, struct readv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_readv(td, uap->fd, auio);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_readv(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_read(td, fd, &fp);
 	if (error)
 		return (error);
 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Scatter positioned read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct preadv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 	off_t	offset;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 preadv(struct thread *td, struct preadv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_preadv(td, uap->fd, auio, uap->offset);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_preadv(td, fd, auio, offset)
 	struct thread *td;
 	int fd;
 	struct uio *auio;
 	off_t offset;
 {
 	struct file *fp;
 	int error;
 
 	error = fget_read(td, fd, &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
 		error = EINVAL;
 	else
 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common code for readv and preadv that reads data in
  * from a file using the passed in uio, offset, and flags.
  */
 static int
 dofileread(td, fd, fp, auio, offset, flags)
 	struct thread *td;
 	int fd;
 	struct file *fp;
 	struct uio *auio;
 	off_t offset;
 	int flags;
 {
 	ssize_t cnt;
 	int error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	/* Finish zero length reads right here */
 	if (auio->uio_resid == 0) {
 		td->td_retval[0] = 0;
 		return(0);
 	}
 	auio->uio_rw = UIO_READ;
 	auio->uio_offset = offset;
 	auio->uio_td = td;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO)) 
 		ktruio = cloneuio(auio);
 #endif
 	cnt = auio->uio_resid;
 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
 		if (auio->uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 	cnt -= auio->uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = cnt;
 		ktrgenio(fd, UIO_READ, ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
 /*
  * Write system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct write_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 write(td, uap)
 	struct thread *td;
 	struct write_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > INT_MAX)
 		return (EINVAL);
 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_writev(td, uap->fd, &auio);
 	return(error);
 }
 
 /*
- * Positioned write system call
+ * Positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwrite_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 pwrite(td, uap)
 	struct thread *td;
 	struct pwrite_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > INT_MAX)
 		return (EINVAL);
 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
 	return(error);
 }
 
 /*
- * Gather write system call
+ * Gather write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct writev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 writev(struct thread *td, struct writev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_writev(td, uap->fd, auio);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_writev(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_write(td, fd, &fp);
 	if (error)
 		return (error);
 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
- * Gather positioned write system call
+ * Gather positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwritev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 	off_t	offset;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 pwritev(struct thread *td, struct pwritev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_pwritev(td, fd, auio, offset)
 	struct thread *td;
 	struct uio *auio;
 	int fd;
 	off_t offset;
 {
 	struct file *fp;
 	int error;
 
 	error = fget_write(td, fd, &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
 		error = EINVAL;
 	else
 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common code for writev and pwritev that writes data to
  * a file using the passed in uio, offset, and flags.
  */
 static int
 dofilewrite(td, fd, fp, auio, offset, flags)
 	struct thread *td;
 	int fd;
 	struct file *fp;
 	struct uio *auio;
 	off_t offset;
 	int flags;
 {
 	ssize_t cnt;
 	int error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	auio->uio_rw = UIO_WRITE;
 	auio->uio_td = td;
 	auio->uio_offset = offset;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(auio);
 #endif
 	cnt = auio->uio_resid;
 	if (fp->f_type == DTYPE_VNODE)
 		bwillwrite();
 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
 		if (auio->uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Socket layer is responsible for issuing SIGPIPE. */
 		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	cnt -= auio->uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = cnt;
 		ktrgenio(fd, UIO_WRITE, ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
 /*
- * Ioctl system call
+ * Ioctl system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ioctl_args {
 	int	fd;
 	u_long	com;
 	caddr_t	data;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 ioctl(struct thread *td, struct ioctl_args *uap)
 {
 	u_long com;
 	int arg, error;
 	u_int size;
 	caddr_t data;
 
 	if (uap->com > 0xffffffff) {
 		printf(
 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
 		uap->com &= 0xffffffff;
 	}
 	com = uap->com;
 
 	/*
 	 * Interpret high order word to find amount of data to be
 	 * copied to/from the user's address space.
 	 */
 	size = IOCPARM_LEN(com);
 	if ((size > IOCPARM_MAX) ||
 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
 	    ((com & IOC_OUT) && size == 0) ||
 #else
 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
 #endif
 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
 		return (ENOTTY);
 
 	if (size > 0) {
 		if (!(com & IOC_VOID))
 			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
 		else {
 			/* Integer argument. */
 			arg = (intptr_t)uap->data;
 			data = (void *)&arg;
 			size = 0;
 		}
 	} else
 		data = (void *)&uap->data;
 	if (com & IOC_IN) {
 		error = copyin(uap->data, data, (u_int)size);
 		if (error) {
 			if (size > 0)
 				free(data, M_IOCTLOPS);
 			return (error);
 		}
 	} else if (com & IOC_OUT) {
 		/*
 		 * Zero the buffer so the user always
 		 * gets back something deterministic.
 		 */
 		bzero(data, size);
 	}
 
 	error = kern_ioctl(td, uap->fd, com, data);
 
 	if (error == 0 && (com & IOC_OUT))
 		error = copyout(data, uap->data, (u_int)size);
 
 	if (size > 0)
 		free(data, M_IOCTLOPS);
 	return (error);
 }
 
 int
 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	int error;
 	int tmp;
 
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	fdp = td->td_proc->p_fd;
 	switch (com) {
 	case FIONCLEX:
 		FILEDESC_LOCK_FAST(fdp);
 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
 		FILEDESC_UNLOCK_FAST(fdp);
 		goto out;
 	case FIOCLEX:
 		FILEDESC_LOCK_FAST(fdp);
 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
 		FILEDESC_UNLOCK_FAST(fdp);
 		goto out;
 	case FIONBIO:
 		FILE_LOCK(fp);
 		if ((tmp = *(int *)data))
 			fp->f_flag |= FNONBLOCK;
 		else
 			fp->f_flag &= ~FNONBLOCK;
 		FILE_UNLOCK(fp);
 		data = (void *)&tmp;
 		break;
 	case FIOASYNC:
 		FILE_LOCK(fp);
 		if ((tmp = *(int *)data))
 			fp->f_flag |= FASYNC;
 		else
 			fp->f_flag &= ~FASYNC;
 		FILE_UNLOCK(fp);
 		data = (void *)&tmp;
 		break;
 	}
 
 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * sellock and selwait are initialized in selectinit() via SYSINIT.
  */
 struct mtx	sellock;
 struct cv	selwait;
 u_int		nselcoll;	/* Select collisions since boot */
 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
 
 /*
  * Select system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct select_args {
 	int	nd;
 	fd_set	*in, *ou, *ex;
 	struct	timeval *tv;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 select(td, uap)
 	register struct thread *td;
 	register struct select_args *uap;
 {
 	struct timeval tv, *tvp;
 	int error;
 
 	if (uap->tv != NULL) {
 		error = copyin(uap->tv, &tv, sizeof(tv));
 		if (error)
 			return (error);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 
 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
 }
 
 int
 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
     fd_set *fd_ex, struct timeval *tvp)
 {
 	struct filedesc *fdp;
 	/*
 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
 	 * of 256.
 	 */
 	fd_mask s_selbits[howmany(2048, NFDBITS)];
 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
 	struct timeval atv, rtv, ttv;
 	int error, timo;
 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
 
 	if (nd < 0)
 		return (EINVAL);
 	fdp = td->td_proc->p_fd;
 	
 	FILEDESC_LOCK_FAST(fdp);
 
 	if (nd > td->td_proc->p_fd->fd_nfiles)
 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
 	FILEDESC_UNLOCK_FAST(fdp);
 
 	/*
 	 * Allocate just enough bits for the non-null fd_sets.  Use the
 	 * preallocated auto buffer if possible.
 	 */
 	nfdbits = roundup(nd, NFDBITS);
 	ncpbytes = nfdbits / NBBY;
 	nbufbytes = 0;
 	if (fd_in != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (fd_ou != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (fd_ex != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (nbufbytes <= sizeof s_selbits)
 		selbits = &s_selbits[0];
 	else
 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
 
 	/*
 	 * Assign pointers into the bit buffers and fetch the input bits.
 	 * Put the output buffers together so that they can be bzeroed
 	 * together.
 	 */
 	sbp = selbits;
 #define	getbits(name, x) \
 	do {								\
 		if (name == NULL)					\
 			ibits[x] = NULL;				\
 		else {							\
 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
 			obits[x] = sbp;					\
 			sbp += ncpbytes / sizeof *sbp;			\
 			error = copyin(name, ibits[x], ncpbytes);	\
 			if (error != 0)					\
 				goto done_nosellock;			\
 		}							\
 	} while (0)
 	getbits(fd_in, 0);
 	getbits(fd_ou, 1);
 	getbits(fd_ex, 2);
 #undef	getbits
 	if (nbufbytes != 0)
 		bzero(selbits, nbufbytes / 2);
 
 	if (tvp != NULL) {
 		atv = *tvp;
 		if (itimerfix(&atv)) {
 			error = EINVAL;
 			goto done_nosellock;
 		}
 		getmicrouptime(&rtv);
 		timevaladd(&atv, &rtv);
 	} else {
 		atv.tv_sec = 0;
 		atv.tv_usec = 0;
 	}
 	timo = 0;
 	TAILQ_INIT(&td->td_selq);
 	mtx_lock(&sellock);
 retry:
 	ncoll = nselcoll;
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&sellock);
 
 	error = selscan(td, ibits, obits, nd);
 	mtx_lock(&sellock);
 	if (error || td->td_retval[0])
 		goto done;
 	if (atv.tv_sec || atv.tv_usec) {
 		getmicrouptime(&rtv);
 		if (timevalcmp(&rtv, &atv, >=))
 			goto done;
 		ttv = atv;
 		timevalsub(&ttv, &rtv);
 		timo = ttv.tv_sec > 24 * 60 * 60 ?
 		    24 * 60 * 60 * hz : tvtohz(&ttv);
 	}
 
 	/*
 	 * An event of interest may occur while we do not hold
 	 * sellock, so check TDF_SELECT and the number of
 	 * collisions and rescan the file descriptors if
 	 * necessary.
 	 */
 	mtx_lock_spin(&sched_lock);
 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
 		mtx_unlock_spin(&sched_lock);
 		goto retry;
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	if (timo > 0)
 		error = cv_timedwait_sig(&selwait, &sellock, timo);
 	else
 		error = cv_wait_sig(&selwait, &sellock);
 	
 	if (error == 0)
 		goto retry;
 
 done:
 	clear_selinfo_list(td);
 	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&sellock);
 
 done_nosellock:
 	/* select is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 #define	putbits(name, x) \
 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
 		error = error2;
 	if (error == 0) {
 		int error2;
 
 		putbits(fd_in, 0);
 		putbits(fd_ou, 1);
 		putbits(fd_ex, 2);
 #undef putbits
 	}
 	if (selbits != &s_selbits[0])
 		free(selbits, M_SELECT);
 
 	return (error);
 }
 
 static int
 selscan(td, ibits, obits, nfd)
 	struct thread *td;
 	fd_mask **ibits, **obits;
 	int nfd;
 {
 	int msk, i, fd;
 	fd_mask bits;
 	struct file *fp;
 	int n = 0;
 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
 	struct filedesc *fdp = td->td_proc->p_fd;
 
 	FILEDESC_LOCK(fdp);
 	for (msk = 0; msk < 3; msk++) {
 		if (ibits[msk] == NULL)
 			continue;
 		for (i = 0; i < nfd; i += NFDBITS) {
 			bits = ibits[msk][i/NFDBITS];
 			/* ffs(int mask) not portable, fd_mask is long */
 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
 				if (!(bits & 1))
 					continue;
 				if ((fp = fget_locked(fdp, fd)) == NULL) {
 					FILEDESC_UNLOCK(fdp);
 					return (EBADF);
 				}
 				if (fo_poll(fp, flag[msk], td->td_ucred,
 				    td)) {
 					obits[msk][(fd)/NFDBITS] |=
 					    ((fd_mask)1 << ((fd) % NFDBITS));
 					n++;
 				}
 			}
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * Poll system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct poll_args {
 	struct pollfd *fds;
 	u_int	nfds;
 	int	timeout;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 poll(td, uap)
 	struct thread *td;
 	struct poll_args *uap;
 {
 	struct pollfd *bits;
 	struct pollfd smallbits[32];
 	struct timeval atv, rtv, ttv;
 	int error = 0, timo;
 	u_int ncoll, nfds;
 	size_t ni;
 
 	nfds = uap->nfds;
 
 	/*
 	 * This is kinda bogus.  We have fd limits, but that is not
 	 * really related to the size of the pollfd array.  Make sure
 	 * we let the process use at least FD_SETSIZE entries and at
 	 * least enough for the current limits.  We want to be reasonably
 	 * safe, but not overly restrictive.
 	 */
 	PROC_LOCK(td->td_proc);
 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
 	    (nfds > FD_SETSIZE)) {
 		PROC_UNLOCK(td->td_proc);
 		error = EINVAL;
 		goto done2;
 	}
 	PROC_UNLOCK(td->td_proc);
 	ni = nfds * sizeof(struct pollfd);
 	if (ni > sizeof(smallbits))
 		bits = malloc(ni, M_TEMP, M_WAITOK);
 	else
 		bits = smallbits;
 	error = copyin(uap->fds, bits, ni);
 	if (error)
 		goto done_nosellock;
 	if (uap->timeout != INFTIM) {
 		atv.tv_sec = uap->timeout / 1000;
 		atv.tv_usec = (uap->timeout % 1000) * 1000;
 		if (itimerfix(&atv)) {
 			error = EINVAL;
 			goto done_nosellock;
 		}
 		getmicrouptime(&rtv);
 		timevaladd(&atv, &rtv);
 	} else {
 		atv.tv_sec = 0;
 		atv.tv_usec = 0;
 	}
 	timo = 0;
 	TAILQ_INIT(&td->td_selq);
 	mtx_lock(&sellock);
 retry:
 	ncoll = nselcoll;
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&sellock);
 
 	error = pollscan(td, bits, nfds);
 	mtx_lock(&sellock);
 	if (error || td->td_retval[0])
 		goto done;
 	if (atv.tv_sec || atv.tv_usec) {
 		getmicrouptime(&rtv);
 		if (timevalcmp(&rtv, &atv, >=))
 			goto done;
 		ttv = atv;
 		timevalsub(&ttv, &rtv);
 		timo = ttv.tv_sec > 24 * 60 * 60 ?
 		    24 * 60 * 60 * hz : tvtohz(&ttv);
 	}
 	/*
 	 * An event of interest may occur while we do not hold
 	 * sellock, so check TDF_SELECT and the number of collisions
 	 * and rescan the file descriptors if necessary.
 	 */
 	mtx_lock_spin(&sched_lock);
 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
 		mtx_unlock_spin(&sched_lock);
 		goto retry;
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	if (timo > 0)
 		error = cv_timedwait_sig(&selwait, &sellock, timo);
 	else
 		error = cv_wait_sig(&selwait, &sellock);
 
 	if (error == 0)
 		goto retry;
 
 done:
 	clear_selinfo_list(td);
 	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&sellock);
 
 done_nosellock:
 	/* poll is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 	if (error == 0) {
 		error = copyout(bits, uap->fds, ni);
 		if (error)
 			goto out;
 	}
 out:
 	if (ni > sizeof(smallbits))
 		free(bits, M_TEMP);
 done2:
 	return (error);
 }
 
 static int
 pollscan(td, fds, nfd)
 	struct thread *td;
 	struct pollfd *fds;
 	u_int nfd;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	int i;
 	struct file *fp;
 	int n = 0;
 
 	FILEDESC_LOCK(fdp);
 	for (i = 0; i < nfd; i++, fds++) {
 		if (fds->fd >= fdp->fd_nfiles) {
 			fds->revents = POLLNVAL;
 			n++;
 		} else if (fds->fd < 0) {
 			fds->revents = 0;
 		} else {
 			fp = fdp->fd_ofiles[fds->fd];
 			if (fp == NULL) {
 				fds->revents = POLLNVAL;
 				n++;
 			} else {
 				/*
 				 * Note: backend also returns POLLHUP and
 				 * POLLERR if appropriate.
 				 */
 				fds->revents = fo_poll(fp, fds->events,
 				    td->td_ucred, td);
 				if (fds->revents != 0)
 					n++;
 			}
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * OpenBSD poll system call.
+ *
  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct openbsd_poll_args {
 	struct pollfd *fds;
 	u_int	nfds;
 	int	timeout;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 openbsd_poll(td, uap)
 	register struct thread *td;
 	register struct openbsd_poll_args *uap;
 {
 	return (poll(td, (struct poll_args *)uap));
 }
 
 /*
- * Remove the references to the thread from all of the objects
- * we were polling.
+ * Remove the references to the thread from all of the objects we were
+ * polling.
  *
- * This code assumes that the underlying owner of the selinfo
- * structure will hold sellock before it changes it, and that
- * it will unlink itself from our list if it goes away.
+ * This code assumes that the underlying owner of the selinfo structure will
+ * hold sellock before it changes it, and that it will unlink itself from our
+ * list if it goes away.
  */
 void
 clear_selinfo_list(td)
 	struct thread *td;
 {
 	struct selinfo *si;
 
 	mtx_assert(&sellock, MA_OWNED);
 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
 		si->si_thread = NULL;
 	TAILQ_INIT(&td->td_selq);
 }
 
 /*
  * Record a select request.
  */
 void
 selrecord(selector, sip)
 	struct thread *selector;
 	struct selinfo *sip;
 {
 
 	mtx_lock(&sellock);
 	/*
 	 * If the selinfo's thread pointer is NULL then take ownership of it.
 	 *
 	 * If the thread pointer is not NULL and it points to another
 	 * thread, then we have a collision.
 	 *
 	 * If the thread pointer is not NULL and points back to us then leave
 	 * it alone as we've already added pointed it at us and added it to
 	 * our list.
 	 */
 	if (sip->si_thread == NULL) {
 		sip->si_thread = selector;
 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
 	} else if (sip->si_thread != selector) {
 		sip->si_flags |= SI_COLL;
 	}
 
 	mtx_unlock(&sellock);
 }
 
 /* Wake up a selecting thread. */
 void
 selwakeup(sip)
 	struct selinfo *sip;
 {
 	doselwakeup(sip, -1);
 }
 
 /* Wake up a selecting thread, and set its priority. */
 void
 selwakeuppri(sip, pri)
 	struct selinfo *sip;
 	int pri;
 {
 	doselwakeup(sip, pri);
 }
 
 /*
  * Do a wakeup when a selectable event occurs.
  */
 static void
 doselwakeup(sip, pri)
 	struct selinfo *sip;
 	int pri;
 {
 	struct thread *td;
 
 	mtx_lock(&sellock);
 	td = sip->si_thread;
 	if ((sip->si_flags & SI_COLL) != 0) {
 		nselcoll++;
 		sip->si_flags &= ~SI_COLL;
 		cv_broadcastpri(&selwait, pri);
 	}
 	if (td == NULL) {
 		mtx_unlock(&sellock);
 		return;
 	}
 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 	sip->si_thread = NULL;
 	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	sleepq_remove(td, &selwait);
 	mtx_unlock(&sellock);
 }
 
 static void selectinit(void *);
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
 
 /* ARGSUSED*/
 static void
 selectinit(dummy)
 	void *dummy;
 {
 	cv_init(&selwait, "select");
 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
 }
Index: head/sys/kern/sys_process.c
===================================================================
--- head/sys/kern/sys_process.c	(revision 167210)
+++ head/sys/kern/sys_process.c	(revision 167211)
@@ -1,1032 +1,1029 @@
 /*-
  * Copyright (c) 1994, Sean Eric Fagan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Sean Eric Fagan.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/ptrace.h>
 #include <sys/sx.h>
 #include <sys/malloc.h>
 #include <sys/signalvar.h>
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 
 #ifdef COMPAT_IA32
 #include <sys/procfs.h>
 #include <machine/fpu.h>
 #include <compat/ia32/ia32_reg.h>
 
 extern struct sysentvec ia32_freebsd_sysvec;
 
 struct ptrace_io_desc32 {
 	int		piod_op;
 	u_int32_t	piod_offs;
 	u_int32_t	piod_addr;
 	u_int32_t	piod_len;
 };
 #endif
 
 /*
  * Functions implemented using PROC_ACTION():
  *
  * proc_read_regs(proc, regs)
  *	Get the current user-visible register set from the process
  *	and copy it into the regs structure (<machine/reg.h>).
  *	The process is stopped at the time read_regs is called.
  *
  * proc_write_regs(proc, regs)
  *	Update the current register set from the passed in regs
  *	structure.  Take care to avoid clobbering special CPU
  *	registers or privileged bits in the PSL.
  *	Depending on the architecture this may have fix-up work to do,
  *	especially if the IAR or PCW are modified.
  *	The process is stopped at the time write_regs is called.
  *
  * proc_read_fpregs, proc_write_fpregs
  *	deal with the floating point register set, otherwise as above.
  *
  * proc_read_dbregs, proc_write_dbregs
  *	deal with the processor debug register set, otherwise as above.
  *
  * proc_sstep(proc)
  *	Arrange for the process to trap after executing a single instruction.
  */
 
 #define	PROC_ACTION(action) do {					\
 	int error;							\
 									\
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);			\
 	if ((td->td_proc->p_sflag & PS_INMEM) == 0)			\
 		error = EIO;						\
 	else								\
 		error = (action);					\
 	return (error);							\
 } while(0)
 
 int
 proc_read_regs(struct thread *td, struct reg *regs)
 {
 
 	PROC_ACTION(fill_regs(td, regs));
 }
 
 int
 proc_write_regs(struct thread *td, struct reg *regs)
 {
 
 	PROC_ACTION(set_regs(td, regs));
 }
 
 int
 proc_read_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	PROC_ACTION(fill_dbregs(td, dbregs));
 }
 
 int
 proc_write_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	PROC_ACTION(set_dbregs(td, dbregs));
 }
 
 /*
  * Ptrace doesn't support fpregs at all, and there are no security holes
  * or translations for fpregs, so we can just copy them.
  */
 int
 proc_read_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	PROC_ACTION(fill_fpregs(td, fpregs));
 }
 
 int
 proc_write_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	PROC_ACTION(set_fpregs(td, fpregs));
 }
 
 #ifdef COMPAT_IA32
 /* For 32 bit binaries, we need to expose the 32 bit regs layouts. */
 int
 proc_read_regs32(struct thread *td, struct reg32 *regs32)
 {
 
 	PROC_ACTION(fill_regs32(td, regs32));
 }
 
 int
 proc_write_regs32(struct thread *td, struct reg32 *regs32)
 {
 
 	PROC_ACTION(set_regs32(td, regs32));
 }
 
 int
 proc_read_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
 {
 
 	PROC_ACTION(fill_dbregs32(td, dbregs32));
 }
 
 int
 proc_write_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
 {
 
 	PROC_ACTION(set_dbregs32(td, dbregs32));
 }
 
 int
 proc_read_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
 {
 
 	PROC_ACTION(fill_fpregs32(td, fpregs32));
 }
 
 int
 proc_write_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
 {
 
 	PROC_ACTION(set_fpregs32(td, fpregs32));
 }
 #endif
 
 int
 proc_sstep(struct thread *td)
 {
 
 	PROC_ACTION(ptrace_single_step(td));
 }
 
 int
 proc_rwmem(struct proc *p, struct uio *uio)
 {
 	vm_map_t map;
 	vm_object_t backing_object, object = NULL;
 	vm_offset_t pageno = 0;		/* page number */
 	vm_prot_t reqprot;
 	int error, writing;
 
 	/*
 	 * Assert that someone has locked this vmspace.  (Should be
 	 * curthread but we can't assert that.)  This keeps the process
 	 * from exiting out from under us until this operation completes.
 	 */
 	KASSERT(p->p_lock >= 1, ("%s: process %p (pid %d) not held", __func__,
 	    p, p->p_pid));
 
 	/*
 	 * The map we want...
 	 */
 	map = &p->p_vmspace->vm_map;
 
 	writing = uio->uio_rw == UIO_WRITE;
 	reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) :
 	    VM_PROT_READ;
 
 	/*
 	 * Only map in one page at a time.  We don't have to, but it
 	 * makes things easier.  This way is trivial - right?
 	 */
 	do {
 		vm_map_t tmap;
 		vm_offset_t uva;
 		int page_offset;		/* offset into page */
 		vm_map_entry_t out_entry;
 		vm_prot_t out_prot;
 		boolean_t wired;
 		vm_pindex_t pindex;
 		u_int len;
 		vm_page_t m;
 
 		object = NULL;
 
 		uva = (vm_offset_t)uio->uio_offset;
 
 		/*
 		 * Get the page number of this segment.
 		 */
 		pageno = trunc_page(uva);
 		page_offset = uva - pageno;
 
 		/*
 		 * How many bytes to copy
 		 */
 		len = min(PAGE_SIZE - page_offset, uio->uio_resid);
 
 		/*
 		 * Fault the page on behalf of the process
 		 */
 		error = vm_fault(map, pageno, reqprot, VM_FAULT_NORMAL);
 		if (error) {
 			error = EFAULT;
 			break;
 		}
 
 		/*
 		 * Now we need to get the page.  out_entry, out_prot, wired,
 		 * and single_use aren't used.  One would think the vm code
 		 * would be a *bit* nicer...  We use tmap because
 		 * vm_map_lookup() can change the map argument.
 		 */
 		tmap = map;
 		error = vm_map_lookup(&tmap, pageno, reqprot, &out_entry,
 		    &object, &pindex, &out_prot, &wired);
 		if (error) {
 			error = EFAULT;
 			break;
 		}
 		VM_OBJECT_LOCK(object);
 		while ((m = vm_page_lookup(object, pindex)) == NULL &&
 		    !writing &&
 		    (backing_object = object->backing_object) != NULL) {
 			/*
 			 * Allow fallback to backing objects if we are reading.
 			 */
 			VM_OBJECT_LOCK(backing_object);
 			pindex += OFF_TO_IDX(object->backing_object_offset);
 			VM_OBJECT_UNLOCK(object);
 			object = backing_object;
 		}
 		VM_OBJECT_UNLOCK(object);
 		if (m == NULL) {
 			vm_map_lookup_done(tmap, out_entry);
 			error = EFAULT;
 			break;
 		}
 
 		/*
 		 * Hold the page in memory.
 		 */
 		vm_page_lock_queues();
 		vm_page_hold(m);
 		vm_page_unlock_queues();
 
 		/*
 		 * We're done with tmap now.
 		 */
 		vm_map_lookup_done(tmap, out_entry);
 
 		/*
 		 * Now do the i/o move.
 		 */
 		error = uiomove_fromphys(&m, page_offset, len, uio);
 
 		/*
 		 * Release the page.
 		 */
 		vm_page_lock_queues();
 		vm_page_unhold(m);
 		vm_page_unlock_queues();
 
 	} while (error == 0 && uio->uio_resid > 0);
 
 	return (error);
 }
 
 /*
  * Process debugging system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ptrace_args {
 	int	req;
 	pid_t	pid;
 	caddr_t	addr;
 	int	data;
 };
 #endif
 
 #ifdef COMPAT_IA32
 /*
  * This CPP subterfuge is to try and reduce the number of ifdefs in
  * the body of the code.
  *   COPYIN(uap->addr, &r.reg, sizeof r.reg);
  * becomes either:
  *   copyin(uap->addr, &r.reg, sizeof r.reg);
  * or
  *   copyin(uap->addr, &r.reg32, sizeof r.reg32);
  * .. except this is done at runtime.
  */
 #define	COPYIN(u, k, s)		wrap32 ? \
 	copyin(u, k ## 32, s ## 32) : \
 	copyin(u, k, s)
 #define	COPYOUT(k, u, s)	wrap32 ? \
 	copyout(k ## 32, u, s ## 32) : \
 	copyout(k, u, s)
 #else
 #define	COPYIN(u, k, s)		copyin(u, k, s)
 #define	COPYOUT(k, u, s)	copyout(k, u, s)
 #endif
-/*
- * MPSAFE
- */
 int
 ptrace(struct thread *td, struct ptrace_args *uap)
 {
 	/*
 	 * XXX this obfuscation is to reduce stack usage, but the register
 	 * structs may be too large to put on the stack anyway.
 	 */
 	union {
 		struct ptrace_io_desc piod;
 		struct ptrace_lwpinfo pl;
 		struct dbreg dbreg;
 		struct fpreg fpreg;
 		struct reg reg;
 #ifdef COMPAT_IA32
 		struct dbreg32 dbreg32;
 		struct fpreg32 fpreg32;
 		struct reg32 reg32;
 		struct ptrace_io_desc32 piod32;
 #endif
 	} r;
 	void *addr;
 	int error = 0;
 #ifdef COMPAT_IA32
 	int wrap32 = 0;
 
 	if (td->td_proc->p_sysent == &ia32_freebsd_sysvec)
 		wrap32 = 1;
 #endif
 	AUDIT_ARG(pid, uap->pid);
 	AUDIT_ARG(cmd, uap->req);
 	AUDIT_ARG(addr, uap->addr);
 	AUDIT_ARG(value, uap->data);
 	addr = &r;
 	switch (uap->req) {
 	case PT_GETREGS:
 	case PT_GETFPREGS:
 	case PT_GETDBREGS:
 	case PT_LWPINFO:
 		break;
 	case PT_SETREGS:
 		error = COPYIN(uap->addr, &r.reg, sizeof r.reg);
 		break;
 	case PT_SETFPREGS:
 		error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg);
 		break;
 	case PT_SETDBREGS:
 		error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg);
 		break;
 	case PT_IO:
 		error = COPYIN(uap->addr, &r.piod, sizeof r.piod);
 		break;
 	default:
 		addr = uap->addr;
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data);
 	if (error)
 		return (error);
 
 	switch (uap->req) {
 	case PT_IO:
 		error = COPYOUT(&r.piod, uap->addr, sizeof r.piod);
 		break;
 	case PT_GETREGS:
 		error = COPYOUT(&r.reg, uap->addr, sizeof r.reg);
 		break;
 	case PT_GETFPREGS:
 		error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg);
 		break;
 	case PT_GETDBREGS:
 		error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg);
 		break;
 	case PT_LWPINFO:
 		error = copyout(&r.pl, uap->addr, uap->data);
 		break;
 	}
 
 	return (error);
 }
 #undef COPYIN
 #undef COPYOUT
 
 #ifdef COMPAT_IA32
 /*
  *   PROC_READ(regs, td2, addr);
  * becomes either:
  *   proc_read_regs(td2, addr);
  * or
  *   proc_read_regs32(td2, addr);
  * .. except this is done at runtime.  There is an additional
  * complication in that PROC_WRITE disallows 32 bit consumers
  * from writing to 64 bit address space targets.
  */
 #define	PROC_READ(w, t, a)	wrap32 ? \
 	proc_read_ ## w ## 32(t, a) : \
 	proc_read_ ## w (t, a)
 #define	PROC_WRITE(w, t, a)	wrap32 ? \
 	(safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \
 	proc_write_ ## w (t, a)
 #else
 #define	PROC_READ(w, t, a)	proc_read_ ## w (t, a)
 #define	PROC_WRITE(w, t, a)	proc_write_ ## w (t, a)
 #endif
 
 int
 kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 {
 	struct iovec iov;
 	struct uio uio;
 	struct proc *curp, *p, *pp;
 	struct thread *td2 = NULL;
 	struct ptrace_io_desc *piod = NULL;
 	struct ptrace_lwpinfo *pl;
 	int error, write, tmp, num;
 	int proctree_locked = 0;
 	lwpid_t tid = 0, *buf;
 #ifdef COMPAT_IA32
 	int wrap32 = 0, safe = 0;
 	struct ptrace_io_desc32 *piod32 = NULL;
 #endif
 
 	curp = td->td_proc;
 
 	/* Lock proctree before locking the process. */
 	switch (req) {
 	case PT_TRACE_ME:
 	case PT_ATTACH:
 	case PT_STEP:
 	case PT_CONTINUE:
 	case PT_TO_SCE:
 	case PT_TO_SCX:
 	case PT_SYSCALL:
 	case PT_DETACH:
 		sx_xlock(&proctree_lock);
 		proctree_locked = 1;
 		break;
 	default:
 		break;
 	}
 
 	write = 0;
 	if (req == PT_TRACE_ME) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		if (pid <= PID_MAX) {
 			if ((p = pfind(pid)) == NULL) {
 				if (proctree_locked)
 					sx_xunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		} else {
 			/* this is slow, should be optimized */
 			sx_slock(&allproc_lock);
 			FOREACH_PROC_IN_SYSTEM(p) {
 				PROC_LOCK(p);
 				mtx_lock_spin(&sched_lock);
 				FOREACH_THREAD_IN_PROC(p, td2) {
 					if (td2->td_tid == pid)
 						break;
 				}
 				mtx_unlock_spin(&sched_lock);
 				if (td2 != NULL)
 					break; /* proc lock held */
 				PROC_UNLOCK(p);
 			}
 			sx_sunlock(&allproc_lock);
 			if (p == NULL) {
 				if (proctree_locked)
 					sx_xunlock(&proctree_lock);
 				return (ESRCH);
 			}
 			tid = pid;
 			pid = p->p_pid;
 		}
 	}
 	AUDIT_ARG(process, p);
 
 	if ((p->p_flag & P_WEXIT) != 0) {
 		error = ESRCH;
 		goto fail;
 	}
 	if ((error = p_cansee(td, p)) != 0)
 		goto fail;
 
 	if ((error = p_candebug(td, p)) != 0)
 		goto fail;
 
 	/*
 	 * System processes can't be debugged.
 	 */
 	if ((p->p_flag & P_SYSTEM) != 0) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	if (tid == 0) {
 		if ((p->p_flag & P_STOPPED_TRACE) != 0) {
 			KASSERT(p->p_xthread != NULL, ("NULL p_xthread"));
 			td2 = p->p_xthread;
 		} else {
 			td2 = FIRST_THREAD_IN_PROC(p);
 		}
 		tid = td2->td_tid;
 	}
 
 #ifdef COMPAT_IA32
 	/*
 	 * Test if we're a 32 bit client and what the target is.
 	 * Set the wrap controls accordingly.
 	 */
 	if (td->td_proc->p_sysent == &ia32_freebsd_sysvec) {
 		if (td2->td_proc->p_sysent == &ia32_freebsd_sysvec)
 			safe = 1;
 		wrap32 = 1;
 	}
 #endif
 	/*
 	 * Permissions check
 	 */
 	switch (req) {
 	case PT_TRACE_ME:
 		/* Always legal. */
 		break;
 
 	case PT_ATTACH:
 		/* Self */
 		if (p->p_pid == td->td_proc->p_pid) {
 			error = EINVAL;
 			goto fail;
 		}
 
 		/* Already traced */
 		if (p->p_flag & P_TRACED) {
 			error = EBUSY;
 			goto fail;
 		}
 
 		/* Can't trace an ancestor if you're being traced. */
 		if (curp->p_flag & P_TRACED) {
 			for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) {
 				if (pp == p) {
 					error = EINVAL;
 					goto fail;
 				}
 			}
 		}
 
 
 		/* OK */
 		break;
 
 	case PT_CLEARSTEP:
 		/* Allow thread to clear single step for itself */
 		if (td->td_tid == tid)
 			break;
 
 		/* FALLTHROUGH */
 	default:
 		/* not being traced... */
 		if ((p->p_flag & P_TRACED) == 0) {
 			error = EPERM;
 			goto fail;
 		}
 
 		/* not being traced by YOU */
 		if (p->p_pptr != td->td_proc) {
 			error = EBUSY;
 			goto fail;
 		}
 
 		/* not currently stopped */
 		if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) == 0 ||
 		    p->p_suspcount != p->p_numthreads  ||
 		    (p->p_flag & P_WAITED) == 0) {
 			error = EBUSY;
 			goto fail;
 		}
 
 		if ((p->p_flag & P_STOPPED_TRACE) == 0) {
 			static int count = 0;
 			if (count++ == 0)
 				printf("P_STOPPED_TRACE not set.\n");
 		}
 
 		/* OK */
 		break;
 	}
 
 	/* Keep this process around until we finish this request. */
 	_PHOLD(p);
 
 #ifdef FIX_SSTEP
 	/*
 	 * Single step fixup ala procfs
 	 */
 	FIX_SSTEP(td2);
 #endif
 
 	/*
 	 * Actually do the requests
 	 */
 
 	td->td_retval[0] = 0;
 
 	switch (req) {
 	case PT_TRACE_ME:
 		/* set my trace flag and "owner" so it can read/write me */
 		p->p_flag |= P_TRACED;
 		p->p_oppid = p->p_pptr->p_pid;
 		break;
 
 	case PT_ATTACH:
 		/* security check done above */
 		p->p_flag |= P_TRACED;
 		p->p_oppid = p->p_pptr->p_pid;
 		if (p->p_pptr != td->td_proc)
 			proc_reparent(p, td->td_proc);
 		data = SIGSTOP;
 		goto sendsig;	/* in PT_CONTINUE below */
 
 	case PT_CLEARSTEP:
 		error = ptrace_clear_single_step(td2);
 		break;
 
 	case PT_SETSTEP:
 		error = ptrace_single_step(td2);
 		break;
 
 	case PT_SUSPEND:
 		mtx_lock_spin(&sched_lock);
 		td2->td_flags |= TDF_DBSUSPEND;
 		mtx_unlock_spin(&sched_lock);
 		break;
 
 	case PT_RESUME:
 		mtx_lock_spin(&sched_lock);
 		td2->td_flags &= ~TDF_DBSUSPEND;
 		mtx_unlock_spin(&sched_lock);
 		break;
 
 	case PT_STEP:
 	case PT_CONTINUE:
 	case PT_TO_SCE:
 	case PT_TO_SCX:
 	case PT_SYSCALL:
 	case PT_DETACH:
 		/* Zero means do not send any signal */
 		if (data < 0 || data > _SIG_MAXSIG) {
 			error = EINVAL;
 			break;
 		}
 
 		switch (req) {
 		case PT_STEP:
 			error = ptrace_single_step(td2);
 			if (error)
 				goto out;
 			break;
 		case PT_TO_SCE:
 			p->p_stops |= S_PT_SCE;
 			break;
 		case PT_TO_SCX:
 			p->p_stops |= S_PT_SCX;
 			break;
 		case PT_SYSCALL:
 			p->p_stops |= S_PT_SCE | S_PT_SCX;
 			break;
 		}
 
 		if (addr != (void *)1) {
 			error = ptrace_set_pc(td2, (u_long)(uintfptr_t)addr);
 			if (error)
 				break;
 		}
 
 		if (req == PT_DETACH) {
 			/* reset process parent */
 			if (p->p_oppid != p->p_pptr->p_pid) {
 				struct proc *pp;
 
 				PROC_LOCK(p->p_pptr);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(p->p_pptr);
 
 				PROC_UNLOCK(p);
 				pp = pfind(p->p_oppid);
 				if (pp == NULL)
 					pp = initproc;
 				else
 					PROC_UNLOCK(pp);
 				PROC_LOCK(p);
 				proc_reparent(p, pp);
 				if (pp == initproc)
 					p->p_sigparent = SIGCHLD;
 			}
 			p->p_flag &= ~(P_TRACED | P_WAITED);
 			p->p_oppid = 0;
 
 			/* should we send SIGCHLD? */
 			/* childproc_continued(p); */
 		}
 
 	sendsig:
 		if (proctree_locked) {
 			sx_xunlock(&proctree_lock);
 			proctree_locked = 0;
 		}
 		/* deliver or queue signal */
 		mtx_lock_spin(&sched_lock);
 		td2->td_flags &= ~TDF_XSIG;
 		mtx_unlock_spin(&sched_lock);
 		td2->td_xsig = data;
 		p->p_xstat = data;
 		p->p_xthread = NULL;
 		if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) != 0) {
 			mtx_lock_spin(&sched_lock);
 			if (req == PT_DETACH) {
 				struct thread *td3;
 				FOREACH_THREAD_IN_PROC(p, td3)
 					td3->td_flags &= ~TDF_DBSUSPEND; 
 			}
 			/*
 			 * unsuspend all threads, to not let a thread run,
 			 * you should use PT_SUSPEND to suspend it before
 			 * continuing process.
 			 */
 			mtx_unlock_spin(&sched_lock);
 #ifdef KSE
 			thread_continued(p);
 #endif
 			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED);
 			mtx_lock_spin(&sched_lock);
 			thread_unsuspend(p);
 			mtx_unlock_spin(&sched_lock);
 		}
 
 		if (data)
 			psignal(p, data);
 
 		break;
 
 	case PT_WRITE_I:
 	case PT_WRITE_D:
 		write = 1;
 		/* FALLTHROUGH */
 	case PT_READ_I:
 	case PT_READ_D:
 		PROC_UNLOCK(p);
 		tmp = 0;
 		/* write = 0 set above */
 		iov.iov_base = write ? (caddr_t)&data : (caddr_t)&tmp;
 		iov.iov_len = sizeof(int);
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = (off_t)(uintptr_t)addr;
 		uio.uio_resid = sizeof(int);
 		uio.uio_segflg = UIO_SYSSPACE;	/* i.e.: the uap */
 		uio.uio_rw = write ? UIO_WRITE : UIO_READ;
 		uio.uio_td = td;
 		error = proc_rwmem(p, &uio);
 		if (uio.uio_resid != 0) {
 			/*
 			 * XXX proc_rwmem() doesn't currently return ENOSPC,
 			 * so I think write() can bogusly return 0.
 			 * XXX what happens for short writes?  We don't want
 			 * to write partial data.
 			 * XXX proc_rwmem() returns EPERM for other invalid
 			 * addresses.  Convert this to EINVAL.  Does this
 			 * clobber returns of EPERM for other reasons?
 			 */
 			if (error == 0 || error == ENOSPC || error == EPERM)
 				error = EINVAL;	/* EOF */
 		}
 		if (!write)
 			td->td_retval[0] = tmp;
 		PROC_LOCK(p);
 		break;
 
 	case PT_IO:
 #ifdef COMPAT_IA32
 		if (wrap32) {
 			piod32 = addr;
 			iov.iov_base = (void *)(uintptr_t)piod32->piod_addr;
 			iov.iov_len = piod32->piod_len;
 			uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs;
 			uio.uio_resid = piod32->piod_len;
 		} else
 #endif
 		{
 			piod = addr;
 			iov.iov_base = piod->piod_addr;
 			iov.iov_len = piod->piod_len;
 			uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs;
 			uio.uio_resid = piod->piod_len;
 		}
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_segflg = UIO_USERSPACE;
 		uio.uio_td = td;
 #ifdef COMPAT_IA32
 		tmp = wrap32 ? piod32->piod_op : piod->piod_op;
 #else
 		tmp = piod->piod_op;
 #endif
 		switch (tmp) {
 		case PIOD_READ_D:
 		case PIOD_READ_I:
 			uio.uio_rw = UIO_READ;
 			break;
 		case PIOD_WRITE_D:
 		case PIOD_WRITE_I:
 			uio.uio_rw = UIO_WRITE;
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 		PROC_UNLOCK(p);
 		error = proc_rwmem(p, &uio);
 #ifdef COMPAT_IA32
 		if (wrap32)
 			piod32->piod_len -= uio.uio_resid;
 		else
 #endif
 			piod->piod_len -= uio.uio_resid;
 		PROC_LOCK(p);
 		break;
 
 	case PT_KILL:
 		data = SIGKILL;
 		goto sendsig;	/* in PT_CONTINUE above */
 
 	case PT_SETREGS:
 		error = PROC_WRITE(regs, td2, addr);
 		break;
 
 	case PT_GETREGS:
 		error = PROC_READ(regs, td2, addr);
 		break;
 
 	case PT_SETFPREGS:
 		error = PROC_WRITE(fpregs, td2, addr);
 		break;
 
 	case PT_GETFPREGS:
 		error = PROC_READ(fpregs, td2, addr);
 		break;
 
 	case PT_SETDBREGS:
 		error = PROC_WRITE(dbregs, td2, addr);
 		break;
 
 	case PT_GETDBREGS:
 		error = PROC_READ(dbregs, td2, addr);
 		break;
 
 	case PT_LWPINFO:
 		if (data <= 0 || data > sizeof(*pl)) {
 			error = EINVAL;
 			break;
 		}
 		pl = addr;
 		pl->pl_lwpid = td2->td_tid;
 		if (td2->td_flags & TDF_XSIG)
 			pl->pl_event = PL_EVENT_SIGNAL;
 		else
 			pl->pl_event = 0;
 #ifdef KSE
 		if (td2->td_pflags & TDP_SA) {
 			pl->pl_flags = PL_FLAG_SA;
 			if (td2->td_upcall && !TD_CAN_UNBIND(td2))
 				pl->pl_flags |= PL_FLAG_BOUND;
 		} else {
 			pl->pl_flags = 0;
 		}
 #else
 		pl->pl_flags = 0;
 #endif
 		pl->pl_sigmask = td2->td_sigmask;
 		pl->pl_siglist = td2->td_siglist;
 		break;
 
 	case PT_GETNUMLWPS:
 		td->td_retval[0] = p->p_numthreads;
 		break;
 
 	case PT_GETLWPLIST:
 		if (data <= 0) {
 			error = EINVAL;
 			break;
 		}
 		num = imin(p->p_numthreads, data);
 		PROC_UNLOCK(p);
 		buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK);
 		tmp = 0;
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (tmp >= num)
 				break;
 			buf[tmp++] = td2->td_tid;
 		}
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		error = copyout(buf, addr, tmp * sizeof(lwpid_t));
 		free(buf, M_TEMP);
 		if (!error)
 			td->td_retval[0] = tmp;
 		PROC_LOCK(p);
 		break;
 
 	default:
 #ifdef __HAVE_PTRACE_MACHDEP
 		if (req >= PT_FIRSTMACH) {
 			PROC_UNLOCK(p);
 			error = cpu_ptrace(td2, req, addr, data);
 			PROC_LOCK(p);
 		} else
 #endif
 			/* Unknown request. */
 			error = EINVAL;
 		break;
 	}
 
 out:
 	/* Drop our hold on this process now that the request has completed. */
 	_PRELE(p);
 fail:
 	PROC_UNLOCK(p);
 	if (proctree_locked)
 		sx_xunlock(&proctree_lock);
 	return (error);
 }
 #undef PROC_READ
 #undef PROC_WRITE
 
 /*
  * Stop a process because of a debugging event;
  * stay stopped until p->p_step is cleared
  * (cleared by PIOCCONT in procfs).
  */
 void
 stopevent(struct proc *p, unsigned int event, unsigned int val)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_step = 1;
 	do {
 		p->p_xstat = val;
 		p->p_xthread = NULL;
 		p->p_stype = event;	/* Which event caused the stop? */
 		wakeup(&p->p_stype);	/* Wake up any PIOCWAIT'ing procs */
 		msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0);
 	} while (p->p_step);
 }
Index: head/sys/kern/sysv_msg.c
===================================================================
--- head/sys/kern/sysv_msg.c	(revision 167210)
+++ head/sys/kern/sysv_msg.c	(revision 167211)
@@ -1,1312 +1,1298 @@
 /*-
  * Implementation of SVID messages
  *
  * Author:  Daniel Boulet
  *
  * Copyright 1993 Daniel Boulet and RTMX Inc.
  *
  * This system call was implemented by Daniel Boulet under contract from RTMX.
  *
  * Redistribution and use in source forms, with and without modification,
  * are permitted provided that this entire comment appears intact.
  *
  * Redistribution in binary form may occur without any restrictions.
  * Obviously, it would be nice if you gave credit where credit is due
  * but requiring it would be too onerous.
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sysvipc.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/msg.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
 
 static void msginit(void);
 static int msgunload(void);
 static int sysvmsg_modload(struct module *, int, void *);
 
 #ifdef MSG_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)
 #endif
 
 static void msg_freehdr(struct msg *msghdr);
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *msgcalls[] = {
 	(sy_call_t *)msgctl, (sy_call_t *)msgget,
 	(sy_call_t *)msgsnd, (sy_call_t *)msgrcv
 };
 
 #ifndef MSGSSZ
 #define MSGSSZ	8		/* Each segment must be 2^N long */
 #endif
 #ifndef MSGSEG
 #define MSGSEG	2048		/* must be less than 32767 */
 #endif
 #define MSGMAX	(MSGSSZ*MSGSEG)
 #ifndef MSGMNB
 #define MSGMNB	2048		/* max # of bytes in a queue */
 #endif
 #ifndef MSGMNI
 #define MSGMNI	40
 #endif
 #ifndef MSGTQL
 #define MSGTQL	40
 #endif
 
 /*
  * Based on the configuration parameters described in an SVR2 (yes, two)
  * config(1m) man page.
  *
  * Each message is broken up and stored in segments that are msgssz bytes
  * long.  For efficiency reasons, this should be a power of two.  Also,
  * it doesn't make sense if it is less than 8 or greater than about 256.
  * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
  * two between 8 and 1024 inclusive (and panic's if it isn't).
  */
 struct msginfo msginfo = {
                 MSGMAX,         /* max chars in a message */
                 MSGMNI,         /* # of message queue identifiers */
                 MSGMNB,         /* max chars in a queue */
                 MSGTQL,         /* max messages in system */
                 MSGSSZ,         /* size of a message segment */
                 		/* (must be small power of 2 greater than 4) */
                 MSGSEG          /* number of message segments */
 };
 
 /*
  * macros to convert between msqid_ds's and msqid's.
  * (specific to this implementation)
  */
 #define MSQID(ix,ds)	((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
 #define MSQID_IX(id)	((id) & 0xffff)
 #define MSQID_SEQ(id)	(((id) >> 16) & 0xffff)
 
 /*
  * The rest of this file is specific to this particular implementation.
  */
 
 struct msgmap {
 	short	next;		/* next segment in buffer */
     				/* -1 -> available */
     				/* 0..(MSGSEG-1) -> index of next segment */
 };
 
 #define MSG_LOCKED	01000	/* Is this msqid_ds locked? */
 
 static int nfree_msgmaps;	/* # of free map entries */
 static short free_msgmaps;	/* head of linked list of free map entries */
 static struct msg *free_msghdrs;/* list of free msg headers */
 static char *msgpool;		/* MSGMAX byte long msg buffer pool */
 static struct msgmap *msgmaps;	/* MSGSEG msgmap structures */
 static struct msg *msghdrs;	/* MSGTQL msg headers */
 static struct msqid_kernel *msqids;	/* MSGMNI msqid_kernel struct's */
 static struct mtx msq_mtx;	/* global mutex for message queues. */
 
 static void
 msginit()
 {
 	register int i;
 
 	TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg);
 	TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz);
 	msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
 	TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni);
 	TUNABLE_INT_FETCH("kern.ipc.msgmnb", &msginfo.msgmnb);
 	TUNABLE_INT_FETCH("kern.ipc.msgtql", &msginfo.msgtql);
 
 	msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
 	if (msgpool == NULL)
 		panic("msgpool is NULL");
 	msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
 	if (msgmaps == NULL)
 		panic("msgmaps is NULL");
 	msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
 	if (msghdrs == NULL)
 		panic("msghdrs is NULL");
 	msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG,
 	    M_WAITOK);
 	if (msqids == NULL)
 		panic("msqids is NULL");
 
 	/*
 	 * msginfo.msgssz should be a power of two for efficiency reasons.
 	 * It is also pretty silly if msginfo.msgssz is less than 8
 	 * or greater than about 256 so ...
 	 */
 
 	i = 8;
 	while (i < 1024 && i != msginfo.msgssz)
 		i <<= 1;
     	if (i != msginfo.msgssz) {
 		DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
 		    msginfo.msgssz));
 		panic("msginfo.msgssz not a small power of 2");
 	}
 
 	if (msginfo.msgseg > 32767) {
 		DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg));
 		panic("msginfo.msgseg > 32767");
 	}
 
 	if (msgmaps == NULL)
 		panic("msgmaps is NULL");
 
 	for (i = 0; i < msginfo.msgseg; i++) {
 		if (i > 0)
 			msgmaps[i-1].next = i;
 		msgmaps[i].next = -1;	/* implies entry is available */
 	}
 	free_msgmaps = 0;
 	nfree_msgmaps = msginfo.msgseg;
 
 	if (msghdrs == NULL)
 		panic("msghdrs is NULL");
 
 	for (i = 0; i < msginfo.msgtql; i++) {
 		msghdrs[i].msg_type = 0;
 		if (i > 0)
 			msghdrs[i-1].msg_next = &msghdrs[i];
 		msghdrs[i].msg_next = NULL;
 #ifdef MAC
 		mac_init_sysv_msgmsg(&msghdrs[i]);
 #endif
     	}
 	free_msghdrs = &msghdrs[0];
 
 	if (msqids == NULL)
 		panic("msqids is NULL");
 
 	for (i = 0; i < msginfo.msgmni; i++) {
 		msqids[i].u.msg_qbytes = 0;	/* implies entry is available */
 		msqids[i].u.msg_perm.seq = 0;	/* reset to a known value */
 		msqids[i].u.msg_perm.mode = 0;
 #ifdef MAC
 		mac_init_sysv_msgqueue(&msqids[i]);
 #endif
 	}
 	mtx_init(&msq_mtx, "msq", NULL, MTX_DEF);
 }
 
 static int
 msgunload()
 {
 	struct msqid_kernel *msqkptr;
 	int msqid;
 #ifdef MAC
 	int i;
 #endif
 
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 		/*
 		 * Look for an unallocated and unlocked msqid_ds.
 		 * msqid_ds's can be locked by msgsnd or msgrcv while
 		 * they are copying the message in/out.  We can't
 		 * re-use the entry until they release it.
 		 */
 		msqkptr = &msqids[msqid];
 		if (msqkptr->u.msg_qbytes != 0 ||
 		    (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 			break;
 	}
 	if (msqid != msginfo.msgmni)
 		return (EBUSY);
 
 #ifdef MAC
 	for (i = 0; i < msginfo.msgtql; i++)
 		mac_destroy_sysv_msgmsg(&msghdrs[i]);
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++)
 		mac_destroy_sysv_msgqueue(&msqids[msqid]);
 #endif
 	free(msgpool, M_MSG);
 	free(msgmaps, M_MSG);
 	free(msghdrs, M_MSG);
 	free(msqids, M_MSG);
 	mtx_destroy(&msq_mtx);
 	return (0);
 }
 
 
 static int
 sysvmsg_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		msginit();
 		break;
 	case MOD_UNLOAD:
 		error = msgunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvmsg_mod = {
 	"sysvmsg",
 	&sysvmsg_modload,
 	NULL
 };
 
 SYSCALL_MODULE_HELPER(msgsys);
 SYSCALL_MODULE_HELPER(msgctl);
 SYSCALL_MODULE_HELPER(msgget);
 SYSCALL_MODULE_HELPER(msgsnd);
 SYSCALL_MODULE_HELPER(msgrcv);
 
 DECLARE_MODULE(sysvmsg, sysvmsg_mod,
 	SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
 MODULE_VERSION(sysvmsg, 1);
 
 /*
- * Entry point for all MSG calls
- *
- * MPSAFE
+ * Entry point for all MSG calls.
  */
 int
 msgsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct msgsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 		int	a5;
 		int	a6;
 	} */ *uap;
 {
 	int error;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	if (uap->which < 0 ||
 	    uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
 		return (EINVAL);
 	error = (*msgcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 static void
 msg_freehdr(msghdr)
 	struct msg *msghdr;
 {
 	while (msghdr->msg_ts > 0) {
 		short next;
 		if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
 			panic("msghdr->msg_spot out of range");
 		next = msgmaps[msghdr->msg_spot].next;
 		msgmaps[msghdr->msg_spot].next = free_msgmaps;
 		free_msgmaps = msghdr->msg_spot;
 		nfree_msgmaps++;
 		msghdr->msg_spot = next;
 		if (msghdr->msg_ts >= msginfo.msgssz)
 			msghdr->msg_ts -= msginfo.msgssz;
 		else
 			msghdr->msg_ts = 0;
 	}
 	if (msghdr->msg_spot != -1)
 		panic("msghdr->msg_spot != -1");
 	msghdr->msg_next = free_msghdrs;
 	free_msghdrs = msghdr;
 #ifdef MAC
 	mac_cleanup_sysv_msgmsg(msghdr);
 #endif
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgctl_args {
 	int	msqid;
 	int	cmd;
 	struct	msqid_ds *buf;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 msgctl(td, uap)
 	struct thread *td;
 	register struct msgctl_args *uap;
 {
 	int msqid = uap->msqid;
 	int cmd = uap->cmd;
 	struct msqid_ds msqbuf;
 	int error;
 
 	DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
 	if (cmd == IPC_SET &&
 	    (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
 		return (error);
 	error = kern_msgctl(td, msqid, cmd, &msqbuf);
 	if (cmd == IPC_STAT && error == 0)
 		error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds));
 	return (error);
 }
 
 int
 kern_msgctl(td, msqid, cmd, msqbuf)
 	struct thread *td;
 	int msqid;
 	int cmd;
 	struct msqid_ds *msqbuf;
 {
 	int rval, error, msqix;
 	register struct msqid_kernel *msqkptr;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such msqid\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_check_sysv_msqctl(td->td_ucred, msqkptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	rval = 0;
 
 	switch (cmd) {
 
 	case IPC_RMID:
 	{
 		struct msg *msghdr;
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 
 #ifdef MAC
 		/*
 		 * Check that the thread has MAC access permissions to
 		 * individual msghdrs.  Note: We need to do this in a
 		 * separate loop because the actual loop alters the
 		 * msq/msghdr info as it progresses, and there is no going
 		 * back if half the way through we discover that the
 		 * thread cannot free a certain msghdr.  The msq will get
 		 * into an inconsistent state.
 		 */
 		for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
 		    msghdr = msghdr->msg_next) {
 			error = mac_check_sysv_msgrmid(td->td_ucred, msghdr);
 			if (error != 0)
 				goto done2;
 		}
 #endif
 
 		/* Free the message headers */
 		msghdr = msqkptr->u.msg_first;
 		while (msghdr != NULL) {
 			struct msg *msghdr_tmp;
 
 			/* Free the segments of each message */
 			msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 			msqkptr->u.msg_qnum--;
 			msghdr_tmp = msghdr;
 			msghdr = msghdr->msg_next;
 			msg_freehdr(msghdr_tmp);
 		}
 
 		if (msqkptr->u.msg_cbytes != 0)
 			panic("msg_cbytes is screwed up");
 		if (msqkptr->u.msg_qnum != 0)
 			panic("msg_qnum is screwed up");
 
 		msqkptr->u.msg_qbytes = 0;	/* Mark it as free */
 
 #ifdef MAC
 		mac_cleanup_sysv_msgqueue(msqkptr);
 #endif
 
 		wakeup(msqkptr);
 	}
 
 		break;
 
 	case IPC_SET:
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 		if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
 			error = priv_check_cred(td->td_ucred,
 			    PRIV_IPC_MSGSIZE, SUSER_ALLOWJAIL);
 			if (error)
 				goto done2;
 		}
 		if (msqbuf->msg_qbytes > msginfo.msgmnb) {
 			DPRINTF(("can't increase msg_qbytes beyond %d"
 			    "(truncating)\n", msginfo.msgmnb));
 			msqbuf->msg_qbytes = msginfo.msgmnb;	/* silently restrict qbytes to system limit */
 		}
 		if (msqbuf->msg_qbytes == 0) {
 			DPRINTF(("can't reduce msg_qbytes to 0\n"));
 			error = EINVAL;		/* non-standard errno! */
 			goto done2;
 		}
 		msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid;	/* change the owner */
 		msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid;	/* change the owner */
 		msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) |
 		    (msqbuf->msg_perm.mode & 0777);
 		msqkptr->u.msg_qbytes = msqbuf->msg_qbytes;
 		msqkptr->u.msg_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 			DPRINTF(("requester doesn't have read access\n"));
 			goto done2;
 		}
 		*msqbuf = msqkptr->u;
 		break;
 
 	default:
 		DPRINTF(("invalid command %d\n", cmd));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgget_args {
 	key_t	key;
 	int	msgflg;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 msgget(td, uap)
 	struct thread *td;
 	register struct msgget_args *uap;
 {
 	int msqid, error = 0;
 	int key = uap->key;
 	int msgflg = uap->msgflg;
 	struct ucred *cred = td->td_ucred;
 	register struct msqid_kernel *msqkptr = NULL;
 
 	DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	if (key != IPC_PRIVATE) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes != 0 &&
 			    msqkptr->u.msg_perm.key == key)
 				break;
 		}
 		if (msqid < msginfo.msgmni) {
 			DPRINTF(("found public key\n"));
 			if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 			if ((error = ipcperm(td, &msqkptr->u.msg_perm,
 			    msgflg & 0700))) {
 				DPRINTF(("requester doesn't have 0%o access\n",
 				    msgflg & 0700));
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_check_sysv_msqget(cred, msqkptr);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the msqid_ds\n"));
 	if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			/*
 			 * Look for an unallocated and unlocked msqid_ds.
 			 * msqid_ds's can be locked by msgsnd or msgrcv while
 			 * they are copying the message in/out.  We can't
 			 * re-use the entry until they release it.
 			 */
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes == 0 &&
 			    (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0)
 				break;
 		}
 		if (msqid == msginfo.msgmni) {
 			DPRINTF(("no more msqid_ds's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 		DPRINTF(("msqid %d is available\n", msqid));
 		msqkptr->u.msg_perm.key = key;
 		msqkptr->u.msg_perm.cuid = cred->cr_uid;
 		msqkptr->u.msg_perm.uid = cred->cr_uid;
 		msqkptr->u.msg_perm.cgid = cred->cr_gid;
 		msqkptr->u.msg_perm.gid = cred->cr_gid;
 		msqkptr->u.msg_perm.mode = (msgflg & 0777);
 		/* Make sure that the returned msqid is unique */
 		msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff;
 		msqkptr->u.msg_first = NULL;
 		msqkptr->u.msg_last = NULL;
 		msqkptr->u.msg_cbytes = 0;
 		msqkptr->u.msg_qnum = 0;
 		msqkptr->u.msg_qbytes = msginfo.msgmnb;
 		msqkptr->u.msg_lspid = 0;
 		msqkptr->u.msg_lrpid = 0;
 		msqkptr->u.msg_stime = 0;
 		msqkptr->u.msg_rtime = 0;
 		msqkptr->u.msg_ctime = time_second;
 #ifdef MAC
 		mac_create_sysv_msgqueue(cred, msqkptr);
 #endif
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	/* Construct the unique msqid */
 	td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm);
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgsnd_args {
 	int	msqid;
 	const void	*msgp;
 	size_t	msgsz;
 	int	msgflg;
 };
 #endif
 
 int
 kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
 	struct thread *td;
 	int msqid;
 	const void *msgp;	/* XXX msgp is actually mtext. */
 	size_t msgsz;
 	int msgflg;
 	long mtype;
 {
 	int msqix, segs_needed, error = 0;
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
 	short next;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		error = EINVAL;
 		goto done2;
 	}
 
 	msqkptr = &msqids[msqix];
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) {
 		DPRINTF(("requester doesn't have write access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_check_sysv_msqsnd(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
 	DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
 	    msginfo.msgssz, segs_needed));
 	for (;;) {
 		int need_more_resources = 0;
 
 		/*
 		 * check msgsz
 		 * (inside this loop in case msg_qbytes changes while we sleep)
 		 */
 
 		if (msgsz > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n"));
 			error = EINVAL;
 			goto done2;
 		}
 
 		if (msqkptr->u.msg_perm.mode & MSG_LOCKED) {
 			DPRINTF(("msqid is locked\n"));
 			need_more_resources = 1;
 		}
 		if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
 			need_more_resources = 1;
 		}
 		if (segs_needed > nfree_msgmaps) {
 			DPRINTF(("segs_needed > nfree_msgmaps\n"));
 			need_more_resources = 1;
 		}
 		if (free_msghdrs == NULL) {
 			DPRINTF(("no more msghdrs\n"));
 			need_more_resources = 1;
 		}
 
 		if (need_more_resources) {
 			int we_own_it;
 
 			if ((msgflg & IPC_NOWAIT) != 0) {
 				DPRINTF(("need more resources but caller "
 				    "doesn't want to wait\n"));
 				error = EAGAIN;
 				goto done2;
 			}
 
 			if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) {
 				DPRINTF(("we don't own the msqid_ds\n"));
 				we_own_it = 0;
 			} else {
 				/* Force later arrivals to wait for our
 				   request */
 				DPRINTF(("we own the msqid_ds\n"));
 				msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 				we_own_it = 1;
 			}
 			DPRINTF(("msgsnd:  goodnight\n"));
 			error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 			    "msgsnd", hz);
 			DPRINTF(("msgsnd:  good morning, error=%d\n", error));
 			if (we_own_it)
 				msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			if (error == EWOULDBLOCK) {
 				DPRINTF(("msgsnd:  timed out\n"));
 				continue;
 			}
 			if (error != 0) {
 				DPRINTF(("msgsnd:  interrupted system call\n"));
 				error = EINTR;
 				goto done2;
 			}
 
 			/*
 			 * Make sure that the msq queue still exists
 			 */
 
 			if (msqkptr->u.msg_qbytes == 0) {
 				DPRINTF(("msqid deleted\n"));
 				error = EIDRM;
 				goto done2;
 			}
 
 		} else {
 			DPRINTF(("got all the resources that we need\n"));
 			break;
 		}
 	}
 
 	/*
 	 * We have the resources that we need.
 	 * Make sure!
 	 */
 
 	if (msqkptr->u.msg_perm.mode & MSG_LOCKED)
 		panic("msg_perm.mode & MSG_LOCKED");
 	if (segs_needed > nfree_msgmaps)
 		panic("segs_needed > nfree_msgmaps");
 	if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes)
 		panic("msgsz + msg_cbytes > msg_qbytes");
 	if (free_msghdrs == NULL)
 		panic("no more msghdrs");
 
 	/*
 	 * Re-lock the msqid_ds in case we page-fault when copying in the
 	 * message
 	 */
 
 	if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 		panic("msqid_ds is already locked");
 	msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 
 	/*
 	 * Allocate a message header
 	 */
 
 	msghdr = free_msghdrs;
 	free_msghdrs = msghdr->msg_next;
 	msghdr->msg_spot = -1;
 	msghdr->msg_ts = msgsz;
 	msghdr->msg_type = mtype;
 #ifdef MAC
 	/*
 	 * XXXMAC: Should the mac_check_sysv_msgmsq check follow here
 	 * immediately?  Or, should it be checked just before the msg is
 	 * enqueued in the msgq (as it is done now)?
 	 */
 	mac_create_sysv_msgmsg(td->td_ucred, msqkptr, msghdr);
 #endif
 
 	/*
 	 * Allocate space for the message
 	 */
 
 	while (segs_needed > 0) {
 		if (nfree_msgmaps <= 0)
 			panic("not enough msgmaps");
 		if (free_msgmaps == -1)
 			panic("nil free_msgmaps");
 		next = free_msgmaps;
 		if (next <= -1)
 			panic("next too low #1");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #1");
 		DPRINTF(("allocating segment %d to message\n", next));
 		free_msgmaps = msgmaps[next].next;
 		nfree_msgmaps--;
 		msgmaps[next].next = msghdr->msg_spot;
 		msghdr->msg_spot = next;
 		segs_needed--;
 	}
 
 	/*
 	 * Validate the message type
 	 */
 
 	if (msghdr->msg_type < 1) {
 		msg_freehdr(msghdr);
 		msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 		wakeup(msqkptr);
 		DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
 		error = EINVAL;
 		goto done2;
 	}
 
 	/*
 	 * Copy in the message body
 	 */
 
 	next = msghdr->msg_spot;
 	while (msgsz > 0) {
 		size_t tlen;
 		if (msgsz > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz;
 		if (next <= -1)
 			panic("next too low #2");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #2");
 		mtx_unlock(&msq_mtx);
 		if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
 		    tlen)) != 0) {
 			mtx_lock(&msq_mtx);
 			DPRINTF(("error %d copying in message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			wakeup(msqkptr);
 			goto done2;
 		}
 		mtx_lock(&msq_mtx);
 		msgsz -= tlen;
 		msgp = (const char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 	if (next != -1)
 		panic("didn't use all the msg segments");
 
 	/*
 	 * We've got the message.  Unlock the msqid_ds.
 	 */
 
 	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 
 	/*
 	 * Make sure that the msqid_ds is still allocated.
 	 */
 
 	if (msqkptr->u.msg_qbytes == 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		error = EIDRM;
 		goto done2;
 	}
 
 #ifdef MAC
 	/*
 	 * Note: Since the task/thread allocates the msghdr and usually
 	 * primes it with its own MAC label, for a majority of policies, it
 	 * won't be necessary to check whether the msghdr has access
 	 * permissions to the msgq.  The mac_check_sysv_msqsnd check would
 	 * suffice in that case.  However, this hook may be required where
 	 * individual policies derive a non-identical label for the msghdr
 	 * from the current thread label and may want to check the msghdr
 	 * enqueue permissions, along with read/write permissions to the
 	 * msgq.
 	 */
 	error = mac_check_sysv_msgmsq(td->td_ucred, msghdr, msqkptr);
 	if (error != 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		goto done2;
 	}
 #endif
 
 	/*
 	 * Put the message into the queue
 	 */
 	if (msqkptr->u.msg_first == NULL) {
 		msqkptr->u.msg_first = msghdr;
 		msqkptr->u.msg_last = msghdr;
 	} else {
 		msqkptr->u.msg_last->msg_next = msghdr;
 		msqkptr->u.msg_last = msghdr;
 	}
 	msqkptr->u.msg_last->msg_next = NULL;
 
 	msqkptr->u.msg_cbytes += msghdr->msg_ts;
 	msqkptr->u.msg_qnum++;
 	msqkptr->u.msg_lspid = td->td_proc->p_pid;
 	msqkptr->u.msg_stime = time_second;
 
 	wakeup(msqkptr);
 	td->td_retval[0] = 0;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 msgsnd(td, uap)
 	struct thread *td;
 	register struct msgsnd_args *uap;
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
 	    uap->msgsz, uap->msgflg));
 
 	if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
 		DPRINTF(("error %d copying the message type\n", error));
 		return (error);
 	}
 	return (kern_msgsnd(td, uap->msqid,
 	    (const char *)uap->msgp + sizeof(mtype),
 	    uap->msgsz, uap->msgflg, mtype));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgrcv_args {
 	int	msqid;
 	void	*msgp;
 	size_t	msgsz;
 	long	msgtyp;
 	int	msgflg;
 };
 #endif
 
 int
 kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype)
 	struct thread *td;
 	int msqid;
 	void *msgp;	/* XXX msgp is actually mtext. */
 	size_t msgsz;
 	long msgtyp;
 	int msgflg;
 	long *mtype;
 {
 	size_t len;
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
 	int msqix, error = 0;
 	short next;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 		DPRINTF(("requester doesn't have read access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_check_sysv_msqrcv(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 	msghdr = NULL;
 	while (msghdr == NULL) {
 		if (msgtyp == 0) {
 			msghdr = msqkptr->u.msg_first;
 			if (msghdr != NULL) {
 				if (msgsz < msghdr->msg_ts &&
 				    (msgflg & MSG_NOERROR) == 0) {
 					DPRINTF(("first message on the queue "
 					    "is too big (want %zu, got %d)\n",
 					    msgsz, msghdr->msg_ts));
 					error = E2BIG;
 					goto done2;
 				}
 #ifdef MAC
 				error = mac_check_sysv_msgrcv(td->td_ucred,
 				    msghdr);
 				if (error != 0)
 					goto done2;
 #endif
 				if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
 					msqkptr->u.msg_first = NULL;
 					msqkptr->u.msg_last = NULL;
 				} else {
 					msqkptr->u.msg_first = msghdr->msg_next;
 					if (msqkptr->u.msg_first == NULL)
 						panic("msg_first/last screwed up #1");
 				}
 			}
 		} else {
 			struct msg *previous;
 			struct msg **prev;
 
 			previous = NULL;
 			prev = &(msqkptr->u.msg_first);
 			while ((msghdr = *prev) != NULL) {
 				/*
 				 * Is this message's type an exact match or is
 				 * this message's type less than or equal to
 				 * the absolute value of a negative msgtyp?
 				 * Note that the second half of this test can
 				 * NEVER be true if msgtyp is positive since
 				 * msg_type is always positive!
 				 */
 
 				if (msgtyp == msghdr->msg_type ||
 				    msghdr->msg_type <= -msgtyp) {
 					DPRINTF(("found message type %ld, "
 					    "requested %ld\n",
 					    msghdr->msg_type, msgtyp));
 					if (msgsz < msghdr->msg_ts &&
 					    (msgflg & MSG_NOERROR) == 0) {
 						DPRINTF(("requested message "
 						    "on the queue is too big "
 						    "(want %zu, got %hu)\n",
 						    msgsz, msghdr->msg_ts));
 						error = E2BIG;
 						goto done2;
 					}
 #ifdef MAC
 					error = mac_check_sysv_msgrcv(
 					    td->td_ucred, msghdr);
 					if (error != 0)
 						goto done2;
 #endif
 					*prev = msghdr->msg_next;
 					if (msghdr == msqkptr->u.msg_last) {
 						if (previous == NULL) {
 							if (prev !=
 							    &msqkptr->u.msg_first)
 								panic("msg_first/last screwed up #2");
 							msqkptr->u.msg_first =
 							    NULL;
 							msqkptr->u.msg_last =
 							    NULL;
 						} else {
 							if (prev ==
 							    &msqkptr->u.msg_first)
 								panic("msg_first/last screwed up #3");
 							msqkptr->u.msg_last =
 							    previous;
 						}
 					}
 					break;
 				}
 				previous = msghdr;
 				prev = &(msghdr->msg_next);
 			}
 		}
 
 		/*
 		 * We've either extracted the msghdr for the appropriate
 		 * message or there isn't one.
 		 * If there is one then bail out of this loop.
 		 */
 
 		if (msghdr != NULL)
 			break;
 
 		/*
 		 * Hmph!  No message found.  Does the user want to wait?
 		 */
 
 		if ((msgflg & IPC_NOWAIT) != 0) {
 			DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
 			    msgtyp));
 			/* The SVID says to return ENOMSG. */
 			error = ENOMSG;
 			goto done2;
 		}
 
 		/*
 		 * Wait for something to happen
 		 */
 
 		DPRINTF(("msgrcv:  goodnight\n"));
 		error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 		    "msgrcv", 0);
 		DPRINTF(("msgrcv:  good morning (error=%d)\n", error));
 
 		if (error != 0) {
 			DPRINTF(("msgrcv:  interrupted system call\n"));
 			error = EINTR;
 			goto done2;
 		}
 
 		/*
 		 * Make sure that the msq queue still exists
 		 */
 
 		if (msqkptr->u.msg_qbytes == 0 ||
 		    msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 			DPRINTF(("msqid deleted\n"));
 			error = EIDRM;
 			goto done2;
 		}
 	}
 
 	/*
 	 * Return the message to the user.
 	 *
 	 * First, do the bookkeeping (before we risk being interrupted).
 	 */
 
 	msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 	msqkptr->u.msg_qnum--;
 	msqkptr->u.msg_lrpid = td->td_proc->p_pid;
 	msqkptr->u.msg_rtime = time_second;
 
 	/*
 	 * Make msgsz the actual amount that we'll be returning.
 	 * Note that this effectively truncates the message if it is too long
 	 * (since msgsz is never increased).
 	 */
 
 	DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
 	    msghdr->msg_ts));
 	if (msgsz > msghdr->msg_ts)
 		msgsz = msghdr->msg_ts;
 	*mtype = msghdr->msg_type;
 
 	/*
 	 * Return the segments to the user
 	 */
 
 	next = msghdr->msg_spot;
 	for (len = 0; len < msgsz; len += msginfo.msgssz) {
 		size_t tlen;
 
 		if (msgsz - len > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz - len;
 		if (next <= -1)
 			panic("next too low #3");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #3");
 		mtx_unlock(&msq_mtx);
 		error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
 		mtx_lock(&msq_mtx);
 		if (error != 0) {
 			DPRINTF(("error (%d) copying out message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			wakeup(msqkptr);
 			goto done2;
 		}
 		msgp = (char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 
 	/*
 	 * Done, return the actual number of bytes copied out.
 	 */
 
 	msg_freehdr(msghdr);
 	wakeup(msqkptr);
 	td->td_retval[0] = msgsz;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 msgrcv(td, uap)
 	struct thread *td;
 	register struct msgrcv_args *uap;
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
 	    uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
 
 	if ((error = kern_msgrcv(td, uap->msqid,
 	    (char *)uap->msgp + sizeof(mtype), uap->msgsz,
 	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
 		return (error);
 	if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
 		DPRINTF(("error %d copying the message type\n", error));
 	return (error);
 }
 
 static int
 sysctl_msqids(SYSCTL_HANDLER_ARGS)
 {
 
 	return (SYSCTL_OUT(req, msqids,
 	    sizeof(struct msqid_kernel) * msginfo.msgmni));
 }
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
     "Maximum message size");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
     "Number of message queue identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0,
     "Maximum number of bytes in a queue");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0,
     "Maximum number of messages in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0,
     "Size of a message segment");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0,
     "Number of message segments");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLFLAG_RD,
     NULL, 0, sysctl_msqids, "", "Message queue IDs");
Index: head/sys/kern/sysv_sem.c
===================================================================
--- head/sys/kern/sysv_sem.c	(revision 167210)
+++ head/sys/kern/sysv_sem.c	(revision 167211)
@@ -1,1375 +1,1364 @@
 /*-
  * Implementation of SVID semaphores
  *
  * Author:  Daniel Boulet
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sysvipc.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/sem.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
 
 #ifdef SEM_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)
 #endif
 
 static void seminit(void);
 static int sysvsem_modload(struct module *, int, void *);
 static int semunload(void);
 static void semexit_myhook(void *arg, struct proc *p);
 static int sysctl_sema(SYSCTL_HANDLER_ARGS);
 static int semvalid(int semid, struct semid_kernel *semakptr);
 
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args;
 int __semctl(struct thread *td, struct __semctl_args *uap);
 struct semget_args;
 int semget(struct thread *td, struct semget_args *uap);
 struct semop_args;
 int semop(struct thread *td, struct semop_args *uap);
 #endif
 
 static struct sem_undo *semu_alloc(struct thread *td);
 static int semundo_adjust(struct thread *td, struct sem_undo **supptr,
 		int semid, int semnum, int adjval);
 static void semundo_clear(int semid, int semnum);
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *semcalls[] = {
 	(sy_call_t *)__semctl, (sy_call_t *)semget,
 	(sy_call_t *)semop
 };
 
 static struct mtx	sem_mtx;	/* semaphore global lock */
 static int	semtot = 0;
 static struct semid_kernel *sema;	/* semaphore id pool */
 static struct mtx *sema_mtx;	/* semaphore id pool mutexes*/
 static struct sem *sem;		/* semaphore pool */
 SLIST_HEAD(, sem_undo) semu_list;	/* list of active undo structures */
 static int	*semu;		/* undo structure pool */
 static eventhandler_tag semexit_tag;
 
 #define SEMUNDO_MTX		sem_mtx
 #define SEMUNDO_LOCK()		mtx_lock(&SEMUNDO_MTX);
 #define SEMUNDO_UNLOCK()	mtx_unlock(&SEMUNDO_MTX);
 #define SEMUNDO_LOCKASSERT(how)	mtx_assert(&SEMUNDO_MTX, (how));
 
 struct sem {
 	u_short	semval;		/* semaphore value */
 	pid_t	sempid;		/* pid of last operation */
 	u_short	semncnt;	/* # awaiting semval > cval */
 	u_short	semzcnt;	/* # awaiting semval = 0 */
 };
 
 /*
  * Undo structure (one per process)
  */
 struct sem_undo {
 	SLIST_ENTRY(sem_undo) un_next;	/* ptr to next active undo structure */
 	struct	proc *un_proc;		/* owner of this structure */
 	short	un_cnt;			/* # of active entries */
 	struct undo {
 		short	un_adjval;	/* adjust on exit values */
 		short	un_num;		/* semaphore # */
 		int	un_id;		/* semid */
 	} un_ent[1];			/* undo entries */
 };
 
 /*
  * Configuration parameters
  */
 #ifndef SEMMNI
 #define SEMMNI	10		/* # of semaphore identifiers */
 #endif
 #ifndef SEMMNS
 #define SEMMNS	60		/* # of semaphores in system */
 #endif
 #ifndef SEMUME
 #define SEMUME	10		/* max # of undo entries per process */
 #endif
 #ifndef SEMMNU
 #define SEMMNU	30		/* # of undo structures in system */
 #endif
 
 /* shouldn't need tuning */
 #ifndef SEMMAP
 #define SEMMAP	30		/* # of entries in semaphore map */
 #endif
 #ifndef SEMMSL
 #define SEMMSL	SEMMNS		/* max # of semaphores per id */
 #endif
 #ifndef SEMOPM
 #define SEMOPM	100		/* max # of operations per semop call */
 #endif
 
 #define SEMVMX	32767		/* semaphore maximum value */
 #define SEMAEM	16384		/* adjust on exit max value */
 
 /*
  * Due to the way semaphore memory is allocated, we have to ensure that
  * SEMUSZ is properly aligned.
  */
 
 #define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
 
 /* actual size of an undo structure */
 #define SEMUSZ	SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))
 
 /*
  * Macro to find a particular sem_undo vector
  */
 #define SEMU(ix) \
 	((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz))
 
 /*
  * semaphore info struct
  */
 struct seminfo seminfo = {
                 SEMMAP,         /* # of entries in semaphore map */
                 SEMMNI,         /* # of semaphore identifiers */
                 SEMMNS,         /* # of semaphores in system */
                 SEMMNU,         /* # of undo structures in system */
                 SEMMSL,         /* max # of semaphores per id */
                 SEMOPM,         /* max # of operations per semop call */
                 SEMUME,         /* max # of undo entries per process */
                 SEMUSZ,         /* size in bytes of undo structure */
                 SEMVMX,         /* semaphore maximum value */
                 SEMAEM          /* adjust on exit max value */
 };
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0,
     "Number of entries in the semaphore map");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
     "Number of semaphore identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
     "Maximum number of semaphores in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0,
     "Maximum number of undo structures in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0,
     "Max semaphores per id");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0,
     "Max operations per semop call");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0,
     "Max undo entries per process");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0,
     "Size in bytes of undo structure");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0,
     "Semaphore maximum value");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0,
     "Adjust on exit max value");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLFLAG_RD,
     NULL, 0, sysctl_sema, "", "");
 
 static void
 seminit(void)
 {
 	int i;
 
 	TUNABLE_INT_FETCH("kern.ipc.semmap", &seminfo.semmap);
 	TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
 	TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
 	TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
 	TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl);
 	TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm);
 	TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume);
 	TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz);
 	TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx);
 	TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem);
 
 	sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
 	sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM,
 	    M_WAITOK);
 	sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM,
 	    M_WAITOK | M_ZERO);
 	semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
 
 	for (i = 0; i < seminfo.semmni; i++) {
 		sema[i].u.sem_base = 0;
 		sema[i].u.sem_perm.mode = 0;
 		sema[i].u.sem_perm.seq = 0;
 #ifdef MAC
 		mac_init_sysv_sem(&sema[i]);
 #endif
 	}
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF);
 	for (i = 0; i < seminfo.semmnu; i++) {
 		struct sem_undo *suptr = SEMU(i);
 		suptr->un_proc = NULL;
 	}
 	SLIST_INIT(&semu_list);
 	mtx_init(&sem_mtx, "sem", NULL, MTX_DEF);
 	semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 static int
 semunload(void)
 {
 	int i;
 
 	if (semtot != 0)
 		return (EBUSY);
 
 	EVENTHANDLER_DEREGISTER(process_exit, semexit_tag);
 #ifdef MAC
 	for (i = 0; i < seminfo.semmni; i++)
 		mac_destroy_sysv_sem(&sema[i]);
 #endif
 	free(sem, M_SEM);
 	free(sema, M_SEM);
 	free(semu, M_SEM);
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_destroy(&sema_mtx[i]);
 	mtx_destroy(&sem_mtx);
 	return (0);
 }
 
 static int
 sysvsem_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		seminit();
 		break;
 	case MOD_UNLOAD:
 		error = semunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvsem_mod = {
 	"sysvsem",
 	&sysvsem_modload,
 	NULL
 };
 
 SYSCALL_MODULE_HELPER(semsys);
 SYSCALL_MODULE_HELPER(__semctl);
 SYSCALL_MODULE_HELPER(semget);
 SYSCALL_MODULE_HELPER(semop);
 
 DECLARE_MODULE(sysvsem, sysvsem_mod,
 	SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvsem, 1);
 
 /*
- * Entry point for all SEM calls
- *
- * MPSAFE
+ * Entry point for all SEM calls.
  */
 int
 semsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct semsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 		int	a5;
 	} */ *uap;
 {
 	int error;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	if (uap->which < 0 ||
 	    uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
 		return (EINVAL);
 	error = (*semcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 /*
  * Allocate a new sem_undo structure for a process
  * (returns ptr to structure or NULL if no more room)
  */
 
 static struct sem_undo *
 semu_alloc(td)
 	struct thread *td;
 {
 	int i;
 	struct sem_undo *suptr;
 	struct sem_undo **supptr;
 	int attempt;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	/*
 	 * Try twice to allocate something.
 	 * (we'll purge an empty structure after the first pass so
 	 * two passes are always enough)
 	 */
 
 	for (attempt = 0; attempt < 2; attempt++) {
 		/*
 		 * Look for a free structure.
 		 * Fill it in and return it if we find one.
 		 */
 
 		for (i = 0; i < seminfo.semmnu; i++) {
 			suptr = SEMU(i);
 			if (suptr->un_proc == NULL) {
 				SLIST_INSERT_HEAD(&semu_list, suptr, un_next);
 				suptr->un_cnt = 0;
 				suptr->un_proc = td->td_proc;
 				return(suptr);
 			}
 		}
 
 		/*
 		 * We didn't find a free one, if this is the first attempt
 		 * then try to free a structure.
 		 */
 
 		if (attempt == 0) {
 			/* All the structures are in use - try to free one */
 			int did_something = 0;
 
 			SLIST_FOREACH_PREVPTR(suptr, supptr, &semu_list,
 			    un_next) {
 				if (suptr->un_cnt == 0) {
 					suptr->un_proc = NULL;
 					did_something = 1;
 					*supptr = SLIST_NEXT(suptr, un_next);
 					break;
 				}
 			}
 
 			/* If we didn't free anything then just give-up */
 			if (!did_something)
 				return(NULL);
 		} else {
 			/*
 			 * The second pass failed even though we freed
 			 * something after the first pass!
 			 * This is IMPOSSIBLE!
 			 */
 			panic("semu_alloc - second attempt failed");
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Adjust a particular entry for a particular proc
  */
 
 static int
 semundo_adjust(td, supptr, semid, semnum, adjval)
 	struct thread *td;
 	struct sem_undo **supptr;
 	int semid, semnum;
 	int adjval;
 {
 	struct proc *p = td->td_proc;
 	struct sem_undo *suptr;
 	struct undo *sunptr;
 	int i;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	/* Look for and remember the sem_undo if the caller doesn't provide
 	   it */
 
 	suptr = *supptr;
 	if (suptr == NULL) {
 		SLIST_FOREACH(suptr, &semu_list, un_next) {
 			if (suptr->un_proc == p) {
 				*supptr = suptr;
 				break;
 			}
 		}
 		if (suptr == NULL) {
 			if (adjval == 0)
 				return(0);
 			suptr = semu_alloc(td);
 			if (suptr == NULL)
 				return(ENOSPC);
 			*supptr = suptr;
 		}
 	}
 
 	/*
 	 * Look for the requested entry and adjust it (delete if adjval becomes
 	 * 0).
 	 */
 	sunptr = &suptr->un_ent[0];
 	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
 		if (sunptr->un_id != semid || sunptr->un_num != semnum)
 			continue;
 		if (adjval != 0) {
 			adjval += sunptr->un_adjval;
 			if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 				return (ERANGE);
 		}
 		sunptr->un_adjval = adjval;
 		if (sunptr->un_adjval == 0) {
 			suptr->un_cnt--;
 			if (i < suptr->un_cnt)
 				suptr->un_ent[i] =
 				    suptr->un_ent[suptr->un_cnt];
 		}
 		return(0);
 	}
 
 	/* Didn't find the right entry - create it */
 	if (adjval == 0)
 		return(0);
 	if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 		return (ERANGE);
 	if (suptr->un_cnt != seminfo.semume) {
 		sunptr = &suptr->un_ent[suptr->un_cnt];
 		suptr->un_cnt++;
 		sunptr->un_adjval = adjval;
 		sunptr->un_id = semid; sunptr->un_num = semnum;
 	} else
 		return(EINVAL);
 	return(0);
 }
 
 static void
 semundo_clear(semid, semnum)
 	int semid, semnum;
 {
 	struct sem_undo *suptr;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	SLIST_FOREACH(suptr, &semu_list, un_next) {
 		struct undo *sunptr = &suptr->un_ent[0];
 		int i = 0;
 
 		while (i < suptr->un_cnt) {
 			if (sunptr->un_id == semid) {
 				if (semnum == -1 || sunptr->un_num == semnum) {
 					suptr->un_cnt--;
 					if (i < suptr->un_cnt) {
 						suptr->un_ent[i] =
 						  suptr->un_ent[suptr->un_cnt];
 						continue;
 					}
 				}
 				if (semnum != -1)
 					break;
 			}
 			i++, sunptr++;
 		}
 	}
 }
 
 static int
 semvalid(semid, semakptr)
 	int semid;
 	struct semid_kernel *semakptr;
 {
 
 	return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 	    semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0);
 }
 
 /*
- * Note that the user-mode half of this passes a union, not a pointer
+ * Note that the user-mode half of this passes a union, not a pointer.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args {
 	int	semid;
 	int	semnum;
 	int	cmd;
 	union	semun *arg;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 __semctl(td, uap)
 	struct thread *td;
 	struct __semctl_args *uap;
 {
 	struct semid_ds dsbuf;
 	union semun arg, semun;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
 		if (error)
 			return (error);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = arg.array;
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 int
 kern_semctl(struct thread *td, int semid, int semnum, int cmd,
     union semun *arg, register_t *rval)
 {
 	u_short *array;
 	struct ucred *cred = td->td_ucred;
 	int i, error;
 	struct semid_ds *sbuf;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	u_short usval, count;
 	int semidx;
 
 	DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
 	    semid, semnum, cmd, arg));
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	array = NULL;
 
 	switch(cmd) {
 	case SEM_STAT:
 		/*
 		 * For this command we assume semid is an array index
 		 * rather than an IPC id.
 		 */
 		if (semid < 0 || semid >= seminfo.semmni)
 			return (EINVAL);
 		semakptr = &sema[semid];
 		sema_mtxp = &sema_mtx[semid];
 		mtx_lock(sema_mtxp);
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 			error = EINVAL;
 			goto done2;
 		}
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 #ifdef MAC
 		error = mac_check_sysv_semctl(cred, semakptr, cmd);
 		if (error != 0)
 			goto done2;
 #endif
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		*rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
 		mtx_unlock(sema_mtxp);
 		return (0);
 	}
 
 	semidx = IPCID_TO_IX(semid);
 	if (semidx < 0 || semidx >= seminfo.semmni)
 		return (EINVAL);
 
 	semakptr = &sema[semidx];
 	sema_mtxp = &sema_mtx[semidx];
 	mtx_lock(sema_mtxp);
 #ifdef MAC
 	error = mac_check_sysv_semctl(cred, semakptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	*rval = 0;
 
 	switch (cmd) {
 	case IPC_RMID:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		semakptr->u.sem_perm.cuid = cred->cr_uid;
 		semakptr->u.sem_perm.uid = cred->cr_uid;
 		semtot -= semakptr->u.sem_nsems;
 		for (i = semakptr->u.sem_base - sem; i < semtot; i++)
 			sem[i] = sem[i + semakptr->u.sem_nsems];
 		for (i = 0; i < seminfo.semmni; i++) {
 			if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
 			    sema[i].u.sem_base > semakptr->u.sem_base)
 				sema[i].u.sem_base -= semakptr->u.sem_nsems;
 		}
 		semakptr->u.sem_perm.mode = 0;
 #ifdef MAC
 		mac_cleanup_sysv_sem(semakptr);
 #endif
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	case IPC_SET:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		sbuf = arg->buf;
 		semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
 		semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
 		semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
 		    ~0777) | (sbuf->sem_perm.mode & 0777);
 		semakptr->u.sem_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		break;
 
 	case GETNCNT:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semncnt;
 		break;
 
 	case GETPID:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].sempid;
 		break;
 
 	case GETVAL:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semval;
 		break;
 
 	case GETALL:
 		/*
 		 * Unfortunately, callers of this function don't know
 		 * in advance how many semaphores are in this set.
 		 * While we could just allocate the maximum size array
 		 * and pass the actual size back to the caller, that
 		 * won't work for SETALL since we can't copyin() more
 		 * data than the user specified as we may return a
 		 * spurious EFAULT.
 		 * 
 		 * Note that the number of semaphores in a set is
 		 * fixed for the life of that set.  The only way that
 		 * the 'count' could change while are blocked in
 		 * malloc() is if this semaphore set were destroyed
 		 * and a new one created with the same index.
 		 * However, semvalid() will catch that due to the
 		 * sequence number unless exactly 0x8000 (or a
 		 * multiple thereof) semaphore sets for the same index
 		 * are created and destroyed while we are in malloc!
 		 *
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		mtx_lock(sema_mtxp);
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++)
 			array[i] = semakptr->u.sem_base[i].semval;
 		mtx_unlock(sema_mtxp);
 		error = copyout(array, arg->array, count * sizeof(*array));
 		mtx_lock(sema_mtxp);
 		break;
 
 	case GETZCNT:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semzcnt;
 		break;
 
 	case SETVAL:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		if (arg->val < 0 || arg->val > seminfo.semvmx) {
 			error = ERANGE;
 			goto done2;
 		}
 		semakptr->u.sem_base[semnum].semval = arg->val;
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, semnum);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	case SETALL:
 		/*
 		 * See comment on GETALL for why 'count' shouldn't change
 		 * and why we require a userland buffer.
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		error = copyin(arg->array, array, count * sizeof(*array));
 		if (error)
 			break;
 		mtx_lock(sema_mtxp);
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++) {
 			usval = array[i];
 			if (usval > seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			}
 			semakptr->u.sem_base[i].semval = usval;
 		}
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 done2:
 	mtx_unlock(sema_mtxp);
 	if (array != NULL)
 		free(array, M_TEMP);
 	return(error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semget_args {
 	key_t	key;
 	int	nsems;
 	int	semflg;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 semget(td, uap)
 	struct thread *td;
 	struct semget_args *uap;
 {
 	int semid, error = 0;
 	int key = uap->key;
 	int nsems = uap->nsems;
 	int semflg = uap->semflg;
 	struct ucred *cred = td->td_ucred;
 
 	DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	mtx_lock(&Giant);
 	if (key != IPC_PRIVATE) {
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) &&
 			    sema[semid].u.sem_perm.key == key)
 				break;
 		}
 		if (semid < seminfo.semmni) {
 			DPRINTF(("found public key\n"));
 			if ((error = ipcperm(td, &sema[semid].u.sem_perm,
 			    semflg & 0700))) {
 				goto done2;
 			}
 			if (nsems > 0 && sema[semid].u.sem_nsems < nsems) {
 				DPRINTF(("too small\n"));
 				error = EINVAL;
 				goto done2;
 			}
 			if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_check_sysv_semget(cred, &sema[semid]);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the semid_kernel\n"));
 	if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
 		if (nsems <= 0 || nsems > seminfo.semmsl) {
 			DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
 			    seminfo.semmsl));
 			error = EINVAL;
 			goto done2;
 		}
 		if (nsems > seminfo.semmns - semtot) {
 			DPRINTF((
 			    "not enough semaphores left (need %d, got %d)\n",
 			    nsems, seminfo.semmns - semtot));
 			error = ENOSPC;
 			goto done2;
 		}
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0)
 				break;
 		}
 		if (semid == seminfo.semmni) {
 			DPRINTF(("no more semid_kernel's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 		DPRINTF(("semid %d is available\n", semid));
 		sema[semid].u.sem_perm.key = key;
 		sema[semid].u.sem_perm.cuid = cred->cr_uid;
 		sema[semid].u.sem_perm.uid = cred->cr_uid;
 		sema[semid].u.sem_perm.cgid = cred->cr_gid;
 		sema[semid].u.sem_perm.gid = cred->cr_gid;
 		sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
 		sema[semid].u.sem_perm.seq =
 		    (sema[semid].u.sem_perm.seq + 1) & 0x7fff;
 		sema[semid].u.sem_nsems = nsems;
 		sema[semid].u.sem_otime = 0;
 		sema[semid].u.sem_ctime = time_second;
 		sema[semid].u.sem_base = &sem[semtot];
 		semtot += nsems;
 		bzero(sema[semid].u.sem_base,
 		    sizeof(sema[semid].u.sem_base[0])*nsems);
 #ifdef MAC
 		mac_create_sysv_sem(cred, &sema[semid]);
 #endif
 		DPRINTF(("sembase = %p, next = %p\n",
 		    sema[semid].u.sem_base, &sem[semtot]));
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semop_args {
 	int	semid;
 	struct	sembuf *sops;
 	size_t	nsops;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 semop(td, uap)
 	struct thread *td;
 	struct semop_args *uap;
 {
 #define SMALL_SOPS	8
 	struct sembuf small_sops[SMALL_SOPS];
 	int semid = uap->semid;
 	size_t nsops = uap->nsops;
 	struct sembuf *sops;
 	struct semid_kernel *semakptr;
 	struct sembuf *sopptr = 0;
 	struct sem *semptr = 0;
 	struct sem_undo *suptr;
 	struct mtx *sema_mtxp;
 	size_t i, j, k;
 	int error;
 	int do_wakeup, do_undos;
 
 #ifdef SEM_DEBUG
 	sops = NULL;
 #endif
 	DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	semid = IPCID_TO_IX(semid);	/* Convert back to zero origin */
 
 	if (semid < 0 || semid >= seminfo.semmni)
 		return (EINVAL);
 
 	/* Allocate memory for sem_ops */
 	if (nsops <= SMALL_SOPS)
 		sops = small_sops;
 	else if (nsops <= seminfo.semopm)
 		sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
 	else {
 		DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
 		    nsops));
 		return (E2BIG);
 	}
 	if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
 		DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
 		    uap->sops, sops, nsops * sizeof(sops[0])));
 		if (sops != small_sops)
 			free(sops, M_SEM);
 		return (error);
 	}
 
 	semakptr = &sema[semid];
 	sema_mtxp = &sema_mtx[semid];
 	mtx_lock(sema_mtxp);
 	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 		error = EINVAL;
 		goto done2;
 	}
 	if (semakptr->u.sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
 		error = EINVAL;
 		goto done2;
 	}
 	/*
 	 * Initial pass thru sops to see what permissions are needed.
 	 * Also perform any checks that don't need repeating on each
 	 * attempt to satisfy the request vector.
 	 */
 	j = 0;		/* permission needed */
 	do_undos = 0;
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		if (sopptr->sem_num >= semakptr->u.sem_nsems) {
 			error = EFBIG;
 			goto done2;
 		}
 		if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
 			do_undos = 1;
 		j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
 	}
 
 	if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) {
 		DPRINTF(("error = %d from ipaccess\n", error));
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_check_sysv_semop(td->td_ucred, semakptr, j);
 	if (error != 0)
 		goto done2;
 #endif
 
 	/*
 	 * Loop trying to satisfy the vector of requests.
 	 * If we reach a point where we must wait, any requests already
 	 * performed are rolled back and we go to sleep until some other
 	 * process wakes us up.  At this point, we start all over again.
 	 *
 	 * This ensures that from the perspective of other tasks, a set
 	 * of requests is atomic (never partially satisfied).
 	 */
 	for (;;) {
 		do_wakeup = 0;
 		error = 0;	/* error return if necessary */
 
 		for (i = 0; i < nsops; i++) {
 			sopptr = &sops[i];
 			semptr = &semakptr->u.sem_base[sopptr->sem_num];
 
 			DPRINTF((
 			    "semop:  semakptr=%p, sem_base=%p, "
 			    "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
 			    semakptr, semakptr->u.sem_base, semptr,
 			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
 			    (sopptr->sem_flg & IPC_NOWAIT) ?
 			    "nowait" : "wait"));
 
 			if (sopptr->sem_op < 0) {
 				if (semptr->semval + sopptr->sem_op < 0) {
 					DPRINTF(("semop:  can't do it now\n"));
 					break;
 				} else {
 					semptr->semval += sopptr->sem_op;
 					if (semptr->semval == 0 &&
 					    semptr->semzcnt > 0)
 						do_wakeup = 1;
 				}
 			} else if (sopptr->sem_op == 0) {
 				if (semptr->semval != 0) {
 					DPRINTF(("semop:  not zero now\n"));
 					break;
 				}
 			} else if (semptr->semval + sopptr->sem_op >
 			    seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			} else {
 				if (semptr->semncnt > 0)
 					do_wakeup = 1;
 				semptr->semval += sopptr->sem_op;
 			}
 		}
 
 		/*
 		 * Did we get through the entire vector?
 		 */
 		if (i >= nsops)
 			goto done;
 
 		/*
 		 * No ... rollback anything that we've already done
 		 */
 		DPRINTF(("semop:  rollback 0 through %d\n", i-1));
 		for (j = 0; j < i; j++)
 			semakptr->u.sem_base[sops[j].sem_num].semval -=
 			    sops[j].sem_op;
 
 		/* If we detected an error, return it */
 		if (error != 0)
 			goto done2;
 
 		/*
 		 * If the request that we couldn't satisfy has the
 		 * NOWAIT flag set then return with EAGAIN.
 		 */
 		if (sopptr->sem_flg & IPC_NOWAIT) {
 			error = EAGAIN;
 			goto done2;
 		}
 
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt++;
 		else
 			semptr->semncnt++;
 
 		DPRINTF(("semop:  good night!\n"));
 		error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH,
 		    "semwait", 0);
 		DPRINTF(("semop:  good morning (error=%d)!\n", error));
 		/* return code is checked below, after sem[nz]cnt-- */
 
 		/*
 		 * Make sure that the semaphore still exists
 		 */
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 		    semakptr->u.sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
 			error = EIDRM;
 			goto done2;
 		}
 
 		/*
 		 * The semaphore is still alive.  Readjust the count of
 		 * waiting processes.
 		 */
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt--;
 		else
 			semptr->semncnt--;
 
 		/*
 		 * Is it really morning, or was our sleep interrupted?
 		 * (Delayed check of msleep() return code because we
 		 * need to decrement sem[nz]cnt either way.)
 		 */
 		if (error != 0) {
 			error = EINTR;
 			goto done2;
 		}
 		DPRINTF(("semop:  good morning!\n"));
 	}
 
 done:
 	/*
 	 * Process any SEM_UNDO requests.
 	 */
 	if (do_undos) {
 		SEMUNDO_LOCK();
 		suptr = NULL;
 		for (i = 0; i < nsops; i++) {
 			/*
 			 * We only need to deal with SEM_UNDO's for non-zero
 			 * op's.
 			 */
 			int adjval;
 
 			if ((sops[i].sem_flg & SEM_UNDO) == 0)
 				continue;
 			adjval = sops[i].sem_op;
 			if (adjval == 0)
 				continue;
 			error = semundo_adjust(td, &suptr, semid,
 			    sops[i].sem_num, -adjval);
 			if (error == 0)
 				continue;
 
 			/*
 			 * Oh-Oh!  We ran out of either sem_undo's or undo's.
 			 * Rollback the adjustments to this point and then
 			 * rollback the semaphore ups and down so we can return
 			 * with an error with all structures restored.  We
 			 * rollback the undo's in the exact reverse order that
 			 * we applied them.  This guarantees that we won't run
 			 * out of space as we roll things back out.
 			 */
 			for (j = 0; j < i; j++) {
 				k = i - j - 1;
 				if ((sops[k].sem_flg & SEM_UNDO) == 0)
 					continue;
 				adjval = sops[k].sem_op;
 				if (adjval == 0)
 					continue;
 				if (semundo_adjust(td, &suptr, semid,
 				    sops[k].sem_num, adjval) != 0)
 					panic("semop - can't undo undos");
 			}
 
 			for (j = 0; j < nsops; j++)
 				semakptr->u.sem_base[sops[j].sem_num].semval -=
 				    sops[j].sem_op;
 
 			DPRINTF(("error = %d from semundo_adjust\n", error));
 			SEMUNDO_UNLOCK();
 			goto done2;
 		} /* loop through the sops */
 		SEMUNDO_UNLOCK();
 	} /* if (do_undos) */
 
 	/* We're definitely done - set the sempid's and time */
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		semptr = &semakptr->u.sem_base[sopptr->sem_num];
 		semptr->sempid = td->td_proc->p_pid;
 	}
 	semakptr->u.sem_otime = time_second;
 
 	/*
 	 * Do a wakeup if any semaphore was up'd whilst something was
 	 * sleeping on it.
 	 */
 	if (do_wakeup) {
 		DPRINTF(("semop:  doing wakeup\n"));
 		wakeup(semakptr);
 		DPRINTF(("semop:  back from wakeup\n"));
 	}
 	DPRINTF(("semop:  done\n"));
 	td->td_retval[0] = 0;
 done2:
 	mtx_unlock(sema_mtxp);
 	if (sops != small_sops)
 		free(sops, M_SEM);
 	return (error);
 }
 
 /*
  * Go through the undo structures for this process and apply the adjustments to
  * semaphores.
  */
 static void
 semexit_myhook(arg, p)
 	void *arg;
 	struct proc *p;
 {
 	struct sem_undo *suptr;
 	struct sem_undo **supptr;
 
 	/*
 	 * Go through the chain of undo vectors looking for one
 	 * associated with this process.
 	 */
 	SEMUNDO_LOCK();
 	SLIST_FOREACH_PREVPTR(suptr, supptr, &semu_list, un_next) {
 		if (suptr->un_proc == p)
 			break;
 	}
 	SEMUNDO_UNLOCK();
 
 	if (suptr == NULL)
 		return;
 
 	DPRINTF(("proc @%p has undo structure with %d entries\n", p,
 	    suptr->un_cnt));
 
 	/*
 	 * If there are any active undo elements then process them.
 	 */
 	if (suptr->un_cnt > 0) {
 		int ix;
 
 		for (ix = 0; ix < suptr->un_cnt; ix++) {
 			int semid = suptr->un_ent[ix].un_id;
 			int semnum = suptr->un_ent[ix].un_num;
 			int adjval = suptr->un_ent[ix].un_adjval;
 			struct semid_kernel *semakptr;
 			struct mtx *sema_mtxp;
 
 			semakptr = &sema[semid];
 			sema_mtxp = &sema_mtx[semid];
 			mtx_lock(sema_mtxp);
 			SEMUNDO_LOCK();
 			if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0)
 				panic("semexit - semid not allocated");
 			if (semnum >= semakptr->u.sem_nsems)
 				panic("semexit - semnum out of range");
 
 			DPRINTF((
 			    "semexit:  %p id=%d num=%d(adj=%d) ; sem=%d\n",
 			    suptr->un_proc, suptr->un_ent[ix].un_id,
 			    suptr->un_ent[ix].un_num,
 			    suptr->un_ent[ix].un_adjval,
 			    semakptr->u.sem_base[semnum].semval));
 
 			if (adjval < 0) {
 				if (semakptr->u.sem_base[semnum].semval <
 				    -adjval)
 					semakptr->u.sem_base[semnum].semval = 0;
 				else
 					semakptr->u.sem_base[semnum].semval +=
 					    adjval;
 			} else
 				semakptr->u.sem_base[semnum].semval += adjval;
 
 			wakeup(semakptr);
 			DPRINTF(("semexit:  back from wakeup\n"));
 			mtx_unlock(sema_mtxp);
 			SEMUNDO_UNLOCK();
 		}
 	}
 
 	/*
 	 * Deallocate the undo vector.
 	 */
 	DPRINTF(("removing vector\n"));
 	suptr->un_proc = NULL;
 	*supptr = SLIST_NEXT(suptr, un_next);
 }
 
 static int
 sysctl_sema(SYSCTL_HANDLER_ARGS)
 {
 
 	return (SYSCTL_OUT(req, sema,
 	    sizeof(struct semid_kernel) * seminfo.semmni));
 }
Index: head/sys/kern/sysv_shm.c
===================================================================
--- head/sys/kern/sysv_shm.c	(revision 167210)
+++ head/sys/kern/sysv_shm.c	(revision 167211)
@@ -1,1041 +1,1023 @@
 /*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
 /*-
  * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Adam Glass and Charles
  *	Hannum.
  * 4. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/shm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
 
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 struct oshmctl_args;
 static int oshmctl(struct thread *td, struct oshmctl_args *uap);
 #endif
 
 static int shmget_allocate_segment(struct thread *td,
     struct shmget_args *uap, int mode);
 static int shmget_existing(struct thread *td, struct shmget_args *uap,
     int mode, int segnum);
 
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *shmcalls[] = {
 	(sy_call_t *)shmat, (sy_call_t *)oshmctl,
 	(sy_call_t *)shmdt, (sy_call_t *)shmget,
 	(sy_call_t *)shmctl
 };
 #endif
 
 #define	SHMSEG_FREE     	0x0200
 #define	SHMSEG_REMOVED  	0x0400
 #define	SHMSEG_ALLOCATED	0x0800
 #define	SHMSEG_WANTED		0x1000
 
 static int shm_last_free, shm_nused, shm_committed, shmalloced;
 static struct shmid_kernel	*shmsegs;
 
 struct shmmap_state {
 	vm_offset_t va;
 	int shmid;
 };
 
 static void shm_deallocate_segment(struct shmid_kernel *);
 static int shm_find_segment_by_key(key_t);
 static struct shmid_kernel *shm_find_segment_by_shmid(int);
 static struct shmid_kernel *shm_find_segment_by_shmidx(int);
 static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *);
 static void shmrealloc(void);
 static void shminit(void);
 static int sysvshm_modload(struct module *, int, void *);
 static int shmunload(void);
 static void shmexit_myhook(struct vmspace *vm);
 static void shmfork_myhook(struct proc *p1, struct proc *p2);
 static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
 
 /*
  * Tuneable values.
  */
 #ifndef SHMMAXPGS
 #define	SHMMAXPGS	8192	/* Note: sysv shared memory is swap backed. */
 #endif
 #ifndef SHMMAX
 #define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
 #endif
 #ifndef SHMMIN
 #define	SHMMIN	1
 #endif
 #ifndef SHMMNI
 #define	SHMMNI	192
 #endif
 #ifndef SHMSEG
 #define	SHMSEG	128
 #endif
 #ifndef SHMALL
 #define	SHMALL	(SHMMAXPGS)
 #endif
 
 struct	shminfo shminfo = {
 	SHMMAX,
 	SHMMIN,
 	SHMMNI,
 	SHMSEG,
 	SHMALL
 };
 
 static int shm_use_phys;
 static int shm_allow_removed;
 
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
     "Maximum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
     "Minimum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
     "Number of shared memory identifiers");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
     "Number of segments per process");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
     "Maximum number of pages available for shared memory");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
     &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RW,
     &shm_allow_removed, 0,
     "Enable/Disable attachment to attached segments marked for removal");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLFLAG_RD,
     NULL, 0, sysctl_shmsegs, "",
     "Current number of shared memory segments allocated");
 
 static int
 shm_find_segment_by_key(key)
 	key_t key;
 {
 	int i;
 
 	for (i = 0; i < shmalloced; i++)
 		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
 		    shmsegs[i].u.shm_perm.key == key)
 			return (i);
 	return (-1);
 }
 
 static struct shmid_kernel *
 shm_find_segment_by_shmid(int shmid)
 {
 	int segnum;
 	struct shmid_kernel *shmseg;
 
 	segnum = IPCID_TO_IX(shmid);
 	if (segnum < 0 || segnum >= shmalloced)
 		return (NULL);
 	shmseg = &shmsegs[segnum];
 	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
 	    (!shm_allow_removed &&
 	     (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) ||
 	    shmseg->u.shm_perm.seq != IPCID_TO_SEQ(shmid))
 		return (NULL);
 	return (shmseg);
 }
 
 static struct shmid_kernel *
 shm_find_segment_by_shmidx(int segnum)
 {
 	struct shmid_kernel *shmseg;
 
 	if (segnum < 0 || segnum >= shmalloced)
 		return (NULL);
 	shmseg = &shmsegs[segnum];
 	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
 	    (!shm_allow_removed &&
 	     (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0))
 		return (NULL);
 	return (shmseg);
 }
 
 static void
 shm_deallocate_segment(shmseg)
 	struct shmid_kernel *shmseg;
 {
 	size_t size;
 
 	GIANT_REQUIRED;
 
 	vm_object_deallocate(shmseg->u.shm_internal);
 	shmseg->u.shm_internal = NULL;
 	size = round_page(shmseg->u.shm_segsz);
 	shm_committed -= btoc(size);
 	shm_nused--;
 	shmseg->u.shm_perm.mode = SHMSEG_FREE;
 #ifdef MAC
 	mac_cleanup_sysv_shm(shmseg);
 #endif
 }
 
 static int
 shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
 {
 	struct shmid_kernel *shmseg;
 	int segnum, result;
 	size_t size;
 
 	GIANT_REQUIRED;
 
 	segnum = IPCID_TO_IX(shmmap_s->shmid);
 	shmseg = &shmsegs[segnum];
 	size = round_page(shmseg->u.shm_segsz);
 	result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
 	if (result != KERN_SUCCESS)
 		return (EINVAL);
 	shmmap_s->shmid = -1;
 	shmseg->u.shm_dtime = time_second;
 	if ((--shmseg->u.shm_nattch <= 0) &&
 	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
 		shm_deallocate_segment(shmseg);
 		shm_last_free = segnum;
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmdt_args {
 	const void *shmaddr;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 shmdt(td, uap)
 	struct thread *td;
 	struct shmdt_args *uap;
 {
 	struct proc *p = td->td_proc;
 	struct shmmap_state *shmmap_s;
 #ifdef MAC
 	struct shmid_kernel *shmsegptr;
 #endif
 	int i;
 	int error = 0;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	mtx_lock(&Giant);
 	shmmap_s = p->p_vmspace->vm_shm;
  	if (shmmap_s == NULL) {
 		error = EINVAL;
 		goto done2;
 	}
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
 		if (shmmap_s->shmid != -1 &&
 		    shmmap_s->va == (vm_offset_t)uap->shmaddr) {
 			break;
 		}
 	}
 	if (i == shminfo.shmseg) {
 		error = EINVAL;
 		goto done2;
 	}
 #ifdef MAC
 	shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
 	error = mac_check_sysv_shmdt(td->td_ucred, shmsegptr);
 	if (error != 0)
 		goto done2;
 #endif
 	error = shm_delete_mapping(p->p_vmspace, shmmap_s);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmat_args {
 	int shmid;
 	const void *shmaddr;
 	int shmflg;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 kern_shmat(td, shmid, shmaddr, shmflg)
 	struct thread *td;
 	int shmid;
 	const void *shmaddr;
 	int shmflg;
 {
 	struct proc *p = td->td_proc;
 	int i, flags;
 	struct shmid_kernel *shmseg;
 	struct shmmap_state *shmmap_s = NULL;
 	vm_offset_t attach_va;
 	vm_prot_t prot;
 	vm_size_t size;
 	int rv;
 	int error = 0;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	mtx_lock(&Giant);
 	shmmap_s = p->p_vmspace->vm_shm;
 	if (shmmap_s == NULL) {
 		size = shminfo.shmseg * sizeof(struct shmmap_state);
 		shmmap_s = malloc(size, M_SHM, M_WAITOK);
 		for (i = 0; i < shminfo.shmseg; i++)
 			shmmap_s[i].shmid = -1;
 		p->p_vmspace->vm_shm = shmmap_s;
 	}
 	shmseg = shm_find_segment_by_shmid(shmid);
 	if (shmseg == NULL) {
 		error = EINVAL;
 		goto done2;
 	}
 	error = ipcperm(td, &shmseg->u.shm_perm,
 	    (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
 	if (error)
 		goto done2;
 #ifdef MAC
 	error = mac_check_sysv_shmat(td->td_ucred, shmseg, shmflg);
 	if (error != 0)
 		goto done2;
 #endif
 	for (i = 0; i < shminfo.shmseg; i++) {
 		if (shmmap_s->shmid == -1)
 			break;
 		shmmap_s++;
 	}
 	if (i >= shminfo.shmseg) {
 		error = EMFILE;
 		goto done2;
 	}
 	size = round_page(shmseg->u.shm_segsz);
 #ifdef VM_PROT_READ_IS_EXEC
 	prot = VM_PROT_READ | VM_PROT_EXECUTE;
 #else
 	prot = VM_PROT_READ;
 #endif
 	if ((shmflg & SHM_RDONLY) == 0)
 		prot |= VM_PROT_WRITE;
 	flags = MAP_ANON | MAP_SHARED;
 	if (shmaddr) {
 		flags |= MAP_FIXED;
 		if (shmflg & SHM_RND) {
 			attach_va = (vm_offset_t)shmaddr & ~(SHMLBA-1);
 		} else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0) {
 			attach_va = (vm_offset_t)shmaddr;
 		} else {
 			error = EINVAL;
 			goto done2;
 		}
 	} else {
 		/*
 		 * This is just a hint to vm_map_find() about where to
 		 * put it.
 		 */
 		PROC_LOCK(p);
 		attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
 		    lim_max(p, RLIMIT_DATA));
 		PROC_UNLOCK(p);
 	}
 
 	vm_object_reference(shmseg->u.shm_internal);
 	rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->u.shm_internal,
 		0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0);
 	if (rv != KERN_SUCCESS) {
 		vm_object_deallocate(shmseg->u.shm_internal);
 		error = ENOMEM;
 		goto done2;
 	}
 	vm_map_inherit(&p->p_vmspace->vm_map,
 		attach_va, attach_va + size, VM_INHERIT_SHARE);
 
 	shmmap_s->va = attach_va;
 	shmmap_s->shmid = shmid;
 	shmseg->u.shm_lpid = p->p_pid;
 	shmseg->u.shm_atime = time_second;
 	shmseg->u.shm_nattch++;
 	td->td_retval[0] = attach_va;
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 int
 shmat(td, uap)
 	struct thread *td;
 	struct shmat_args *uap;
 {
 	return kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg);
 }
 
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 struct oshmid_ds {
 	struct	ipc_perm shm_perm;	/* operation perms */
 	int	shm_segsz;		/* size of segment (bytes) */
 	u_short	shm_cpid;		/* pid, creator */
 	u_short	shm_lpid;		/* pid, last operation */
 	short	shm_nattch;		/* no. of current attaches */
 	time_t	shm_atime;		/* last attach time */
 	time_t	shm_dtime;		/* last detach time */
 	time_t	shm_ctime;		/* last change time */
 	void	*shm_handle;		/* internal handle for shm segment */
 };
 
 struct oshmctl_args {
 	int shmid;
 	int cmd;
 	struct oshmid_ds *ubuf;
 };
 
-/*
- * MPSAFE
- */
 static int
 oshmctl(td, uap)
 	struct thread *td;
 	struct oshmctl_args *uap;
 {
 #ifdef COMPAT_43
 	int error = 0;
 	struct shmid_kernel *shmseg;
 	struct oshmid_ds outbuf;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	mtx_lock(&Giant);
 	shmseg = shm_find_segment_by_shmid(uap->shmid);
 	if (shmseg == NULL) {
 		error = EINVAL;
 		goto done2;
 	}
 	switch (uap->cmd) {
 	case IPC_STAT:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 		if (error)
 			goto done2;
 #ifdef MAC
 		error = mac_check_sysv_shmctl(td->td_ucred, shmseg, uap->cmd);
 		if (error != 0)
 			goto done2;
 #endif
 		outbuf.shm_perm = shmseg->u.shm_perm;
 		outbuf.shm_segsz = shmseg->u.shm_segsz;
 		outbuf.shm_cpid = shmseg->u.shm_cpid;
 		outbuf.shm_lpid = shmseg->u.shm_lpid;
 		outbuf.shm_nattch = shmseg->u.shm_nattch;
 		outbuf.shm_atime = shmseg->u.shm_atime;
 		outbuf.shm_dtime = shmseg->u.shm_dtime;
 		outbuf.shm_ctime = shmseg->u.shm_ctime;
 		outbuf.shm_handle = shmseg->u.shm_internal;
 		error = copyout(&outbuf, uap->ubuf, sizeof(outbuf));
 		if (error)
 			goto done2;
 		break;
 	default:
 		error = shmctl(td, (struct shmctl_args *)uap);
 		break;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 #else
 	return (EINVAL);
 #endif
 }
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmctl_args {
 	int shmid;
 	int cmd;
 	struct shmid_ds *buf;
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 kern_shmctl(td, shmid, cmd, buf, bufsz)
 	struct thread *td;
 	int shmid;
 	int cmd;
 	void *buf;
 	size_t *bufsz;
 {
 	int error = 0;
 	struct shmid_kernel *shmseg;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	mtx_lock(&Giant);
 	switch (cmd) {
 	case IPC_INFO:
 		memcpy(buf, &shminfo, sizeof(shminfo));
 		if (bufsz)
 			*bufsz = sizeof(shminfo);
 		td->td_retval[0] = shmalloced;
 		goto done2;
 	case SHM_INFO: {
 		struct shm_info shm_info;
 		shm_info.used_ids = shm_nused;
 		shm_info.shm_rss = 0;	/*XXX where to get from ? */
 		shm_info.shm_tot = 0;	/*XXX where to get from ? */
 		shm_info.shm_swp = 0;	/*XXX where to get from ? */
 		shm_info.swap_attempts = 0;	/*XXX where to get from ? */
 		shm_info.swap_successes = 0;	/*XXX where to get from ? */
 		memcpy(buf, &shm_info, sizeof(shm_info));
 		if (bufsz)
 			*bufsz = sizeof(shm_info);
 		td->td_retval[0] = shmalloced;
 		goto done2;
 	}
 	}
 	if (cmd == SHM_STAT)
 		shmseg = shm_find_segment_by_shmidx(shmid);
 	else
 		shmseg = shm_find_segment_by_shmid(shmid);
 	if (shmseg == NULL) {
 		error = EINVAL;
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_check_sysv_shmctl(td->td_ucred, shmseg, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 	switch (cmd) {
 	case SHM_STAT:
 	case IPC_STAT:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 		if (error)
 			goto done2;
 		memcpy(buf, &shmseg->u, sizeof(struct shmid_ds));
 		if (bufsz)
 			*bufsz = sizeof(struct shmid_ds);
 		if (cmd == SHM_STAT)
 			td->td_retval[0] = IXSEQ_TO_IPCID(shmid, shmseg->u.shm_perm);
 		break;
 	case IPC_SET: {
 		struct shmid_ds *shmid;
 
 		shmid = (struct shmid_ds *)buf;
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error)
 			goto done2;
 		shmseg->u.shm_perm.uid = shmid->shm_perm.uid;
 		shmseg->u.shm_perm.gid = shmid->shm_perm.gid;
 		shmseg->u.shm_perm.mode =
 		    (shmseg->u.shm_perm.mode & ~ACCESSPERMS) |
 		    (shmid->shm_perm.mode & ACCESSPERMS);
 		shmseg->u.shm_ctime = time_second;
 		break;
 	}
 	case IPC_RMID:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error)
 			goto done2;
 		shmseg->u.shm_perm.key = IPC_PRIVATE;
 		shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
 		if (shmseg->u.shm_nattch <= 0) {
 			shm_deallocate_segment(shmseg);
 			shm_last_free = IPCID_TO_IX(shmid);
 		}
 		break;
 #if 0
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 #endif
 	default:
 		error = EINVAL;
 		break;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 int
 shmctl(td, uap)
 	struct thread *td;
 	struct shmctl_args *uap;
 {
 	int error = 0;
 	struct shmid_ds buf;
 	size_t bufsz;
 	
 	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
 			goto done;
 	}
 	
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
 	if (error)
 		goto done;
 	
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_INFO:
 	case SHM_INFO:
 	case SHM_STAT:
 	case IPC_STAT:
 		error = copyout(&buf, uap->buf, bufsz);
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmget_args {
 	key_t key;
 	size_t size;
 	int shmflg;
 };
 #endif
 
 static int
 shmget_existing(td, uap, mode, segnum)
 	struct thread *td;
 	struct shmget_args *uap;
 	int mode;
 	int segnum;
 {
 	struct shmid_kernel *shmseg;
 	int error;
 
 	shmseg = &shmsegs[segnum];
 	if (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) {
 		/*
 		 * This segment is in the process of being allocated.  Wait
 		 * until it's done, and look the key up again (in case the
 		 * allocation failed or it was freed).
 		 */
 		shmseg->u.shm_perm.mode |= SHMSEG_WANTED;
 		error = tsleep(shmseg, PLOCK | PCATCH, "shmget", 0);
 		if (error)
 			return (error);
 		return (EAGAIN);
 	}
 	if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
 		return (EEXIST);
 #ifdef MAC
 	error = mac_check_sysv_shmget(td->td_ucred, shmseg, uap->shmflg);
 	if (error != 0)
 		return (error);
 #endif
 	if (uap->size && uap->size > shmseg->u.shm_segsz)
 		return (EINVAL);
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 	return (0);
 }
 
 static int
 shmget_allocate_segment(td, uap, mode)
 	struct thread *td;
 	struct shmget_args *uap;
 	int mode;
 {
 	int i, segnum, shmid, size;
 	struct ucred *cred = td->td_ucred;
 	struct shmid_kernel *shmseg;
 	vm_object_t shm_object;
 
 	GIANT_REQUIRED;
 
 	if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
 		return (EINVAL);
 	if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
 		return (ENOSPC);
 	size = round_page(uap->size);
 	if (shm_committed + btoc(size) > shminfo.shmall)
 		return (ENOMEM);
 	if (shm_last_free < 0) {
 		shmrealloc();	/* Maybe expand the shmsegs[] array. */
 		for (i = 0; i < shmalloced; i++)
 			if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
 				break;
 		if (i == shmalloced)
 			return (ENOSPC);
 		segnum = i;
 	} else  {
 		segnum = shm_last_free;
 		shm_last_free = -1;
 	}
 	shmseg = &shmsegs[segnum];
 	/*
 	 * In case we sleep in malloc(), mark the segment present but deleted
 	 * so that noone else tries to create the same key.
 	 */
 	shmseg->u.shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
 	shmseg->u.shm_perm.key = uap->key;
 	shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
 	shmid = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 	
 	/*
 	 * We make sure that we have allocated a pager before we need
 	 * to.
 	 */
 	if (shm_use_phys) {
 		shm_object =
 		    vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0);
 	} else {
 		shm_object =
 		    vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0);
 	}
 	VM_OBJECT_LOCK(shm_object);
 	vm_object_clear_flag(shm_object, OBJ_ONEMAPPING);
 	vm_object_set_flag(shm_object, OBJ_NOSPLIT);
 	VM_OBJECT_UNLOCK(shm_object);
 
 	shmseg->u.shm_internal = shm_object;
 	shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
 	shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
 	shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) |
 	    (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
 	shmseg->u.shm_segsz = uap->size;
 	shmseg->u.shm_cpid = td->td_proc->p_pid;
 	shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
 	shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
 #ifdef MAC
 	mac_create_sysv_shm(cred, shmseg);
 #endif
 	shmseg->u.shm_ctime = time_second;
 	shm_committed += btoc(size);
 	shm_nused++;
 	if (shmseg->u.shm_perm.mode & SHMSEG_WANTED) {
 		/*
 		 * Somebody else wanted this key while we were asleep.  Wake
 		 * them up now.
 		 */
 		shmseg->u.shm_perm.mode &= ~SHMSEG_WANTED;
 		wakeup(shmseg);
 	}
 	td->td_retval[0] = shmid;
 	return (0);
 }
 
-/*
- * MPSAFE
- */
 int
 shmget(td, uap)
 	struct thread *td;
 	struct shmget_args *uap;
 {
 	int segnum, mode;
 	int error;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	mtx_lock(&Giant);
 	mode = uap->shmflg & ACCESSPERMS;
 	if (uap->key != IPC_PRIVATE) {
 	again:
 		segnum = shm_find_segment_by_key(uap->key);
 		if (segnum >= 0) {
 			error = shmget_existing(td, uap, mode, segnum);
 			if (error == EAGAIN)
 				goto again;
 			goto done2;
 		}
 		if ((uap->shmflg & IPC_CREAT) == 0) {
 			error = ENOENT;
 			goto done2;
 		}
 	}
 	error = shmget_allocate_segment(td, uap, mode);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 shmsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct shmsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 	} */ *uap;
 {
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 	int error;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	if (uap->which < 0 ||
 	    uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
 		return (EINVAL);
 	mtx_lock(&Giant);
 	error = (*shmcalls[uap->which])(td, &uap->a2);
 	mtx_unlock(&Giant);
 	return (error);
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 static void
 shmfork_myhook(p1, p2)
 	struct proc *p1, *p2;
 {
 	struct shmmap_state *shmmap_s;
 	size_t size;
 	int i;
 
 	mtx_lock(&Giant);
 	size = shminfo.shmseg * sizeof(struct shmmap_state);
 	shmmap_s = malloc(size, M_SHM, M_WAITOK);
 	bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
 	p2->p_vmspace->vm_shm = shmmap_s;
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
 		if (shmmap_s->shmid != -1)
 			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
 	mtx_unlock(&Giant);
 }
 
 static void
 shmexit_myhook(struct vmspace *vm)
 {
 	struct shmmap_state *base, *shm;
 	int i;
 
 	if ((base = vm->vm_shm) != NULL) {
 		vm->vm_shm = NULL;
 		mtx_lock(&Giant);
 		for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
 			if (shm->shmid != -1)
 				shm_delete_mapping(vm, shm);
 		}
 		mtx_unlock(&Giant);
 		free(base, M_SHM);
 	}
 }
 
 static void
 shmrealloc(void)
 {
 	int i;
 	struct shmid_kernel *newsegs;
 
 	if (shmalloced >= shminfo.shmmni)
 		return;
 
 	newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
 	if (newsegs == NULL)
 		return;
 	for (i = 0; i < shmalloced; i++)
 		bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
 	for (; i < shminfo.shmmni; i++) {
 		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		shmsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_init_sysv_shm(&shmsegs[i]);
 #endif
 	}
 	free(shmsegs, M_SHM);
 	shmsegs = newsegs;
 	shmalloced = shminfo.shmmni;
 }
 
 static void
 shminit()
 {
 	int i;
 
 	TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall);
 	for (i = PAGE_SIZE; i > 0; i--) {
 		shminfo.shmmax = shminfo.shmall * i;
 		if (shminfo.shmmax >= shminfo.shmall)
 			break;
 	}
 	TUNABLE_ULONG_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
 	TUNABLE_ULONG_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
 	TUNABLE_ULONG_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
 	TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);
 
 	shmalloced = shminfo.shmmni;
 	shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
 	if (shmsegs == NULL)
 		panic("cannot allocate initial memory for sysvshm");
 	for (i = 0; i < shmalloced; i++) {
 		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		shmsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_init_sysv_shm(&shmsegs[i]);
 #endif
 	}
 	shm_last_free = 0;
 	shm_nused = 0;
 	shm_committed = 0;
 	shmexit_hook = &shmexit_myhook;
 	shmfork_hook = &shmfork_myhook;
 }
 
 static int
 shmunload()
 {
 #ifdef MAC
 	int i;	
 #endif
 
 	if (shm_nused > 0)
 		return (EBUSY);
 
 #ifdef MAC
 	for (i = 0; i < shmalloced; i++)
 		mac_destroy_sysv_shm(&shmsegs[i]);
 #endif
 	free(shmsegs, M_SHM);
 	shmexit_hook = NULL;
 	shmfork_hook = NULL;
 	return (0);
 }
 
 static int
 sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
 {
 
 	return (SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0])));
 }
 
 static int
 sysvshm_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		shminit();
 		break;
 	case MOD_UNLOAD:
 		error = shmunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvshm_mod = {
 	"sysvshm",
 	&sysvshm_modload,
 	NULL
 };
 
 SYSCALL_MODULE_HELPER(shmsys);
 SYSCALL_MODULE_HELPER(shmat);
 SYSCALL_MODULE_HELPER(shmctl);
 SYSCALL_MODULE_HELPER(shmdt);
 SYSCALL_MODULE_HELPER(shmget);
 
 DECLARE_MODULE(sysvshm, sysvshm_mod,
 	SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvshm, 1);
Index: head/sys/kern/uipc_syscalls.c
===================================================================
--- head/sys/kern/uipc_syscalls.c	(revision 167210)
+++ head/sys/kern/uipc_syscalls.c	(revision 167211)
@@ -1,2764 +1,2689 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * sendfile(2) and related extensions:
  * Copyright (c) 1998, David Greenman. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sctp.h"
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <sys/event.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/sf_buf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_peeloff.h>
 #endif /* SCTP */
 
 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 
 static int accept1(struct thread *td, struct accept_args *uap, int compat);
 static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
 static int getsockname1(struct thread *td, struct getsockname_args *uap,
 			int compat);
 static int getpeername1(struct thread *td, struct getpeername_args *uap,
 			int compat);
 
 /*
  * NSFBUFS-related variables and associated sysctls
  */
 int nsfbufs;
 int nsfbufspeak;
 int nsfbufsused;
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
     "Maximum number of sendfile(2) sf_bufs available");
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
     "Number of sendfile(2) sf_bufs at peak usage");
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
     "Number of sendfile(2) sf_bufs in use");
 
 /*
  * Convert a user file descriptor to a kernel file entry.  A reference on the
  * file entry is held upon returning.  This is lighter weight than
  * fgetsock(), which bumps the socket reference drops the file reference
  * count instead, as this approach avoids several additional mutex operations
  * associated with the additional reference count.  If requested, return the
  * open file flags.
  */
 static int
 getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	fp = NULL;
 	if (fdp == NULL)
 		error = EBADF;
 	else {
 		FILEDESC_LOCK_FAST(fdp);
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			error = EBADF;
 		else if (fp->f_type != DTYPE_SOCKET) {
 			fp = NULL;
 			error = ENOTSOCK;
 		} else {
 			fhold(fp);
 			if (fflagp != NULL)
 				*fflagp = fp->f_flag;
 			error = 0;
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 	}
 	*fpp = fp;
 	return (error);
 }
 
 /*
  * System call interface to the socket abstraction.
  */
 #if defined(COMPAT_43)
 #define COMPAT_OLDSOCK
 #endif
 
-/*
- * MPSAFE
- */
 int
 socket(td, uap)
 	struct thread *td;
 	register struct socket_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
 	} */ *uap;
 {
 	struct filedesc *fdp;
 	struct socket *so;
 	struct file *fp;
 	int fd, error;
 
 #ifdef MAC
 	error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type,
 	    uap->protocol);
 	if (error)
 		return (error);
 #endif
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd);
 	if (error)
 		return (error);
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	NET_LOCK_GIANT();
 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	NET_UNLOCK_GIANT();
 	if (error) {
 		fdclose(fdp, fp, fd, td);
 	} else {
 		FILEDESC_LOCK_FAST(fdp);
 		fp->f_data = so;	/* already has ref count */
 		fp->f_flag = FREAD|FWRITE;
 		fp->f_ops = &socketops;
 		fp->f_type = DTYPE_SOCKET;
 		FILEDESC_UNLOCK_FAST(fdp);
 		td->td_retval[0] = fd;
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 bind(td, uap)
 	struct thread *td;
 	register struct bind_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct sockaddr *sa;
 	int error;
 
 	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
 		return (error);
 
 	error = kern_bind(td, uap->s, sa);
 	free(sa, M_SONAME);
 	return (error);
 }
 
 int
 kern_bind(td, fd, sa)
 	struct thread *td;
 	int fd;
 	struct sockaddr *sa;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		goto done2;
 	so = fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_bind(td->td_ucred, so, sa);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto done1;
 #endif
 	error = sobind(so, sa, td);
 #ifdef MAC
 done1:
 #endif
 	fdrop(fp, td);
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 listen(td, uap)
 	struct thread *td;
 	register struct listen_args /* {
 		int	s;
 		int	backlog;
 	} */ *uap;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 #ifdef MAC
 		SOCK_LOCK(so);
 		error = mac_check_socket_listen(td->td_ucred, so);
 		SOCK_UNLOCK(so);
 		if (error)
 			goto done;
 #endif
 		error = solisten(so, uap->backlog, td);
 #ifdef MAC
 done:
 #endif
 		fdrop(fp, td);
 	}
 	NET_UNLOCK_GIANT();
 	return(error);
 }
 
 /*
  * accept1()
- * MPSAFE
  */
 static int
 accept1(td, uap, compat)
 	struct thread *td;
 	register struct accept_args /* {
 		int	s;
 		struct sockaddr	* __restrict name;
 		socklen_t	* __restrict anamelen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *name;
 	socklen_t namelen;
 	struct file *fp;
 	int error;
 
 	if (uap->name == NULL)
 		return (kern_accept(td, uap->s, NULL, NULL, NULL));
 
 	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
 	if (error)
 		return (error);
 
 	error = kern_accept(td, uap->s, &name, &namelen, &fp);
 
 	/*
 	 * return a namelen of zero for older code which might
 	 * ignore the return value from accept.
 	 */
 	if (error) {
 		(void) copyout(&namelen,
 		    uap->anamelen, sizeof(*uap->anamelen));
 		return (error);
 	}
 
 	if (error == 0 && name != NULL) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)name)->sa_family =
 			    name->sa_family;
 #endif
 		error = copyout(name, uap->name, namelen);
 	}
 	if (error == 0)
 		error = copyout(&namelen, uap->anamelen,
 		    sizeof(namelen));
 	if (error)
 		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
 	fdrop(fp, td);
 	free(name, M_SONAME);
 	return (error);
 }
 
 int
 kern_accept(struct thread *td, int s, struct sockaddr **name,
     socklen_t *namelen, struct file **fp)
 {
 	struct filedesc *fdp;
 	struct file *headfp, *nfp = NULL;
 	struct sockaddr *sa = NULL;
 	int error;
 	struct socket *head, *so;
 	int fd;
 	u_int fflag;
 	pid_t pgid;
 	int tmp;
 
 	if (name) {
 		*name = NULL;
 		if (*namelen < 0)
 			return (EINVAL);
 	}
 
 	fdp = td->td_proc->p_fd;
 	NET_LOCK_GIANT();
 	error = getsock(fdp, s, &headfp, &fflag);
 	if (error)
 		goto done2;
 	head = headfp->f_data;
 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
 		error = EINVAL;
 		goto done;
 	}
 #ifdef MAC
 	SOCK_LOCK(head);
 	error = mac_check_socket_accept(td->td_ucred, head);
 	SOCK_UNLOCK(head);
 	if (error != 0)
 		goto done;
 #endif
 	error = falloc(td, &nfp, &fd);
 	if (error)
 		goto done;
 	ACCEPT_LOCK();
 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
 		ACCEPT_UNLOCK();
 		error = EWOULDBLOCK;
 		goto noconnection;
 	}
 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
 		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			head->so_error = ECONNABORTED;
 			break;
 		}
 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
 		    "accept", 0);
 		if (error) {
 			ACCEPT_UNLOCK();
 			goto noconnection;
 		}
 	}
 	if (head->so_error) {
 		error = head->so_error;
 		head->so_error = 0;
 		ACCEPT_UNLOCK();
 		goto noconnection;
 	}
 	so = TAILQ_FIRST(&head->so_comp);
 	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
 	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
 
 	/*
 	 * Before changing the flags on the socket, we have to bump the
 	 * reference count.  Otherwise, if the protocol calls sofree(),
 	 * the socket will be released due to a zero refcount.
 	 */
 	SOCK_LOCK(so);			/* soref() and so_state update */
 	soref(so);			/* file descriptor reference */
 
 	TAILQ_REMOVE(&head->so_comp, so, so_list);
 	head->so_qlen--;
 	so->so_state |= (head->so_state & SS_NBIO);
 	so->so_qstate &= ~SQ_COMP;
 	so->so_head = NULL;
 
 	SOCK_UNLOCK(so);
 	ACCEPT_UNLOCK();
 
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	td->td_retval[0] = fd;
 
 	/* connection has been removed from the listen queue */
 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
 
 	pgid = fgetown(&head->so_sigio);
 	if (pgid != 0)
 		fsetown(pgid, &so->so_sigio);
 
 	FILE_LOCK(nfp);
 	nfp->f_data = so;	/* nfp has ref count from falloc */
 	nfp->f_flag = fflag;
 	nfp->f_ops = &socketops;
 	nfp->f_type = DTYPE_SOCKET;
 	FILE_UNLOCK(nfp);
 	/* Sync socket nonblocking/async state with file flags */
 	tmp = fflag & FNONBLOCK;
 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
 	tmp = fflag & FASYNC;
 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
 	sa = 0;
 	error = soaccept(so, &sa);
 	if (error) {
 		/*
 		 * return a namelen of zero for older code which might
 		 * ignore the return value from accept.
 		 */
 		if (name)
 			*namelen = 0;
 		goto noconnection;
 	}
 	if (sa == NULL) {
 		if (name)
 			*namelen = 0;
 		goto done;
 	}
 	if (name) {
 		/* check sa_len before it is destroyed */
 		if (*namelen > sa->sa_len)
 			*namelen = sa->sa_len;
 		*name = sa;
 		sa = NULL;
 	}
 noconnection:
 	if (sa)
 		FREE(sa, M_SONAME);
 
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error)
 		fdclose(fdp, nfp, fd, td);
 
 	/*
 	 * Release explicitly held references before returning.  We return
 	 * a reference on nfp to the caller on success if they request it.
 	 */
 done:
 	if (fp != NULL) {
 		if (error == 0) {
 			*fp = nfp;
 			nfp = NULL;
 		} else
 			*fp = NULL;
 	}
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fdrop(headfp, td);
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
-/*
- * MPSAFE (accept1() is MPSAFE)
- */
 int
 accept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE (accept1() is MPSAFE)
- */
 int
 oaccept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 connect(td, uap)
 	struct thread *td;
 	register struct connect_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error)
 		return (error);
 
 	error = kern_connect(td, uap->s, sa);
 	free(sa, M_SONAME);
 	return (error);
 }
 
 
 int
 kern_connect(td, fd, sa)
 	struct thread *td;
 	int fd;
 	struct sockaddr *sa;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 	int interrupted = 0;
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		goto done2;
 	so = fp->f_data;
 	if (so->so_state & SS_ISCONNECTING) {
 		error = EALREADY;
 		goto done1;
 	}
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_connect(td->td_ucred, so, sa);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto bad;
 #endif
 	error = soconnect(so, sa, td);
 	if (error)
 		goto bad;
 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 		error = EINPROGRESS;
 		goto done1;
 	}
 	SOCK_LOCK(so);
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
 		    "connec", 0);
 		if (error) {
 			if (error == EINTR || error == ERESTART)
 				interrupted = 1;
 			break;
 		}
 	}
 	if (error == 0) {
 		error = so->so_error;
 		so->so_error = 0;
 	}
 	SOCK_UNLOCK(so);
 bad:
 	if (!interrupted)
 		so->so_state &= ~SS_ISCONNECTING;
 	if (error == ERESTART)
 		error = EINTR;
 done1:
 	fdrop(fp, td);
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 socketpair(td, uap)
 	struct thread *td;
 	register struct socketpair_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
 		int	*rsv;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp1, *fp2;
 	struct socket *so1, *so2;
 	int fd, error, sv[2];
 
 #ifdef MAC
 	/* We might want to have a separate check for socket pairs. */
 	error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type,
 	    uap->protocol);
 	if (error)
 		return (error);
 #endif
 
 	NET_LOCK_GIANT();
 	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error)
 		goto done2;
 	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error)
 		goto free1;
 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
 	error = falloc(td, &fp1, &fd);
 	if (error)
 		goto free2;
 	sv[0] = fd;
 	fp1->f_data = so1;	/* so1 already has ref count */
 	error = falloc(td, &fp2, &fd);
 	if (error)
 		goto free3;
 	fp2->f_data = so2;	/* so2 already has ref count */
 	sv[1] = fd;
 	error = soconnect2(so1, so2);
 	if (error)
 		goto free4;
 	if (uap->type == SOCK_DGRAM) {
 		/*
 		 * Datagram socket connection is asymmetric.
 		 */
 		 error = soconnect2(so2, so1);
 		 if (error)
 			goto free4;
 	}
 	FILE_LOCK(fp1);
 	fp1->f_flag = FREAD|FWRITE;
 	fp1->f_ops = &socketops;
 	fp1->f_type = DTYPE_SOCKET;
 	FILE_UNLOCK(fp1);
 	FILE_LOCK(fp2);
 	fp2->f_flag = FREAD|FWRITE;
 	fp2->f_ops = &socketops;
 	fp2->f_type = DTYPE_SOCKET;
 	FILE_UNLOCK(fp2);
 	error = copyout(sv, uap->rsv, 2 * sizeof (int));
 	fdrop(fp1, td);
 	fdrop(fp2, td);
 	goto done2;
 free4:
 	fdclose(fdp, fp2, sv[1], td);
 	fdrop(fp2, td);
 free3:
 	fdclose(fdp, fp1, sv[0], td);
 	fdrop(fp1, td);
 free2:
 	(void)soclose(so2);
 free1:
 	(void)soclose(so1);
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 static int
 sendit(td, s, mp, flags)
 	register struct thread *td;
 	int s;
 	register struct msghdr *mp;
 	int flags;
 {
 	struct mbuf *control;
 	struct sockaddr *to;
 	int error;
 
 	if (mp->msg_name != NULL) {
 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 		if (error) {
 			to = NULL;
 			goto bad;
 		}
 		mp->msg_name = to;
 	} else {
 		to = NULL;
 	}
 
 	if (mp->msg_control) {
 		if (mp->msg_controllen < sizeof(struct cmsghdr)
 #ifdef COMPAT_OLDSOCK
 		    && mp->msg_flags != MSG_COMPAT
 #endif
 		) {
 			error = EINVAL;
 			goto bad;
 		}
 		error = sockargs(&control, mp->msg_control,
 		    mp->msg_controllen, MT_CONTROL);
 		if (error)
 			goto bad;
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags == MSG_COMPAT) {
 			register struct cmsghdr *cm;
 
 			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
 			if (control == 0) {
 				error = ENOBUFS;
 				goto bad;
 			} else {
 				cm = mtod(control, struct cmsghdr *);
 				cm->cmsg_len = control->m_len;
 				cm->cmsg_level = SOL_SOCKET;
 				cm->cmsg_type = SCM_RIGHTS;
 			}
 		}
 #endif
 	} else {
 		control = NULL;
 	}
 
 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
 
 bad:
 	if (to)
 		FREE(to, M_SONAME);
 	return (error);
 }
 
 int
 kern_sendit(td, s, mp, flags, control, segflg)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	int flags;
 	struct mbuf *control;
 	enum uio_seg segflg;
 {
 	struct file *fp;
 	struct uio auio;
 	struct iovec *iov;
 	struct socket *so;
 	int i;
 	int len, error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error)
 		goto bad2;
 	so = (struct socket *)fp->f_data;
 
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto bad;
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			error = EINVAL;
 			goto bad;
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(s, UIO_WRITE, ktruio, error);
 	}
 #endif
 bad:
 	fdrop(fp, td);
 bad2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 sendto(td, uap)
 	struct thread *td;
 	register struct sendto_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
 		int	flags;
 		caddr_t	to;
 		int	tolen;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = uap->to;
 	msg.msg_namelen = uap->tolen;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = 0;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = 0;
 #endif
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	return (error);
 }
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
 int
 osend(td, uap)
 	struct thread *td;
 	register struct osend_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = 0;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 osendmsg(td, uap)
 	struct thread *td;
 	struct osendmsg_args /* {
 		int	s;
 		caddr_t	msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_iov = iov;
 	msg.msg_flags = MSG_COMPAT;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
-/*
- * MPSAFE
- */
 int
 sendmsg(td, uap)
 	struct thread *td;
 	struct sendmsg_args /* {
 		int	s;
 		caddr_t	msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_iov = iov;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = 0;
 #endif
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 
 int
 kern_recvit(td, s, mp, fromseg, controlp)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	enum uio_seg fromseg;
 	struct mbuf **controlp;
 {
 	struct uio auio;
 	struct iovec *iov;
 	int i;
 	socklen_t len;
 	int error;
 	struct mbuf *m, *control = 0;
 	caddr_t ctlbuf;
 	struct file *fp;
 	struct socket *so;
 	struct sockaddr *fromsa = 0;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	if(controlp != NULL)
 		*controlp = 0;
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error) {
 		NET_UNLOCK_GIANT();
 		return (error);
 	}
 	so = fp->f_data;
 
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_receive(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error) {
 		fdrop(fp, td);
 		NET_UNLOCK_GIANT();
 		return (error);
 	}
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			fdrop(fp, td);
 			NET_UNLOCK_GIANT();
 			return (EINVAL);
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
 	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
 	    (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
 	    &mp->msg_flags);
 	if (error) {
 		if (auio.uio_resid != (int)len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = (int)len - auio.uio_resid;
 		ktrgenio(s, UIO_READ, ktruio, error);
 	}
 #endif
 	if (error)
 		goto out;
 	td->td_retval[0] = (int)len - auio.uio_resid;
 	if (mp->msg_name) {
 		len = mp->msg_namelen;
 		if (len <= 0 || fromsa == 0)
 			len = 0;
 		else {
 			/* save sa_len before it is destroyed by MSG_COMPAT */
 			len = MIN(len, fromsa->sa_len);
 #ifdef COMPAT_OLDSOCK
 			if (mp->msg_flags & MSG_COMPAT)
 				((struct osockaddr *)fromsa)->sa_family =
 				    fromsa->sa_family;
 #endif
 			if (fromseg == UIO_USERSPACE) {
 				error = copyout(fromsa, mp->msg_name,
 				    (unsigned)len);
 				if (error)
 					goto out;
 			} else
 				bcopy(fromsa, mp->msg_name, len);
 		}
 		mp->msg_namelen = len;
 	}
 	if (mp->msg_control && controlp == NULL) {
 #ifdef COMPAT_OLDSOCK
 		/*
 		 * We assume that old recvmsg calls won't receive access
 		 * rights and other control info, esp. as control info
 		 * is always optional and those options didn't exist in 4.3.
 		 * If we receive rights, trim the cmsghdr; anything else
 		 * is tossed.
 		 */
 		if (control && mp->msg_flags & MSG_COMPAT) {
 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
 			    SOL_SOCKET ||
 			    mtod(control, struct cmsghdr *)->cmsg_type !=
 			    SCM_RIGHTS) {
 				mp->msg_controllen = 0;
 				goto out;
 			}
 			control->m_len -= sizeof (struct cmsghdr);
 			control->m_data += sizeof (struct cmsghdr);
 		}
 #endif
 		len = mp->msg_controllen;
 		m = control;
 		mp->msg_controllen = 0;
 		ctlbuf = mp->msg_control;
 
 		while (m && len > 0) {
 			unsigned int tocopy;
 
 			if (len >= m->m_len)
 				tocopy = m->m_len;
 			else {
 				mp->msg_flags |= MSG_CTRUNC;
 				tocopy = len;
 			}
 
 			if ((error = copyout(mtod(m, caddr_t),
 					ctlbuf, tocopy)) != 0)
 				goto out;
 
 			ctlbuf += tocopy;
 			len -= tocopy;
 			m = m->m_next;
 		}
 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
 	}
 out:
 	fdrop(fp, td);
 	NET_UNLOCK_GIANT();
 	if (fromsa)
 		FREE(fromsa, M_SONAME);
 
 	if (error == 0 && controlp != NULL)  
 		*controlp = control;
 	else  if (control)
 		m_freem(control);
 
 	return (error);
 }
 
 static int
 recvit(td, s, mp, namelenp)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	void *namelenp;
 {
 	int error;
 
 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
 	if (error)
 		return (error);
 	if (namelenp) {
 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags & MSG_COMPAT)
 			error = 0;	/* old recvfrom didn't check */
 #endif
 	}
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 recvfrom(td, uap)
 	struct thread *td;
 	register struct recvfrom_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
 		int	flags;
 		struct sockaddr * __restrict	from;
 		socklen_t * __restrict fromlenaddr;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	if (uap->fromlenaddr) {
 		error = copyin(uap->fromlenaddr,
 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
 		if (error)
 			goto done2;
 	} else {
 		msg.msg_namelen = 0;
 	}
 	msg.msg_name = uap->from;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
 done2:
 	return(error);
 }
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
 int
 orecvfrom(td, uap)
 	struct thread *td;
 	struct recvfrom_args *uap;
 {
 
 	uap->flags |= MSG_COMPAT;
 	return (recvfrom(td, uap));
 }
 #endif
 
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
 int
 orecv(td, uap)
 	struct thread *td;
 	register struct orecv_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	error = recvit(td, uap->s, &msg, NULL);
 	return (error);
 }
 
 /*
  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
  * overlays the new one, missing only the flags, and with the (old) access
  * rights where the control fields are now.
- *
- * MPSAFE
  */
 int
 orecvmsg(td, uap)
 	struct thread *td;
 	struct orecvmsg_args /* {
 		int	s;
 		struct	omsghdr *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_flags = uap->flags | MSG_COMPAT;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
 	if (msg.msg_controllen && error == 0)
 		error = copyout(&msg.msg_controllen,
 		    &uap->msg->msg_accrightslen, sizeof (int));
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
-/*
- * MPSAFE
- */
 int
 recvmsg(td, uap)
 	struct thread *td;
 	struct recvmsg_args /* {
 		int	s;
 		struct	msghdr *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *uiov, *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_flags = uap->flags;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags &= ~MSG_COMPAT;
 #endif
 	uiov = msg.msg_iov;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, NULL);
 	if (error == 0) {
 		msg.msg_iov = uiov;
 		error = copyout(&msg, uap->msg, sizeof(msg));
 	}
 	free(iov, M_IOV);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 shutdown(td, uap)
 	struct thread *td;
 	register struct shutdown_args /* {
 		int	s;
 		int	how;
 	} */ *uap;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = soshutdown(so, uap->how);
 		fdrop(fp, td);
 	}
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setsockopt(td, uap)
 	struct thread *td;
 	register struct setsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
 		caddr_t	val;
 		int	valsize;
 	} */ *uap;
 {
 
 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, uap->valsize));
 }
 
 int
 kern_setsockopt(td, s, level, name, val, valseg, valsize)
 	struct thread *td;
 	int s;
 	int level;
 	int name;
 	void *val;
 	enum uio_seg valseg;
 	socklen_t valsize;
 {
 	int error;
 	struct socket *so;
 	struct file *fp;
 	struct sockopt sopt;
 
 	if (val == NULL && valsize != 0)
 		return (EFAULT);
 	if ((int)valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = val;
 	sopt.sopt_valsize = valsize;
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_setsockopt called with bad valseg");
 	}
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sosetopt(so, &sopt);
 		fdrop(fp, td);
 	}
 	NET_UNLOCK_GIANT();
 	return(error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getsockopt(td, uap)
 	struct thread *td;
 	register struct getsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
 		void * __restrict	val;
 		socklen_t * __restrict avalsize;
 	} */ *uap;
 {
 	socklen_t valsize;
 	int	error;
 
 	if (uap->val) {
 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
 		if (error)
 			return (error);
 	}
 
 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, &valsize);
 
 	if (error == 0)
 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
 	return (error);
 }
 
 /*
  * Kernel version of getsockopt.
  * optval can be a userland or userspace. optlen is always a kernel pointer.
  */
 int
 kern_getsockopt(td, s, level, name, val, valseg, valsize)
 	struct thread *td;
 	int s;
 	int level;
 	int name;
 	void *val;
 	enum uio_seg valseg;
 	socklen_t *valsize;
 {
 	int error;
 	struct  socket *so;
 	struct file *fp;
 	struct	sockopt sopt;
 
 	if (val == NULL)
 		*valsize = 0;
 	if ((int)*valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_GET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = val;
 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_getsockopt called with bad valseg");
 	}
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sogetopt(so, &sopt);
 		*valsize = sopt.sopt_valsize;
 		fdrop(fp, td);
 	}
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 /*
  * getsockname1() - Get socket name.
- *
- * MPSAFE
  */
 /* ARGSUSED */
 static int
 getsockname1(td, uap, compat)
 	struct thread *td;
 	register struct getsockname_args /* {
 		int	fdes;
 		struct sockaddr * __restrict asa;
 		socklen_t * __restrict alen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 
 	error = copyin(uap->alen, &len, sizeof(len));
 	if (error)
 		return (error);
 
 	error = kern_getsockname(td, uap->fdes, &sa, &len);
 	if (error)
 		return (error);
 
 	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 		error = copyout(sa, uap->asa, (u_int)len);
 	}
 	free(sa, M_SONAME);
 	if (error == 0)
 		error = copyout(&len, uap->alen, sizeof(len));
 	return (error);
 }
 
 int
 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
     socklen_t *alen)
 {
 	struct socket *so;
 	struct file *fp;
 	socklen_t len;
 	int error;
 
 	if (*alen < 0)
 		return (EINVAL);
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		goto done;
 	so = fp->f_data;
 	*sa = NULL;
 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
 	if (error)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
 	else
 		len = MIN(*alen, (*sa)->sa_len);
 	*alen = len;
 bad:
 	fdrop(fp, td);
 	if (error && *sa) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
 done:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 getsockname(td, uap)
 	struct thread *td;
 	struct getsockname_args *uap;
 {
 
 	return (getsockname1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
 int
 ogetsockname(td, uap)
 	struct thread *td;
 	struct getsockname_args *uap;
 {
 
 	return (getsockname1(td, uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 /*
  * getpeername1() - Get name of peer for connected socket.
- *
- * MPSAFE
  */
 /* ARGSUSED */
 static int
 getpeername1(td, uap, compat)
 	struct thread *td;
 	register struct getpeername_args /* {
 		int	fdes;
 		struct sockaddr * __restrict	asa;
 		socklen_t * __restrict	alen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 
 	error = copyin(uap->alen, &len, sizeof (len));
 	if (error)
 		return (error);
 
 	error = kern_getpeername(td, uap->fdes, &sa, &len);
 	if (error)
 		return (error);
 
 	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 		error = copyout(sa, uap->asa, (u_int)len);
 	}
 	free(sa, M_SONAME);
 	if (error == 0)
 		error = copyout(&len, uap->alen, sizeof(len));
 	return (error);
 }
 
 int
 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
     socklen_t *alen)
 {
 	struct socket *so;
 	struct file *fp;
 	socklen_t len;
 	int error;
 
 	if (*alen < 0)
 		return (EINVAL);
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		goto done2;
 	so = fp->f_data;
 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
 		error = ENOTCONN;
 		goto done1;
 	}
 	*sa = NULL;
 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
 	if (error)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
 	else
 		len = MIN(*alen, (*sa)->sa_len);
 	*alen = len;
 bad:
 	if (error && *sa) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
 done1:
 	fdrop(fp, td);
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 getpeername(td, uap)
 	struct thread *td;
 	struct getpeername_args *uap;
 {
 
 	return (getpeername1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
 int
 ogetpeername(td, uap)
 	struct thread *td;
 	struct ogetpeername_args *uap;
 {
 
 	/* XXX uap should have type `getpeername_args *' to begin with. */
 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 int
 sockargs(mp, buf, buflen, type)
 	struct mbuf **mp;
 	caddr_t buf;
 	int buflen, type;
 {
 	register struct sockaddr *sa;
 	register struct mbuf *m;
 	int error;
 
 	if ((u_int)buflen > MLEN) {
 #ifdef COMPAT_OLDSOCK
 		if (type == MT_SONAME && (u_int)buflen <= 112)
 			buflen = MLEN;		/* unix domain compat. hack */
 		else
 #endif
 			if ((u_int)buflen > MCLBYTES)
 				return (EINVAL);
 	}
 	m = m_get(M_TRYWAIT, type);
 	if (m == NULL)
 		return (ENOBUFS);
 	if ((u_int)buflen > MLEN) {
 		MCLGET(m, M_TRYWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			return (ENOBUFS);
 		}
 	}
 	m->m_len = buflen;
 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
 	if (error)
 		(void) m_free(m);
 	else {
 		*mp = m;
 		if (type == MT_SONAME) {
 			sa = mtod(m, struct sockaddr *);
 
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 				sa->sa_family = sa->sa_len;
 #endif
 			sa->sa_len = buflen;
 		}
 	}
 	return (error);
 }
 
 int
 getsockaddr(namp, uaddr, len)
 	struct sockaddr **namp;
 	caddr_t uaddr;
 	size_t len;
 {
 	struct sockaddr *sa;
 	int error;
 
 	if (len > SOCK_MAXADDRLEN)
 		return (ENAMETOOLONG);
 	if (len < offsetof(struct sockaddr, sa_data[0]))
 		return (EINVAL);
 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
 	error = copyin(uaddr, sa, len);
 	if (error) {
 		FREE(sa, M_SONAME);
 	} else {
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 			sa->sa_family = sa->sa_len;
 #endif
 		sa->sa_len = len;
 		*namp = sa;
 	}
 	return (error);
 }
 
 /*
  * Detach mapped page and release resources back to the system.
  */
 void
 sf_buf_mext(void *addr, void *args)
 {
 	vm_page_t m;
 
 	m = sf_buf_page(args);
 	sf_buf_free(args);
 	vm_page_lock_queues();
 	vm_page_unwire(m, 0);
 	/*
 	 * Check for the object going away on us. This can
 	 * happen since we don't hold a reference to it.
 	 * If so, we're responsible for freeing the page.
 	 */
 	if (m->wire_count == 0 && m->object == NULL)
 		vm_page_free(m);
 	vm_page_unlock_queues();
 }
 
 /*
  * sendfile(2)
  *
- * MPSAFE
- *
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  *
  * Send a file specified by 'fd' and starting at 'offset' to a socket
- * specified by 's'. Send only 'nbytes' of the file or until EOF if
- * nbytes == 0. Optionally add a header and/or trailer to the socket
- * output. If specified, write the total number of bytes sent into *sbytes.
+ * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
+ * 0. Optionally add a header and/or trailer to the socket output.  If
+ * specified, write the total number of bytes sent into *sbytes.
  *
  */
 int
 sendfile(struct thread *td, struct sendfile_args *uap)
 {
 
 	return (do_sendfile(td, uap, 0));
 }
 
 static int
 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	int error;
 
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 		if (error)
 			goto out;
 		if (hdtr.headers != NULL) {
 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
 			if (error)
 				goto out;
 		}
 		if (hdtr.trailers != NULL) {
 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
 			if (error)
 				goto out;
 
 		}
 	}
 
 	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
 out:
 	if (hdr_uio)
 		free(hdr_uio, M_IOV);
 	if (trl_uio)
 		free(trl_uio, M_IOV);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 {
 	struct sendfile_args args;
 
 	args.fd = uap->fd;
 	args.s = uap->s;
 	args.offset = uap->offset;
 	args.nbytes = uap->nbytes;
 	args.hdtr = uap->hdtr;
 	args.sbytes = uap->sbytes;
 	args.flags = uap->flags;
 
 	return (do_sendfile(td, &args, 1));
 }
 #endif /* COMPAT_FREEBSD4 */
 
 int
 kern_sendfile(struct thread *td, struct sendfile_args *uap,
     struct uio *hdr_uio, struct uio *trl_uio, int compat)
 {
 	struct file *sock_fp;
 	struct vnode *vp;
 	struct vm_object *obj = NULL;
 	struct socket *so = NULL;
 	struct mbuf *m = NULL;
 	struct sf_buf *sf;
 	struct vm_page *pg;
 	off_t off, xfsize, sbytes = 0, rem = 0;
 	int error, mnw = 0;
 	int vfslocked;
 
 	NET_LOCK_GIANT();
 
 	/*
 	 * The file descriptor must be a regular file and have a
 	 * backing VM object.
 	 * File offset must be positive.  If it goes beyond EOF
 	 * we send only the header/trailer and no payload data.
 	 */
 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
 		goto out;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	obj = vp->v_object;
 	if (obj != NULL) {
 		/*
 		 * Temporarily increase the backing VM object's reference
 		 * count so that a forced reclamation of its vnode does not
 		 * immediately destroy it.
 		 */
 		VM_OBJECT_LOCK(obj);
 		if ((obj->flags & OBJ_DEAD) == 0) {
 			vm_object_reference_locked(obj);
 			VM_OBJECT_UNLOCK(obj);
 		} else {
 			VM_OBJECT_UNLOCK(obj);
 			obj = NULL;
 		}
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (obj == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 	if (uap->offset < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * The socket must be a stream socket and connected.
 	 * Remember if it a blocking or non-blocking socket.
 	 */
 	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp,
 	    NULL)) != 0)
 		goto out;
 	so = sock_fp->f_data;
 	if (so->so_type != SOCK_STREAM) {
 		error = EINVAL;
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		error = ENOTCONN;
 		goto out;
 	}
 	/*
 	 * Do not wait on memory allocations but return ENOMEM for
 	 * caller to retry later.
 	 * XXX: Experimental.
 	 */
 	if (uap->flags & SF_MNOWAIT)
 		mnw = 1;
 
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto out;
 #endif
 
 	/* If headers are specified copy them into mbufs. */
 	if (hdr_uio != NULL) {
 		hdr_uio->uio_td = td;
 		hdr_uio->uio_rw = UIO_WRITE;
 		if (hdr_uio->uio_resid > 0) {
 			/*
 			 * In FBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
 			    0, 0, 0);
 			if (m == NULL) {
 				error = mnw ? EAGAIN : ENOBUFS;
 				goto out;
 			}
 		}
 	}
 
 	/* Protect against multiple writers to the socket. */
 	SOCKBUF_LOCK(&so->so_snd);
 	(void) sblock(&so->so_snd, M_WAITOK);
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	/*
 	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
 	 * This is done in two loops.  The inner loop turns as many pages
 	 * as it can, up to available socket buffer space, without blocking
 	 * into mbufs to have it bulk delivered into the socket send buffer.
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
 	for (off = uap->offset; ; ) {
 		int loopbytes = 0;
 		int space = 0;
 		int done = 0;
 
 		/*
 		 * Check the socket state for ongoing connection,
 		 * no errors and space in socket buffer.
 		 * If space is low allow for the remainder of the
 		 * file to be processed if it fits the socket buffer.
 		 * Otherwise block in waiting for sufficient space
 		 * to proceed, or if the socket is nonblocking, return
 		 * to userland with EAGAIN while reporting how far
 		 * we've come.
 		 * We wait until the socket buffer has significant free
 		 * space to do bulk sends.  This makes good use of file
 		 * system read ahead and allows packet segmentation
 		 * offloading hardware to take over lots of work.  If
 		 * we were not careful here we would send off only one
 		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
 retry_space:
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		} else if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
 		space = sbspace(&so->so_snd);
 		if (space < rem &&
 		    (space <= 0 ||
 		     space < so->so_snd.sb_lowat)) {
 			if (so->so_state & SS_NBIO) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EAGAIN;
 				goto done;
 			}
 			/*
 			 * sbwait drops the lock while sleeping.
 			 * When we loop back to retry_space the
 			 * state may have changed and we retest
 			 * for it.
 			 */
 			error = sbwait(&so->so_snd);
 			/*
 			 * An error from sbwait usually indicates that we've
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
 			if (error) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			goto retry_space;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
 		while(space > loopbytes) {
 			vm_pindex_t pindex;
 			vm_offset_t pgoff;
 			struct mbuf *m0;
 
 			VM_OBJECT_LOCK(obj);
 			/*
 			 * Calculate the amount to transfer.
 			 * Not to exceed a page, the EOF,
 			 * or the passed in nbytes.
 			 */
 			pgoff = (vm_offset_t)(off & PAGE_MASK);
 			xfsize = omin(PAGE_SIZE - pgoff,
 			    obj->un_pager.vnp.vnp_size - off -
 			    sbytes - loopbytes);
 			if (uap->nbytes)
 				rem = (uap->nbytes - sbytes - loopbytes);
 			else
 				rem = obj->un_pager.vnp.vnp_size - off -
 				    sbytes - loopbytes;
 			xfsize = omin(rem, xfsize);
 			if (xfsize <= 0) {
 				VM_OBJECT_UNLOCK(obj);
 				done = 1;		/* all data sent */
 				break;
 			}
 			/*
 			 * Don't overflow the send buffer.
 			 * Stop here and send out what we've
 			 * already got.
 			 */
 			if (space < loopbytes + xfsize) {
 				VM_OBJECT_UNLOCK(obj);
 				break;
 			}
 retry_lookup:
 			/*
 			 * Attempt to look up the page.
 			 * Allocate if not found or
 			 * wait and loop if busy.
 			 */
 			pindex = OFF_TO_IDX(off);
 			pg = vm_page_lookup(obj, pindex);
 			if (pg == NULL) {
 				pg = vm_page_alloc(obj, pindex,
 				    VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL |
 				    VM_ALLOC_WIRED);
 				if (pg == NULL) {
 					VM_OBJECT_UNLOCK(obj);
 					VM_WAIT;
 					VM_OBJECT_LOCK(obj);
 					goto retry_lookup;
 				}
 			} else if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
 				goto retry_lookup;
 			else {
 				/*
 				 * Wire the page so it does not get
 				 * ripped out from under us.
 				 */
 				vm_page_lock_queues();
 				vm_page_wire(pg);
 				vm_page_unlock_queues();
 			}
 
 			/*
 			 * Check if page is valid for what we need,
 			 * otherwise initiate I/O.
 			 * If we already turned some pages into mbufs,
 			 * send them off before we come here again and
 			 * block.
 			 */
 			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
 				VM_OBJECT_UNLOCK(obj);
 			else if (m != NULL)
 				error = EAGAIN;	/* send what we already got */
 			else if (uap->flags & SF_NODISKIO)
 				error = EBUSY;
 			else {
 				int bsize, resid;
 
 				/*
 				 * Ensure that our page is still around
 				 * when the I/O completes.
 				 */
 				vm_page_io_start(pg);
 				VM_OBJECT_UNLOCK(obj);
 
 				/*
 				 * Get the page from backing store.
 				 */
 				bsize = vp->v_mount->mnt_stat.f_iosize;
 				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 				vn_lock(vp, LK_SHARED | LK_RETRY, td);
 
 				/*
 				 * XXXMAC: Because we don't have fp->f_cred
 				 * here, we pass in NOCRED.  This is probably
 				 * wrong, but is consistent with our original
 				 * implementation.
 				 */
 				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
 				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
 				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
 				    td->td_ucred, NOCRED, &resid, td);
 				VOP_UNLOCK(vp, 0, td);
 				VFS_UNLOCK_GIANT(vfslocked);
 				VM_OBJECT_LOCK(obj);
 				vm_page_io_finish(pg);
 				if (!error)
 					VM_OBJECT_UNLOCK(obj);
 				mbstat.sf_iocnt++;
 			}
 			if (error) {
 				vm_page_lock_queues();
 				vm_page_unwire(pg, 0);
 				/*
 				 * See if anyone else might know about
 				 * this page.  If not and it is not valid,
 				 * then free it.
 				 */
 				if (pg->wire_count == 0 && pg->valid == 0 &&
 				    pg->busy == 0 && !(pg->oflags & VPO_BUSY) &&
 				    pg->hold_count == 0) {
 					vm_page_free(pg);
 				}
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(obj);
 				if (error == EAGAIN)
 					error = 0;	/* not a real error */
 				break;
 			}
 
 			/*
 			 * Get a sendfile buf.  We usually wait as long
 			 * as necessary, but this wait can be interrupted.
 			 */
 			if ((sf = sf_buf_alloc(pg,
 			    (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) {
 				mbstat.sf_allocfail++;
 				vm_page_lock_queues();
 				vm_page_unwire(pg, 0);
 				/*
 				 * XXX: Not same check as above!?
 				 */
 				if (pg->wire_count == 0 && pg->object == NULL)
 					vm_page_free(pg);
 				vm_page_unlock_queues();
 				error = (mnw ? EAGAIN : EINTR);
 				break;
 			}
 
 			/*
 			 * Get an mbuf and set it up as having
 			 * external storage.
 			 */
 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
 			if (m0 == NULL) {
 				error = (mnw ? EAGAIN : ENOBUFS);
 				sf_buf_mext((void *)sf_buf_kva(sf), sf);
 				break;
 			}
 			MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
 			    sf, M_RDONLY, EXT_SFBUF);
 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
 			m0->m_len = xfsize;
 
 			/* Append to mbuf chain. */
 			if (m != NULL)
 				m_cat(m, m0);
 			else
 				m = m0;
 
 			/* Keep track of bits processed. */
 			loopbytes += xfsize;
 			off += xfsize;
 		}
 
 		/* Add the buffer chain to the socket buffer. */
 		if (m != NULL) {
 			int mlen;
 
 			mlen = m_length(m, NULL);
 			SOCKBUF_LOCK(&so->so_snd);
 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 				error = EPIPE;
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = (*so->so_proto->pr_usrreqs->pru_send)
 				    (so, 0, m, NULL, NULL, td);
 			if (!error)
 				sbytes += mlen;
 			m = NULL;	/* pru_send always consumes */
 		}
 
 		/* Quit outer loop on error or when we're done. */
 		if (error || done)
 			goto done;
 	}
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
 	if (trl_uio != NULL) {
 		error = kern_writev(td, uap->s, trl_uio);
 		if (error)
 			goto done;
 		sbytes += td->td_retval[0];
 	}
 
 done:
 	SOCKBUF_LOCK(&so->so_snd);
 	sbunlock(&so->so_snd);
 	SOCKBUF_UNLOCK(&so->so_snd);
 out:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
 	if (uap->sbytes != NULL) {
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 	}
 	if (obj != NULL)
 		vm_object_deallocate(obj);
 	if (vp != NULL) {
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (so)
 		fdrop(sock_fp, td);
 	if (m)
 		m_freem(m);
 
 	NET_UNLOCK_GIANT();
 
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 /*
  * SCTP syscalls.
  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
  * otherwise all return EOPNOTSUPP.
  * XXX: We should make this loadable one day.
  */
 int
 sctp_peeloff(td, uap)
 	struct thread *td;
 	struct sctp_peeloff_args /* {
 		int	sd;
 		caddr_t	name;
 	} */ *uap;
 {
 #ifdef SCTP
 	struct filedesc *fdp;
 	struct file *nfp = NULL;
 	int error;
 	struct socket *head, *so;
 	int fd;
 	u_int fflag;
 
 	fdp = td->td_proc->p_fd;
 	error = fgetsock(td, uap->sd, &head, &fflag);
 	if (error)
 		goto done2;
 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
 	if (error)
 		goto done2;
 	/*
 	 * At this point we know we do have a assoc to pull
 	 * we proceed to get the fd setup. This may block
 	 * but that is ok.
 	 */
 
 	error = falloc(td, &nfp, &fd);
 	if (error)
 		goto done;
 	td->td_retval[0] = fd;
 
 	so = sonewconn(head, SS_ISCONNECTED);
 	if (so == NULL) 
 		goto noconnection;
 	/*
 	 * Before changing the flags on the socket, we have to bump the
 	 * reference count.  Otherwise, if the protocol calls sofree(),
 	 * the socket will be released due to a zero refcount.
 	 */
         SOCK_LOCK(so);
         soref(so);                      /* file descriptor reference */
         SOCK_UNLOCK(so);
 
 	ACCEPT_LOCK();
 
 	TAILQ_REMOVE(&head->so_comp, so, so_list);
 	head->so_qlen--;
 	so->so_state |= (head->so_state & SS_NBIO);
 	so->so_state &= ~SS_NOFDREF;
 	so->so_qstate &= ~SQ_COMP;
 	so->so_head = NULL;
 
 	ACCEPT_UNLOCK();
 
 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
 	if (error)
 		goto noconnection;
 	if (head->so_sigio != NULL)
 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
 
 	FILE_LOCK(nfp);
 	nfp->f_data = so;
 	nfp->f_flag = fflag;
 	nfp->f_ops = &socketops;
 	nfp->f_type = DTYPE_SOCKET;
 	FILE_UNLOCK(nfp);
 
 noconnection:
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error)
 		fdclose(fdp, nfp, fd, td);
 
 	/*
 	 * Release explicitly held references before returning.
 	 */
 done:
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fputsock(head);
 done2:
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sctp_generic_sendmsg (td, uap)
 	struct thread *td;
 	struct sctp_generic_sendmsg_args /* {
 		int sd, 
 		caddr_t msg, 
 		int mlen, 
 		caddr_t to, 
 		__socklen_t tolen, 
 		struct sctp_sndrcvinfo *sinfo, 
 		int flags
 	} */ *uap;
 {
 #ifdef SCTP
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp;
 	int use_rcvinfo = 1;
 	int error = 0, len;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec iov[1];
 
 	if (uap->sinfo) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 	if (uap->tolen) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 	}
 
 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
 	if (error)
 		goto sctp_bad;
 
 	iov[0].iov_base = uap->msg;
 	iov[0].iov_len = uap->mlen;
 
 	so = (struct socket *)fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov =  iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	len = auio.uio_resid = uap->mlen;
 	error = sctp_lower_sosend(so, to, &auio,
 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
 		    uap->flags, use_rcvinfo, u_sinfo, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket. */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	fdrop(fp, td);
 sctp_bad2:
 	if (to)
 		free(to, M_SONAME);
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sctp_generic_sendmsg_iov(td, uap)
 	struct thread *td;
 	struct sctp_generic_sendmsg_iov_args /* {
 		int sd, 
 		struct iovec *iov, 
 		int iovlen, 
 		caddr_t to, 
 		__socklen_t tolen, 
 		struct sctp_sndrcvinfo *sinfo, 
 		int flags
 	} */ *uap;
 {
 #ifdef SCTP
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp;
 	int use_rcvinfo = 1;
 	int error=0, len, i;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec *iov, *tiov;
 
 	if (uap->sinfo) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 	if (uap->tolen) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 	}
 
 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
 	if (error)
 		goto sctp_bad1;
 
 	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error)
 		goto sctp_bad1;
 
 	so = (struct socket *)fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov =  iov;
 	auio.uio_iovcnt = uap->iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto sctp_bad;
 		}
 	}
 	len = auio.uio_resid;
 	error = sctp_lower_sosend(so, to, &auio,
 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
 		    uap->flags, use_rcvinfo, u_sinfo, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	free(iov, M_IOV);
 sctp_bad1:
 	fdrop(fp, td);
 sctp_bad2:
 	if (to)
 		free(to, M_SONAME);
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sctp_generic_recvmsg(td, uap)
 	struct thread *td;
 	struct sctp_generic_recvmsg_args /* {
 		int sd, 
 		struct iovec *iov, 
 		int iovlen,
 		struct sockaddr *from, 
 		__socklen_t *fromlenaddr,
 		struct sctp_sndrcvinfo *sinfo, 
 		int *msg_flags
 	} */ *uap;
 {
 #ifdef SCTP
 	u_int8_t sockbufstore[256];
 	struct uio auio;
 	struct iovec *iov, *tiov;
 	struct sctp_sndrcvinfo sinfo;
 	struct socket *so;
 	struct file *fp;
 	struct sockaddr *fromsa;
 	int fromlen;
 	int len, i, msg_flags;
 	int error = 0;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
 	if (error) {
 		return (error);
 	}
 	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error) {
 		goto out1;
 	}
 
 	so = fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_receive(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error) {
 		goto out;
 		return (error);
 	}
 #endif /* MAC */
 
 	if (uap->fromlenaddr) {
 		error = copyin(uap->fromlenaddr,
 		    &fromlen, sizeof (fromlen));
 		if (error) {
 			goto out;
 		}
 	} else {
 		fromlen = 0;
 	}
 	if(uap->msg_flags) {
 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
 		if (error) {
 			goto out;
 		}
 	} else {
 		msg_flags = 0;
 	}
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovlen;
   	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto out;
 		}
 	}
 	len = auio.uio_resid;
 	fromsa = (struct sockaddr *)sockbufstore;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif /* KTRACE */
 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
 		    fromsa, fromlen, &msg_flags,
 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
 	if (error) {
 		if (auio.uio_resid != (int)len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	} else {
 		if (uap->sinfo)
 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
 	}
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = (int)len - auio.uio_resid;
 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
 	}
 #endif /* KTRACE */
 	if (error)
 		goto out;
 	td->td_retval[0] = (int)len - auio.uio_resid;
 
 	if (fromlen && uap->from) {
 		len = fromlen;
 		if (len <= 0 || fromsa == 0)
 			len = 0;
 		else {
 			len = MIN(len, fromsa->sa_len);
 			error = copyout(fromsa, uap->from, (unsigned)len);
 			if (error)
 				goto out;
 		}
 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
 		if (error) {
 			goto out;
 		}
 	}
 	if (uap->msg_flags) {
 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
 		if (error) {
 			goto out;
 		}
 	}
 out:
 	free(iov, M_IOV);
 out1:
 	fdrop(fp, td);
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
Index: head/sys/kern/vfs_acl.c
===================================================================
--- head/sys/kern/vfs_acl.c	(revision 167210)
+++ head/sys/kern/vfs_acl.c	(revision 167211)
@@ -1,455 +1,431 @@
 /*-
  * Copyright (c) 1999-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
  * Developed by the TrustedBSD Project.
  *
  * ACL system calls and other functions common across different ACL types.
  * Type-specific routines go into subr_acl_<type>.c.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/acl.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 uma_zone_t	acl_zone;
 static int	vacl_set_acl(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 static int	vacl_get_acl(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 
 /*
  * These calls wrap the real vnode operations, and are called by the syscall
  * code once the syscall has converted the path or file descriptor to a vnode
  * (unlocked).  The aclp pointer is assumed still to point to userland, so
  * this should not be consumed within the kernel except by syscall code.
  * Other code should directly invoke VOP_{SET,GET}ACL.
  */
 
 /*
  * Given a vnode, set its ACL.
  */
 static int
 vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernacl;
 	struct mount *mp;
 	int error;
 
 	error = copyin(aclp, &inkernacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 	error = mac_check_vnode_setacl(td->td_ucred, vp, type, &inkernacl);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return(error);
 }
 
 /*
  * Given a vnode, get its ACL.
  */
 static int
 vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 	error = mac_check_vnode_getacl(td->td_ucred, vp, type);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	if (error == 0)
 		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
 	return (error);
 }
 
 /*
  * Given a vnode, delete its ACL.
  */
 static int
 vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
 {
 	struct mount *mp;
 	int error;
 
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 	error = mac_check_vnode_deleteacl(td->td_ucred, vp, type);
 	if (error)
 		goto out;
 #endif
 	error = VOP_SETACL(vp, type, 0, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Given a vnode, check whether an ACL is appropriate for it
  */
 static int
 vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
 	return (error);
 }
 
 /*
  * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.  Don't
  * need to lock, as the vacl_ code will get/release any locks required.
  */
 
 /*
  * Given a file path, get an ACL for it
- *
- * MPSAFE
  */
 int
 __acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, get an ACL for it; don't follow links.
- *
- * MPSAFE
  */
 int
 __acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
- * Given a file path, set an ACL for it
- *
- * MPSAFE
+ * Given a file path, set an ACL for it.
  */
 int
 __acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it; don't follow links.
- *
- * MPSAFE
  */
 int
 __acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file descriptor, get an ACL for it
- *
- * MPSAFE
  */
 int
 __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
  * Given a file descriptor, set an ACL for it
- *
- * MPSAFE
  */
 int
 __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
- *
- * MPSAFE
  */
 int
 __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, uap->type);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it; don't follow links.
- *
- * MPSAFE
  */
 int
 __acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, uap->type);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
- *
- * MPSAFE
  */
 int
 __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_delete(td, fp->f_vnode, uap->type);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
- * Given a file path, check an ACL for it
- *
- * MPSAFE
+ * Given a file path, check an ACL for it.
  */
 int
 __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
 {
 	struct nameidata	nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it; don't follow links.
- *
- * MPSAFE
  */
 int
 __acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
 {
 	struct nameidata	nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
- * Given a file descriptor, check an ACL for it
- *
- * MPSAFE
+ * Given a file descriptor, check an ACL for it.
  */
 int
 __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /* ARGUSED */
 
 static void
 aclinit(void *dummy __unused)
 {
 
 	acl_zone = uma_zcreate("ACL UMA zone", sizeof(struct acl),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 SYSINIT(acls, SI_SUB_ACL, SI_ORDER_FIRST, aclinit, NULL)
Index: head/sys/nfsserver/nfs_syscalls.c
===================================================================
--- head/sys/nfsserver/nfs_syscalls.c	(revision 167210)
+++ head/sys/nfsserver/nfs_syscalls.c	(revision 167211)
@@ -1,760 +1,757 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_syscalls.c	8.5 (Berkeley) 3/30/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/namei.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
 
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #ifdef INET6
 #include <net/if.h>
 #include <netinet6/in6_var.h>
 #endif
 #include <nfs/xdr_subs.h>
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfsserver/nfs.h>
 #include <nfsserver/nfsm_subs.h>
 #include <nfsserver/nfsrvcache.h>
 
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_NFSSVC, "nfsserver_srvsock", "Nfs server structure");
 
 MALLOC_DEFINE(M_NFSRVDESC, "nfsserver_srvdesc", "NFS server socket descriptor");
 MALLOC_DEFINE(M_NFSD, "nfsserver_daemon", "Nfs server daemon structure");
 
 
 #define	TRUE	1
 #define	FALSE	0
 
 SYSCTL_DECL(_vfs_nfsrv);
 
 int		nfsd_waiting = 0;
 int		nfsrv_numnfsd = 0;
 static int	notstarted = 1;
 
 static int	nfs_privport = 0;
 SYSCTL_INT(_vfs_nfsrv, NFS_NFSPRIVPORT, nfs_privport, CTLFLAG_RW,
 	    &nfs_privport, 0, "");
 SYSCTL_INT(_vfs_nfsrv, OID_AUTO, gatherdelay, CTLFLAG_RW,
 	    &nfsrvw_procrastinate, 0, "");
 SYSCTL_INT(_vfs_nfsrv, OID_AUTO, gatherdelay_v3, CTLFLAG_RW,
 	    &nfsrvw_procrastinate_v3, 0, "");
 
 static int	nfssvc_addsock(struct file *, struct sockaddr *,
 		    struct thread *);
 static void	nfsrv_zapsock(struct nfssvc_sock *slp);
 static int	nfssvc_nfsd(struct thread *);
 
 /*
  * NFS server system calls
  */
 
 /*
  * Nfs server psuedo system call for the nfsd's
  * Based on the flag value it either:
  * - adds a socket to the selection list
  * - remains in the kernel as an nfsd
  * - remains in the kernel as an nfsiod
  * For INET6 we suppose that nfsd provides only IN6P_IPV6_V6ONLY sockets
  * and that mountd provides
  *  - sockaddr with no IPv4-mapped addresses
  *  - mask for both INET and INET6 families if there is IPv4-mapped overlap
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfssvc_args {
 	int flag;
 	caddr_t argp;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 nfssvc(struct thread *td, struct nfssvc_args *uap)
 {
 	struct file *fp;
 	struct sockaddr *nam;
 	struct nfsd_args nfsdarg;
 	int error;
 
 	KASSERT(!mtx_owned(&Giant), ("nfssvc(): called with Giant"));
 
 #ifdef MAC
 	error = mac_check_system_nfsd(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 	error = priv_check(td, PRIV_NFSD);
 	if (error)
 		return (error);
 	NET_LOCK_GIANT();
 	NFSD_LOCK();
 	while (nfssvc_sockhead_flag & SLP_INIT) {
 		 nfssvc_sockhead_flag |= SLP_WANTINIT;
 		(void) msleep(&nfssvc_sockhead, &nfsd_mtx, PSOCK,
 		    "nfsd init", 0);
 	}
 	NFSD_UNLOCK();
 	if (uap->flag & NFSSVC_ADDSOCK) {
 		error = copyin(uap->argp, (caddr_t)&nfsdarg, sizeof(nfsdarg));
 		if (error)
 			goto done2;
 		if ((error = fget(td, nfsdarg.sock, &fp)) != 0)
 			goto done2;
 		if (fp->f_type != DTYPE_SOCKET) {
 			fdrop(fp, td);
 			goto done2;
 		}
 		/*
 		 * Get the client address for connected sockets.
 		 */
 		if (nfsdarg.name == NULL || nfsdarg.namelen == 0)
 			nam = NULL;
 		else {
 			error = getsockaddr(&nam, nfsdarg.name,
 					    nfsdarg.namelen);
 			if (error) {
 				fdrop(fp, td);
 				goto done2;
 			}
 		}
 		error = nfssvc_addsock(fp, nam, td);
 		fdrop(fp, td);
 	} else if (uap->flag & NFSSVC_NFSD) {
 		error = nfssvc_nfsd(td);
 	} else {
 		error = ENXIO;
 	}
 	if (error == EINTR || error == ERESTART)
 		error = 0;
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 /*
  * Adds a socket to the list for servicing by nfsds.
  */
 static int
 nfssvc_addsock(struct file *fp, struct sockaddr *mynam, struct thread *td)
 {
 	int siz;
 	struct nfssvc_sock *slp;
 	struct socket *so;
 	int error, s;
 
 	NET_ASSERT_GIANT();
 
 	so = fp->f_data;
 #if 0
 	/*
 	 * XXXRW: If this code is ever enabled, there's a race when running
 	 * MPSAFE.
 	 */
 	tslp = NULL;
 	/*
 	 * Add it to the list, as required.
 	 */
 	if (so->so_proto->pr_protocol == IPPROTO_UDP) {
 		tslp = nfs_udpsock;
 		if (tslp->ns_flag & SLP_VALID) {
 			if (mynam != NULL)
 				FREE(mynam, M_SONAME);
 			return (EPERM);
 		}
 	}
 #endif
 	if (so->so_type == SOCK_STREAM)
 		siz = NFS_MAXPACKET + sizeof (u_long);
 	else
 		siz = NFS_MAXPACKET;
 	error = soreserve(so, siz, siz);
 	if (error) {
 		if (mynam != NULL)
 			FREE(mynam, M_SONAME);
 		return (error);
 	}
 
 	/*
 	 * Set protocol specific options { for now TCP only } and
 	 * reserve some space. For datagram sockets, this can get called
 	 * repeatedly for the same socket, but that isn't harmful.
 	 */
 	if (so->so_type == SOCK_STREAM) {
 		struct sockopt sopt;
 		int val;
 
 		bzero(&sopt, sizeof sopt);
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = SOL_SOCKET;
 		sopt.sopt_name = SO_KEEPALIVE;
 		sopt.sopt_val = &val;
 		sopt.sopt_valsize = sizeof val;
 		val = 1;
 		sosetopt(so, &sopt);
 	}
 	if (so->so_proto->pr_protocol == IPPROTO_TCP) {
 		struct sockopt sopt;
 		int val;
 
 		bzero(&sopt, sizeof sopt);
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = IPPROTO_TCP;
 		sopt.sopt_name = TCP_NODELAY;
 		sopt.sopt_val = &val;
 		sopt.sopt_valsize = sizeof val;
 		val = 1;
 		sosetopt(so, &sopt);
 	}
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_flags &= ~SB_NOINTR;
 	so->so_rcv.sb_timeo = 0;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_flags &= ~SB_NOINTR;
 	so->so_snd.sb_timeo = 0;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	slp = (struct nfssvc_sock *)
 		malloc(sizeof (struct nfssvc_sock), M_NFSSVC,
 		M_WAITOK | M_ZERO);
 	STAILQ_INIT(&slp->ns_rec);
 	NFSD_LOCK();
 	TAILQ_INSERT_TAIL(&nfssvc_sockhead, slp, ns_chain);
 
 	slp->ns_so = so;
 	slp->ns_nam = mynam;
 	fhold(fp);
 	slp->ns_fp = fp;
 	/*
 	 * XXXRW: Socket locking here?
 	 */
 	s = splnet();
 	so->so_upcallarg = (caddr_t)slp;
 	so->so_upcall = nfsrv_rcv;
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_flags |= SB_UPCALL;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	slp->ns_flag = (SLP_VALID | SLP_NEEDQ);
 	nfsrv_wakenfsd(slp);
 	splx(s);
 	NFSD_UNLOCK();
 	return (0);
 }
 
 /*
  * Called by nfssvc() for nfsds. Just loops around servicing rpc requests
  * until it is killed by a signal.
  */
 static int
 nfssvc_nfsd(struct thread *td)
 {
 	int siz;
 	struct nfssvc_sock *slp;
 	struct nfsd *nfsd;
 	struct nfsrv_descript *nd = NULL;
 	struct mbuf *m, *mreq;
 	int error = 0, cacherep, s, sotype, writes_todo;
 	int procrastinate;
 	u_quad_t cur_usec;
 
 	NET_ASSERT_GIANT();
 
 #ifndef nolint
 	cacherep = RC_DOIT;
 	writes_todo = 0;
 #endif
 	nfsd = (struct nfsd *)
 		malloc(sizeof (struct nfsd), M_NFSD, M_WAITOK | M_ZERO);
 	s = splnet();
 	NFSD_LOCK();
 
 	nfsd->nfsd_td = td;
 	TAILQ_INSERT_TAIL(&nfsd_head, nfsd, nfsd_chain);
 	nfsrv_numnfsd++;
 
 	/*
 	 * Loop getting rpc requests until SIGKILL.
 	 */
 	for (;;) {
 		if ((nfsd->nfsd_flag & NFSD_REQINPROG) == 0) {
 			while (nfsd->nfsd_slp == NULL &&
 			    (nfsd_head_flag & NFSD_CHECKSLP) == 0) {
 				nfsd->nfsd_flag |= NFSD_WAITING;
 				nfsd_waiting++;
 				error = msleep(nfsd, &nfsd_mtx,
 				    PSOCK | PCATCH, "-", 0);
 				nfsd_waiting--;
 				if (error)
 					goto done;
 			}
 			if (nfsd->nfsd_slp == NULL &&
 			    (nfsd_head_flag & NFSD_CHECKSLP) != 0) {
 				TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
 				    if ((slp->ns_flag & (SLP_VALID | SLP_DOREC))
 					== (SLP_VALID | SLP_DOREC)) {
 					    slp->ns_flag &= ~SLP_DOREC;
 					    slp->ns_sref++;
 					    nfsd->nfsd_slp = slp;
 					    break;
 				    }
 				}
 				if (slp == NULL)
 					nfsd_head_flag &= ~NFSD_CHECKSLP;
 			}
 			if ((slp = nfsd->nfsd_slp) == NULL)
 				continue;
 			if (slp->ns_flag & SLP_VALID) {
 				if (slp->ns_flag & SLP_DISCONN)
 					nfsrv_zapsock(slp);
 				else if (slp->ns_flag & SLP_NEEDQ) {
 					slp->ns_flag &= ~SLP_NEEDQ;
 					(void) nfs_slplock(slp, 1);
 					NFSD_UNLOCK();
 					nfsrv_rcv(slp->ns_so, (caddr_t)slp,
 						M_TRYWAIT);
 					NFSD_LOCK();
 					nfs_slpunlock(slp);
 				}
 				error = nfsrv_dorec(slp, nfsd, &nd);
 				cur_usec = nfs_curusec();
 				if (error && LIST_FIRST(&slp->ns_tq) &&
 				    LIST_FIRST(&slp->ns_tq)->nd_time <= cur_usec) {
 					error = 0;
 					cacherep = RC_DOIT;
 					writes_todo = 1;
 				} else
 					writes_todo = 0;
 				nfsd->nfsd_flag |= NFSD_REQINPROG;
 			}
 		} else {
 			error = 0;
 			slp = nfsd->nfsd_slp;
 		}
 		if (error || (slp->ns_flag & SLP_VALID) == 0) {
 			if (nd) {
 				if (nd->nd_cr != NULL)
 					crfree(nd->nd_cr);
 				free((caddr_t)nd, M_NFSRVDESC);
 				nd = NULL;
 			}
 			nfsd->nfsd_slp = NULL;
 			nfsd->nfsd_flag &= ~NFSD_REQINPROG;
 			nfsrv_slpderef(slp);
 			continue;
 		}
 		splx(s);
 		sotype = slp->ns_so->so_type;
 		if (nd) {
 		    getmicrotime(&nd->nd_starttime);
 		    if (nd->nd_nam2)
 			nd->nd_nam = nd->nd_nam2;
 		    else
 			nd->nd_nam = slp->ns_nam;
 
 		    /*
 		     * Check to see if authorization is needed.
 		     */
 		    cacherep = nfsrv_getcache(nd, &mreq);
 
 		    if (nfs_privport) {
 			/* Check if source port is privileged */
 			u_short port;
 			struct sockaddr *nam = nd->nd_nam;
 			struct sockaddr_in *sin;
 
 			sin = (struct sockaddr_in *)nam;
 			/*
 			 * INET/INET6 - same code:
 			 *    sin_port and sin6_port are at same offset
 			 */
 			port = ntohs(sin->sin_port);
 			if (port >= IPPORT_RESERVED &&
 			    nd->nd_procnum != NFSPROC_NULL) {
 #ifdef INET6
 			    char b6[INET6_ADDRSTRLEN];
 #if defined(KLD_MODULE)
 	/* Do not use ip6_sprintf: the nfs module should work without INET6. */
 #define ip6_sprintf(buf, a) \
 	 (sprintf((buf), "%x:%x:%x:%x:%x:%x:%x:%x", \
 		  (a)->s6_addr16[0], (a)->s6_addr16[1], \
 		  (a)->s6_addr16[2], (a)->s6_addr16[3], \
 		  (a)->s6_addr16[4], (a)->s6_addr16[5], \
 		  (a)->s6_addr16[6], (a)->s6_addr16[7]), \
 	 (buf))
 #endif
 #endif
 			    nd->nd_procnum = NFSPROC_NOOP;
 			    nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK);
 			    cacherep = RC_DOIT;
 			    printf("NFS request from unprivileged port (%s:%d)\n",
 #ifdef INET6
 				sin->sin_family == AF_INET6 ?
 				    ip6_sprintf(b6, &satosin6(sin)->sin6_addr) :
 #if defined(KLD_MODULE)
 #undef ip6_sprintf
 #endif
 #endif
 				    inet_ntoa(sin->sin_addr), port);
 			}
 		    }
 
 		}
 
 		/*
 		 * Loop to get all the write rpc relies that have been
 		 * gathered together.
 		 */
 		do {
 		    switch (cacherep) {
 		    case RC_DOIT:
 			if (nd && (nd->nd_flag & ND_NFSV3))
 			    procrastinate = nfsrvw_procrastinate_v3;
 			else
 			    procrastinate = nfsrvw_procrastinate;
 			if (writes_todo || (!(nd->nd_flag & ND_NFSV3) &&
 			    nd->nd_procnum == NFSPROC_WRITE &&
 			    procrastinate > 0 && !notstarted))
 			    error = nfsrv_writegather(&nd, slp,
 				nfsd->nfsd_td, &mreq);
 			else
 			    error = (*(nfsrv3_procs[nd->nd_procnum]))(nd,
 				slp, nfsd->nfsd_td, &mreq);
 			if (mreq == NULL)
 				break;
 			if (error != 0 && error != NFSERR_RETVOID) {
 				nfsrvstats.srv_errs++;
 				nfsrv_updatecache(nd, FALSE, mreq);
 				if (nd->nd_nam2)
 					FREE(nd->nd_nam2, M_SONAME);
 				break;
 			}
 			nfsrvstats.srvrpccnt[nd->nd_procnum]++;
 			nfsrv_updatecache(nd, TRUE, mreq);
 			nd->nd_mrep = NULL;
 			/* FALLTHROUGH */
 		    case RC_REPLY:
 			NFSD_UNLOCK();
 			siz = m_length(mreq, NULL);
 			if (siz <= 0 || siz > NFS_MAXPACKET) {
 				printf("mbuf siz=%d\n",siz);
 				panic("Bad nfs svc reply");
 			}
 			m = mreq;
 			m->m_pkthdr.len = siz;
 			m->m_pkthdr.rcvif = NULL;
 			/*
 			 * For stream protocols, prepend a Sun RPC
 			 * Record Mark.
 			 */
 			if (sotype == SOCK_STREAM) {
 				M_PREPEND(m, NFSX_UNSIGNED, M_TRYWAIT);
 				*mtod(m, u_int32_t *) = htonl(0x80000000 | siz);
 			}
 			NFSD_LOCK();
 			if (slp->ns_so->so_proto->pr_flags & PR_CONNREQUIRED)
 				(void) nfs_slplock(slp, 1);
 			if (slp->ns_flag & SLP_VALID) {
 			    NFSD_UNLOCK();
 			    error = nfsrv_send(slp->ns_so, nd->nd_nam2, m);
 			    NFSD_LOCK();
 			} else {
 			    error = EPIPE;
 			    m_freem(m);
 			}
 			if (nd->nd_nam2)
 				FREE(nd->nd_nam2, M_SONAME);
 			if (nd->nd_mrep)
 				m_freem(nd->nd_mrep);
 			if (error == EPIPE)
 				nfsrv_zapsock(slp);
 			if (slp->ns_so->so_proto->pr_flags & PR_CONNREQUIRED)
 				nfs_slpunlock(slp);
 			if (error == EINTR || error == ERESTART) {
 				if (nd->nd_cr != NULL)
 					crfree(nd->nd_cr);
 				free((caddr_t)nd, M_NFSRVDESC);
 				nfsrv_slpderef(slp);
 				s = splnet();
 				goto done;
 			}
 			break;
 		    case RC_DROPIT:
 			m_freem(nd->nd_mrep);
 			if (nd->nd_nam2)
 				FREE(nd->nd_nam2, M_SONAME);
 			break;
 		    };
 		    if (nd) {
 			if (nd->nd_cr != NULL)
 				crfree(nd->nd_cr);
 			FREE((caddr_t)nd, M_NFSRVDESC);
 			nd = NULL;
 		    }
 
 		    /*
 		     * Check to see if there are outstanding writes that
 		     * need to be serviced.
 		     */
 		    cur_usec = nfs_curusec();
 		    s = splsoftclock();
 		    if (LIST_FIRST(&slp->ns_tq) &&
 			LIST_FIRST(&slp->ns_tq)->nd_time <= cur_usec) {
 			cacherep = RC_DOIT;
 			writes_todo = 1;
 		    } else
 			writes_todo = 0;
 		    splx(s);
 		} while (writes_todo);
 		s = splnet();
 		if (nfsrv_dorec(slp, nfsd, &nd)) {
 			nfsd->nfsd_flag &= ~NFSD_REQINPROG;
 			nfsd->nfsd_slp = NULL;
 			nfsrv_slpderef(slp);
 		}
 		KASSERT(!(debug_mpsafenet == 0 && !mtx_owned(&Giant)),
 		    ("nfssvc_nfsd(): debug.mpsafenet=0 && !Giant"));
 		KASSERT(!(debug_mpsafenet == 1 && mtx_owned(&Giant)),
 		    ("nfssvc_nfsd(): debug.mpsafenet=1 && Giant"));
 	}
 done:
 	KASSERT(!(debug_mpsafenet == 0 && !mtx_owned(&Giant)),
 	    ("nfssvc_nfsd(): debug.mpsafenet=0 && !Giant"));
 	KASSERT(!(debug_mpsafenet == 1 && mtx_owned(&Giant)),
 	    ("nfssvc_nfsd(): debug.mpsafenet=1 && Giant"));
 	TAILQ_REMOVE(&nfsd_head, nfsd, nfsd_chain);
 	splx(s);
 	free((caddr_t)nfsd, M_NFSD);
 	if (--nfsrv_numnfsd == 0)
 		nfsrv_init(TRUE);	/* Reinitialize everything */
 	NFSD_UNLOCK();
 	return (error);
 }
 
 /*
  * Shut down a socket associated with an nfssvc_sock structure.
  * Should be called with the send lock set, if required.
  * The trick here is to increment the sref at the start, so that the nfsds
  * will stop using it and clear ns_flag at the end so that it will not be
  * reassigned during cleanup.
  */
 static void
 nfsrv_zapsock(struct nfssvc_sock *slp)
 {
 	struct nfsrv_descript *nwp, *nnwp;
 	struct socket *so;
 	struct file *fp;
 	struct nfsrv_rec *rec;
 	int s;
 
 	NET_ASSERT_GIANT();
 	NFSD_LOCK_ASSERT();
 
 	/*
 	 * XXXRW: By clearing all flags, other threads/etc should ignore
 	 * this slp and we can safely release nfsd_mtx so we can clean
 	 * up the slp safely.
 	 */
 	slp->ns_flag &= ~SLP_ALLFLAGS;
 	fp = slp->ns_fp;
 	if (fp) {
 		NFSD_UNLOCK();
 		slp->ns_fp = NULL;
 		so = slp->ns_so;
 		SOCKBUF_LOCK(&so->so_rcv);
 		so->so_rcv.sb_flags &= ~SB_UPCALL;
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		so->so_upcall = NULL;
 		so->so_upcallarg = NULL;
 		soshutdown(so, SHUT_RDWR);
 		closef(fp, NULL);
 		NFSD_LOCK();
 		if (slp->ns_nam)
 			FREE(slp->ns_nam, M_SONAME);
 		m_freem(slp->ns_raw);
 		while ((rec = STAILQ_FIRST(&slp->ns_rec)) != NULL) {
 			STAILQ_REMOVE_HEAD(&slp->ns_rec, nr_link);
 			if (rec->nr_address)
 				FREE(rec->nr_address, M_SONAME);
 			m_freem(rec->nr_packet);
 			free(rec, M_NFSRVDESC);
 		}
 		s = splsoftclock();
 		for (nwp = LIST_FIRST(&slp->ns_tq); nwp; nwp = nnwp) {
 			nnwp = LIST_NEXT(nwp, nd_tq);
 			LIST_REMOVE(nwp, nd_tq);
 			if (nwp->nd_cr != NULL)
 				crfree(nwp->nd_cr);
 			free((caddr_t)nwp, M_NFSRVDESC);
 		}
 		LIST_INIT(&slp->ns_tq);
 		splx(s);
 	}
 }
 
 /*
  * Derefence a server socket structure. If it has no more references and
  * is no longer valid, you can throw it away.
  */
 void
 nfsrv_slpderef(struct nfssvc_sock *slp)
 {
 
 	NFSD_LOCK_ASSERT();
 
 	if (--(slp->ns_sref) == 0 && (slp->ns_flag & SLP_VALID) == 0) {
 		TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain);
 		free((caddr_t)slp, M_NFSSVC);
 	}
 }
 
 /*
  * Lock a socket against others.
  *
  * XXXRW: Wait argument is always 1 in the caller.  Replace with a real
  * sleep lock?
  */
 int
 nfs_slplock(struct nfssvc_sock *slp, int wait)
 {
 	int *statep = &slp->ns_solock;
 
 	NFSD_LOCK_ASSERT();
 
 	if (!wait && (*statep & NFSRV_SNDLOCK))
 		return(0);	/* already locked, fail */
 	while (*statep & NFSRV_SNDLOCK) {
 		*statep |= NFSRV_WANTSND;
 		(void) msleep(statep, &nfsd_mtx, PZERO - 1, "nfsslplck", 0);
 	}
 	*statep |= NFSRV_SNDLOCK;
 	return (1);
 }
 
 /*
  * Unlock the stream socket for others.
  */
 void
 nfs_slpunlock(struct nfssvc_sock *slp)
 {
 	int *statep = &slp->ns_solock;
 
 	NFSD_LOCK_ASSERT();
 
 	if ((*statep & NFSRV_SNDLOCK) == 0)
 		panic("nfs slpunlock");
 	*statep &= ~NFSRV_SNDLOCK;
 	if (*statep & NFSRV_WANTSND) {
 		*statep &= ~NFSRV_WANTSND;
 		wakeup(statep);
 	}
 }
 
 /*
  * Initialize the data structures for the server.
  * Handshake with any new nfsds starting up to avoid any chance of
  * corruption.
  */
 void
 nfsrv_init(int terminating)
 {
 	struct nfssvc_sock *slp, *nslp;
 
 	NET_ASSERT_GIANT();
 	NFSD_LOCK_ASSERT();
 
 	if (nfssvc_sockhead_flag & SLP_INIT)
 		panic("nfsd init");
 	nfssvc_sockhead_flag |= SLP_INIT;
 	if (terminating) {
 		TAILQ_FOREACH_SAFE(slp, &nfssvc_sockhead, ns_chain, nslp) {
 			if (slp->ns_flag & SLP_VALID)
 				nfsrv_zapsock(slp);
 			TAILQ_REMOVE(&nfssvc_sockhead, slp, ns_chain);
 			free((caddr_t)slp, M_NFSSVC);
 		}
 		nfsrv_cleancache();	/* And clear out server cache */
 	} else
 		nfs_pub.np_valid = 0;
 
 	TAILQ_INIT(&nfssvc_sockhead);
 	nfssvc_sockhead_flag &= ~SLP_INIT;
 	if (nfssvc_sockhead_flag & SLP_WANTINIT) {
 		nfssvc_sockhead_flag &= ~SLP_WANTINIT;
 		wakeup(&nfssvc_sockhead);
 	}
 
 	TAILQ_INIT(&nfsd_head);
 	nfsd_head_flag &= ~NFSD_CHECKSLP;
 
 #if 0
 	nfs_udpsock = (struct nfssvc_sock *)
 	    malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK | M_ZERO);
 	STAILQ_INIT(&nfs_udpsock->ns_rec);
 	TAILQ_INSERT_HEAD(&nfssvc_sockhead, nfs_udpsock, ns_chain);
 
 	nfs_cltpsock = (struct nfssvc_sock *)
 	    malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK | M_ZERO);
 	STAILQ_INIT(&nfs_cltpsock->ns_rec);
 	TAILQ_INSERT_TAIL(&nfssvc_sockhead, nfs_cltpsock, ns_chain);
 #endif
 }
Index: head/sys/security/audit/audit.c
===================================================================
--- head/sys/security/audit/audit.c	(revision 167210)
+++ head/sys/security/audit/audit.c	(revision 167211)
@@ -1,641 +1,636 @@
 /*
  * Copyright (c) 1999-2005 Apple Computer, Inc.
  * Copyright (c) 2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1.  Redistributions of source code must retain the above copyright
  *     notice, this list of conditions and the following disclaimer.
  * 2.  Redistributions in binary form must reproduce the above copyright
  *     notice, this list of conditions and the following disclaimer in the
  *     documentation and/or other materials provided with the distribution.
  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
  *     its contributors may be used to endorse or promote products derived
  *     from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/ipc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/ucred.h>
 #include <sys/uio.h>
 #include <sys/un.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <bsm/audit.h>
 #include <bsm/audit_internal.h>
 #include <bsm/audit_kevents.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
 #include <security/audit/audit.h>
 #include <security/audit/audit_private.h>
 
 #include <vm/uma.h>
 
 static uma_zone_t	audit_record_zone;
 static MALLOC_DEFINE(M_AUDITPROC, "audit_proc", "Audit process storage");
 MALLOC_DEFINE(M_AUDITDATA, "audit_data", "Audit data storage");
 MALLOC_DEFINE(M_AUDITPATH, "audit_path", "Audit path storage");
 MALLOC_DEFINE(M_AUDITTEXT, "audit_text", "Audit text storage");
 
 /*
  * Audit control settings that are set/read by system calls and are
  * hence non-static.
  */
 /*
  * Define the audit control flags.
  */
 int			audit_enabled;
 int			audit_suspended;
 
 /*
  * Flags controlling behavior in low storage situations.  Should we panic if
  * a write fails?  Should we fail stop if we're out of disk space?
  */
 int			audit_panic_on_write_fail;
 int			audit_fail_stop;
 int			audit_argv;
 int			audit_arge;
 
 /*
  * Are we currently "failing stop" due to out of disk space?
  */
 int			audit_in_failure;
 
 /*
  * Global audit statistiscs.
  */
 struct audit_fstat	audit_fstat;
 
 /*
  * Preselection mask for non-attributable events.
  */
 struct au_mask		audit_nae_mask;
 
 /*
  * Mutex to protect global variables shared between various threads and
  * processes.
  */
 struct mtx		audit_mtx;
 
 /*
  * Queue of audit records ready for delivery to disk.  We insert new
  * records at the tail, and remove records from the head.  Also,
  * a count of the number of records used for checking queue depth.
  * In addition, a counter of records that we have allocated but are
  * not yet in the queue, which is needed to estimate the total
  * size of the combined set of records outstanding in the system.
  */
 struct kaudit_queue	audit_q;
 int			audit_q_len;
 int			audit_pre_q_len;
 
 /*
  * Audit queue control settings (minimum free, low/high water marks, etc.)
  */
 struct au_qctrl		audit_qctrl;
 
 /*
  * Condition variable to signal to the worker that it has work to do:
  * either new records are in the queue, or a log replacement is taking
  * place.
  */
 struct cv		audit_worker_cv;
 
 /*
  * Condition variable to flag when crossing the low watermark, meaning that
  * threads blocked due to hitting the high watermark can wake up and continue
  * to commit records.
  */
 struct cv		audit_watermark_cv;
 
 /*
  * Condition variable for  auditing threads wait on when in fail-stop mode.
  * Threads wait on this CV forever (and ever), never seeing the light of
  * day again.
  */
 static struct cv	audit_fail_cv;
 
 /*
  * Construct an audit record for the passed thread.
  */
 static int
 audit_record_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct kaudit_record *ar;
 	struct thread *td;
 
 	KASSERT(sizeof(*ar) == size, ("audit_record_ctor: wrong size"));
 
 	td = arg;
 	ar = mem;
 	bzero(ar, sizeof(*ar));
 	ar->k_ar.ar_magic = AUDIT_RECORD_MAGIC;
 	nanotime(&ar->k_ar.ar_starttime);
 
 	/*
 	 * Export the subject credential.
 	 */
 	cru2x(td->td_ucred, &ar->k_ar.ar_subj_cred);
 	ar->k_ar.ar_subj_ruid = td->td_ucred->cr_ruid;
 	ar->k_ar.ar_subj_rgid = td->td_ucred->cr_rgid;
 	ar->k_ar.ar_subj_egid = td->td_ucred->cr_groups[0];
 	PROC_LOCK(td->td_proc);
 	ar->k_ar.ar_subj_auid = td->td_proc->p_au->ai_auid;
 	ar->k_ar.ar_subj_asid = td->td_proc->p_au->ai_asid;
 	ar->k_ar.ar_subj_pid = td->td_proc->p_pid;
 	ar->k_ar.ar_subj_amask = td->td_proc->p_au->ai_mask;
 	ar->k_ar.ar_subj_term = td->td_proc->p_au->ai_termid;
 	bcopy(td->td_proc->p_comm, ar->k_ar.ar_subj_comm, MAXCOMLEN);
 	PROC_UNLOCK(td->td_proc);
 
 	return (0);
 }
 
 static void
 audit_record_dtor(void *mem, int size, void *arg)
 {
 	struct kaudit_record *ar;
 
 	KASSERT(sizeof(*ar) == size, ("audit_record_dtor: wrong size"));
 
 	ar = mem;
 	if (ar->k_ar.ar_arg_upath1 != NULL)
 		free(ar->k_ar.ar_arg_upath1, M_AUDITPATH);
 	if (ar->k_ar.ar_arg_upath2 != NULL)
 		free(ar->k_ar.ar_arg_upath2, M_AUDITPATH);
 	if (ar->k_ar.ar_arg_text != NULL)
 		free(ar->k_ar.ar_arg_text, M_AUDITTEXT);
 	if (ar->k_udata != NULL)
 		free(ar->k_udata, M_AUDITDATA);
 	if (ar->k_ar.ar_arg_argv != NULL)
 		free(ar->k_ar.ar_arg_argv, M_AUDITTEXT);
 	if (ar->k_ar.ar_arg_envv != NULL)
 		free(ar->k_ar.ar_arg_envv, M_AUDITTEXT);
 }
 
 /*
  * Initialize the Audit subsystem: configuration state, work queue,
  * synchronization primitives, worker thread, and trigger device node.  Also
  * call into the BSM assembly code to initialize it.
  */
 static void
 audit_init(void)
 {
 
 	printf("Security auditing service present\n");
 	audit_enabled = 0;
 	audit_suspended = 0;
 	audit_panic_on_write_fail = 0;
 	audit_fail_stop = 0;
 	audit_in_failure = 0;
 	audit_argv = 0;
 	audit_arge = 0;
 
 	audit_fstat.af_filesz = 0;	/* '0' means unset, unbounded */
 	audit_fstat.af_currsz = 0;
 	audit_nae_mask.am_success = AU_NULL;
 	audit_nae_mask.am_failure = AU_NULL;
 
 	TAILQ_INIT(&audit_q);
 	audit_q_len = 0;
 	audit_pre_q_len = 0;
 	audit_qctrl.aq_hiwater = AQ_HIWATER;
 	audit_qctrl.aq_lowater = AQ_LOWATER;
 	audit_qctrl.aq_bufsz = AQ_BUFSZ;
 	audit_qctrl.aq_minfree = AU_FS_MINFREE;
 
 	mtx_init(&audit_mtx, "audit_mtx", NULL, MTX_DEF);
 	cv_init(&audit_worker_cv, "audit_worker_cv");
 	cv_init(&audit_watermark_cv, "audit_watermark_cv");
 	cv_init(&audit_fail_cv, "audit_fail_cv");
 
 	audit_record_zone = uma_zcreate("audit_record",
 	    sizeof(struct kaudit_record), audit_record_ctor,
 	    audit_record_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
 
 	/* Initialize the BSM audit subsystem. */
 	kau_init();
 
 	audit_trigger_init();
 
 	/* Register shutdown handler. */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, audit_shutdown, NULL,
 	    SHUTDOWN_PRI_FIRST);
 
 	/* Start audit worker thread. */
 	audit_worker_init();
 }
 
 SYSINIT(audit_init, SI_SUB_AUDIT, SI_ORDER_FIRST, audit_init, NULL)
 
 /*
  * Drain the audit queue and close the log at shutdown.  Note that this can
  * be called both from the system shutdown path and also from audit
  * configuration syscalls, so 'arg' and 'howto' are ignored.
  */
 void
 audit_shutdown(void *arg, int howto)
 {
 
 	audit_rotate_vnode(NULL, NULL);
 }
 
 /*
  * Return the current thread's audit record, if any.
  */
 __inline__ struct kaudit_record *
 currecord(void)
 {
 
 	return (curthread->td_ar);
 }
 
 /*
- * MPSAFE
- *
  * XXXAUDIT: There are a number of races present in the code below due to
  * release and re-grab of the mutex.  The code should be revised to become
  * slightly less racy.
  *
  * XXXAUDIT: Shouldn't there be logic here to sleep waiting on available
  * pre_q space, suspending the system call until there is room?
  */
 struct kaudit_record *
 audit_new(int event, struct thread *td)
 {
 	struct kaudit_record *ar;
 	int no_record;
 
 	mtx_lock(&audit_mtx);
 	no_record = (audit_suspended || !audit_enabled);
 	mtx_unlock(&audit_mtx);
 	if (no_record)
 		return (NULL);
 
 	/*
 	 * Note: the number of outstanding uncommitted audit records is
 	 * limited to the number of concurrent threads servicing system calls
 	 * in the kernel.
 	 */
 	ar = uma_zalloc_arg(audit_record_zone, td, M_WAITOK);
 	ar->k_ar.ar_event = event;
 
 	mtx_lock(&audit_mtx);
 	audit_pre_q_len++;
 	mtx_unlock(&audit_mtx);
 
 	return (ar);
 }
 
 void
 audit_free(struct kaudit_record *ar)
 {
 
 	uma_zfree(audit_record_zone, ar);
 }
 
-/*
- * MPSAFE
- */
 void
 audit_commit(struct kaudit_record *ar, int error, int retval)
 {
 	au_event_t event;
 	au_class_t class;
 	au_id_t auid;
 	int sorf;
 	struct au_mask *aumask;
 
 	if (ar == NULL)
 		return;
 
 	/*
 	 * Decide whether to commit the audit record by checking the
 	 * error value from the system call and using the appropriate
 	 * audit mask.
 	 *
 	 * XXXAUDIT: Synchronize access to audit_nae_mask?
 	 */
 	if (ar->k_ar.ar_subj_auid == AU_DEFAUDITID)
 		aumask = &audit_nae_mask;
 	else
 		aumask = &ar->k_ar.ar_subj_amask;
 
 	if (error)
 		sorf = AU_PRS_FAILURE;
 	else
 		sorf = AU_PRS_SUCCESS;
 
 	switch(ar->k_ar.ar_event) {
 
 	case AUE_OPEN_RWTC:
 		/* The open syscall always writes a AUE_OPEN_RWTC event; change
 		 * it to the proper type of event based on the flags and the
 		 * error value.
 		 */
 		ar->k_ar.ar_event = flags_and_error_to_openevent(
 		    ar->k_ar.ar_arg_fflags, error);
 		break;
 
 	case AUE_SYSCTL:
 		ar->k_ar.ar_event = ctlname_to_sysctlevent(
 		    ar->k_ar.ar_arg_ctlname, ar->k_ar.ar_valid_arg);
 		break;
 
 	case AUE_AUDITON:
 		/* Convert the auditon() command to an event */
 		ar->k_ar.ar_event = auditon_command_event(ar->k_ar.ar_arg_cmd);
 		break;
 	}
 
 	auid = ar->k_ar.ar_subj_auid;
 	event = ar->k_ar.ar_event;
 	class = au_event_class(event);
 
 	ar->k_ar_commit |= AR_COMMIT_KERNEL;
 	if (au_preselect(event, class, aumask, sorf) != 0)
 		ar->k_ar_commit |= AR_PRESELECT_TRAIL;
 	if (audit_pipe_preselect(auid, event, class, sorf,
 	    ar->k_ar_commit & AR_PRESELECT_TRAIL) != 0)
 		ar->k_ar_commit |= AR_PRESELECT_PIPE;
 	if ((ar->k_ar_commit & (AR_PRESELECT_TRAIL | AR_PRESELECT_PIPE |
 	    AR_PRESELECT_USER_TRAIL | AR_PRESELECT_USER_PIPE)) == 0) {
 		mtx_lock(&audit_mtx);
 		audit_pre_q_len--;
 		mtx_unlock(&audit_mtx);
 		audit_free(ar);
 		return;
 	}
 
 	ar->k_ar.ar_errno = error;
 	ar->k_ar.ar_retval = retval;
 
 	/*
 	 * We might want to do some system-wide post-filtering
 	 * here at some point.
 	 */
 
 	/*
 	 * Timestamp system call end.
 	 */
 	nanotime(&ar->k_ar.ar_endtime);
 
 	mtx_lock(&audit_mtx);
 
 	/*
 	 * Note: it could be that some records initiated while audit was
 	 * enabled should still be committed?
 	 */
 	if (audit_suspended || !audit_enabled) {
 		audit_pre_q_len--;
 		mtx_unlock(&audit_mtx);
 		audit_free(ar);
 		return;
 	}
 
 	/*
 	 * Constrain the number of committed audit records based on
 	 * the configurable parameter.
 	 */
 	while (audit_q_len >= audit_qctrl.aq_hiwater) {
 		AUDIT_PRINTF(("audit_commit: sleeping to wait for "
 		   "audit queue to drain below high water mark\n"));
 		cv_wait(&audit_watermark_cv, &audit_mtx);
 		AUDIT_PRINTF(("audit_commit: woke up waiting for "
 		   "audit queue draining\n"));
 	}
 
 	TAILQ_INSERT_TAIL(&audit_q, ar, k_q);
 	audit_q_len++;
 	audit_pre_q_len--;
 	cv_signal(&audit_worker_cv);
 	mtx_unlock(&audit_mtx);
 }
 
 /*
  * audit_syscall_enter() is called on entry to each system call.  It is
  * responsible for deciding whether or not to audit the call (preselection),
  * and if so, allocating a per-thread audit record.  audit_new() will fill in
  * basic thread/credential properties.
  */
 void
 audit_syscall_enter(unsigned short code, struct thread *td)
 {
 	struct au_mask *aumask;
 	au_class_t class;
 	au_event_t event;
 	au_id_t auid;
 
 	KASSERT(td->td_ar == NULL, ("audit_syscall_enter: td->td_ar != NULL"));
 
 	/*
 	 * In FreeBSD, each ABI has its own system call table, and hence
 	 * mapping of system call codes to audit events.  Convert the code to
 	 * an audit event identifier using the process system call table
 	 * reference.  In Darwin, there's only one, so we use the global
 	 * symbol for the system call table.  No audit record is generated
 	 * for bad system calls, as no operation has been performed.
 	 */
 	if (code >= td->td_proc->p_sysent->sv_size)
 		return;
 
 	event = td->td_proc->p_sysent->sv_table[code].sy_auevent;
 	if (event == AUE_NULL)
 		return;
 
 	/*
 	 * Check which audit mask to use; either the kernel non-attributable
 	 * event mask or the process audit mask.
 	 */
 	auid = td->td_proc->p_au->ai_auid;
 	if (auid == AU_DEFAUDITID)
 		aumask = &audit_nae_mask;
 	else
 		aumask = &td->td_proc->p_au->ai_mask;
 
 	/*
 	 * Allocate an audit record, if preselection allows it, and store
 	 * in the thread for later use.
 	 */
 	class = au_event_class(event);
 	if (au_preselect(event, class, aumask, AU_PRS_BOTH)) {
 		/*
 		 * If we're out of space and need to suspend unprivileged
 		 * processes, do that here rather than trying to allocate
 		 * another audit record.
 		 *
 		 * Note: we might wish to be able to continue here in the
 		 * future, if the system recovers.  That should be possible
 		 * by means of checking the condition in a loop around
 		 * cv_wait().  It might be desirable to reevaluate whether an
 		 * audit record is still required for this event by
 		 * re-calling au_preselect().
 		 */
 		if (audit_in_failure &&
 		    priv_check(td, PRIV_AUDIT_FAILSTOP) != 0) {
 			cv_wait(&audit_fail_cv, &audit_mtx);
 			panic("audit_failing_stop: thread continued");
 		}
 		td->td_ar = audit_new(event, td);
 	} else if (audit_pipe_preselect(auid, event, class, AU_PRS_BOTH, 0))
 		td->td_ar = audit_new(event, td);
 	else
 		td->td_ar = NULL;
 }
 
 /*
  * audit_syscall_exit() is called from the return of every system call, or in
  * the event of exit1(), during the execution of exit1().  It is responsible
  * for committing the audit record, if any, along with return condition.
  */
 void
 audit_syscall_exit(int error, struct thread *td)
 {
 	int retval;
 
 	/*
 	 * Commit the audit record as desired; once we pass the record
 	 * into audit_commit(), the memory is owned by the audit
 	 * subsystem.
 	 * The return value from the system call is stored on the user
 	 * thread. If there was an error, the return value is set to -1,
 	 * imitating the behavior of the cerror routine.
 	 */
 	if (error)
 		retval = -1;
 	else
 		retval = td->td_retval[0];
 
 	audit_commit(td->td_ar, error, retval);
 	if (td->td_ar != NULL)
 		AUDIT_PRINTF(("audit record committed by pid %d\n",
 			td->td_proc->p_pid));
 	td->td_ar = NULL;
 
 }
 
 /*
  * Allocate storage for a new process (init, or otherwise).
  */
 void
 audit_proc_alloc(struct proc *p)
 {
 
 	KASSERT(p->p_au == NULL, ("audit_proc_alloc: p->p_au != NULL (%d)",
 	    p->p_pid));
 	p->p_au = malloc(sizeof(*(p->p_au)), M_AUDITPROC, M_WAITOK);
 }
 
 /*
  * Allocate storage for a new thread.
  */
 void
 audit_thread_alloc(struct thread *td)
 {
 
 	td->td_ar = NULL;
 }
 
 /*
  * Thread destruction.
  */
 void
 audit_thread_free(struct thread *td)
 {
 
 	KASSERT(td->td_ar == NULL, ("audit_thread_free: td_ar != NULL"));
 }
 
 /*
  * Initialize audit information for the first kernel process (proc 0) and for
  * the first user process (init).
  *
  * XXX It is not clear what the initial values should be for audit ID,
  * session ID, etc.
  */
 void
 audit_proc_kproc0(struct proc *p)
 {
 
 	KASSERT(p->p_au != NULL, ("audit_proc_kproc0: p->p_au == NULL (%d)",
 	    p->p_pid));
 	bzero(p->p_au, sizeof(*(p)->p_au));
 }
 
 void
 audit_proc_init(struct proc *p)
 {
 
 	KASSERT(p->p_au != NULL, ("audit_proc_init: p->p_au == NULL (%d)",
 	    p->p_pid));
 	bzero(p->p_au, sizeof(*(p)->p_au));
 	p->p_au->ai_auid = AU_DEFAUDITID;
 }
 
 /*
  * Copy the audit info from the parent process to the child process when
  * a fork takes place.
  */
 void
 audit_proc_fork(struct proc *parent, struct proc *child)
 {
 
 	PROC_LOCK_ASSERT(parent, MA_OWNED);
 	PROC_LOCK_ASSERT(child, MA_OWNED);
 	KASSERT(parent->p_au != NULL,
 	    ("audit_proc_fork: parent->p_au == NULL (%d)", parent->p_pid));
 	KASSERT(child->p_au != NULL,
 	    ("audit_proc_fork: child->p_au == NULL (%d)", child->p_pid));
 	bcopy(parent->p_au, child->p_au, sizeof(*child->p_au));
 }
 
 /*
  * Free the auditing structure for the process.
  */
 void
 audit_proc_free(struct proc *p)
 {
 
 	KASSERT(p->p_au != NULL, ("p->p_au == NULL (%d)", p->p_pid));
 	free(p->p_au, M_AUDITPROC);
 	p->p_au = NULL;
 }
Index: head/sys/security/audit/audit_syscalls.c
===================================================================
--- head/sys/security/audit/audit_syscalls.c	(revision 167210)
+++ head/sys/security/audit/audit_syscalls.c	(revision 167211)
@@ -1,692 +1,680 @@
 /*
  * Copyright (c) 1999-2005 Apple Computer, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1.  Redistributions of source code must retain the above copyright
  *     notice, this list of conditions and the following disclaimer.
  * 2.  Redistributions in binary form must reproduce the above copyright
  *     notice, this list of conditions and the following disclaimer in the
  *     documentation and/or other materials provided with the distribution.
  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
  *     its contributors may be used to endorse or promote products derived
  *     from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/jail.h>
 
 #include <bsm/audit.h>
 #include <bsm/audit_kevents.h>
 #include <security/audit/audit.h>
 #include <security/audit/audit_private.h>
 
 #ifdef AUDIT
 
 /*
- * MPSAFE
- *
  * System call to allow a user space application to submit a BSM audit record
  * to the kernel for inclusion in the audit log. This function does little
  * verification on the audit record that is submitted.
  *
  * XXXAUDIT: Audit preselection for user records does not currently work,
  * since we pre-select only based on the AUE_audit event type, not the event
  * type submitted as part of the user audit data.
  */
 /* ARGSUSED */
 int
 audit(struct thread *td, struct audit_args *uap)
 {
 	int error;
 	void * rec;
 	struct kaudit_record *ar;
 
 	if (jailed(td->td_ucred))
 		return (ENOSYS);
 	error = priv_check(td, PRIV_AUDIT_SUBMIT);
 	if (error)
 		return (error);
 
 	if ((uap->length <= 0) || (uap->length > audit_qctrl.aq_bufsz))
 		return (EINVAL);
 
 	ar = currecord();
 
 	/*
 	 * If there's no current audit record (audit() itself not audited)
 	 * commit the user audit record.
 	 */
 	if (ar == NULL) {
 
 		/*
 		 * This is not very efficient; we're required to allocate a
 		 * complete kernel audit record just so the user record can
 		 * tag along.
 		 *
 		 * XXXAUDIT: Maybe AUE_AUDIT in the system call context and
 		 * special pre-select handling?
 		 */
 		td->td_ar = audit_new(AUE_NULL, td);
 		if (td->td_ar == NULL)
 			return (ENOTSUP);
 		ar = td->td_ar;
 	}
 
 	if (uap->length > MAX_AUDIT_RECORD_SIZE)
 		return (EINVAL);
 
 	rec = malloc(uap->length, M_AUDITDATA, M_WAITOK);
 
 	error = copyin(uap->record, rec, uap->length);
 	if (error)
 		goto free_out;
 
 	/* Verify the record. */
 	if (bsm_rec_verify(rec) == 0) {
 		error = EINVAL;
 		goto free_out;
 	}
 
 	/*
 	 * Attach the user audit record to the kernel audit record. Because
 	 * this system call is an auditable event, we will write the user
 	 * record along with the record for this audit event.
 	 *
 	 * XXXAUDIT: KASSERT appropriate starting values of k_udata, k_ulen,
 	 * k_ar_commit & AR_COMMIT_USER?
 	 */
 	ar->k_udata = rec;
 	ar->k_ulen  = uap->length;
 	ar->k_ar_commit |= AR_COMMIT_USER;
 
 	/*
 	 * Currently we assume that all preselection has been performed in
 	 * userspace. We unconditionally set these masks so that the records
 	 * get committed both to the trail and pipe.  In the future we will
 	 * want to setup kernel based preselection.
 	 */
 	ar->k_ar_commit |= (AR_PRESELECT_USER_TRAIL | AR_PRESELECT_USER_PIPE);
 	return (0);
 
 free_out:
 	/*
 	 * audit_syscall_exit() will free the audit record on the thread even
 	 * if we allocated it above.
 	 */
 	free(rec, M_AUDITDATA);
 	return (error);
 }
 
 /*
- * MPSAFE
- *
  *  System call to manipulate auditing.
  */
 /* ARGSUSED */
 int
 auditon(struct thread *td, struct auditon_args *uap)
 {
 	int error;
 	union auditon_udata udata;
 	struct proc *tp;
 
 	if (jailed(td->td_ucred))
 		return (ENOSYS);
 	AUDIT_ARG(cmd, uap->cmd);
 	error = priv_check(td, PRIV_AUDIT_CONTROL);
 	if (error)
 		return (error);
 
 	if ((uap->length <= 0) || (uap->length > sizeof(union auditon_udata)))
 		return (EINVAL);
 
 	memset((void *)&udata, 0, sizeof(udata));
 
 	/*
 	 * Some of the GET commands use the arguments too.
 	 */
 	switch (uap->cmd) {
 	case A_SETPOLICY:
 	case A_SETKMASK:
 	case A_SETQCTRL:
 	case A_SETSTAT:
 	case A_SETUMASK:
 	case A_SETSMASK:
 	case A_SETCOND:
 	case A_SETCLASS:
 	case A_SETPMASK:
 	case A_SETFSIZE:
 	case A_SETKAUDIT:
 	case A_GETCLASS:
 	case A_GETPINFO:
 	case A_GETPINFO_ADDR:
 	case A_SENDTRIGGER:
 		error = copyin(uap->data, (void *)&udata, uap->length);
 		if (error)
 			return (error);
 		AUDIT_ARG(auditon, &udata);
 		break;
 	}
 
 	/*
 	 * XXXAUDIT: Locking?
 	 */
 	switch (uap->cmd) {
 	case A_GETPOLICY:
 		if (!audit_fail_stop)
 			udata.au_policy |= AUDIT_CNT;
 		if (audit_panic_on_write_fail)
 			udata.au_policy |= AUDIT_AHLT;
 		if (audit_argv)
 			udata.au_policy |= AUDIT_ARGV;
 		if (audit_arge)
 			udata.au_policy |= AUDIT_ARGE;
 		break;
 
 	case A_SETPOLICY:
 		if (udata.au_policy & ~(AUDIT_CNT|AUDIT_AHLT|AUDIT_ARGV|
 		    AUDIT_ARGE))
 			return (EINVAL);
 		/*
 		 * XXX - Need to wake up waiters if the policy relaxes?
 		 */
 		audit_fail_stop = ((udata.au_policy & AUDIT_CNT) == 0);
 		audit_panic_on_write_fail = (udata.au_policy & AUDIT_AHLT);
 		audit_argv = (udata.au_policy & AUDIT_ARGV);
 		audit_arge = (udata.au_policy & AUDIT_ARGE);
 		break;
 
 	case A_GETKMASK:
 		udata.au_mask = audit_nae_mask;
 		break;
 
 	case A_SETKMASK:
 		audit_nae_mask = udata.au_mask;
 		break;
 
 	case A_GETQCTRL:
 		udata.au_qctrl = audit_qctrl;
 		break;
 
 	case A_SETQCTRL:
 		if ((udata.au_qctrl.aq_hiwater > AQ_MAXHIGH) ||
 		    (udata.au_qctrl.aq_lowater >= udata.au_qctrl.aq_hiwater) ||
 		    (udata.au_qctrl.aq_bufsz > AQ_MAXBUFSZ) ||
 		    (udata.au_qctrl.aq_minfree < 0) ||
 		    (udata.au_qctrl.aq_minfree > 100))
 			return (EINVAL);
 
 		audit_qctrl = udata.au_qctrl;
 		/* XXX The queue delay value isn't used with the kernel. */
 		audit_qctrl.aq_delay = -1;
 		break;
 
 	case A_GETCWD:
 		return (ENOSYS);
 		break;
 
 	case A_GETCAR:
 		return (ENOSYS);
 		break;
 
 	case A_GETSTAT:
 		return (ENOSYS);
 		break;
 
 	case A_SETSTAT:
 		return (ENOSYS);
 		break;
 
 	case A_SETUMASK:
 		return (ENOSYS);
 		break;
 
 	case A_SETSMASK:
 		return (ENOSYS);
 		break;
 
 	case A_GETCOND:
 		if (audit_enabled && !audit_suspended)
 			udata.au_cond = AUC_AUDITING;
 		else
 			udata.au_cond = AUC_NOAUDIT;
 		break;
 
 	case A_SETCOND:
 		if (udata.au_cond == AUC_NOAUDIT)
 			audit_suspended = 1;
 		if (udata.au_cond == AUC_AUDITING)
 			audit_suspended = 0;
 		if (udata.au_cond == AUC_DISABLED) {
 			audit_suspended = 1;
 			audit_shutdown(NULL, 0);
 		}
 		break;
 
 	case A_GETCLASS:
 		udata.au_evclass.ec_class = au_event_class(
 		    udata.au_evclass.ec_number);
 		break;
 
 	case A_SETCLASS:
 		au_evclassmap_insert(udata.au_evclass.ec_number,
 		    udata.au_evclass.ec_class);
 		break;
 
 	case A_GETPINFO:
 		if (udata.au_aupinfo.ap_pid < 1)
 			return (EINVAL);
 
 		if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL)
 			return (EINVAL);
 		if (p_cansee(td, tp) != 0) {
 			PROC_UNLOCK(tp);
 			return (EINVAL);
 		}
 
 		udata.au_aupinfo.ap_auid = tp->p_au->ai_auid;
 		udata.au_aupinfo.ap_mask.am_success =
 		    tp->p_au->ai_mask.am_success;
 		udata.au_aupinfo.ap_mask.am_failure =
 		    tp->p_au->ai_mask.am_failure;
 		udata.au_aupinfo.ap_termid.machine =
 		    tp->p_au->ai_termid.machine;
 		udata.au_aupinfo.ap_termid.port = tp->p_au->ai_termid.port;
 		udata.au_aupinfo.ap_asid = tp->p_au->ai_asid;
 		PROC_UNLOCK(tp);
 		break;
 
 	case A_SETPMASK:
 		if (udata.au_aupinfo.ap_pid < 1)
 			return (EINVAL);
 
 		if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL)
 			return (EINVAL);
 		if (p_cansee(td, tp) != 0) {
 			PROC_UNLOCK(tp);
 			return (EINVAL);
 		}
 
 		tp->p_au->ai_mask.am_success =
 		    udata.au_aupinfo.ap_mask.am_success;
 		tp->p_au->ai_mask.am_failure =
 		    udata.au_aupinfo.ap_mask.am_failure;
 		PROC_UNLOCK(tp);
 		break;
 
 	case A_SETFSIZE:
 		if ((udata.au_fstat.af_filesz != 0) &&
 		   (udata.au_fstat.af_filesz < MIN_AUDIT_FILE_SIZE))
 			return (EINVAL);
 		audit_fstat.af_filesz = udata.au_fstat.af_filesz;
 		break;
 
 	case A_GETFSIZE:
 		udata.au_fstat.af_filesz = audit_fstat.af_filesz;
 		udata.au_fstat.af_currsz = audit_fstat.af_currsz;
 		break;
 
 	case A_GETPINFO_ADDR:
 		return (ENOSYS);
 		break;
 
 	case A_GETKAUDIT:
 		return (ENOSYS);
 		break;
 
 	case A_SETKAUDIT:
 		return (ENOSYS);
 		break;
 
 	case A_SENDTRIGGER:
 		if ((udata.au_trigger < AUDIT_TRIGGER_MIN) ||
 		    (udata.au_trigger > AUDIT_TRIGGER_MAX))
 			return (EINVAL);
 		return (send_trigger(udata.au_trigger));
 	}
 
 	/*
 	 * Copy data back to userspace for the GET comands.
 	 */
 	switch (uap->cmd) {
 	case A_GETPOLICY:
 	case A_GETKMASK:
 	case A_GETQCTRL:
 	case A_GETCWD:
 	case A_GETCAR:
 	case A_GETSTAT:
 	case A_GETCOND:
 	case A_GETCLASS:
 	case A_GETPINFO:
 	case A_GETFSIZE:
 	case A_GETPINFO_ADDR:
 	case A_GETKAUDIT:
 		error = copyout((void *)&udata, uap->data, uap->length);
 		if (error)
 			return (error);
 		break;
 	}
 
 	return (0);
 }
 
 /*
- * MPSAFE
- *
  * System calls to manage the user audit information.
  */
 /* ARGSUSED */
 int
 getauid(struct thread *td, struct getauid_args *uap)
 {
 	int error;
 	au_id_t id;
 
 	if (jailed(td->td_ucred))
 		return (ENOSYS);
 	error = priv_check(td, PRIV_AUDIT_GETAUDIT);
 	if (error)
 		return (error);
 
 	/*
 	 * XXX: Integer read on static pointer dereference: doesn't need
 	 * locking?
 	 */
 	PROC_LOCK(td->td_proc);
 	id = td->td_proc->p_au->ai_auid;
 	PROC_UNLOCK(td->td_proc);
 	return copyout(&id, uap->auid, sizeof(id));
 }
 
-/* MPSAFE */
 /* ARGSUSED */
 int
 setauid(struct thread *td, struct setauid_args *uap)
 {
 	int error;
 	au_id_t id;
 
 	if (jailed(td->td_ucred))
 		return (ENOSYS);
 	error = priv_check(td, PRIV_AUDIT_SETAUDIT);
 	if (error)
 		return (error);
 
 	error = copyin(uap->auid, &id, sizeof(id));
 	if (error)
 		return (error);
 
 	audit_arg_auid(id);
 
 	/*
 	 * XXX: Integer write on static pointer dereference: doesn't need
 	 * locking?
 	 *
 	 * XXXAUDIT: Might need locking to serialize audit events in the same
 	 * order as change events?  Or maybe that's an under-solveable
 	 * problem.
 	 *
 	 * XXXRW: Test privilege while holding the proc lock?
 	 */
 	PROC_LOCK(td->td_proc);
 	td->td_proc->p_au->ai_auid = id;
 	PROC_UNLOCK(td->td_proc);
 
 	return (0);
 }
 
 /*
- * MPSAFE
  * System calls to get and set process audit information.
  */
 /* ARGSUSED */
 int
 getaudit(struct thread *td, struct getaudit_args *uap)
 {
 	struct auditinfo ai;
 	int error;
 
 	if (jailed(td->td_ucred))
 		return (ENOSYS);
 	error = priv_check(td, PRIV_AUDIT_GETAUDIT);
 	if (error)
 		return (error);
 
 	PROC_LOCK(td->td_proc);
 	ai = *td->td_proc->p_au;
 	PROC_UNLOCK(td->td_proc);
 
 	return (copyout(&ai, uap->auditinfo, sizeof(ai)));
 }
 
-/* MPSAFE */
 /* ARGSUSED */
 int
 setaudit(struct thread *td, struct setaudit_args *uap)
 {
 	struct auditinfo ai;
 	int error;
 
 	if (jailed(td->td_ucred))
 		return (ENOSYS);
 	error = priv_check(td, PRIV_AUDIT_SETAUDIT);
 	if (error)
 		return (error);
 
 	error = copyin(uap->auditinfo, &ai, sizeof(ai));
 	if (error)
 		return (error);
 
 	audit_arg_auditinfo(&ai);
 
 	/*
 	 * XXXRW: Test privilege while holding the proc lock?
 	*/
 	PROC_LOCK(td->td_proc);
 	*td->td_proc->p_au = ai;
 	PROC_UNLOCK(td->td_proc);
 
 	return (0);
 }
 
-/* MPSAFE */
 /* ARGSUSED */
 int
 getaudit_addr(struct thread *td, struct getaudit_addr_args *uap)
 {
 	int error;
 
 	if (jailed(td->td_ucred))
 		return (ENOSYS);
 	error = priv_check(td, PRIV_AUDIT_GETAUDIT);
 	if (error)
 		return (error);
 	return (ENOSYS);
 }
 
-/* MPSAFE */
 /* ARGSUSED */
 int
 setaudit_addr(struct thread *td, struct setaudit_addr_args *uap)
 {
 	int error;
 
 	if (jailed(td->td_ucred))
 		return (ENOSYS);
 	error = priv_check(td, PRIV_AUDIT_SETAUDIT);
 	if (error)
 		return (error);
 	return (ENOSYS);
 }
 
 /*
- * MPSAFE
  * Syscall to manage audit files.
  */
 /* ARGSUSED */
 int
 auditctl(struct thread *td, struct auditctl_args *uap)
 {
 	struct nameidata nd;
 	struct ucred *cred;
 	struct vnode *vp;
 	int error = 0;
 	int flags, vfslocked;
 
 	if (jailed(td->td_ucred))
 		return (ENOSYS);
 	error = priv_check(td, PRIV_AUDIT_CONTROL);
 	if (error)
 		return (error);
 
 	vp = NULL;
 	cred = NULL;
 
 	/*
 	 * If a path is specified, open the replacement vnode, perform
 	 * validity checks, and grab another reference to the current
 	 * credential.
 	 *
 	 * On Darwin, a NULL path argument is also used to disable audit.
 	 */
 	if (uap->path == NULL)
 		return (EINVAL);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	flags = AUDIT_OPEN_FLAGS;
 	error = vn_open(&nd, &flags, 0, -1);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	VOP_UNLOCK(vp, 0, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (vp->v_type != VREG) {
 		vn_close(vp, AUDIT_CLOSE_FLAGS, td->td_ucred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EINVAL);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	cred = td->td_ucred;
 	crhold(cred);
 
 	/*
 	 * XXXAUDIT: Should audit_suspended actually be cleared by
 	 * audit_worker?
 	 */
 	audit_suspended = 0;
 
 	audit_rotate_vnode(cred, vp);
 
 	return (error);
 }
 
 #else /* !AUDIT */
 
 int
 audit(struct thread *td, struct audit_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 auditon(struct thread *td, struct auditon_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 getauid(struct thread *td, struct getauid_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 setauid(struct thread *td, struct setauid_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 getaudit(struct thread *td, struct getaudit_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 setaudit(struct thread *td, struct setaudit_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 getaudit_addr(struct thread *td, struct getaudit_addr_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 setaudit_addr(struct thread *td, struct setaudit_addr_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 auditctl(struct thread *td, struct auditctl_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 void
 audit_proc_init(struct proc *p)
 {
 
 }
 
 void
 audit_proc_fork(struct proc *parent, struct proc *child)
 {
 
 }
 
 void
 audit_proc_free(struct proc *p)
 {
 
 }
 
 #endif /* AUDIT */
Index: head/sys/security/mac/mac_syscalls.c
===================================================================
--- head/sys/security/mac/mac_syscalls.c	(revision 167210)
+++ head/sys/security/mac/mac_syscalls.c	(revision 167211)
@@ -1,736 +1,706 @@
 /*-
  * Copyright (c) 1999-2002, 2006 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2005 Networks Associates Technology, Inc.
  * Copyright (c) 2005-2006 SPARTA, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract 
  * N66001-04-C-6019 ("SEFOS").
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mac.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/socket.h>
 #include <sys/pipe.h>
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 #ifdef MAC
 
-/*
- * MPSAFE
- */
 int
 __mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
 {
 	char *elements, *buffer;
 	struct mac mac;
 	struct proc *tproc;
 	struct ucred *tcred;
 	int error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	tproc = pfind(uap->pid);
 	if (tproc == NULL)
 		return (ESRCH);
 
 	tcred = NULL;				/* Satisfy gcc. */
 	error = p_cansee(td, tproc);
 	if (error == 0)
 		tcred = crhold(tproc->p_ucred);
 	PROC_UNLOCK(tproc);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		crfree(tcred);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	error = mac_externalize_cred_label(tcred->cr_label, elements,
 	    buffer, mac.m_buflen);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 	crfree(tcred);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 __mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
 {
 	char *elements, *buffer;
 	struct mac mac;
 	int error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	error = mac_externalize_cred_label(td->td_ucred->cr_label,
 	    elements, buffer, mac.m_buflen);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 __mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
 {
 	struct ucred *newcred, *oldcred;
 	struct label *intlabel;
 	struct proc *p;
 	struct mac mac;
 	char *buffer;
 	int error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_cred_label_alloc();
 	error = mac_internalize_cred_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error)
 		goto out;
 
 	newcred = crget();
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 	error = mac_check_cred_relabel(oldcred, intlabel);
 	if (error) {
 		PROC_UNLOCK(p);
 		crfree(newcred);
 		goto out;
 	}
 
 	setsugid(p);
 	crcopy(newcred, oldcred);
 	mac_relabel_cred(newcred, intlabel);
 	p->p_ucred = newcred;
 
 	/*
 	 * Grab additional reference for use while revoking mmaps, prior to
 	 * releasing the proc lock and sharing the cred.
 	 */
 	crhold(newcred);
 	PROC_UNLOCK(p);
 
 	mac_cred_mmapped_drop_perms(td, newcred);
 
 	crfree(newcred);	/* Free revocation reference. */
 	crfree(oldcred);
 
 out:
 	mac_cred_label_free(intlabel);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 __mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
 {
 	char *elements, *buffer;
 	struct label *intlabel;
 	struct file *fp;
 	struct mac mac;
 	struct vnode *vp;
 	struct pipe *pipe;
 	struct socket *so;
 	short label_type;
 	int vfslocked, error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	error = fget(td, uap->fd, &fp);
 	if (error)
 		goto out;
 
 	label_type = fp->f_type;
 	switch (fp->f_type) {
 	case DTYPE_FIFO:
 	case DTYPE_VNODE:
 		vp = fp->f_vnode;
 		intlabel = mac_vnode_label_alloc();
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		mac_copy_vnode_label(vp->v_label, intlabel);
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		error = mac_externalize_vnode_label(intlabel, elements,
 		    buffer, mac.m_buflen);
 		mac_vnode_label_free(intlabel);
 		break;
 
 	case DTYPE_PIPE:
 		pipe = fp->f_data;
 		intlabel = mac_pipe_label_alloc();
 		PIPE_LOCK(pipe);
 		mac_copy_pipe_label(pipe->pipe_pair->pp_label, intlabel);
 		PIPE_UNLOCK(pipe);
 		error = mac_externalize_pipe_label(intlabel, elements,
 		    buffer, mac.m_buflen);
 		mac_pipe_label_free(intlabel);
 		break;
 
 	case DTYPE_SOCKET:
 		so = fp->f_data;
 		intlabel = mac_socket_label_alloc(M_WAITOK);
 		NET_LOCK_GIANT();
 		SOCK_LOCK(so);
 		mac_copy_socket_label(so->so_label, intlabel);
 		SOCK_UNLOCK(so);
 		NET_UNLOCK_GIANT();
 		error = mac_externalize_socket_label(intlabel, elements,
 		    buffer, mac.m_buflen);
 		mac_socket_label_free(intlabel);
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	fdrop(fp, td);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 out:
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 __mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
 {
 	char *elements, *buffer;
 	struct nameidata nd;
 	struct label *intlabel;
 	struct mac mac;
 	int vfslocked, error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE,
 	    uap->path_p, td);
 	error = namei(&nd);
 	if (error)
 		goto out;
 
 	intlabel = mac_vnode_label_alloc();
 	vfslocked = NDHASGIANT(&nd);
 	mac_copy_vnode_label(nd.ni_vp->v_label, intlabel);
 	error = mac_externalize_vnode_label(intlabel, elements, buffer,
 	    mac.m_buflen);
 
 	NDFREE(&nd, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 	mac_vnode_label_free(intlabel);
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 out:
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 __mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
 {
 	char *elements, *buffer;
 	struct nameidata nd;
 	struct label *intlabel;
 	struct mac mac;
 	int vfslocked, error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
 	    uap->path_p, td);
 	error = namei(&nd);
 	if (error)
 		goto out;
 
 	intlabel = mac_vnode_label_alloc();
 	vfslocked = NDHASGIANT(&nd);
 	mac_copy_vnode_label(nd.ni_vp->v_label, intlabel);
 	error = mac_externalize_vnode_label(intlabel, elements, buffer,
 	    mac.m_buflen);
 	NDFREE(&nd, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 	mac_vnode_label_free(intlabel);
 
 	if (error == 0)
 		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
 
 out:
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 __mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
 {
 	struct label *intlabel;
 	struct pipe *pipe;
 	struct socket *so;
 	struct file *fp;
 	struct mount *mp;
 	struct vnode *vp;
 	struct mac mac;
 	char *buffer;
 	int error, vfslocked;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	error = fget(td, uap->fd, &fp);
 	if (error)
 		goto out;
 
 	switch (fp->f_type) {
 	case DTYPE_FIFO:
 	case DTYPE_VNODE:
 		intlabel = mac_vnode_label_alloc();
 		error = mac_internalize_vnode_label(intlabel, buffer);
 		if (error) {
 			mac_vnode_label_free(intlabel);
 			break;
 		}
 		vp = fp->f_vnode;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 		if (error != 0) {
 			VFS_UNLOCK_GIANT(vfslocked);
 			mac_vnode_label_free(intlabel);
 			break;
 		}
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		error = vn_setlabel(vp, intlabel, td->td_ucred);
 		VOP_UNLOCK(vp, 0, td);
 		vn_finished_write(mp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		mac_vnode_label_free(intlabel);
 		break;
 
 	case DTYPE_PIPE:
 		intlabel = mac_pipe_label_alloc();
 		error = mac_internalize_pipe_label(intlabel, buffer);
 		if (error == 0) {
 			pipe = fp->f_data;
 			PIPE_LOCK(pipe);
 			error = mac_pipe_label_set(td->td_ucred,
 			    pipe->pipe_pair, intlabel);
 			PIPE_UNLOCK(pipe);
 		}
 		mac_pipe_label_free(intlabel);
 		break;
 
 	case DTYPE_SOCKET:
 		intlabel = mac_socket_label_alloc(M_WAITOK);
 		error = mac_internalize_socket_label(intlabel, buffer);
 		if (error == 0) {
 			so = fp->f_data;
 			NET_LOCK_GIANT();
 			error = mac_socket_label_set(td->td_ucred, so,
 			    intlabel);
 			NET_UNLOCK_GIANT();
 		}
 		mac_socket_label_free(intlabel);
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	fdrop(fp, td);
 out:
 	free(buffer, M_MACTEMP);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 __mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
 {
 	struct label *intlabel;
 	struct nameidata nd;
 	struct mount *mp;
 	struct mac mac;
 	char *buffer;
 	int vfslocked, error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_vnode_label_alloc();
 	error = mac_internalize_vnode_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error)
 		goto out;
 
 	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE,
 	    uap->path_p, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
 		if (error == 0) {
 			error = vn_setlabel(nd.ni_vp, intlabel,
 			    td->td_ucred);
 			vn_finished_write(mp);
 		}
 	}
 
 	NDFREE(&nd, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 out:
 	mac_vnode_label_free(intlabel);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 __mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
 {
 	struct label *intlabel;
 	struct nameidata nd;
 	struct mount *mp;
 	struct mac mac;
 	char *buffer;
 	int vfslocked, error;
 
 	error = copyin(uap->mac_p, &mac, sizeof(mac));
 	if (error)
 		return (error);
 
 	error = mac_check_structmac_consistent(&mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_vnode_label_alloc();
 	error = mac_internalize_vnode_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error)
 		goto out;
 
 	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
 	    uap->path_p, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
 		if (error == 0) {
 			error = vn_setlabel(nd.ni_vp, intlabel,
 			    td->td_ucred);
 			vn_finished_write(mp);
 		}
 	}
 
 	NDFREE(&nd, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 out:
 	mac_vnode_label_free(intlabel);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 mac_syscall(struct thread *td, struct mac_syscall_args *uap)
 {
 	struct mac_policy_conf *mpc;
 	char target[MAC_MAX_POLICY_NAME];
 	int entrycount, error;
 
 	error = copyinstr(uap->policy, target, sizeof(target), NULL);
 	if (error)
 		return (error);
 
 	error = ENOSYS;
 	LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) {
 		if (strcmp(mpc->mpc_name, target) == 0 &&
 		    mpc->mpc_ops->mpo_syscall != NULL) {
 			error = mpc->mpc_ops->mpo_syscall(td,
 			    uap->call, uap->arg);
 			goto out;
 		}
 	}
 
 	if ((entrycount = mac_policy_list_conditional_busy()) != 0) {
 		LIST_FOREACH(mpc, &mac_policy_list, mpc_list) {
 			if (strcmp(mpc->mpc_name, target) == 0 &&
 			    mpc->mpc_ops->mpo_syscall != NULL) {
 				error = mpc->mpc_ops->mpo_syscall(td,
 				    uap->call, uap->arg);
 				break;
 			}
 		}
 		mac_policy_list_unbusy();
 	}
 out:
 	return (error);
 }
 
 #else /* !MAC */
 
 int
 __mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 __mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 mac_syscall(struct thread *td, struct mac_syscall_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 #endif /* !MAC */