Index: head/sys/kern/kern_acl.c
===================================================================
--- head/sys/kern/kern_acl.c	(revision 167231)
+++ head/sys/kern/kern_acl.c	(revision 167232)
@@ -1,431 +1,431 @@
 /*-
  * Copyright (c) 1999-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*
  * Developed by the TrustedBSD Project.
  *
  * ACL system calls and other functions common across different ACL types.
  * Type-specific routines go into subr_acl_<type>.c.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/acl.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 uma_zone_t	acl_zone;
 static int	vacl_set_acl(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 static int	vacl_get_acl(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
 		    acl_type_t type, struct acl *aclp);
 
 /*
  * These calls wrap the real vnode operations, and are called by the syscall
  * code once the syscall has converted the path or file descriptor to a vnode
  * (unlocked).  The aclp pointer is assumed still to point to userland, so
  * this should not be consumed within the kernel except by syscall code.
  * Other code should directly invoke VOP_{SET,GET}ACL.
  */
 
 /*
  * Given a vnode, set its ACL.
  */
 static int
 vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernacl;
 	struct mount *mp;
 	int error;
 
 	error = copyin(aclp, &inkernacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 	error = mac_check_vnode_setacl(td->td_ucred, vp, type, &inkernacl);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return(error);
 }
 
 /*
  * Given a vnode, get its ACL.
  */
 static int
 vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 	error = mac_check_vnode_getacl(td->td_ucred, vp, type);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	if (error == 0)
 		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
 	return (error);
 }
 
 /*
  * Given a vnode, delete its ACL.
  */
 static int
 vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
 {
 	struct mount *mp;
 	int error;
 
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 	error = mac_check_vnode_deleteacl(td->td_ucred, vp, type);
 	if (error)
 		goto out;
 #endif
 	error = VOP_SETACL(vp, type, 0, td->td_ucred, td);
 #ifdef MAC
 out:
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Given a vnode, check whether an ACL is appropriate for it
  */
 static int
 vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
 	return (error);
 }
 
 /*
  * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.  Don't
  * need to lock, as the vacl_ code will get/release any locks required.
  */
 
 /*
  * Given a file path, get an ACL for it
  */
 int
 __acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, get an ACL for it; don't follow links.
  */
 int
 __acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it.
  */
 int
 __acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it; don't follow links.
  */
 int
 __acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
- * Given a file descriptor, get an ACL for it
+ * Given a file descriptor, get an ACL for it.
  */
 int
 __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
- * Given a file descriptor, set an ACL for it
+ * Given a file descriptor, set an ACL for it.
  */
 int
 __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  */
 int
 __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, uap->type);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it; don't follow links.
  */
 int
 __acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
 {
 	struct nameidata nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, uap->type);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  */
 int
 __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_delete(td, fp->f_vnode, uap->type);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it.
  */
 int
 __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
 {
 	struct nameidata	nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it; don't follow links.
  */
 int
 __acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
 {
 	struct nameidata	nd;
 	int vfslocked, error;
 
 	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	vfslocked = NDHASGIANT(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
 		NDFREE(&nd, 0);
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Given a file descriptor, check an ACL for it.
  */
 int
 __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
 {
 	struct file *fp;
 	int vfslocked, error;
 
 	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
 	if (error == 0) {
 		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 		error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
 		fdrop(fp, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (error);
 }
 
 /* ARGUSED */
 
 static void
 aclinit(void *dummy __unused)
 {
 
 	acl_zone = uma_zcreate("ACL UMA zone", sizeof(struct acl),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 SYSINIT(acls, SI_SUB_ACL, SI_ORDER_FIRST, aclinit, NULL)
Index: head/sys/kern/kern_context.c
===================================================================
--- head/sys/kern/kern_context.c	(revision 167231)
+++ head/sys/kern/kern_context.c	(revision 167232)
@@ -1,130 +1,130 @@
 /*-
  * Copyright (c) 2002 Daniel M. Eischen <deischen@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/ucontext.h>
 
 /*
- * The first two fields of a ucontext_t are the signal mask and
- * the machine context.  The next field is uc_link; we want to
- * avoid destroying the link when copying out contexts.
+ * The first two fields of a ucontext_t are the signal mask and the machine
+ * context.  The next field is uc_link; we want to avoid destroying the link
+ * when copying out contexts.
  */
 #define	UC_COPY_SIZE	offsetof(ucontext_t, uc_link)
 
 #ifndef _SYS_SYSPROTO_H_
 struct getcontext_args {
 	struct __ucontext *ucp;
 }
 struct setcontext_args {
 	const struct __ucontext_t *ucp;
 }
 struct swapcontext_args {
 	struct __ucontext *oucp;
 	const struct __ucontext_t *ucp;
 }
 #endif
 
 int
 getcontext(struct thread *td, struct getcontext_args *uap)
 {
 	ucontext_t uc;
 	int ret;
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		ret = copyout(&uc, uap->ucp, UC_COPY_SIZE);
 	}
 	return (ret);
 }
 
 int
 setcontext(struct thread *td, struct setcontext_args *uap)
 {
 	ucontext_t uc;
 	int ret;	
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
 		if (ret == 0) {
 			ret = set_mcontext(td, &uc.uc_mcontext);
 			if (ret == 0) {
 				SIG_CANTMASK(uc.uc_sigmask);
 				PROC_LOCK(td->td_proc);
 				td->td_sigmask = uc.uc_sigmask;
 				PROC_UNLOCK(td->td_proc);
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
 
 int
 swapcontext(struct thread *td, struct swapcontext_args *uap)
 {
 	ucontext_t uc;
 	int ret;	
 
 	if (uap->oucp == NULL || uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		ret = copyout(&uc, uap->oucp, UC_COPY_SIZE);
 		if (ret == 0) {
 			ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
 			if (ret == 0) {
 				ret = set_mcontext(td, &uc.uc_mcontext);
 				if (ret == 0) {
 					SIG_CANTMASK(uc.uc_sigmask);
 					PROC_LOCK(td->td_proc);
 					td->td_sigmask = uc.uc_sigmask;
 					PROC_UNLOCK(td->td_proc);
 				}
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
Index: head/sys/kern/kern_descrip.c
===================================================================
--- head/sys/kern/kern_descrip.c	(revision 167231)
+++ head/sys/kern/kern_descrip.c	(revision 167232)
@@ -1,2722 +1,2722 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
 		     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 
 static uma_zone_t file_zone;
 
 
 /* How to treat 'new' parameter when allocating a fd for do_dup(). */
 enum dup_type { DUP_VARIABLE, DUP_FIXED };
 
 static int do_dup(struct thread *td, enum dup_type type, int old, int new,
     register_t *retval);
 static int	fd_first_free(struct filedesc *, int, int);
 static int	fd_last_used(struct filedesc *, int, int);
 static void	fdgrowtable(struct filedesc *, int);
 static int	fdrop_locked(struct file *fp, struct thread *td);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 
 /*
  * A process is initially started out with NDFILE descriptors stored within
  * this structure, selected to be enough for typical applications based on
  * the historical limit of 20 open files (and the usage of descriptors by
  * shells).  If these descriptors are exhausted, a larger descriptor table
  * may be allocated, up to a process' resource limit; the internal arrays
  * are then unused.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * Storage required per open file descriptor.
  */
 #define OFILESIZE (sizeof(struct file *) + sizeof(char))
 
 /*
  * Basic allocation of descriptors:
  * one of the above, plus arrays for NDFILE descriptors.
  */
 struct filedesc0 {
 	struct	filedesc fd_fd;
 	/*
 	 * These arrays are used when the number of open files is
 	 * <= NDFILE, and are then pointed to by the pointers above.
 	 */
 	struct	file *fd_dfiles[NDFILE];
 	char	fd_dfileflags[NDFILE];
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 struct filelist filehead;	/* head of list of open files */
 int openfiles;			/* actual number of open files */
 struct sx filelist_lock;	/* sx to protect filelist */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /* A mutex to protect the association between a proc and filedesc. */
 static struct mtx	fdesc_mtx;
 
 /*
  * Find the first zero bit in the given bitmap, starting at low and not
  * exceeding size - 1.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the highest non-zero bit in the given bitmap, starting at low and
  * not exceeding size - 1.
  */
 static int
 fd_last_used(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
 	if (low >= size)
 		return (-1);
 
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
 		if ((mask &= map[off]) != 0)
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
 	for (minoff = NDSLOT(low); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (low - 1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
         KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
             ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd already used"));
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 	KASSERT(fdisused(fdp, fd),
 	    ("fd is already unused"));
 	KASSERT(fdp->fd_ofiles[fd] == NULL,
 	    ("fd is still in use"));
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
 		fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	td->td_retval[0] =
 	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
- * note: keep in mind that a potential race condition exists when closing
+ * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /* ARGSUSED */
 int
 dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
 		    td->td_retval));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /* ARGSUSED */
 int
 fcntl(struct thread *td, struct fcntl_args *uap)
 {
 	struct flock fl;
 	intptr_t arg;
 	int error;
 
 	error = 0;
 	switch (uap->cmd) {
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
 		arg = (intptr_t)&fl;
 		break;
 	default:
 		arg = uap->arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
 	if (error)
 		return (error);
 	if (uap->cmd == F_GETLK)
 		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp;
 	struct proc *p;
 	char *pop;
 	struct vnode *vp;
 	u_int newmin;
 	int error, flg, tmp;
 	int giant_locked;
 
 	/*
 	 * XXXRW: Some fcntl() calls require Giant -- others don't.  Try to
 	 * avoid grabbing Giant for calls we know don't need it.
 	 */
 	switch (cmd) {
 	case F_DUPFD:
 	case F_GETFD:
 	case F_SETFD:
 	case F_GETFL:
 		giant_locked = 0;
 		break;
 
 	default:
 		giant_locked = 1;
 		mtx_lock(&Giant);
 	}
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 	FILEDESC_LOCK(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		error = EBADF;
 		goto done2;
 	}
 	pop = &fdp->fd_ofileflags[fd];
 
 	switch (cmd) {
 	case F_DUPFD:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		FILEDESC_UNLOCK(fdp);
 		newmin = arg;
 		PROC_LOCK(p);
 		if (newmin >= lim_cur(p, RLIMIT_NOFILE) ||
 		    newmin >= maxfilesperproc) {
 			PROC_UNLOCK(p);
 			error = EINVAL;
 			break;
 		}
 		PROC_UNLOCK(p);
 		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
 		break;
 
 	case F_GETFD:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		*pop = (*pop &~ UF_EXCLOSE) |
 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		FILE_LOCK(fp);
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		FILE_UNLOCK(fp);
 		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_SETFL:
 		mtx_assert(&Giant, MA_OWNED);
 		FILE_LOCK(fp);
 		fhold_locked(fp);
 		fp->f_flag &= ~FCNTLFLAGS;
 		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		FILE_UNLOCK(fp);
 		FILEDESC_UNLOCK(fdp);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		FILE_LOCK(fp);
 		fp->f_flag &= ~FNONBLOCK;
 		FILE_UNLOCK(fp);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		mtx_assert(&Giant, MA_OWNED);
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		mtx_assert(&Giant, MA_OWNED);
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLKW:
 		mtx_assert(&Giant, MA_OWNED);
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 		mtx_assert(&Giant, MA_OWNED);
 		if (fp->f_type != DTYPE_VNODE) {
 			FILEDESC_UNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			if (fp->f_offset < 0 ||
 			    (flp->l_start > 0 &&
 			     fp->f_offset > OFF_MAX - flp->l_start)) {
 				FILEDESC_UNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
 			flp->l_start += fp->f_offset;
 		}
 
 		/*
 		 * VOP_ADVLOCK() may block.
 		 */
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		vp = fp->f_vnode;
 
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, F_POSIX);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		/* Check for race with close */
 		FILEDESC_LOCK_FAST(fdp);
 		if ((unsigned) fd >= fdp->fd_nfiles ||
 		    fp != fdp->fd_ofiles[fd]) {
 			FILEDESC_UNLOCK_FAST(fdp);
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 					   F_UNLCK, flp, F_POSIX);
 		} else
 			FILEDESC_UNLOCK_FAST(fdp);
 		fdrop(fp, td);
 		break;
 
 	case F_GETLK:
 		mtx_assert(&Giant, MA_OWNED);
 		if (fp->f_type != DTYPE_VNODE) {
 			FILEDESC_UNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			FILEDESC_UNLOCK(fdp);
 			error = EINVAL;
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			if ((flp->l_start > 0 &&
 			    fp->f_offset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			     fp->f_offset < OFF_MIN - flp->l_start)) {
 				FILEDESC_UNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
 			flp->l_start += fp->f_offset;
 		}
 		/*
 		 * VOP_ADVLOCK() may block.
 		 */
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 	default:
 		FILEDESC_UNLOCK(fdp);
 		error = EINVAL;
 		break;
 	}
 done2:
 	if (giant_locked)
 		mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Common code for dup, dup2, and fcntl(F_DUPFD).
  */
 static int
 do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	struct file *fp;
 	struct file *delfp;
 	int error, holdleaders, maxfd;
 
 	KASSERT((type == DUP_VARIABLE || type == DUP_FIXED),
 	    ("invalid dup type %d", type));
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to.
 	 */
 	if (old < 0 || new < 0)
 		return (EBADF);
 	PROC_LOCK(p);
 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	if (new >= maxfd)
 		return (EMFILE);
 
 	FILEDESC_LOCK(fdp);
 	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	if (type == DUP_FIXED && old == new) {
 		*retval = new;
 		FILEDESC_UNLOCK(fdp);
 		return (0);
 	}
 	fp = fdp->fd_ofiles[old];
 	fhold(fp);
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.  Since the filedesc
 	 * lock may be temporarily dropped in the process, we have to look
 	 * out for a race.
 	 */
 	if (type == DUP_FIXED) {
 		if (new >= fdp->fd_nfiles)
 			fdgrowtable(fdp, new + 1);
 		if (fdp->fd_ofiles[new] == NULL)
 			fdused(fdp, new);
 	} else {
 		if ((error = fdalloc(td, new, &new)) != 0) {
 			FILEDESC_UNLOCK(fdp);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	/*
 	 * If the old file changed out from under us then treat it as a
 	 * bad file descriptor.  Userland should do its own locking to
 	 * avoid this case.
 	 */
 	if (fdp->fd_ofiles[old] != fp) {
 		/* we've allocated a descriptor which we won't use */
 		if (fdp->fd_ofiles[new] == NULL)
 			fdunused(fdp, new);
 		FILEDESC_UNLOCK(fdp);
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	KASSERT(old != new,
 	    ("new fd is same as old"));
 
 	/*
 	 * Save info on the descriptor being overwritten.  We cannot close
 	 * it without introducing an ownership race for the slot, since we
 	 * need to drop the filedesc lock to call closef().
 	 *
 	 * XXX this duplicates parts of close().
 	 */
 	delfp = fdp->fd_ofiles[new];
 	holdleaders = 0;
 	if (delfp != NULL) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 			holdleaders = 1;
 		}
 	}
 
 	/*
 	 * Duplicate the source descriptor
 	 */
 	fdp->fd_ofiles[new] = fp;
 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
 	if (new > fdp->fd_lastfile)
 		fdp->fd_lastfile = new;
 	*retval = new;
 
 	/*
 	 * If we dup'd over a valid file, we now own the reference to it
 	 * and must dispose of it using closef() semantics (as if a
 	 * close() were performed on it).
 	 *
 	 * XXX this duplicates parts of close().
 	 */
 	if (delfp != NULL) {
 		knote_fdclose(td, new);
 		if (delfp->f_type == DTYPE_MQUEUE)
 			mq_fdclose(td, new, delfp);
 		FILEDESC_UNLOCK(fdp);
 		(void) closef(delfp, td);
 		if (holdleaders) {
 			FILEDESC_LOCK_FAST(fdp);
 			fdp->fd_holdleaderscount--;
 			if (fdp->fd_holdleaderscount == 0 &&
 			    fdp->fd_holdleaderswakeup != 0) {
 				fdp->fd_holdleaderswakeup = 0;
 				wakeup(&fdp->fd_holdleaderscount);
 			}
 			FILEDESC_UNLOCK_FAST(fdp);
 		}
 	} else {
 		FILEDESC_UNLOCK(fdp);
 	}
 	return (0);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	*(sigio->sio_myref) = NULL;
 	if ((sigio)->sio_pgid < 0) {
 		struct pgrp *pg = (sigio)->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		struct proc *p = (sigio)->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	SIGIO_UNLOCK();
 	crfree(sigio->sio_ucred);
 	FREE(sigio, M_SIGIO);
 }
 
 /*
  * Free a list of sigio structures.
  * We only need to lock the SIGIO_LOCK because we have made ourselves
  * inaccessible to callers of fsetown and therefore do not need to lock
  * the proc or pgrp struct for the list manipulation.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio;
 
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 	p = NULL;
 	pg = NULL;
 
 	/*
 	 * Every entry of the list should belong
 	 * to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	}
 
 	SIGIO_LOCK();
 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
 		*(sigio->sio_myref) = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 			PGRP_LOCK(pg);
 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PGRP_UNLOCK(pg);
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 			PROC_LOCK(p);
 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PROC_UNLOCK(p);
 		}
 		SIGIO_UNLOCK();
 		crfree(sigio->sio_ucred);
 		FREE(sigio, M_SIGIO);
 		SIGIO_LOCK();
 	}
 	SIGIO_UNLOCK();
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	ret = 0;
 
 	/* Allocate and fill in the new sigio out of locks. */
 	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	sx_slock(&proctree_lock);
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		PROC_UNLOCK(proc);
 		if (proc->p_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		pgrp = NULL;
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 		PGRP_UNLOCK(pgrp);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		proc = NULL;
 	}
 	funsetown(sigiop);
 	if (pgid > 0) {
 		PROC_LOCK(proc);
 		/*
 		 * Since funsetownlst() is called without the proctree
 		 * locked, we need to check for P_WEXIT.
 		 * XXX: is ESRCH correct?
 		 */
 		if ((proc->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(proc);
 			ret = ESRCH;
 			goto fail;
 		}
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_proc = proc;
 		PROC_UNLOCK(proc);
 	} else {
 		PGRP_LOCK(pgrp);
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 		PGRP_UNLOCK(pgrp);
 	}
 	sx_sunlock(&proctree_lock);
 	SIGIO_LOCK();
 	*sigiop = sigio;
 	SIGIO_UNLOCK();
 	return (0);
 
 fail:
 	sx_sunlock(&proctree_lock);
 	crfree(sigio->sio_ucred);
 	FREE(sigio, M_SIGIO);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(sigiop)
 	struct sigio **sigiop;
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /* ARGSUSED */
 int
 close(td, uap)
 	struct thread *td;
 	struct close_args *uap;
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(td, fd)
 	struct thread *td;
 	int fd;
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int error;
 	int holdleaders;
 
 	error = 0;
 	holdleaders = 0;
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_LOCK(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	fdp->fd_ofiles[fd] = NULL;
 	fdp->fd_ofileflags[fd] = 0;
 	fdunused(fdp, fd);
 	if (td->td_proc->p_fdtol != NULL) {
 		/*
 		 * Ask fdfree() to sleep to ensure that all relevant
 		 * process leaders can be traversed in closef().
 		 */
 		fdp->fd_holdleaderscount++;
 		holdleaders = 1;
 	}
 
 	/*
 	 * We now hold the fp reference that used to be owned by the descriptor
 	 * array.
 	 * We have to unlock the FILEDESC *AFTER* knote_fdclose to prevent a
 	 * race of the fd getting opened, a knote added, and deleteing a knote
 	 * for the new fd.
 	 */
 	knote_fdclose(td, fd);
 	if (fp->f_type == DTYPE_MQUEUE)
 		mq_fdclose(td, fd, fp);
 	FILEDESC_UNLOCK(fdp);
 
 	error = closef(fp, td);
 	if (holdleaders) {
 		FILEDESC_LOCK_FAST(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 	}
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG(fd, fd);
 
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
 
 	AUDIT_ARG(file, td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 nfstat(struct thread *td, struct nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	int error;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 
 	/* If asynchronous I/O is available, it works for all descriptors. */
 	if (uap->name == _PC_ASYNC_IO) {
 		td->td_retval[0] = async_io_version;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		int vfslocked;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (uap->name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
 		error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Grow the file table to accomodate (at least) nfd descriptors.  This may
  * block and drop the filedesc lock, but it will reacquire it before
  * returning.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct file **ntable;
 	char *nfileflags;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 
 	KASSERT(fdp->fd_nfiles > 0,
 	    ("zero-length file table"));
 
 	/* compute the size of the new table */
 	onfiles = fdp->fd_nfiles;
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/* allocate a new table and (if required) new bitmaps */
 	FILEDESC_UNLOCK(fdp);
 	MALLOC(ntable, struct file **, nnfiles * OFILESIZE,
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	nfileflags = (char *)&ntable[nnfiles];
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
 		MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE,
 		    M_FILEDESC, M_ZERO | M_WAITOK);
 	else
 		nmap = NULL;
 	FILEDESC_LOCK(fdp);
 
 	/*
 	 * We now have new tables ready to go.  Since we dropped the
 	 * filedesc lock to call malloc(), watch out for a race.
 	 */
 	onfiles = fdp->fd_nfiles;
 	if (onfiles >= nnfiles) {
 		/* we lost the race, but that's OK */
 		free(ntable, M_FILEDESC);
 		if (nmap != NULL)
 			free(nmap, M_FILEDESC);
 		return;
 	}
 	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
 	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
 	if (onfiles > NDFILE)
 		free(fdp->fd_ofiles, M_FILEDESC);
 	fdp->fd_ofiles = ntable;
 	fdp->fd_ofileflags = nfileflags;
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
 		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 			free(fdp->fd_map, M_FILEDESC);
 		fdp->fd_map = nmap;
 	}
 	fdp->fd_nfiles = nnfiles;
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd = -1, maxfd;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;	   
 
 	PROC_LOCK(p);
 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Search the bitmap for a free descriptor.  If none is found, try
 	 * to grow the file table.  Keep at it until we either get a file
 	 * descriptor or run into process or system limits; fdgrowtable()
 	 * may drop the filedesc lock, so we're in a race.
 	 */
 	for (;;) {
 		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 		if (fd >= maxfd)
 			return (EMFILE);
 		if (fd < fdp->fd_nfiles)
 			break;
 		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd] == NULL,
 	    ("free descriptor isn't"));
 	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Check to see whether n user file descriptors
  * are available to the process p.
  */
 int
 fdavail(struct thread *td, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file **fpp;
 	int i, lim, last;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 
 	PROC_LOCK(p);
 	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
 	last = min(fdp->fd_nfiles, lim);
 	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
 	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
 		if (*fpp == NULL && --n <= 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate
  * a file decriptor for the process that refers to it.
  * We add one reference to the file for the descriptor table
  * and one reference for resultfp. This is to prevent us being
  * preempted and the entry in the descriptor table closed after
  * we release the FILEDESC lock.
  */
 int
 falloc(struct thread *td, struct file **resultfp, int *resultfd)
 {
 	struct proc *p = td->td_proc;
 	struct file *fp, *fq;
 	int error, i;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	static struct timeval lastfail;
 	static int curfail;
 
 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	sx_xlock(&filelist_lock);
 
 	if ((openfiles >= maxuserfiles &&
 	    priv_check_cred(td->td_ucred, PRIV_MAXFILES, SUSER_RUID) != 0) ||
 	    openfiles >= maxfiles) {
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
 				td->td_ucred->cr_ruid);
 		}
 		sx_xunlock(&filelist_lock);
 		uma_zfree(file_zone, fp);
 		return (ENFILE);
 	}
 	openfiles++;
 
 	/*
 	 * If the process has file descriptor zero open, add the new file
 	 * descriptor to the list of open files at that point, otherwise
 	 * put it at the front of the list of open files.
 	 */
 	fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep);
 	fp->f_count = 1;
 	if (resultfp)
 		fp->f_count++;
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fp->f_vnode = NULL;
 	FILEDESC_LOCK(p->p_fd);
 	if ((fq = p->p_fd->fd_ofiles[0])) {
 		LIST_INSERT_AFTER(fq, fp, f_list);
 	} else {
 		LIST_INSERT_HEAD(&filehead, fp, f_list);
 	}
 	sx_xunlock(&filelist_lock);
 	if ((error = fdalloc(td, 0, &i))) {
 		FILEDESC_UNLOCK(p->p_fd);
 		fdrop(fp, td);
 		if (resultfp)
 			fdrop(fp, td);
 		return (error);
 	}
 	p->p_fd->fd_ofiles[i] = fp;
 	FILEDESC_UNLOCK(p->p_fd);
 	if (resultfp)
 		*resultfp = fp;
 	if (resultfd)
 		*resultfd = i;
 	return (0);
 }
 
 /*
  * Build a new filedesc structure from another.
  * Copy the current, root, and jail root vnode references.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp)
 {
 	struct filedesc0 *newfdp;
 
 	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
 	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
 	if (fdp != NULL) {
 		FILEDESC_LOCK(fdp);
 		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
 		if (newfdp->fd_fd.fd_cdir)
 			VREF(newfdp->fd_fd.fd_cdir);
 		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
 		if (newfdp->fd_fd.fd_rdir)
 			VREF(newfdp->fd_fd.fd_rdir);
 		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
 		if (newfdp->fd_fd.fd_jdir)
 			VREF(newfdp->fd_fd.fd_jdir);
 		FILEDESC_UNLOCK(fdp);
 	}
 
 	/* Create the file descriptor table. */
 	newfdp->fd_fd.fd_refcnt = 1;
 	newfdp->fd_fd.fd_holdcnt = 1;
 	newfdp->fd_fd.fd_cmask = CMASK;
 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
 	newfdp->fd_fd.fd_nfiles = NDFILE;
 	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
 	newfdp->fd_fd.fd_lastfile = -1;
 	return (&newfdp->fd_fd);
 }
 
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	mtx_lock(&fdesc_mtx);
 	fdp = p->p_fd;
 	if (fdp != NULL)
 		fdp->fd_holdcnt++;
 	mtx_unlock(&fdesc_mtx);
 	return (fdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 	int i;
 
 	mtx_lock(&fdesc_mtx);
 	i = --fdp->fd_holdcnt;
 	mtx_unlock(&fdesc_mtx);
 	if (i > 0)
 		return;
 
 	mtx_destroy(&fdp->fd_mtx);
 	FREE(fdp, M_FILEDESC);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 	FILEDESC_LOCK_FAST(fdp);
 	fdp->fd_refcnt++;
 	FILEDESC_UNLOCK_FAST(fdp);
 	return (fdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct proc *p, struct thread *td)
 {
 
 	FILEDESC_LOCK_FAST(p->p_fd);
 	if (p->p_fd->fd_refcnt > 1) {
 		struct filedesc *tmp;
 
 		FILEDESC_UNLOCK_FAST(p->p_fd);
 		tmp = fdcopy(p->p_fd);
 		fdfree(td);
 		p->p_fd = tmp;
 	} else
 		FILEDESC_UNLOCK_FAST(p->p_fd);
 }
 
 /*
  * Copy a filedesc structure.
  * A NULL pointer in returns a NULL reference, this is to ease callers,
  * not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	if (fdp == NULL)
 		return (NULL);
 
 	newfdp = fdinit(fdp);
 	FILEDESC_LOCK_FAST(fdp);
 	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
 		FILEDESC_UNLOCK_FAST(fdp);
 		FILEDESC_LOCK(newfdp);
 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 		FILEDESC_UNLOCK(newfdp);
 		FILEDESC_LOCK_FAST(fdp);
 	}
 	/* copy everything except kqueue descriptors */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
 		if (fdisused(fdp, i) &&
 		    fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE) {
 			newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
 			newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
 			fhold(newfdp->fd_ofiles[i]);
 			newfdp->fd_lastfile = i;
 		} else {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 		}
 	}
 	FILEDESC_UNLOCK_FAST(fdp);
 	FILEDESC_LOCK(newfdp);
 	for (i = 0; i <= newfdp->fd_lastfile; ++i)
 		if (newfdp->fd_ofiles[i] != NULL)
 			fdused(newfdp, i);
 	FILEDESC_UNLOCK(newfdp);
 	FILEDESC_LOCK_FAST(fdp);
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_UNLOCK_FAST(fdp);
 	return (newfdp);
 }
 
 /*
  * Release a filedesc structure.
  */
 void
 fdfree(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file **fpp;
 	int i, locked;
 	struct filedesc_to_leader *fdtol;
 	struct file *fp;
 	struct vnode *cdir, *jdir, *rdir, *vp;
 	struct flock lf;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	/* Check for special need to clear POSIX style locks */
 	fdtol = td->td_proc->p_fdtol;
 	if (fdtol != NULL) {
 		FILEDESC_LOCK(fdp);
 		KASSERT(fdtol->fdl_refcount > 0,
 			("filedesc_to_refcount botch: fdl_refcount=%d",
 			 fdtol->fdl_refcount));
 		if (fdtol->fdl_refcount == 1 &&
 		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			for (i = 0, fpp = fdp->fd_ofiles;
 			     i <= fdp->fd_lastfile;
 			     i++, fpp++) {
 				if (*fpp == NULL ||
 				    (*fpp)->f_type != DTYPE_VNODE)
 					continue;
 				fp = *fpp;
 				fhold(fp);
 				FILEDESC_UNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				locked = VFS_LOCK_GIANT(vp->v_mount);
 				(void) VOP_ADVLOCK(vp,
 						   (caddr_t)td->td_proc->
 						   p_leader,
 						   F_UNLCK,
 						   &lf,
 						   F_POSIX);
 				VFS_UNLOCK_GIANT(locked);
 				FILEDESC_LOCK(fdp);
 				fdrop(fp, td);
 				fpp = fdp->fd_ofiles + i;
 			}
 		}
 	retry:
 		if (fdtol->fdl_refcount == 1) {
 			if (fdp->fd_holdleaderscount > 0 &&
 			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 				/*
 				 * close() or do_dup() has cleared a reference
 				 * in a shared file descriptor table.
 				 */
 				fdp->fd_holdleaderswakeup = 1;
 				msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx,
 				       PLOCK, "fdlhold", 0);
 				goto retry;
 			}
 			if (fdtol->fdl_holdcount > 0) {
 				/*
 				 * Ensure that fdtol->fdl_leader
 				 * remains valid in closef().
 				 */
 				fdtol->fdl_wakeup = 1;
 				msleep(fdtol, &fdp->fd_mtx,
 				       PLOCK, "fdlhold", 0);
 				goto retry;
 			}
 		}
 		fdtol->fdl_refcount--;
 		if (fdtol->fdl_refcount == 0 &&
 		    fdtol->fdl_holdcount == 0) {
 			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 		} else
 			fdtol = NULL;
 		td->td_proc->p_fdtol = NULL;
 		FILEDESC_UNLOCK(fdp);
 		if (fdtol != NULL)
 			FREE(fdtol, M_FILEDESC_TO_LEADER);
 	}
 	FILEDESC_LOCK_FAST(fdp);
 	i = --fdp->fd_refcnt;
 	FILEDESC_UNLOCK_FAST(fdp);
 	if (i > 0)
 		return;
 	/*
 	 * We are the last reference to the structure, so we can
 	 * safely assume it will not change out from under us.
 	 */
 	fpp = fdp->fd_ofiles;
 	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
 		if (*fpp)
 			(void) closef(*fpp, td);
 	}
 	FILEDESC_LOCK(fdp);
 
 	/* XXX This should happen earlier. */
 	mtx_lock(&fdesc_mtx);
 	td->td_proc->p_fd = NULL;
 	mtx_unlock(&fdesc_mtx);
 
 	if (fdp->fd_nfiles > NDFILE)
 		FREE(fdp->fd_ofiles, M_FILEDESC);
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		FREE(fdp->fd_map, M_FILEDESC);
 
 	fdp->fd_nfiles = 0;
 
 	cdir = fdp->fd_cdir;
 	fdp->fd_cdir = NULL;
 	rdir = fdp->fd_rdir;
 	fdp->fd_rdir = NULL;
 	jdir = fdp->fd_jdir;
 	fdp->fd_jdir = NULL;
 	FILEDESC_UNLOCK(fdp);
 
 	if (cdir) {
 		locked = VFS_LOCK_GIANT(cdir->v_mount);
 		vrele(cdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 	if (rdir) {
 		locked = VFS_LOCK_GIANT(rdir->v_mount);
 		vrele(rdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 	if (jdir) {
 		locked = VFS_LOCK_GIANT(jdir->v_mount);
 		vrele(jdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 
 	fddrop(fdp);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static int
 is_unsafe(struct file *fp)
 {
 	if (fp->f_type == DTYPE_VNODE) {
 		struct vnode *vp = fp->f_vnode;
 
 		if ((vp->v_vflag & VV_PROCDEP) != 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 setugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	/*
 	 * Note: fdp->fd_ofiles may be reallocated out from under us while
 	 * we are blocked in a close.  Be careful!
 	 */
 	FILEDESC_LOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (i > 2)
 			break;
 		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
 			struct file *fp;
 
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
 			FILEDESC_UNLOCK(fdp);
 			(void) closef(fp, td);
 			FILEDESC_LOCK(fdp);
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 }
 
 /*
  * If a specific file object occupies a specific file descriptor,
  * close the file descriptor entry and drop a reference on the file
  * object.  This is a convenience function to handle a subsequent
  * error in a function that calls falloc() that handles the race that
  * another thread might have closed the file descriptor out from under
  * the thread creating the file object.
  */
 void
 fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
 {
 
 	FILEDESC_LOCK(fdp);
 	if (fdp->fd_ofiles[idx] == fp) {
 		fdp->fd_ofiles[idx] = NULL;
 		fdunused(fdp, idx);
 		FILEDESC_UNLOCK(fdp);
 		fdrop(fp, td);
 	} else {
 		FILEDESC_UNLOCK(fdp);
 	}
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	FILEDESC_LOCK(fdp);
 
 	/*
 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
 	 * may block and rip them out from under us.
 	 */
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (fdp->fd_ofiles[i] != NULL &&
 		    (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
 			struct file *fp;
 
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
 			if (fp->f_type == DTYPE_MQUEUE)
 				mq_fdclose(td, i, fp);
 			FILEDESC_UNLOCK(fdp);
 			(void) closef(fp, td);
 			FILEDESC_LOCK(fdp);
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct nameidata nd;
 	struct filedesc *fdp;
 	struct file *fp;
 	register_t retval;
 	int fd, i, error, flags, devnull;
 
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return (0);
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	devnull = -1;
 	error = 0;
 	for (i = 0; i < 3; i++) {
 		if (fdp->fd_ofiles[i] != NULL)
 			continue;
 		if (devnull < 0) {
 			int vfslocked;
 			error = falloc(td, &fp, &fd);
 			if (error != 0)
 				break;
 			/* Note extra ref on `fp' held for us by falloc(). */
 			KASSERT(fd == i, ("oof, we didn't get our fd"));
 			NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE,
 			    "/dev/null", td);
 			flags = FREAD | FWRITE;
 			error = vn_open(&nd, &flags, 0, fd);
 			if (error != 0) {
 				/*
 				 * Someone may have closed the entry in the
 				 * file descriptor table, so check it hasn't
 				 * changed before dropping the reference count.
 				 */
 				FILEDESC_LOCK(fdp);
 				KASSERT(fdp->fd_ofiles[fd] == fp,
 				    ("table not shared, how did it change?"));
 				fdp->fd_ofiles[fd] = NULL;
 				fdunused(fdp, fd);
 				FILEDESC_UNLOCK(fdp);
 				fdrop(fp, td);
 				fdrop(fp, td);
 				break;
 			}
 			vfslocked = NDHASGIANT(&nd);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			fp->f_flag = flags;
 			fp->f_vnode = nd.ni_vp;
 			if (fp->f_data == NULL)
 				fp->f_data = nd.ni_vp;
 			if (fp->f_ops == &badfileops)
 				fp->f_ops = &vnops;
 			fp->f_type = DTYPE_VNODE;
 			VOP_UNLOCK(nd.ni_vp, 0, td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			devnull = fd;
 			fdrop(fp, td);
 		} else {
 			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
 			if (error != 0)
 				break;
 		}
 	}
 	return (error);
 }
 
 /*
  * Internal form of close.
  * Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  *
  * XXXRW: Giant is not required for the caller, but often will be held; this
  * makes it moderately likely the Giant will be recursed in the VFS case.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE && td != NULL) {
 		int vfslocked;
 
 		vp = fp->f_vnode;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 					   F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table
 			 * is shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_LOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			     fdtol != td->td_proc->p_fdtol;
 			     fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				     P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_UNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 						   (caddr_t)fdtol->fdl_leader,
 						   F_UNLCK, &lf, F_POSIX);
 				FILEDESC_LOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_UNLOCK(fdp);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (fdrop(fp, td));
 }
 
 /*
  * Extract the file pointer associated with the specified descriptor for
  * the current user process.
  *
  * If the descriptor doesn't exist, EBADF is returned.
  *
  * If the descriptor exists but doesn't match 'flags' then
  * return EBADF for read attempts and EINVAL for write attempts.
  *
  * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
  * It should be dropped with fdrop().
  * If it is not set, then the refcount will not be bumped however the
  * thread's filedesc struct will be returned locked (for fgetsock).
  *
  * If an error occured the non-zero error is returned and *fpp is set to NULL.
  * Otherwise *fpp is set and zero is returned.
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
 	FILEDESC_LOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 *
 	 * Only one flag, or 0, may be specified.
 	 */
 	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	if (hold) {
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, 0, 1));
 }
 
 int
 fget_read(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, FREAD, 1));
 }
 
 int
 fget_write(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, FWRITE, 1));
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if
  * the descriptor does not represent a vnode.  Note that pipes use vnodes
  * but never have VM objects.  The returned vnode will be vref()d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
 	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, 0));
 }
 
 int
 fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, FREAD));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, FWRITE));
 }
 #endif
 
 /*
  * Like fget() but loads the underlying socket, or returns an error if
  * the descriptor does not represent a socket.
  *
  * We bump the ref count on the returned socket.  XXX Also obtain the SX
  * lock in the future.
  *
  * XXXRW: fgetsock() and fputsock() are deprecated, as consumers should rely
  * on their file descriptor reference to prevent the socket from being
  * freed during use.
  */
 int
 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	NET_ASSERT_GIANT();
 
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
 	} else {
 		*spp = fp->f_data;
 		if (fflagp)
 			*fflagp = fp->f_flag;
 		SOCK_LOCK(*spp);
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
 	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	return (error);
 }
 
 /*
  * Drop the reference count on the socket and XXX release the SX lock in the
  * future.  The last reference closes the socket.
  *
  * XXXRW: fputsock() is deprecated, see comment for fgetsock().
  */
 void
 fputsock(struct socket *so)
 {
 
 	NET_ASSERT_GIANT();
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	sorele(so);
 }
 
 int
 fdrop(struct file *fp, struct thread *td)
 {
 
 	FILE_LOCK(fp);
 	return (fdrop_locked(fp, td));
 }
 
 /*
  * Drop reference on struct file passed in, may call closef if the
  * reference hits zero.
  * Expects struct file locked, and will unlock it.
  */
 static int
 fdrop_locked(struct file *fp, struct thread *td)
 {
 	int error;
 
 	FILE_LOCK_ASSERT(fp, MA_OWNED);
 
 	if (--fp->f_count > 0) {
 		FILE_UNLOCK(fp);
 		return (0);
 	}
 
 	/*
 	 * We might have just dropped the last reference to a file
 	 * object that is for a UNIX domain socket whose message
 	 * buffers are being examined in unp_gc().  If that is the
 	 * case, FWAIT will be set in f_gcflag and we need to wait for
 	 * unp_gc() to finish its scan.
 	 */
 	while (fp->f_gcflag & FWAIT)
 		msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0);
 
 	/* We have the last ref so we can proceed without the file lock. */
 	FILE_UNLOCK(fp);
 	if (fp->f_count < 0)
 		panic("fdrop: count < 0");
 	if (fp->f_ops != &badfileops)
 		error = fo_close(fp, td);
 	else
 		error = 0;
 
 	sx_xlock(&filelist_lock);
 	LIST_REMOVE(fp, f_list);
 	openfiles--;
 	sx_xunlock(&filelist_lock);
 	crfree(fp->f_cred);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
- * Just attempt to get a record lock of the requested type on
- * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ * Just attempt to get a record lock of the requested type on the entire file
+ * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /* ARGSUSED */
 int
 flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
 
 	mtx_lock(&Giant);
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		FILE_LOCK(fp);
 		fp->f_flag &= ~FHASLOCK;
 		FILE_UNLOCK(fp);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
 	FILE_LOCK(fp);
 	fp->f_flag |= FHASLOCK;
 	FILE_UNLOCK(fp);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
 {
 	struct file *wfp;
 	struct file *fp;
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_LOCK(fdp);
 	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
 	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor
 	 * (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and
 	 * store it in (indx).  (dfd) is effectively closed by
 	 * this operation.
 	 *
 	 * Any other error code is just returned.
 	 */
 	switch (error) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		FILE_LOCK(wfp);
 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
 			FILE_UNLOCK(wfp);
 			FILEDESC_UNLOCK(fdp);
 			return (EACCES);
 		}
 		fp = fdp->fd_ofiles[indx];
 		fdp->fd_ofiles[indx] = wfp;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		if (fp == NULL)
 			fdused(fdp, indx);
 		fhold_locked(wfp);
 		FILE_UNLOCK(wfp);
 		FILEDESC_UNLOCK(fdp);
 		if (fp != NULL) {
 			/*
 			 * We now own the reference to fp that the ofiles[]
 			 * array used to own.  Release it.
 			 */
 			FILE_LOCK(fp);
 			fdrop_locked(fp, td);
 		}
 		return (0);
 
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		fp = fdp->fd_ofiles[indx];
 		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
 		fdp->fd_ofiles[dfd] = NULL;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		fdp->fd_ofileflags[dfd] = 0;
 		fdunused(fdp, dfd);
 		if (fp == NULL)
 			fdused(fdp, indx);
 		if (fp != NULL)
 			FILE_LOCK(fp);
 
 		/*
 		 * We now own the reference to fp that the ofiles[] array
 		 * used to own.  Release it.
 		 */
 		if (fp != NULL)
 			fdrop_locked(fp, td);
 
 		FILEDESC_UNLOCK(fdp);
 
 		return (0);
 
 	default:
 		FILEDESC_UNLOCK(fdp);
 		return (error);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Scan all active processes to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new
  * mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		nrele = 0;
 		FILEDESC_LOCK_FAST(fdp);
 		if (fdp->fd_cdir == olddp) {
 			vref(newdp);
 			fdp->fd_cdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_rdir == olddp) {
 			vref(newdp);
 			fdp->fd_rdir = newdp;
 			nrele++;
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 		fddrop(fdp);
 		while (nrele--)
 			vrele(olddp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vrele(rootvnode);
 		vref(newdp);
 		rootvnode = newdp;
 	}
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	MALLOC(fdtol, struct filedesc_to_leader *,
 	       sizeof(struct filedesc_to_leader),
 	       M_FILEDESC_TO_LEADER,
 	       M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_LOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_UNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 /*
  * Get file structures.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n;
 
 	/*
 	 * Note: because the number of file descriptors is calculated
 	 * in different ways for sizing vs returning the data,
 	 * there is information leakage from the first loop.  However,
 	 * it is of a similar order of magnitude to the leakage from
 	 * global system statistics such as kern.openfiles.
 	 */
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 16;		/* A slight overestimate. */
 		sx_slock(&filelist_lock);
 		LIST_FOREACH(fp, &filehead, f_list) {
 			/*
 			 * We should grab the lock, but this is an
 			 * estimate, so does it really matter?
 			 */
 			/* mtx_lock(fp->f_mtxp); */
 			n += fp->f_count;
 			/* mtx_unlock(f->f_mtxp); */
 		}
 		sx_sunlock(&filelist_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		PROC_LOCK(p);
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		PROC_UNLOCK(p);
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_LOCK_FAST(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
 			if ((fp = fdp->fd_ofiles[n]) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
 			xf.xf_data = fp->f_data;
 			xf.xf_vnode = fp->f_vnode;
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = fp->f_msgcount;
 			xf.xf_offset = fp->f_offset;
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnod");
 	case DTYPE_SOCKET:
 		return ("sock");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kque");
 	case DTYPE_CRYPTO:
 		return ("crpt");
 	case DTYPE_MQUEUE:
 		return ("mque");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; n++) {
 			if (fp == fdp->fd_ofiles[n])
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 	struct proc *p;
 
 	if (header)
 		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
 		    "File", "Type", "Data", "Flag", "GCFl", "Count",
 		    "MCount", "Vnode", "FPID", "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
 	    fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct file *fp;
 	int header;
 
 	header = 1;
 	LIST_FOREACH(fp, &filehead, f_list) {
 		db_print_file(fp, header);
 		header = 0;
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     &openfiles, 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	sx_init(&filelist_lock, "filelist lock");
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 };
 
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_NEEDGIANT,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL)
Index: head/sys/kern/kern_environment.c
===================================================================
--- head/sys/kern/kern_environment.c	(revision 167231)
+++ head/sys/kern/kern_environment.c	(revision 167232)
@@ -1,558 +1,557 @@
 /*-
  * Copyright (c) 1998 Michael Smith
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The unified bootloader passes us a pointer to a preserved copy of
  * bootstrap/kernel environment variables.  We convert them to a
  * dynamic array of strings later when the VM subsystem is up.
  *
  * We make these available through the kenv(2) syscall for userland
  * and through getenv()/freeenv() setenv() unsetenv() testenv() for
  * the kernel.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/libkern.h>
 #include <sys/kenv.h>
 
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment");
 
 #define KENV_SIZE	512	/* Maximum number of environment strings */
 
 /* pointer to the static environment */
 char		*kern_envp;
 static char	*kernenv_next(char *);
 
 /* dynamic environment variables */
 char		**kenvp;
 struct mtx	kenv_lock;
 
 /*
- * No need to protect this with a mutex
- * since SYSINITS are single threaded.
+ * No need to protect this with a mutex since SYSINITS are single threaded.
  */
 int	dynamic_kenv = 0;
 
 #define KENV_CHECK	if (!dynamic_kenv) \
 			    panic("%s: called before SI_SUB_KMEM", __func__)
 
 int
 kenv(td, uap)
 	struct thread *td;
 	struct kenv_args /* {
 		int what;
 		const char *name;
 		char *value;
 		int len;
 	} */ *uap;
 {
 	char *name, *value, *buffer = NULL;
 	size_t len, done, needed;
 	int error, i;
 
 	KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0"));
 
 	error = 0;
 	if (uap->what == KENV_DUMP) {
 #ifdef MAC
 		error = mac_check_kenv_dump(td->td_ucred);
 		if (error)
 			return (error);
 #endif
 		done = needed = 0;
 		if (uap->len > 0 && uap->value != NULL)
 			buffer = malloc(uap->len, M_TEMP, M_WAITOK|M_ZERO);
 		mtx_lock(&kenv_lock);
 		for (i = 0; kenvp[i] != NULL; i++) {
 			len = strlen(kenvp[i]) + 1;
 			needed += len;
 			len = min(len, uap->len - done);
 			/*
 			 * If called with a NULL or insufficiently large
 			 * buffer, just keep computing the required size.
 			 */
 			if (uap->value != NULL && buffer != NULL && len > 0) {
 				bcopy(kenvp[i], buffer + done, len);
 				done += len;
 			}
 		}
 		mtx_unlock(&kenv_lock);
 		if (buffer != NULL) {
 			error = copyout(buffer, uap->value, done);
 			free(buffer, M_TEMP);
 		}
 		td->td_retval[0] = ((done == needed) ? 0 : needed);
 		return (error);
 	}
 
 	switch (uap->what) {
 	case KENV_SET:
 		error = priv_check(td, PRIV_KENV_SET);
 		if (error)
 			return (error);
 		break;
 
 	case KENV_UNSET:
 		error = priv_check(td, PRIV_KENV_UNSET);
 		if (error)
 			return (error);
 		break;
 	}
 
 	name = malloc(KENV_MNAMELEN, M_TEMP, M_WAITOK);
 
 	error = copyinstr(uap->name, name, KENV_MNAMELEN, NULL);
 	if (error)
 		goto done;
 
 	switch (uap->what) {
 	case KENV_GET:
 #ifdef MAC
 		error = mac_check_kenv_get(td->td_ucred, name);
 		if (error)
 			goto done;
 #endif
 		value = getenv(name);
 		if (value == NULL) {
 			error = ENOENT;
 			goto done;
 		}
 		len = strlen(value) + 1;
 		if (len > uap->len)
 			len = uap->len;
 		error = copyout(value, uap->value, len);
 		freeenv(value);
 		if (error)
 			goto done;
 		td->td_retval[0] = len;
 		break;
 	case KENV_SET:
 		len = uap->len;
 		if (len < 1) {
 			error = EINVAL;
 			goto done;
 		}
 		if (len > KENV_MVALLEN)
 			len = KENV_MVALLEN;
 		value = malloc(len, M_TEMP, M_WAITOK);
 		error = copyinstr(uap->value, value, len, NULL);
 		if (error) {
 			free(value, M_TEMP);
 			goto done;
 		}
 #ifdef MAC
 		error = mac_check_kenv_set(td->td_ucred, name, value);
 		if (error == 0)
 #endif
 			setenv(name, value);
 		free(value, M_TEMP);
 		break;
 	case KENV_UNSET:
 #ifdef MAC
 		error = mac_check_kenv_unset(td->td_ucred, name);
 		if (error)
 			goto done;
 #endif
 		error = unsetenv(name);
 		if (error)
 			error = ENOENT;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 done:
 	free(name, M_TEMP);
 	return (error);
 }
 
 /*
  * Setup the dynamic kernel environment.
  */
 static void
 init_dynamic_kenv(void *data __unused)
 {
 	char *cp;
 	int len, i;
 
 	kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV,
 		M_WAITOK | M_ZERO);
 	i = 0;
 	for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
 		len = strlen(cp) + 1;
 		if (i < KENV_SIZE) {
 			kenvp[i] = malloc(len, M_KENV, M_WAITOK);
 			strcpy(kenvp[i++], cp);
 		} else
 			printf(
 			    "WARNING: too many kenv strings, ignoring %s\n",
 			    cp);
 	}
 	kenvp[i] = NULL;
 
 	mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF);
 	dynamic_kenv = 1;
 }
 SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL);
 
 void
 freeenv(char *env)
 {
 
 	if (dynamic_kenv)
 		free(env, M_KENV);
 }
 
 /*
  * Internal functions for string lookup.
  */
 static char *
 _getenv_dynamic(const char *name, int *idx)
 {
 	char *cp;
 	int len, i;
 
 	mtx_assert(&kenv_lock, MA_OWNED);
 	len = strlen(name);
 	for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) {
 		if ((strncmp(cp, name, len) == 0) &&
 		    (cp[len] == '=')) {
 			if (idx != NULL)
 				*idx = i;
 			return (cp + len + 1);
 		}
 	}
 	return (NULL);
 }
 
 static char *
 _getenv_static(const char *name)
 {
 	char *cp, *ep;
 	int len;
 
 	for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
 		for (ep = cp; (*ep != '=') && (*ep != 0); ep++)
 			;
 		if (*ep != '=')
 			continue;
 		len = ep - cp;
 		ep++;
 		if (!strncmp(name, cp, len) && name[len] == 0)
 			return (ep);
 	}
 	return (NULL);
 }
 
 /*
  * Look up an environment variable by name.
  * Return a pointer to the string if found.
  * The pointer has to be freed with freeenv()
  * after use.
  */
 char *
 getenv(const char *name)
 {
 	char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
 	char *ret, *cp;
 	int len;
 
 	if (dynamic_kenv) {
 		mtx_lock(&kenv_lock);
 		cp = _getenv_dynamic(name, NULL);
 		if (cp != NULL) {
 			strcpy(buf, cp);
 			mtx_unlock(&kenv_lock);
 			len = strlen(buf) + 1;
 			ret = malloc(len, M_KENV, M_WAITOK);
 			strcpy(ret, buf);
 		} else {
 			mtx_unlock(&kenv_lock);
 			ret = NULL;
 		}
 	} else
 		ret = _getenv_static(name);
 	return (ret);
 }
 
 /*
  * Test if an environment variable is defined.
  */
 int
 testenv(const char *name)
 {
 	char *cp;
 
 	if (dynamic_kenv) {
 		mtx_lock(&kenv_lock);
 		cp = _getenv_dynamic(name, NULL);
 		mtx_unlock(&kenv_lock);
 	} else
 		cp = _getenv_static(name);
 	if (cp != NULL)
 		return (1);
 	return (0);
 }
 
 /*
  * Set an environment variable by name.
  */
 int
 setenv(const char *name, const char *value)
 {
 	char *buf, *cp, *oldenv;
 	int namelen, vallen, i;
 
 	KENV_CHECK;
 
 	namelen = strlen(name) + 1;
 	if (namelen > KENV_MNAMELEN)
 		return (-1);
 	vallen = strlen(value) + 1;
 	if (vallen > KENV_MVALLEN)
 		return (-1);
 	buf = malloc(namelen + vallen, M_KENV, M_WAITOK);
 	sprintf(buf, "%s=%s", name, value);
 
 	mtx_lock(&kenv_lock);
 	cp = _getenv_dynamic(name, &i);
 	if (cp != NULL) {
 		oldenv = kenvp[i];
 		kenvp[i] = buf;
 		mtx_unlock(&kenv_lock);
 		free(oldenv, M_KENV);
 	} else {
 		/* We add the option if it wasn't found */
 		for (i = 0; (cp = kenvp[i]) != NULL; i++)
 			;
 
 		/* Bounds checking */
 		if (i < 0 || i >= KENV_SIZE) {
 			free(buf, M_KENV);
 			mtx_unlock(&kenv_lock);
 			return (-1);
 		}
 
 		kenvp[i] = buf;
 		kenvp[i + 1] = NULL;
 		mtx_unlock(&kenv_lock);
 	}
 	return (0);
 }
 
 /*
  * Unset an environment variable string.
  */
 int
 unsetenv(const char *name)
 {
 	char *cp, *oldenv;
 	int i, j;
 
 	KENV_CHECK;
 
 	mtx_lock(&kenv_lock);
 	cp = _getenv_dynamic(name, &i);
 	if (cp != NULL) {
 		oldenv = kenvp[i];
 		for (j = i + 1; kenvp[j] != NULL; j++)
 			kenvp[i++] = kenvp[j];
 		kenvp[i] = NULL;
 		mtx_unlock(&kenv_lock);
 		free(oldenv, M_KENV);
 		return (0);
 	}
 	mtx_unlock(&kenv_lock);
 	return (-1);
 }
 
 /*
  * Return a string value from an environment variable.
  */
 int
 getenv_string(const char *name, char *data, int size)
 {
 	char *tmp;
 
 	tmp = getenv(name);
 	if (tmp != NULL) {
 		strlcpy(data, tmp, size);
 		freeenv(tmp);
 		return (1);
 	} else
 		return (0);
 }
 
 /*
  * Return an integer value from an environment variable.
  */
 int
 getenv_int(const char *name, int *data)
 {
 	quad_t tmp;
 	int rval;
 
 	rval = getenv_quad(name, &tmp);
 	if (rval)
 		*data = (int) tmp;
 	return (rval);
 }
 
 /*
  * Return a long value from an environment variable.
  */
 long
 getenv_long(const char *name, long *data)
 {
 	quad_t tmp;
 	long rval;
 
 	rval = getenv_quad(name, &tmp);
 	if (rval)
 		*data = (long) tmp;
 	return (rval);
 }
 
 /*
  * Return an unsigned long value from an environment variable.
  */
 unsigned long
 getenv_ulong(const char *name, unsigned long *data)
 {
 	quad_t tmp;
 	long rval;
 
 	rval = getenv_quad(name, &tmp);
 	if (rval)
 		*data = (unsigned long) tmp;
 	return (rval);
 }
 
 /*
  * Return a quad_t value from an environment variable.
  */
 int
 getenv_quad(const char *name, quad_t *data)
 {
 	char	*value;
 	char	*vtp;
 	quad_t	iv;
 
 	value = getenv(name);
 	if (value == NULL)
 		return (0);
 	iv = strtoq(value, &vtp, 0);
 	if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0')) {
 		freeenv(value);
 		return (0);
 	}
 	switch (vtp[0]) {
 	case 't': case 'T':
 		iv *= 1024;
 	case 'g': case 'G':
 		iv *= 1024;
 	case 'm': case 'M':
 		iv *= 1024;
 	case 'k': case 'K':
 		iv *= 1024;
 	case '\0':
 		break;
 	default:
 		freeenv(value);
 		return (0);
 	}
 	*data = iv;
 	freeenv(value);
 	return (1);
 }
 
 /*
  * Find the next entry after the one which (cp) falls within, return a
  * pointer to its start or NULL if there are no more.
  */
 static char *
 kernenv_next(char *cp)
 {
 
 	if (cp != NULL) {
 		while (*cp != 0)
 			cp++;
 		cp++;
 		if (*cp == 0)
 			cp = NULL;
 	}
 	return (cp);
 }
 
 void
 tunable_int_init(void *data)
 {
 	struct tunable_int *d = (struct tunable_int *)data;
 
 	TUNABLE_INT_FETCH(d->path, d->var);
 }
 
 void
 tunable_long_init(void *data)
 {
 	struct tunable_long *d = (struct tunable_long *)data;
 
 	TUNABLE_LONG_FETCH(d->path, d->var);
 }
 
 void
 tunable_ulong_init(void *data)
 {
 	struct tunable_ulong *d = (struct tunable_ulong *)data;
 
 	TUNABLE_ULONG_FETCH(d->path, d->var);
 }
 
 void
 tunable_str_init(void *data)
 {
 	struct tunable_str *d = (struct tunable_str *)data;
 
 	TUNABLE_STR_FETCH(d->path, d->var, d->size);
 }
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 167231)
+++ head/sys/kern/kern_exec.c	(revision 167232)
@@ -1,1299 +1,1299 @@
 /*-
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/acct.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/wait.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/namei.h>
 #include <sys/resourcevar.h>
 #include <sys/sf_buf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p);
 static void exec_free_args(struct image_args *);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_ps_strings, "LU", "");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_usrstack, "LU", "");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
     NULL, 0, sysctl_kern_stackprot, "I", "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_psstrings;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 		   sizeof(p->p_sysent->sv_psstrings));
 	return error;
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_usrstack;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 		    sizeof(p->p_sysent->sv_usrstack));
 	return error;
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
 	char    *fname; 
 	char    **argv;
 	char    **envv; 
 };
 #endif
 
 int
 execve(td, uap)
 	struct thread *td;
 	struct execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 	} */ *uap;
 {
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
 int
 __mac_execve(td, uap)
 	struct thread *td;
 	struct __mac_execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 		struct mac *mac_p;
 	} */ *uap;
 {
 #ifdef MAC
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 /*
- * XXX: kern_execve has the astonishing property of not always
- * returning to the caller.  If sufficiently bad things happen during
- * the call to do_execve(), it can end up calling exit1(); as a result,
- * callers must avoid doing anything which they might need to undo
- * (e.g., allocating memory).
+ * XXX: kern_execve has the astonishing property of not always returning to
+ * the caller.  If sufficiently bad things happen during the call to
+ * do_execve(), it can end up calling exit1(); as a result, callers must
+ * avoid doing anything which they might need to undo (e.g., allocating
+ * memory).
  */
 int
 kern_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	int error;
 
 	AUDIT_ARG(argv, args->begin_argv, args->argc,
 	    args->begin_envv - args->begin_argv);
 	AUDIT_ARG(envv, args->begin_envv, args->envc,
 	    args->endp - args->begin_envv);
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		if (thread_single(SINGLE_BOUNDARY)) {
 			PROC_UNLOCK(p);
 	       		exec_free_args(args);
 			return (ERESTART);	/* Try again later. */
 		}
 		PROC_UNLOCK(p);
 	}
 
 	error = do_execve(td, args, mac_p);
 
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
 		 * force other threads to suicide.
 		 */
 		if (error == 0)
 			thread_single(SINGLE_EXIT);
 		else
 			thread_single_end();
 		PROC_UNLOCK(p);
 	}
 
 	return (error);
 }
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
  */
 static int
 do_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd, *ndp;
 	struct ucred *newcred = NULL, *oldcred;
 	struct uidinfo *euip;
 	register_t *stack_base;
 	int error, len, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts, *newsigacts;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *textvp = NULL;
 	int credential_changing;
 	int vfslocked;
 	int textset;
 #ifdef MAC
 	struct label *interplabel = NULL;
 	int will_transition;
 #endif
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
 
 	vfslocked = 0;
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->execlabel = NULL;
 	imgp->attr = &attr;
 	imgp->entry_addr = 0;
 	imgp->vmspace_destroyed = 0;
 	imgp->interpreted = 0;
 	imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX;
 	imgp->auxargs = NULL;
 	imgp->vp = NULL;
 	imgp->object = NULL;
 	imgp->firstpage = NULL;
 	imgp->ps_strings = 0;
 	imgp->auxarg_size = 0;
 	imgp->args = args;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
 	if (error)
 		goto exec_fail;
 #endif
 
 	imgp->image_header = NULL;
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp amoung other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
 	 */
 	ndp = &nd;
 	NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE |
 	    AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 
 interpret:
 	error = namei(ndp);
 	if (error)
 		goto exec_fail;
 
 	vfslocked = NDHASGIANT(ndp);
 	imgp->vp = ndp->ni_vp;
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->object = imgp->vp->v_object;
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
 	/*
 	 * Set VV_TEXT now so no one can write to the executable while we're
 	 * activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	textset = imgp->vp->v_vflag & VV_TEXT;
 	imgp->vp->v_vflag |= VV_TEXT;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1) {
 			if (textset == 0)
 				imgp->vp->v_vflag &= ~VV_TEXT;
 			error = ENOEXEC;
 		}
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * VV_TEXT needs to be unset for scripts.  There is a short
 		 * period before we determine that something is a script where
 		 * VV_TEXT will be set. The vnode lock is held over this
 		 * entire period so nothing should illegitimately be blocked.
 		 */
 		imgp->vp->v_vflag &= ~VV_TEXT;
 		/* free name buffer and old vnode */
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 #ifdef MAC
 		interplabel = mac_vnode_label_alloc();
 		mac_copy_vnode_label(ndp->ni_vp->v_label, interplabel);
 #endif
 		vput(ndp->ni_vp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
 		/* set new name to that of the interpreter */
 		NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		goto interpret;
 	}
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	if (p->p_sysent->sv_copyout_strings)
 		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
 	else
 		stack_base = exec_copyout_strings(imgp);
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup != NULL)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->args->argc);
 
 	/*
 	 * For security and other reasons, the file descriptor table cannot
 	 * be shared after an exec.
 	 */
 	fdunshare(p, td);
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	newcred = crget();
 	euip = uifind(attr.va_uid);
 	i = imgp->args->begin_envv - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		newargs = pargs_alloc(i);
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
 	/* close files on exec */
 	VOP_UNLOCK(imgp->vp, 0, td);
 	fdcloseexec(td);
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	/* Get a reference to the vnode prior to locking the proc */
 	VREF(ndp->ni_vp);
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	PROC_LOCK(p);
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		PROC_UNLOCK(p);
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 		PROC_LOCK(p);
 		p->p_sigacts = newsigacts;
 	} else
 		oldsigacts = NULL;
 
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
 	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
 	p->p_comm[len] = 0;
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
 		p->p_flag &= ~P_PPWAIT;
 		wakeup(p->p_pptr);
 	}
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	oldcred = p->p_ucred;
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid !=
 	    attr.va_uid;
 	credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid !=
 	    attr.va_gid;
 #ifdef MAC
 	will_transition = mac_execve_will_transition(oldcred, imgp->vp,
 	    interplabel, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	if (credential_changing &&
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 
 #ifdef KTRACE
 		if (p->p_tracevp != NULL &&
 		    priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED,
 		    SUSER_ALLOWJAIL)) {
 			mtx_lock(&ktrace_mtx);
 			p->p_traceflag = 0;
 			tracevp = p->p_tracevp;
 			p->p_tracevp = NULL;
 			tracecred = p->p_tracecred;
 			p->p_tracecred = NULL;
 			mtx_unlock(&ktrace_mtx);
 		}
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * setugidsafety() may call closef() and then pfind()
 		 * which may grab the process lock.
 		 * fdcheckstd() may call falloc() which may block to
 		 * allocate memory, so temporarily drop the process lock.
 		 */
 		PROC_UNLOCK(p);
 		setugidsafety(td);
 		VOP_UNLOCK(imgp->vp, 0, td);
 		error = fdcheckstd(td);
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 		if (error != 0)
 			goto done1;
 		PROC_LOCK(p);
 		/*
 		 * Set the new credentials.
 		 */
 		crcopy(newcred, oldcred);
 		if (attr.va_mode & VSUID)
 			change_euid(newcred, euip);
 		if (attr.va_mode & VSGID)
 			change_egid(newcred, attr.va_gid);
 #ifdef MAC
 		if (will_transition) {
 			mac_execve_transition(oldcred, newcred, imgp->vp,
 			    interplabel, imgp);
 		}
 #endif
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(newcred, newcred->cr_uid);
 		change_svgid(newcred, newcred->cr_gid);
 		p->p_ucred = newcred;
 		newcred = NULL;
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			crcopy(newcred, oldcred);
 			change_svuid(newcred, newcred->cr_uid);
 			change_svgid(newcred, newcred->cr_gid);
 			p->p_ucred = newcred;
 			newcred = NULL;
 		}
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced prior
 	 * to locking the proc lock.
 	 */
 	textvp = p->p_textvp;
 	p->p_textvp = ndp->ni_vp;
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/*
 	 * If tracing the process, trap to debugger so breakpoints
 	 * can be set before the program executes.
 	 * Use tdsignal to deliver signal to current thread, use
 	 * psignal may cause the signal to be delivered to wrong thread
 	 * because that thread will exit, remember we are going to enter
 	 * single thread mode.
 	 */
 	if (p->p_flag & P_TRACED)
 		tdsignal(p, td, SIGTRAP, NULL);
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/*
 	 * Free any previous argument cache and replace it with
 	 * the new argument cache, if any.
 	 */
 	oldargs = p->p_args;
 	p->p_args = newargs;
 	newargs = NULL;
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * Check if system-wide sampling is in effect or if the
 	 * current process is using PMCs.  If so, do exec() time
 	 * processing.  This processing needs to happen AFTER the
 	 * P_INEXEC flag is cleared.
 	 *
 	 * The proc lock needs to be released before taking the PMC
 	 * SX.
 	 */
 	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
 		PROC_UNLOCK(p);
 		pe.pm_credentialschanged = credential_changing;
 		pe.pm_entryaddr = imgp->entry_addr;
 
 		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
 	} else
 		PROC_UNLOCK(p);
 #else  /* !HWPMC_HOOKS */
 	PROC_UNLOCK(p);
 #endif
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
 		(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 	else
 		exec_setregs(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 
 	vfs_mark_atime(imgp->vp, td);
 
 done1:
 	/*
 	 * Free any resources malloc'd earlier that we didn't use.
 	 */
 	uifree(euip);
 	if (newcred == NULL)
 		crfree(oldcred);
 	else
 		crfree(newcred);
 	VOP_UNLOCK(imgp->vp, 0, td);
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (textvp != NULL) {
 		int tvfslocked;
 
 		tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
 		vrele(textvp);
 		VFS_UNLOCK_GIANT(tvfslocked);
 	}
 	if (ndp->ni_vp && error != 0)
 		vrele(ndp->ni_vp);
 #ifdef KTRACE
 	if (tracevp != NULL)
 		vrele(tracevp);
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (oldargs != NULL)
 		pargs_drop(oldargs);
 	if (newargs != NULL)
 		pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 
 exec_fail_dealloc:
 
 	/*
 	 * free various allocated resources
 	 */
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp != NULL) {
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 		vput(imgp->vp);
 	}
 
 	if (imgp->object != NULL)
 		vm_object_deallocate(imgp->object);
 
 	if (error == 0) {
 		/*
 		 * Stop the process here if its stop event mask has
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
 		goto done2;
 	}
 
 exec_fail:
 	/* we're done here, clear P_INEXEC */
 	PROC_LOCK(p);
 	p->p_flag &= ~P_INEXEC;
 	PROC_UNLOCK(p);
 
 done2:
 #ifdef MAC
 	mac_execve_exit(imgp);
 	if (interplabel != NULL)
 		mac_vnode_label_free(interplabel);
 #endif
 	VFS_UNLOCK_GIANT(vfslocked);
 	exec_free_args(args);
 
 	if (error && imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(td, W_EXITCODE(0, SIGABRT));
 		/* NOT REACHED */
 	}
 	return (error);
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i;
 	int initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
 	VM_OBJECT_LOCK(object);
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
 		initial_pagein = VM_INITIAL_PAGEIN;
 		if (initial_pagein > object->size)
 			initial_pagein = object->size;
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
 				if (ma[i]->valid)
 					break;
 				if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
 					break;
 				vm_page_busy(ma[i]);
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
 		ma[0] = vm_page_lookup(object, 0);
 		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) ||
 		    (ma[0]->valid == 0)) {
 			if (ma[0]) {
 				vm_page_lock_queues();
 				vm_page_free(ma[0]);
 				vm_page_unlock_queues();
 			}
 			VM_OBJECT_UNLOCK(object);
 			return (EIO);
 		}
 	}
 	vm_page_lock_queues();
 	vm_page_hold(ma[0]);
 	vm_page_unlock_queues();
 	vm_page_wakeup(ma[0]);
 	VM_OBJECT_UNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(imgp)
 	struct image_params *imgp;
 {
 	vm_page_t m;
 
 	if (imgp->firstpage != NULL) {
 		m = sf_buf_page(imgp->firstpage);
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_lock_queues();
 		vm_page_unhold(m);
 		vm_page_unlock_queues();
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack
  *	The new stack is only SGROWSIZ large because it is grown
  *	automatically in trap.c.
  */
 int
 exec_new_vmspace(imgp, sv)
 	struct image_params *imgp;
 	struct sysentvec *sv;
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_offset_t stack_addr;
 	vm_map_t map;
 
 	imgp->vmspace_destroyed = 1;
 	imgp->sysent = sv;
 
 	/* Called with Giant held, do not depend on it! */
 	EVENTHANDLER_INVOKE(process_exec, p, imgp);
 
 	/*
 	 * Here is as good a place as any to do any resource limit cleanups.
 	 * This is needed if a 64 bit binary exec's a 32 bit binary - the
 	 * data size limit may need to be changed to a value that makes
 	 * sense for the 32 bit binary.
 	 */
 	if (sv->sv_fixlimits != NULL)
 		sv->sv_fixlimits(p);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 	} else {
 		vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Allocate a new stack */
 	stack_addr = sv->sv_usrstack - maxssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error)
 		return (error);
 
 #ifdef __ia64__
 	/* Allocate a new register stack */
 	stack_addr = IA64_BACKINGSTORE;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
 	if (error)
 		return (error);
 #endif
 
 	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
 	 * VM_STACK case, but they are still used to monitor the size of the
 	 * process stack so we can check the stack rlimit.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;
 
 	return (0);
 }
 
 /*
- * Copy out argument and environment strings from the old process
- *	address space into the temporary string buffer.
+ * Copy out argument and environment strings from the old process address
+ * space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
 	char *argp, *envp;
 	int error;
 	size_t length;
 
 	error = 0;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 	/*
 	 * Allocate temporary demand zeroed space for argument and
 	 *	environment strings:
 	 *
 	 * o ARG_MAX for argument and environment;
 	 * o MAXSHELLCMDLEN for the name of interpreters.
 	 */
 	args->buf = (char *) kmem_alloc_wait(exec_map,
 	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
 	if (args->buf == NULL)
 		return (ENOMEM);
 	args->begin_argv = args->buf;
 	args->endp = args->begin_argv;
 	args->stringspace = ARG_MAX;
 
 	args->fname = args->buf + ARG_MAX;
 
 	/*
 	 * Copy the file name.
 	 */
 	error = (segflg == UIO_SYSSPACE) ?
 	    copystr(fname, args->fname, PATH_MAX, &length) :
 	    copyinstr(fname, args->fname, PATH_MAX, &length);
 	if (error != 0)
 		goto err_exit;
 
 	/*
 	 * extract arguments first
 	 */
 	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
 		if (argp == (caddr_t) -1) {
 			error = EFAULT;
 			goto err_exit;
 		}
 		if ((error = copyinstr(argp, args->endp,
 		    args->stringspace, &length))) {
 			if (error == ENAMETOOLONG) 
 				error = E2BIG;
 			goto err_exit;
 		}
 		args->stringspace -= length;
 		args->endp += length;
 		args->argc++;
 	}
 
 	args->begin_envv = args->endp;
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
 			if (envp == (caddr_t)-1) {
 				error = EFAULT;
 				goto err_exit;
 			}
 			if ((error = copyinstr(envp, args->endp,
 			    args->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
 					error = E2BIG;
 				goto err_exit;
 			}
 			args->stringspace -= length;
 			args->endp += length;
 			args->envc++;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 static void
 exec_free_args(struct image_args *args)
 {
 
 	if (args->buf) {
 		kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
 		    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
 		args->buf = NULL;
 	}
 }
 
 /*
- * Copy strings out to the new process address space, constructing
- *	new arg and env vector tables. Return a pointer to the base
- *	so that it can be used as the initial stack pointer.
+ * Copy strings out to the new process address space, constructing new arg
+ * and env vector tables. Return a pointer to the base so that it can be used
+ * as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(imgp)
 	struct image_params *imgp;
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp, *destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	int szsigcode;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	if (p->p_sysent->sv_szsigcode != NULL)
 		szsigcode = *(p->p_sysent->sv_szsigcode);
 	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
 	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode)
 		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
 		    szsigcode), szsigcode);
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size) *
 		    sizeof(char *));
 
 	} else {
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
 		    sizeof(char *));
 	}
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(imgp)
 	struct image_params *imgp;
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error;
 
 	td = curthread;			/* XXXKSE */
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred, td);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that this
 	 *	file resides on.
 	 * 2) Insure that at least one execute bit is on - otherwise root
 	 *	will always succeed, and we don't want to happen unless the
 	 *	file really is executable.
 	 * 3) Insure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr->va_mode & 0111) == 0) ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	if (vp->v_writecount)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1);
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: head/sys/kern/kern_exit.c
===================================================================
--- head/sys/kern/kern_exit.c	(revision 167231)
+++ head/sys/kern/kern_exit.c	(revision 167232)
@@ -1,899 +1,898 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/tty.h>
 #include <sys/wait.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/syslog.h>
 #include <sys/ptrace.h>
 #include <sys/acct.h>		/* for acct_process() function prototype */
 #include <sys/filedesc.h>
 #include <sys/shm.h>
 #include <sys/sem.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 /* Required to be non-static for SysVR4 emulator */
 MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status");
 
 /* Hook for NFS teardown procedure. */
 void (*nlminfo_release_p)(struct proc *p);
 
 /*
- * exit --
- *	Death of process.
+ * exit -- death of process.
  */
 void
 sys_exit(struct thread *td, struct sys_exit_args *uap)
 {
 
 	exit1(td, W_EXITCODE(uap->rval, 0));
 	/* NOTREACHED */
 }
 
 /*
- * Exit: deallocate address space and other resources, change proc state
- * to zombie, and unlink proc from allproc and parent's lists.  Save exit
- * status and rusage for wait().  Check for child processes and orphan them.
+ * Exit: deallocate address space and other resources, change proc state to
+ * zombie, and unlink proc from allproc and parent's lists.  Save exit status
+ * and rusage for wait().  Check for child processes and orphan them.
  */
 void
 exit1(struct thread *td, int rv)
 {
 	struct proc *p, *nq, *q;
 	struct tty *tp;
 	struct vnode *ttyvp;
 	struct vnode *vtmp;
 #ifdef KTRACE
 	struct vnode *tracevp;
 	struct ucred *tracecred;
 #endif
 	struct plimit *plim;
 	int locked;
 
 	/*
 	 * Drop Giant if caller has it.  Eventually we should warn about
 	 * being called with Giant held.
 	 */ 
 	while (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 
 	p = td->td_proc;
 	if (p == initproc) {
 		printf("init died (signal %d, exit %d)\n",
 		    WTERMSIG(rv), WEXITSTATUS(rv));
 		panic("Going nowhere without my init!");
 	}
 
 	/*
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);
 	if (p->p_flag & P_HADTHREADS) {
 retry:
 		/*
 		 * First check if some other thread got here before us..
 		 * if so, act apropriatly, (exit or suspend);
 		 */
 		thread_suspend_check(0);
 
 		/*
 		 * Kill off the other threads. This requires
 		 * some co-operation from other parts of the kernel
 		 * so it may not be instantaneous.  With this state set
 		 * any thread entering the kernel from userspace will
 		 * thread_exit() in trap().  Any thread attempting to
 		 * sleep will return immediately with EINTR or EWOULDBLOCK
 		 * which will hopefully force them to back out to userland
 		 * freeing resources as they go.  Any thread attempting
 		 * to return to userland will thread_exit() from userret().
 		 * thread_exit() will unsuspend us when the last of the
 		 * other threads exits.
 		 * If there is already a thread singler after resumption,
 		 * calling thread_single will fail; in that case, we just
 		 * re-check all suspension request, the thread should
 		 * either be suspended there or exit.
 		 */
 		if (thread_single(SINGLE_EXIT))
 			goto retry;
 
 		/*
 		 * All other activity in this process is now stopped.
 		 * Threading support has been turned off.
 		 */
 	}
 
 	/*
 	 * Wakeup anyone in procfs' PIOCWAIT.  They should have a hold
 	 * on our vmspace, so we should block below until they have
 	 * released their reference to us.  Note that if they have
 	 * requested S_EXIT stops we will block here until they ack
 	 * via PIOCCONT.
 	 */
 	_STOPEVENT(p, S_EXIT, rv);
 
 	/*
 	 * Note that we are exiting and do another wakeup of anyone in
 	 * PIOCWAIT in case they aren't listening for S_EXIT stops or
 	 * decided to wait again after we told them we are exiting.
 	 */
 	p->p_flag |= P_WEXIT;
 	wakeup(&p->p_stype);
 
 	/*
 	 * Wait for any processes that have a hold on our vmspace to
 	 * release their reference.
 	 */
 	while (p->p_lock > 0)
 		msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
 
 	PROC_UNLOCK(p);
 
 #ifdef AUDIT
 	/*
 	 * The Sun BSM exit token contains two components: an exit status as
 	 * passed to exit(), and a return value to indicate what sort of exit
 	 * it was.  The exit status is WEXITSTATUS(rv), but it's not clear
 	 * what the return value is.
 	 */
 	AUDIT_ARG(exit, WEXITSTATUS(rv), 0);
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 
 	/* Are we a task leader? */
 	if (p == p->p_leader) {
 		mtx_lock(&ppeers_lock);
 		q = p->p_peers;
 		while (q != NULL) {
 			PROC_LOCK(q);
 			psignal(q, SIGKILL);
 			PROC_UNLOCK(q);
 			q = q->p_peers;
 		}
 		while (p->p_peers != NULL)
 			msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
 		mtx_unlock(&ppeers_lock);
 	}
 
 	/*
 	 * Check if any loadable modules need anything done at process exit.
 	 * E.g. SYSV IPC stuff
 	 * XXX what if one of these generates an error?
 	 */
 	EVENTHANDLER_INVOKE(process_exit, p);
 
 	MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
 		M_ZOMBIE, M_WAITOK);
 	/*
 	 * If parent is waiting for us to exit or exec,
 	 * P_PPWAIT is set; we will wakeup the parent below.
 	 */
 	PROC_LOCK(p);
 	stopprofclock(p);
 	p->p_flag &= ~(P_TRACED | P_PPWAIT);
 
 	/*
 	 * Stop the real interval timer.  If the handler is currently
 	 * executing, prevent it from rearming itself and let it finish.
 	 */
 	if (timevalisset(&p->p_realtimer.it_value) &&
 	    callout_stop(&p->p_itcallout) == 0) {
 		timevalclear(&p->p_realtimer.it_interval);
 		msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0);
 		KASSERT(!timevalisset(&p->p_realtimer.it_value),
 		    ("realtime timer is still armed"));
 	}
 	PROC_UNLOCK(p);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pid.
 	 */
 	funsetownlst(&p->p_sigiolst);
 
 	/*
 	 * If this process has an nlminfo data area (for lockd), release it
 	 */
 	if (nlminfo_release_p != NULL && p->p_nlminfo != NULL)
 		(*nlminfo_release_p)(p);
 
 	/*
 	 * Close open files and release open-file table.
 	 * This may block!
 	 */
 	fdfree(td);
 
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
 	 */
 	if (td->td_pflags & TDP_GEOM)
 		g_waitidle();
 
 	/*
 	 * Remove ourself from our leader's peer list and wake our leader.
 	 */
 	mtx_lock(&ppeers_lock);
 	if (p->p_leader->p_peers) {
 		q = p->p_leader;
 		while (q->p_peers != p)
 			q = q->p_peers;
 		q->p_peers = p->p_peers;
 		wakeup(p->p_leader);
 	}
 	mtx_unlock(&ppeers_lock);
 
 	vmspace_exit(td);
 
 	mtx_lock(&Giant);	/* XXX TTY */
 	sx_xlock(&proctree_lock);
 	if (SESS_LEADER(p)) {
 		struct session *sp;
 
 		sp = p->p_session;
 		if (sp->s_ttyvp) {
 			/*
 			 * Controlling process.
 			 * Signal foreground pgrp,
 			 * drain controlling terminal
 			 * and revoke access to controlling terminal.
 			 */
 			if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
 				tp = sp->s_ttyp;
 				if (sp->s_ttyp->t_pgrp) {
 					PGRP_LOCK(sp->s_ttyp->t_pgrp);
 					pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
 					PGRP_UNLOCK(sp->s_ttyp->t_pgrp);
 				}
 				/* XXX tp should be locked. */
 				sx_xunlock(&proctree_lock);
 				(void) ttywait(tp);
 				sx_xlock(&proctree_lock);
 				/*
 				 * The tty could have been revoked
 				 * if we blocked.
 				 */
 				if (sp->s_ttyvp) {
 					ttyvp = sp->s_ttyvp;
 					SESS_LOCK(p->p_session);
 					sp->s_ttyvp = NULL;
 					SESS_UNLOCK(p->p_session);
 					sx_xunlock(&proctree_lock);
 					VOP_LOCK(ttyvp, LK_EXCLUSIVE, td);
 					VOP_REVOKE(ttyvp, REVOKEALL);
 					vput(ttyvp);
 					sx_xlock(&proctree_lock);
 				}
 			}
 			if (sp->s_ttyvp) {
 				ttyvp = sp->s_ttyvp;
 				SESS_LOCK(p->p_session);
 				sp->s_ttyvp = NULL;
 				SESS_UNLOCK(p->p_session);
 				vrele(ttyvp);
 			}
 			/*
 			 * s_ttyp is not zero'd; we use this to indicate
 			 * that the session once had a controlling terminal.
 			 * (for logging and informational purposes)
 			 */
 		}
 		SESS_LOCK(p->p_session);
 		sp->s_leader = NULL;
 		SESS_UNLOCK(p->p_session);
 	}
 	fixjobc(p, p->p_pgrp, 0);
 	sx_xunlock(&proctree_lock);
 	(void)acct_process(td);
 	mtx_unlock(&Giant);	
 #ifdef KTRACE
 	/*
 	 * Drain any pending records on the thread and release the trace
 	 * file.  It might be better if drain-and-clear were atomic.
 	 */
 	ktrprocexit(td);
 	PROC_LOCK(p);
 	mtx_lock(&ktrace_mtx);
 	p->p_traceflag = 0;	/* don't trace the vrele() */
 	tracevp = p->p_tracevp;
 	p->p_tracevp = NULL;
 	tracecred = p->p_tracecred;
 	p->p_tracecred = NULL;
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
 	if (tracevp != NULL) {
 		locked = VFS_LOCK_GIANT(tracevp->v_mount);
 		vrele(tracevp);
 		VFS_UNLOCK_GIANT(locked);
 	}
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	/*
 	 * Release reference to text vnode
 	 */
 	if ((vtmp = p->p_textvp) != NULL) {
 		p->p_textvp = NULL;
 		locked = VFS_LOCK_GIANT(vtmp->v_mount);
 		vrele(vtmp);
 		VFS_UNLOCK_GIANT(locked);
 	}
 
 	/*
 	 * Release our limits structure.
 	 */
 	PROC_LOCK(p);
 	plim = p->p_limit;
 	p->p_limit = NULL;
 	PROC_UNLOCK(p);
 	lim_free(plim);
 
 	/*
 	 * Remove proc from allproc queue and pidhash chain.
 	 * Place onto zombproc.  Unlink from parent's child list.
 	 */
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(p, p_list);
 	LIST_INSERT_HEAD(&zombproc, p, p_list);
 	LIST_REMOVE(p, p_hash);
 	sx_xunlock(&allproc_lock);
 
 	/*
 	 * Reparent all of our children to init.
 	 */
 	sx_xlock(&proctree_lock);
 	q = LIST_FIRST(&p->p_children);
 	if (q != NULL)		/* only need this if any child is S_ZOMB */
 		wakeup(initproc);
 	for (; q != NULL; q = nq) {
 		nq = LIST_NEXT(q, p_sibling);
 		PROC_LOCK(q);
 		proc_reparent(q, initproc);
 		q->p_sigparent = SIGCHLD;
 		/*
 		 * Traced processes are killed
 		 * since their existence means someone is screwing up.
 		 */
 		if (q->p_flag & P_TRACED) {
 			q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
 			psignal(q, SIGKILL);
 		}
 		PROC_UNLOCK(q);
 	}
 
 	/*
 	 * Save exit status and finalize rusage info except for times,
 	 * adding in child rusage info later when our time is locked.
 	 */
 	PROC_LOCK(p);
 	p->p_xstat = rv;
 	p->p_xthread = td;
 	p->p_stats->p_ru.ru_nvcsw++;
 	*p->p_ru = p->p_stats->p_ru;
 
 	/*
 	 * Notify interested parties of our demise.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);
 
 	/*
 	 * Just delete all entries in the p_klist. At this point we won't
 	 * report any more events, and there are nasty race conditions that
 	 * can beat us if we don't.
 	 */
 	knlist_clear(&p->p_klist, 1);
 
 	/*
 	 * Notify parent that we're gone.  If parent has the PS_NOCLDWAIT
 	 * flag set, or if the handler is set to SIG_IGN, notify process
 	 * 1 instead (and hope it will handle this situation).
 	 */
 	PROC_LOCK(p->p_pptr);
 	mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
 	if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
 		struct proc *pp;
 
 		mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 		pp = p->p_pptr;
 		PROC_UNLOCK(pp);
 		proc_reparent(p, initproc);
 		p->p_sigparent = SIGCHLD;
 		PROC_LOCK(p->p_pptr);
 		/*
 		 * If this was the last child of our parent, notify
 		 * parent, so in case he was wait(2)ing, he will
 		 * continue.
 		 */
 		if (LIST_EMPTY(&pp->p_children))
 			wakeup(pp);
 	} else
 		mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 
 	if (p->p_pptr == initproc)
 		psignal(p->p_pptr, SIGCHLD);
 	else if (p->p_sigparent != 0) {
 		if (p->p_sigparent == SIGCHLD)
 			childproc_exited(p);
 		else	/* LINUX thread */
 			psignal(p->p_pptr, p->p_sigparent);
 	}
 	PROC_UNLOCK(p->p_pptr);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Finally, call machine-dependent code to release the remaining
 	 * resources including address space.
 	 * The address space is released by "vmspace_exitfree(p)" in
 	 * vm_waitproc().
 	 */
 	cpu_exit(td);
 
 	WITNESS_WARN(WARN_PANIC, &proctree_lock.sx_object,
 	    "process (pid %d) exiting", p->p_pid);
 
 	PROC_LOCK(p);
 	PROC_LOCK(p->p_pptr);
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * The state PRS_ZOMBIE prevents other proesses from sending
 	 * signal to the process, to avoid memory leak, we free memory
 	 * for signal queue at the time when the state is set.
 	 */
 	sigqueue_flush(&p->p_sigqueue);
 	sigqueue_flush(&td->td_sigqueue);
 
 	/*
 	 * We have to wait until after acquiring all locks before
 	 * changing p_state.  We need to avoid all possible context
 	 * switches (including ones from blocking on a mutex) while
 	 * marked as a zombie.  We also have to set the zombie state
 	 * before we release the parent process' proc lock to avoid
 	 * a lost wakeup.  So, we first call wakeup, then we grab the
 	 * sched lock, update the state, and release the parent process'
 	 * proc lock.
 	 */
 	wakeup(p->p_pptr);
 	mtx_lock_spin(&sched_lock);
 	p->p_state = PRS_ZOMBIE;
 	PROC_UNLOCK(p->p_pptr);
 
 	sched_exit(p->p_pptr, td);
 
 	/*
 	 * Hopefully no one will try to deliver a signal to the process this
 	 * late in the game.
 	 */
 	knlist_destroy(&p->p_klist);
 
 	/*
 	 * Make sure the scheduler takes this thread out of its tables etc.
 	 * This will also release this thread's reference to the ucred.
 	 * Other thread parts to release include pcb bits and such.
 	 */
 	thread_exit();
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct abort2_args {
 	char *why;
 	int nargs;
 	void **args;
 };
 #endif
 
 int
 abort2(struct thread *td, struct abort2_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct sbuf *sb;
 	void *uargs[16];
 	int error, i, sig;
 
 	error = 0;	/* satisfy compiler */
 
 	/*
 	 * Do it right now so we can log either proper call of abort2(), or
 	 * note, that invalid argument was passed. 512 is big enough to
 	 * handle 16 arguments' descriptions with additional comments.
 	 */
 	sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
 	sbuf_clear(sb);
 	sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
 	    p->p_comm, p->p_pid, td->td_ucred->cr_uid);
 	/* 
 	 * Since we can't return from abort2(), send SIGKILL in cases, where
 	 * abort2() was called improperly
 	 */
 	sig = SIGKILL;
 	/* Prevent from DoSes from user-space. */
 	if (uap->nargs < 0 || uap->nargs > 16)
 		goto out;
 	if (uap->args == NULL)
 		goto out;
 	error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
 	if (error != 0)
 		goto out;
 	/*
 	 * Limit size of 'reason' string to 128. Will fit even when
 	 * maximal number of arguments was chosen to be logged.
 	 */
 	if (uap->why != NULL) {
 		error = sbuf_copyin(sb, uap->why, 128);
 		if (error < 0)
 			goto out;
 	} else {
 		sbuf_printf(sb, "(null)");
 	}
 	if (uap->nargs) {
 		sbuf_printf(sb, "(");
 		for (i = 0;i < uap->nargs; i++)
 			sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
 		sbuf_printf(sb, ")");
 	}
 	/*
 	 * Final stage: arguments were proper, string has been
 	 * successfully copied from userspace, and copying pointers
 	 * from user-space succeed.
 	 */
 	sig = SIGABRT;
 out:
 	if (sig == SIGKILL) {
 		sbuf_trim(sb);
 		sbuf_printf(sb, " (Reason text inaccessible)");
 	}
 	sbuf_cat(sb, "\n");
 	sbuf_finish(sb);
 	log(LOG_INFO, "%s", sbuf_data(sb));
 	sbuf_delete(sb);
 	exit1(td, W_EXITCODE(0, sig));
 	return (0);
 }
 
 
 #ifdef COMPAT_43
 /*
  * The dirty work is handled by kern_wait().
  */
 int
 owait(struct thread *td, struct owait_args *uap __unused)
 {
 	int error, status;
 
 	error = kern_wait(td, WAIT_ANY, &status, 0, NULL);
 	if (error == 0)
 		td->td_retval[1] = status;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * The dirty work is handled by kern_wait().
  */
 int
 wait4(struct thread *td, struct wait_args *uap)
 {
 	struct rusage ru, *rup;
 	int error, status;
 
 	if (uap->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
 	if (uap->status != NULL && error == 0)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->rusage != NULL && error == 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
 
 int
 kern_wait(struct thread *td, pid_t pid, int *status, int options,
     struct rusage *rusage)
 {
 	struct proc *p, *q, *t;
 	int error, nfound;
 
 	AUDIT_ARG(pid, pid);
 
 	q = td->td_proc;
 	if (pid == 0) {
 		PROC_LOCK(q);
 		pid = -q->p_pgid;
 		PROC_UNLOCK(q);
 	}
 	if (options &~ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE))
 		return (EINVAL);
 loop:
 	if (q->p_flag & P_STATCHILD) {
 		PROC_LOCK(q);
 		q->p_flag &= ~P_STATCHILD;
 		PROC_UNLOCK(q);
 	}
 	nfound = 0;
 	sx_xlock(&proctree_lock);
 	LIST_FOREACH(p, &q->p_children, p_sibling) {
 		PROC_LOCK(p);
 		if (pid != WAIT_ANY &&
 		    p->p_pid != pid && p->p_pgid != -pid) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p_canwait(td, p)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 
 		/*
 		 * This special case handles a kthread spawned by linux_clone
 		 * (see linux_misc.c).  The linux_wait4 and linux_waitpid
 		 * functions need to be able to distinguish between waiting
 		 * on a process and waiting on a thread.  It is a thread if
 		 * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
 		 * signifies we want to wait for threads and not processes.
 		 */
 		if ((p->p_sigparent != SIGCHLD) ^
 		    ((options & WLINUXCLONE) != 0)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 
 		nfound++;
 		if (p->p_state == PRS_ZOMBIE) {
 
 			/*
 			 * It is possible that the last thread of this
 			 * process is still running on another CPU
 			 * in thread_exit() after having dropped the process
 			 * lock via PROC_UNLOCK() but before it has completed
 			 * cpu_throw().  In that case, the other thread must
 			 * still hold sched_lock, so simply by acquiring
 			 * sched_lock once we will wait long enough for the
 			 * thread to exit in that case.
 			 */
 			mtx_lock_spin(&sched_lock);
 			mtx_unlock_spin(&sched_lock);
 			
 			td->td_retval[0] = p->p_pid;
 			if (status)
 				*status = p->p_xstat;	/* convert to int */
 			if (rusage) {
 				*rusage = *p->p_ru;
 				calcru(p, &rusage->ru_utime, &rusage->ru_stime);
 			}
 
 			PROC_LOCK(q);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(q);
 
 			/*
 			 * If we got the child via a ptrace 'attach',
 			 * we need to give it back to the old parent.
 			 */
 			PROC_UNLOCK(p);
 			if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
 				PROC_LOCK(p);
 				p->p_oppid = 0;
 				proc_reparent(p, t);
 				PROC_UNLOCK(p);
 				tdsignal(t, NULL, SIGCHLD, p->p_ksi);
 				wakeup(t);
 				PROC_UNLOCK(t);
 				sx_xunlock(&proctree_lock);
 				return (0);
 			}
 
 			/*
 			 * Remove other references to this process to ensure
 			 * we have an exclusive reference.
 			 */
 			sx_xlock(&allproc_lock);
 			LIST_REMOVE(p, p_list);	/* off zombproc */
 			sx_xunlock(&allproc_lock);
 			LIST_REMOVE(p, p_sibling);
 			leavepgrp(p);
 			sx_xunlock(&proctree_lock);
 
 			/*
 			 * As a side effect of this lock, we know that
 			 * all other writes to this proc are visible now, so
 			 * no more locking is needed for p.
 			 */
 			PROC_LOCK(p);
 			p->p_xstat = 0;		/* XXX: why? */
 			PROC_UNLOCK(p);
 			PROC_LOCK(q);
 			ruadd(&q->p_stats->p_cru, &q->p_crux, p->p_ru,
 			    &p->p_rux);
 			PROC_UNLOCK(q);
 			FREE(p->p_ru, M_ZOMBIE);
 			p->p_ru = NULL;
 
 			/*
 			 * Decrement the count of procs running with this uid.
 			 */
 			(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
 
 			/*
 			 * Free credentials, arguments, and sigacts.
 			 */
 			crfree(p->p_ucred);
 			p->p_ucred = NULL;
 			pargs_drop(p->p_args);
 			p->p_args = NULL;
 			sigacts_free(p->p_sigacts);
 			p->p_sigacts = NULL;
 
 			/*
 			 * Do any thread-system specific cleanups.
 			 */
 			thread_wait(p);
 
 			/*
 			 * Give vm and machine-dependent layer a chance
 			 * to free anything that cpu_exit couldn't
 			 * release while still running in process context.
 			 */
 			vm_waitproc(p);
 #ifdef MAC
 			mac_destroy_proc(p);
 #endif
 #ifdef AUDIT
 			audit_proc_free(p);
 #endif
 			KASSERT(FIRST_THREAD_IN_PROC(p),
 			    ("kern_wait: no residual thread!"));
 			uma_zfree(proc_zone, p);
 			sx_xlock(&allproc_lock);
 			nprocs--;
 			sx_xunlock(&allproc_lock);
 			return (0);
 		}
 		mtx_lock_spin(&sched_lock);
 		if ((p->p_flag & P_STOPPED_SIG) &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    (p->p_flag & P_WAITED) == 0 &&
 		    (p->p_flag & P_TRACED || options & WUNTRACED)) {
 			mtx_unlock_spin(&sched_lock);
 			p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 			if (status)
 				*status = W_STOPCODE(p->p_xstat);
 
 			PROC_LOCK(q);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(q);
 			PROC_UNLOCK(p);
 
 			return (0);
 		}
 		mtx_unlock_spin(&sched_lock);
 		if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 			p->p_flag &= ~P_CONTINUED;
 
 			PROC_LOCK(q);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(q);
 			PROC_UNLOCK(p);
 
 			if (status)
 				*status = SIGCONT;
 			return (0);
 		}
 		PROC_UNLOCK(p);
 	}
 	if (nfound == 0) {
 		sx_xunlock(&proctree_lock);
 		return (ECHILD);
 	}
 	if (options & WNOHANG) {
 		sx_xunlock(&proctree_lock);
 		td->td_retval[0] = 0;
 		return (0);
 	}
 	PROC_LOCK(q);
 	sx_xunlock(&proctree_lock);
 	if (q->p_flag & P_STATCHILD) {
 		q->p_flag &= ~P_STATCHILD;
 		error = 0;
 	} else
 		error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
 	PROC_UNLOCK(q);
 	if (error)
 		return (error);	
 	goto loop;
 }
 
 /*
  * Make process 'parent' the new parent of process 'child'.
  * Must be called with an exclusive hold of proctree lock.
  */
 void
 proc_reparent(struct proc *child, struct proc *parent)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(child, MA_OWNED);
 	if (child->p_pptr == parent)
 		return;
 
 	PROC_LOCK(child->p_pptr);
 	sigqueue_take(child->p_ksi);
 	PROC_UNLOCK(child->p_pptr);
 	LIST_REMOVE(child, p_sibling);
 	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
 	child->p_pptr = parent;
 }
Index: head/sys/kern/kern_ktrace.c
===================================================================
--- head/sys/kern/kern_ktrace.c	(revision 167231)
+++ head/sys/kern/kern_ktrace.c	(revision 167232)
@@ -1,1018 +1,1012 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_ktrace.c	8.2 (Berkeley) 9/23/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/ktrace.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * The ktrace facility allows the tracing of certain key events in user space
  * processes, such as system calls, signal delivery, context switches, and
  * user generated events using utrace(2).  It works by streaming event
  * records and data to a vnode associated with the process using the
  * ktrace(2) system call.  In general, records can be written directly from
  * the context that generates the event.  One important exception to this is
  * during a context switch, where sleeping is not permitted.  To handle this
  * case, trace events are generated using in-kernel ktr_request records, and
  * then delivered to disk at a convenient moment -- either immediately, the
  * next traceable event, at system call return, or at process exit.
  *
  * When dealing with multiple threads or processes writing to the same event
  * log, ordering guarantees are weak: specifically, if an event has multiple
  * records (i.e., system call enter and return), they may be interlaced with
  * records from another event.  Process and thread ID information is provided
  * in the record, and user applications can de-interlace events if required.
  */
 
 static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
 
 #ifdef KTRACE
 
 #ifndef KTRACE_REQUEST_POOL
 #define	KTRACE_REQUEST_POOL	100
 #endif
 
 struct ktr_request {
 	struct	ktr_header ktr_header;
 	void	*ktr_buffer;
 	union {
 		struct	ktr_syscall ktr_syscall;
 		struct	ktr_sysret ktr_sysret;
 		struct	ktr_genio ktr_genio;
 		struct	ktr_psig ktr_psig;
 		struct	ktr_csw ktr_csw;
 	} ktr_data;
 	STAILQ_ENTRY(ktr_request) ktr_list;
 };
 
 static int data_lengths[] = {
 	0,					/* none */
 	offsetof(struct ktr_syscall, ktr_args),	/* KTR_SYSCALL */
 	sizeof(struct ktr_sysret),		/* KTR_SYSRET */
 	0,					/* KTR_NAMEI */
 	sizeof(struct ktr_genio),		/* KTR_GENIO */
 	sizeof(struct ktr_psig),		/* KTR_PSIG */
 	sizeof(struct ktr_csw),			/* KTR_CSW */
 	0					/* KTR_USER */
 };
 
 static STAILQ_HEAD(, ktr_request) ktr_free;
 
 static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
 
 static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
 TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
 
 static u_int ktr_geniosize = PAGE_SIZE;
 TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
 SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
     0, "Maximum size of genio event payload");
 
 static int print_message = 1;
 struct mtx ktrace_mtx;
 static struct sx ktrace_sx;
 
 static void ktrace_init(void *dummy);
 static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
 static u_int ktrace_resize_pool(u_int newsize);
 static struct ktr_request *ktr_getrequest(int type);
 static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
 static void ktr_freerequest(struct ktr_request *req);
 static void ktr_writerequest(struct thread *td, struct ktr_request *req);
 static int ktrcanset(struct thread *,struct proc *);
 static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
 static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
 
 /*
  * ktrace itself generates events, such as context switches, which we do not
  * wish to trace.  Maintain a flag, TDP_INKTRACE, on each thread to determine
  * whether or not it is in a region where tracing of events should be
  * suppressed.
  */
 static void
 ktrace_enter(struct thread *td)
 {
 
 	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
 	td->td_pflags |= TDP_INKTRACE;
 }
 
 static void
 ktrace_exit(struct thread *td)
 {
 
 	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
 	td->td_pflags &= ~TDP_INKTRACE;
 }
 
 static void
 ktrace_assert(struct thread *td)
 {
 
 	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
 }
 
 static void
 ktrace_init(void *dummy)
 {
 	struct ktr_request *req;
 	int i;
 
 	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
 	sx_init(&ktrace_sx, "ktrace_sx");
 	STAILQ_INIT(&ktr_free);
 	for (i = 0; i < ktr_requestpool; i++) {
 		req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
 		STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 	}
 }
 SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
 
 static int
 sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td;
 	u_int newsize, oldsize, wantsize;
 	int error;
 
 	/* Handle easy read-only case first to avoid warnings from GCC. */
 	if (!req->newptr) {
 		mtx_lock(&ktrace_mtx);
 		oldsize = ktr_requestpool;
 		mtx_unlock(&ktrace_mtx);
 		return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
 	}
 
 	error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
 	if (error)
 		return (error);
 	td = curthread;
 	ktrace_enter(td);
 	mtx_lock(&ktrace_mtx);
 	oldsize = ktr_requestpool;
 	newsize = ktrace_resize_pool(wantsize);
 	mtx_unlock(&ktrace_mtx);
 	ktrace_exit(td);
 	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
 	if (error)
 		return (error);
 	if (wantsize > oldsize && newsize < wantsize)
 		return (ENOSPC);
 	return (0);
 }
 SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW,
     &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU", "");
 
 static u_int
 ktrace_resize_pool(u_int newsize)
 {
 	struct ktr_request *req;
 	int bound;
 
 	mtx_assert(&ktrace_mtx, MA_OWNED);
 	print_message = 1;
 	bound = newsize - ktr_requestpool;
 	if (bound == 0)
 		return (ktr_requestpool);
 	if (bound < 0)
 		/* Shrink pool down to newsize if possible. */
 		while (bound++ < 0) {
 			req = STAILQ_FIRST(&ktr_free);
 			if (req == NULL)
 				return (ktr_requestpool);
 			STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
 			ktr_requestpool--;
 			mtx_unlock(&ktrace_mtx);
 			free(req, M_KTRACE);
 			mtx_lock(&ktrace_mtx);
 		}
 	else
 		/* Grow pool up to newsize. */
 		while (bound-- > 0) {
 			mtx_unlock(&ktrace_mtx);
 			req = malloc(sizeof(struct ktr_request), M_KTRACE,
 			    M_WAITOK);
 			mtx_lock(&ktrace_mtx);
 			STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 			ktr_requestpool++;
 		}
 	return (ktr_requestpool);
 }
 
 static struct ktr_request *
 ktr_getrequest(int type)
 {
 	struct ktr_request *req;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int pm;
 
 	ktrace_enter(td);	/* XXX: In caller instead? */
 	mtx_lock(&ktrace_mtx);
 	if (!KTRCHECK(td, type)) {
 		mtx_unlock(&ktrace_mtx);
 		ktrace_exit(td);
 		return (NULL);
 	}
 	req = STAILQ_FIRST(&ktr_free);
 	if (req != NULL) {
 		STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
 		req->ktr_header.ktr_type = type;
 		if (p->p_traceflag & KTRFAC_DROP) {
 			req->ktr_header.ktr_type |= KTR_DROP;
 			p->p_traceflag &= ~KTRFAC_DROP;
 		}
 		mtx_unlock(&ktrace_mtx);
 		microtime(&req->ktr_header.ktr_time);
 		req->ktr_header.ktr_pid = p->p_pid;
 		req->ktr_header.ktr_tid = td->td_tid;
 		bcopy(p->p_comm, req->ktr_header.ktr_comm, MAXCOMLEN + 1);
 		req->ktr_buffer = NULL;
 		req->ktr_header.ktr_len = 0;
 	} else {
 		p->p_traceflag |= KTRFAC_DROP;
 		pm = print_message;
 		print_message = 0;
 		mtx_unlock(&ktrace_mtx);
 		if (pm)
 			printf("Out of ktrace request objects.\n");
 		ktrace_exit(td);
 	}
 	return (req);
 }
 
 /*
  * Some trace generation environments don't permit direct access to VFS,
  * such as during a context switch where sleeping is not allowed.  Under these
  * circumstances, queue a request to the thread to be written asynchronously
  * later.
  */
 static void
 ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
 {
 
 	mtx_lock(&ktrace_mtx);
 	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
 	mtx_unlock(&ktrace_mtx);
 	ktrace_exit(td);
 }
 
 /*
  * Drain any pending ktrace records from the per-thread queue to disk.  This
  * is used both internally before committing other records, and also on
  * system call return.  We drain all the ones we can find at the time when
  * drain is requested, but don't keep draining after that as those events
  * may me approximately "after" the current event.
  */
 static void
 ktr_drain(struct thread *td)
 {
 	struct ktr_request *queued_req;
 	STAILQ_HEAD(, ktr_request) local_queue;
 
 	ktrace_assert(td);
 	sx_assert(&ktrace_sx, SX_XLOCKED);
 
 	STAILQ_INIT(&local_queue);	/* XXXRW: needed? */
 
 	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
 		mtx_lock(&ktrace_mtx);
 		STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
 		mtx_unlock(&ktrace_mtx);
 
 		while ((queued_req = STAILQ_FIRST(&local_queue))) {
 			STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
 			ktr_writerequest(td, queued_req);
 			ktr_freerequest(queued_req);
 		}
 	}
 }
 
 /*
  * Submit a trace record for immediate commit to disk -- to be used only
  * where entering VFS is OK.  First drain any pending records that may have
  * been cached in the thread.
  */
 static void
 ktr_submitrequest(struct thread *td, struct ktr_request *req)
 {
 
 	ktrace_assert(td);
 
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	ktr_writerequest(td, req);
 	ktr_freerequest(req);
 	sx_xunlock(&ktrace_sx);
 
 	ktrace_exit(td);
 }
 
 static void
 ktr_freerequest(struct ktr_request *req)
 {
 
 	if (req->ktr_buffer != NULL)
 		free(req->ktr_buffer, M_KTRACE);
 	mtx_lock(&ktrace_mtx);
 	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 	mtx_unlock(&ktrace_mtx);
 }
 
 void
 ktrsyscall(code, narg, args)
 	int code, narg;
 	register_t args[];
 {
 	struct ktr_request *req;
 	struct ktr_syscall *ktp;
 	size_t buflen;
 	char *buf = NULL;
 
 	buflen = sizeof(register_t) * narg;
 	if (buflen > 0) {
 		buf = malloc(buflen, M_KTRACE, M_WAITOK);
 		bcopy(args, buf, buflen);
 	}
 	req = ktr_getrequest(KTR_SYSCALL);
 	if (req == NULL) {
 		if (buf != NULL)
 			free(buf, M_KTRACE);
 		return;
 	}
 	ktp = &req->ktr_data.ktr_syscall;
 	ktp->ktr_code = code;
 	ktp->ktr_narg = narg;
 	if (buflen > 0) {
 		req->ktr_header.ktr_len = buflen;
 		req->ktr_buffer = buf;
 	}
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrsysret(code, error, retval)
 	int code, error;
 	register_t retval;
 {
 	struct ktr_request *req;
 	struct ktr_sysret *ktp;
 
 	req = ktr_getrequest(KTR_SYSRET);
 	if (req == NULL)
 		return;
 	ktp = &req->ktr_data.ktr_sysret;
 	ktp->ktr_code = code;
 	ktp->ktr_error = error;
 	ktp->ktr_retval = retval;		/* what about val2 ? */
 	ktr_submitrequest(curthread, req);
 }
 
 /*
  * When a process exits, drain per-process asynchronous trace records.
  */
 void
 ktrprocexit(struct thread *td)
 {
 
 	ktrace_enter(td);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
 	ktrace_exit(td);
 }
 
 /*
  * When a thread returns, drain any asynchronous records generated by the
  * system call.
  */
 void
 ktruserret(struct thread *td)
 {
 
 	ktrace_enter(td);
 	sx_xlock(&ktrace_sx);
 	ktr_drain(td);
 	sx_xunlock(&ktrace_sx);
 	ktrace_exit(td);
 }
 
 void
 ktrnamei(path)
 	char *path;
 {
 	struct ktr_request *req;
 	int namelen;
 	char *buf = NULL;
 
 	namelen = strlen(path);
 	if (namelen > 0) {
 		buf = malloc(namelen, M_KTRACE, M_WAITOK);
 		bcopy(path, buf, namelen);
 	}
 	req = ktr_getrequest(KTR_NAMEI);
 	if (req == NULL) {
 		if (buf != NULL)
 			free(buf, M_KTRACE);
 		return;
 	}
 	if (namelen > 0) {
 		req->ktr_header.ktr_len = namelen;
 		req->ktr_buffer = buf;
 	}
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrgenio(fd, rw, uio, error)
 	int fd;
 	enum uio_rw rw;
 	struct uio *uio;
 	int error;
 {
 	struct ktr_request *req;
 	struct ktr_genio *ktg;
 	int datalen;
 	char *buf;
 
 	if (error) {
 		free(uio, M_IOV);
 		return;
 	}
 	uio->uio_offset = 0;
 	uio->uio_rw = UIO_WRITE;
 	datalen = imin(uio->uio_resid, ktr_geniosize);
 	buf = malloc(datalen, M_KTRACE, M_WAITOK);
 	error = uiomove(buf, datalen, uio);
 	free(uio, M_IOV);
 	if (error) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	req = ktr_getrequest(KTR_GENIO);
 	if (req == NULL) {
 		free(buf, M_KTRACE);
 		return;
 	}
 	ktg = &req->ktr_data.ktr_genio;
 	ktg->ktr_fd = fd;
 	ktg->ktr_rw = rw;
 	req->ktr_header.ktr_len = datalen;
 	req->ktr_buffer = buf;
 	ktr_submitrequest(curthread, req);
 }
 
 void
 ktrpsig(sig, action, mask, code)
 	int sig;
 	sig_t action;
 	sigset_t *mask;
 	int code;
 {
 	struct ktr_request *req;
 	struct ktr_psig	*kp;
 
 	req = ktr_getrequest(KTR_PSIG);
 	if (req == NULL)
 		return;
 	kp = &req->ktr_data.ktr_psig;
 	kp->signo = (char)sig;
 	kp->action = action;
 	kp->mask = *mask;
 	kp->code = code;
 	ktr_enqueuerequest(curthread, req);
 }
 
 void
 ktrcsw(out, user)
 	int out, user;
 {
 	struct ktr_request *req;
 	struct ktr_csw *kc;
 
 	req = ktr_getrequest(KTR_CSW);
 	if (req == NULL)
 		return;
 	kc = &req->ktr_data.ktr_csw;
 	kc->out = out;
 	kc->user = user;
 	ktr_enqueuerequest(curthread, req);
 }
 #endif /* KTRACE */
 
 /* Interface and common routines */
 
-/*
- * ktrace system call
- */
 #ifndef _SYS_SYSPROTO_H_
 struct ktrace_args {
 	char	*fname;
 	int	ops;
 	int	facs;
 	int	pid;
 };
 #endif
 /* ARGSUSED */
 int
 ktrace(td, uap)
 	struct thread *td;
 	register struct ktrace_args *uap;
 {
 #ifdef KTRACE
 	register struct vnode *vp = NULL;
 	register struct proc *p;
 	struct pgrp *pg;
 	int facs = uap->facs & ~KTRFAC_ROOT;
 	int ops = KTROP(uap->ops);
 	int descend = uap->ops & KTRFLAG_DESCEND;
 	int nfound, ret = 0;
 	int flags, error = 0, vfslocked;
 	struct nameidata nd;
 	struct ucred *cred;
 
 	/*
 	 * Need something to (un)trace.
 	 */
 	if (ops != KTROP_CLEARFILE && facs == 0)
 		return (EINVAL);
 
 	ktrace_enter(td);
 	if (ops != KTROP_CLEAR) {
 		/*
 		 * an operation which requires a file argument.
 		 */
 		NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE,
 		    uap->fname, td);
 		flags = FREAD | FWRITE | O_NOFOLLOW;
 		error = vn_open(&nd, &flags, 0, -1);
 		if (error) {
 			ktrace_exit(td);
 			return (error);
 		}
 		vfslocked = NDHASGIANT(&nd);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vp = nd.ni_vp;
 		VOP_UNLOCK(vp, 0, td);
 		if (vp->v_type != VREG) {
 			(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			ktrace_exit(td);
 			return (EACCES);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	/*
 	 * Clear all uses of the tracefile.
 	 */
 	if (ops == KTROP_CLEARFILE) {
 		int vrele_count;
 
 		vrele_count = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_tracevp == vp) {
 				if (ktrcanset(td, p)) {
 					mtx_lock(&ktrace_mtx);
 					cred = p->p_tracecred;
 					p->p_tracecred = NULL;
 					p->p_tracevp = NULL;
 					p->p_traceflag = 0;
 					mtx_unlock(&ktrace_mtx);
 					vrele_count++;
 					crfree(cred);
 				} else
 					error = EPERM;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		if (vrele_count > 0) {
 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			while (vrele_count-- > 0)
 				vrele(vp);
 			VFS_UNLOCK_GIANT(vfslocked);
 		}
 		goto done;
 	}
 	/*
 	 * do it
 	 */
 	sx_slock(&proctree_lock);
 	if (uap->pid < 0) {
 		/*
 		 * by process group
 		 */
 		pg = pgfind(-uap->pid);
 		if (pg == NULL) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 		/*
 		 * ktrops() may call vrele(). Lock pg_members
 		 * by the proctree_lock rather than pg_mtx.
 		 */
 		PGRP_UNLOCK(pg);
 		nfound = 0;
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p_cansee(td, p) != 0) {
 				PROC_UNLOCK(p); 
 				continue;
 			}
 			PROC_UNLOCK(p); 
 			nfound++;
 			if (descend)
 				ret |= ktrsetchildren(td, p, ops, facs, vp);
 			else
 				ret |= ktrops(td, p, ops, facs, vp);
 		}
 		if (nfound == 0) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 	} else {
 		/*
 		 * by pid
 		 */
 		p = pfind(uap->pid);
 		if (p == NULL) {
 			sx_sunlock(&proctree_lock);
 			error = ESRCH;
 			goto done;
 		}
 		error = p_cansee(td, p);
 		/*
 		 * The slock of the proctree lock will keep this process
 		 * from going away, so unlocking the proc here is ok.
 		 */
 		PROC_UNLOCK(p);
 		if (error) {
 			sx_sunlock(&proctree_lock);
 			goto done;
 		}
 		if (descend)
 			ret |= ktrsetchildren(td, p, ops, facs, vp);
 		else
 			ret |= ktrops(td, p, ops, facs, vp);
 	}
 	sx_sunlock(&proctree_lock);
 	if (!ret)
 		error = EPERM;
 done:
 	if (vp != NULL) {
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		(void) vn_close(vp, FWRITE, td->td_ucred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	ktrace_exit(td);
 	return (error);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
-/*
- * utrace system call
- */
 /* ARGSUSED */
 int
 utrace(td, uap)
 	struct thread *td;
 	register struct utrace_args *uap;
 {
 
 #ifdef KTRACE
 	struct ktr_request *req;
 	void *cp;
 	int error;
 
 	if (!KTRPOINT(td, KTR_USER))
 		return (0);
 	if (uap->len > KTR_USER_MAXLEN)
 		return (EINVAL);
 	cp = malloc(uap->len, M_KTRACE, M_WAITOK);
 	error = copyin(uap->addr, cp, uap->len);
 	if (error) {
 		free(cp, M_KTRACE);
 		return (error);
 	}
 	req = ktr_getrequest(KTR_USER);
 	if (req == NULL) {
 		free(cp, M_KTRACE);
 		return (ENOMEM);
 	}
 	req->ktr_buffer = cp;
 	req->ktr_header.ktr_len = uap->len;
 	ktr_submitrequest(td, req);
 	return (0);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
 #ifdef KTRACE
 static int
 ktrops(td, p, ops, facs, vp)
 	struct thread *td;
 	struct proc *p;
 	int ops, facs;
 	struct vnode *vp;
 {
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 
 	PROC_LOCK(p);
 	if (!ktrcanset(td, p)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	mtx_lock(&ktrace_mtx);
 	if (ops == KTROP_SET) {
 		if (p->p_tracevp != vp) {
 			/*
 			 * if trace file already in use, relinquish below
 			 */
 			tracevp = p->p_tracevp;
 			VREF(vp);
 			p->p_tracevp = vp;
 		}
 		if (p->p_tracecred != td->td_ucred) {
 			tracecred = p->p_tracecred;
 			p->p_tracecred = crhold(td->td_ucred);
 		}
 		p->p_traceflag |= facs;
 		if (priv_check_cred(td->td_ucred, PRIV_KTRACE,
 		    SUSER_ALLOWJAIL) == 0)
 			p->p_traceflag |= KTRFAC_ROOT;
 	} else {
 		/* KTROP_CLEAR */
 		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
 			/* no more tracing */
 			p->p_traceflag = 0;
 			tracevp = p->p_tracevp;
 			p->p_tracevp = NULL;
 			tracecred = p->p_tracecred;
 			p->p_tracecred = NULL;
 		}
 	}
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
 	if (tracevp != NULL) {
 		int vfslocked;
 
 		vfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
 		vrele(tracevp);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (tracecred != NULL)
 		crfree(tracecred);
 
 	return (1);
 }
 
 static int
 ktrsetchildren(td, top, ops, facs, vp)
 	struct thread *td;
 	struct proc *top;
 	int ops, facs;
 	struct vnode *vp;
 {
 	register struct proc *p;
 	register int ret = 0;
 
 	p = top;
 	sx_assert(&proctree_lock, SX_LOCKED);
 	for (;;) {
 		ret |= ktrops(td, p, ops, facs, vp);
 		/*
 		 * If this process has children, descend to them next,
 		 * otherwise do any siblings, and if done with this level,
 		 * follow back up the tree (but not past top).
 		 */
 		if (!LIST_EMPTY(&p->p_children))
 			p = LIST_FIRST(&p->p_children);
 		else for (;;) {
 			if (p == top)
 				return (ret);
 			if (LIST_NEXT(p, p_sibling)) {
 				p = LIST_NEXT(p, p_sibling);
 				break;
 			}
 			p = p->p_pptr;
 		}
 	}
 	/*NOTREACHED*/
 }
 
 static void
 ktr_writerequest(struct thread *td, struct ktr_request *req)
 {
 	struct ktr_header *kth;
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 	struct uio auio;
 	struct iovec aiov[3];
 	struct mount *mp;
 	int datalen, buflen, vrele_count;
 	int error, vfslocked;
 
 	/*
 	 * We hold the vnode and credential for use in I/O in case ktrace is
 	 * disabled on the process as we write out the request.
 	 *
 	 * XXXRW: This is not ideal: we could end up performing a write after
 	 * the vnode has been closed.
 	 */
 	mtx_lock(&ktrace_mtx);
 	vp = td->td_proc->p_tracevp;
 	if (vp != NULL)
 		VREF(vp);
 	cred = td->td_proc->p_tracecred;
 	if (cred != NULL)
 		crhold(cred);
 	mtx_unlock(&ktrace_mtx);
 
 	/*
 	 * If vp is NULL, the vp has been cleared out from under this
 	 * request, so just drop it.  Make sure the credential and vnode are
 	 * in sync: we should have both or neither.
 	 */
 	if (vp == NULL) {
 		KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
 		return;
 	}
 	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
 
 	kth = &req->ktr_header;
 	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
 	buflen = kth->ktr_len;
 	auio.uio_iov = &aiov[0];
 	auio.uio_offset = 0;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	aiov[0].iov_base = (caddr_t)kth;
 	aiov[0].iov_len = sizeof(struct ktr_header);
 	auio.uio_resid = sizeof(struct ktr_header);
 	auio.uio_iovcnt = 1;
 	auio.uio_td = td;
 	if (datalen != 0) {
 		aiov[1].iov_base = (caddr_t)&req->ktr_data;
 		aiov[1].iov_len = datalen;
 		auio.uio_resid += datalen;
 		auio.uio_iovcnt++;
 		kth->ktr_len += datalen;
 	}
 	if (buflen != 0) {
 		KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
 		aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
 		aiov[auio.uio_iovcnt].iov_len = buflen;
 		auio.uio_resid += buflen;
 		auio.uio_iovcnt++;
 	}
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	(void)VOP_LEASE(vp, td, cred, LEASE_WRITE);
 #ifdef MAC
 	error = mac_check_vnode_write(cred, NOCRED, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (!error)
 		return;
 	/*
 	 * If error encountered, give up tracing on this vnode.  We defer
 	 * all the vrele()'s on the vnode until after we are finished walking
 	 * the various lists to avoid needlessly holding locks.
 	 */
 	log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
 	    error);
 	vrele_count = 0;
 	/*
 	 * First, clear this vnode from being used by any processes in the
 	 * system.
 	 * XXX - If one process gets an EPERM writing to the vnode, should
 	 * we really do this?  Other processes might have suitable
 	 * credentials for the operation.
 	 */
 	cred = NULL;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_tracevp == vp) {
 			mtx_lock(&ktrace_mtx);
 			p->p_tracevp = NULL;
 			p->p_traceflag = 0;
 			cred = p->p_tracecred;
 			p->p_tracecred = NULL;
 			mtx_unlock(&ktrace_mtx);
 			vrele_count++;
 		}
 		PROC_UNLOCK(p);
 		if (cred != NULL) {
 			crfree(cred);
 			cred = NULL;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	/*
 	 * We can't clear any pending requests in threads that have cached
 	 * them but not yet committed them, as those are per-thread.  The
 	 * thread will have to clear it itself on system call return.
 	 */
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	while (vrele_count-- > 0)
 		vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 }
 
 /*
  * Return true if caller has permission to set the ktracing state
  * of target.  Essentially, the target can't possess any
  * more permissions than the caller.  KTRFAC_ROOT signifies that
  * root previously set the tracing status on the target process, and
  * so, only root may further change it.
  */
 static int
 ktrcanset(td, targetp)
 	struct thread *td;
 	struct proc *targetp;
 {
 
 	PROC_LOCK_ASSERT(targetp, MA_OWNED);
 	if (targetp->p_traceflag & KTRFAC_ROOT &&
 	    priv_check_cred(td->td_ucred, PRIV_KTRACE, SUSER_ALLOWJAIL))
 		return (0);
 
 	if (p_candebug(td, targetp) != 0)
 		return (0);
 
 	return (1);
 }
 
 #endif /* KTRACE */
Index: head/sys/kern/kern_ntptime.c
===================================================================
--- head/sys/kern/kern_ntptime.c	(revision 167231)
+++ head/sys/kern/kern_ntptime.c	(revision 167232)
@@ -1,972 +1,972 @@
 /*-
  ***********************************************************************
  *								       *
  * Copyright (c) David L. Mills 1993-2001			       *
  *								       *
  * Permission to use, copy, modify, and distribute this software and   *
  * its documentation for any purpose and without fee is hereby	       *
  * granted, provided that the above copyright notice appears in all    *
  * copies and that both the copyright notice and this permission       *
  * notice appear in supporting documentation, and that the name	       *
  * University of Delaware not be used in advertising or publicity      *
  * pertaining to distribution of the software without specific,	       *
  * written prior permission. The University of Delaware makes no       *
  * representations about the suitability this software for any	       *
  * purpose. It is provided "as is" without express or implied	       *
  * warranty.							       *
  *								       *
  **********************************************************************/
 
 /*
  * Adapted from the original sources for FreeBSD and timecounters by:
  * Poul-Henning Kamp <phk@FreeBSD.org>.
  *
  * The 32bit version of the "LP" macros seems a bit past its "sell by" 
  * date so I have retained only the 64bit version and included it directly
  * in this file.
  *
  * Only minor changes done to interface with the timecounters over in
  * sys/kern/kern_clock.c.   Some of the comments below may be (even more)
  * confusing and/or plain wrong in that context.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ntp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/time.h>
 #include <sys/timex.h>
 #include <sys/timetc.h>
 #include <sys/timepps.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 /*
  * Single-precision macros for 64-bit machines
  */
 typedef int64_t l_fp;
 #define L_ADD(v, u)	((v) += (u))
 #define L_SUB(v, u)	((v) -= (u))
 #define L_ADDHI(v, a)	((v) += (int64_t)(a) << 32)
 #define L_NEG(v)	((v) = -(v))
 #define L_RSHIFT(v, n) \
 	do { \
 		if ((v) < 0) \
 			(v) = -(-(v) >> (n)); \
 		else \
 			(v) = (v) >> (n); \
 	} while (0)
 #define L_MPY(v, a)	((v) *= (a))
 #define L_CLR(v)	((v) = 0)
 #define L_ISNEG(v)	((v) < 0)
 #define L_LINT(v, a)	((v) = (int64_t)(a) << 32)
 #define L_GINT(v)	((v) < 0 ? -(-(v) >> 32) : (v) >> 32)
 
 /*
  * Generic NTP kernel interface
  *
  * These routines constitute the Network Time Protocol (NTP) interfaces
  * for user and daemon application programs. The ntp_gettime() routine
  * provides the time, maximum error (synch distance) and estimated error
  * (dispersion) to client user application programs. The ntp_adjtime()
  * routine is used by the NTP daemon to adjust the system clock to an
  * externally derived time. The time offset and related variables set by
  * this routine are used by other routines in this module to adjust the
  * phase and frequency of the clock discipline loop which controls the
  * system clock.
  *
  * When the kernel time is reckoned directly in nanoseconds (NTP_NANO
  * defined), the time at each tick interrupt is derived directly from
  * the kernel time variable. When the kernel time is reckoned in
  * microseconds, (NTP_NANO undefined), the time is derived from the
  * kernel time variable together with a variable representing the
  * leftover nanoseconds at the last tick interrupt. In either case, the
  * current nanosecond time is reckoned from these values plus an
  * interpolated value derived by the clock routines in another
  * architecture-specific module. The interpolation can use either a
  * dedicated counter or a processor cycle counter (PCC) implemented in
  * some architectures.
  *
  * Note that all routines must run at priority splclock or higher.
  */
 /*
  * Phase/frequency-lock loop (PLL/FLL) definitions
  *
  * The nanosecond clock discipline uses two variable types, time
  * variables and frequency variables. Both types are represented as 64-
  * bit fixed-point quantities with the decimal point between two 32-bit
  * halves. On a 32-bit machine, each half is represented as a single
  * word and mathematical operations are done using multiple-precision
  * arithmetic. On a 64-bit machine, ordinary computer arithmetic is
  * used.
  *
  * A time variable is a signed 64-bit fixed-point number in ns and
  * fraction. It represents the remaining time offset to be amortized
  * over succeeding tick interrupts. The maximum time offset is about
  * 0.5 s and the resolution is about 2.3e-10 ns.
  *
  *			1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
  *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |s s s|			 ns				   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |			    fraction				   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *
  * A frequency variable is a signed 64-bit fixed-point number in ns/s
  * and fraction. It represents the ns and fraction to be added to the
  * kernel time variable at each second. The maximum frequency offset is
  * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
  *
  *			1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
  *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |s s s s s s s s s s s s s|	          ns/s			   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |			    fraction				   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
 /*
  * The following variables establish the state of the PLL/FLL and the
  * residual time and frequency offset of the local clock.
  */
 #define SHIFT_PLL	4		/* PLL loop gain (shift) */
 #define SHIFT_FLL	2		/* FLL loop gain (shift) */
 
 static int time_state = TIME_OK;	/* clock state */
 static int time_status = STA_UNSYNC;	/* clock status bits */
 static long time_tai;			/* TAI offset (s) */
 static long time_monitor;		/* last time offset scaled (ns) */
 static long time_constant;		/* poll interval (shift) (s) */
 static long time_precision = 1;		/* clock precision (ns) */
 static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
 static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
 static long time_reftime;		/* time at last adjustment (s) */
 static l_fp time_offset;		/* time offset (ns) */
 static l_fp time_freq;			/* frequency offset (ns/s) */
 static l_fp time_adj;			/* tick adjust (ns/s) */
 
 static int64_t time_adjtime;		/* correction from adjtime(2) (usec) */
 
 #ifdef PPS_SYNC
 /*
  * The following variables are used when a pulse-per-second (PPS) signal
  * is available and connected via a modem control lead. They establish
  * the engineering parameters of the clock discipline loop when
  * controlled by the PPS signal.
  */
 #define PPS_FAVG	2		/* min freq avg interval (s) (shift) */
 #define PPS_FAVGDEF	8		/* default freq avg int (s) (shift) */
 #define PPS_FAVGMAX	15		/* max freq avg interval (s) (shift) */
 #define PPS_PAVG	4		/* phase avg interval (s) (shift) */
 #define PPS_VALID	120		/* PPS signal watchdog max (s) */
 #define PPS_MAXWANDER	100000		/* max PPS wander (ns/s) */
 #define PPS_POPCORN	2		/* popcorn spike threshold (shift) */
 
 static struct timespec pps_tf[3];	/* phase median filter */
 static l_fp pps_freq;			/* scaled frequency offset (ns/s) */
 static long pps_fcount;			/* frequency accumulator */
 static long pps_jitter;			/* nominal jitter (ns) */
 static long pps_stabil;			/* nominal stability (scaled ns/s) */
 static long pps_lastsec;		/* time at last calibration (s) */
 static int pps_valid;			/* signal watchdog counter */
 static int pps_shift = PPS_FAVG;	/* interval duration (s) (shift) */
 static int pps_shiftmax = PPS_FAVGDEF;	/* max interval duration (s) (shift) */
 static int pps_intcnt;			/* wander counter */
 
 /*
  * PPS signal quality monitors
  */
 static long pps_calcnt;			/* calibration intervals */
 static long pps_jitcnt;			/* jitter limit exceeded */
 static long pps_stbcnt;			/* stability limit exceeded */
 static long pps_errcnt;			/* calibration errors */
 #endif /* PPS_SYNC */
 /*
  * End of phase/frequency-lock loop (PLL/FLL) definitions
  */
 
 static void ntp_init(void);
 static void hardupdate(long offset);
 static void ntp_gettime1(struct ntptimeval *ntvp);
 
 static void
 ntp_gettime1(struct ntptimeval *ntvp)
 {
 	struct timespec atv;	/* nanosecond time */
 
 	GIANT_REQUIRED;
 
 	nanotime(&atv);
 	ntvp->time.tv_sec = atv.tv_sec;
 	ntvp->time.tv_nsec = atv.tv_nsec;
 	ntvp->maxerror = time_maxerror;
 	ntvp->esterror = time_esterror;
 	ntvp->tai = time_tai;
 	ntvp->time_state = time_state;
 
 	/*
 	 * Status word error decode. If any of these conditions occur,
 	 * an error is returned, instead of the status word. Most
 	 * applications will care only about the fact the system clock
 	 * may not be trusted, not about the details.
 	 *
 	 * Hardware or software error
 	 */
 	if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
 
 	/*
 	 * PPS signal lost when either time or frequency synchronization
 	 * requested
 	 */
 	    (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
 	    !(time_status & STA_PPSSIGNAL)) ||
 
 	/*
 	 * PPS jitter exceeded when time synchronization requested
 	 */
 	    (time_status & STA_PPSTIME &&
 	    time_status & STA_PPSJITTER) ||
 
 	/*
 	 * PPS wander exceeded or calibration error when frequency
 	 * synchronization requested
 	 */
 	    (time_status & STA_PPSFREQ &&
 	    time_status & (STA_PPSWANDER | STA_PPSERROR)))
 		ntvp->time_state = TIME_ERROR;
 }
 
 /*
  * ntp_gettime() - NTP user application interface
  *
- * See the timex.h header file for synopsis and API description. Note
- * that the TAI offset is returned in the ntvtimeval.tai structure
- * member.
+ * See the timex.h header file for synopsis and API description.  Note that
+ * the TAI offset is returned in the ntvtimeval.tai structure member.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ntp_gettime_args {
 	struct ntptimeval *ntvp;
 };
 #endif
 /* ARGSUSED */
 int
 ntp_gettime(struct thread *td, struct ntp_gettime_args *uap)
 {	
 	struct ntptimeval ntv;
 
 	mtx_lock(&Giant);
 	ntp_gettime1(&ntv);
 	mtx_unlock(&Giant);
 
 	td->td_retval[0] = ntv.time_state;
 	return (copyout(&ntv, uap->ntvp, sizeof(ntv)));
 }
 
 static int
 ntp_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct ntptimeval ntv;	/* temporary structure */
 
 	ntp_gettime1(&ntv);
 
 	return (sysctl_handle_opaque(oidp, &ntv, sizeof(ntv), req));
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW, 0, "");
 SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
 
 #ifdef PPS_SYNC
 SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW, &pps_shiftmax, 0, "");
 SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW, &pps_shift, 0, "");
 SYSCTL_INT(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD, &time_monitor, 0, "");
 
 SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD, &pps_freq, sizeof(pps_freq), "I", "");
 SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD, &time_freq, sizeof(time_freq), "I", "");
 #endif
+
 /*
  * ntp_adjtime() - NTP daemon application interface
  *
- * See the timex.h header file for synopsis and API description. Note
- * that the timex.constant structure member has a dual purpose to set
- * the time constant and to set the TAI offset.
+ * See the timex.h header file for synopsis and API description.  Note that
+ * the timex.constant structure member has a dual purpose to set the time
+ * constant and to set the TAI offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ntp_adjtime_args {
 	struct timex *tp;
 };
 #endif
 
 int
 ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
 {
 	struct timex ntv;	/* temporary structure */
 	long freq;		/* frequency ns/s) */
 	int modes;		/* mode bits from structure */
 	int s;			/* caller priority */
 	int error;
 
 	error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
 	if (error)
 		return(error);
 
 	/*
 	 * Update selected clock variables - only the superuser can
 	 * change anything. Note that there is no error checking here on
 	 * the assumption the superuser should know what it is doing.
 	 * Note that either the time constant or TAI offset are loaded
 	 * from the ntv.constant member, depending on the mode bits. If
 	 * the STA_PLL bit in the status word is cleared, the state and
 	 * status words are reset to the initial values at boot.
 	 */
 	mtx_lock(&Giant);
 	modes = ntv.modes;
 	if (modes)
 		error = priv_check(td, PRIV_NTP_ADJTIME);
 	if (error)
 		goto done2;
 	s = splclock();
 	if (modes & MOD_MAXERROR)
 		time_maxerror = ntv.maxerror;
 	if (modes & MOD_ESTERROR)
 		time_esterror = ntv.esterror;
 	if (modes & MOD_STATUS) {
 		if (time_status & STA_PLL && !(ntv.status & STA_PLL)) {
 			time_state = TIME_OK;
 			time_status = STA_UNSYNC;
 #ifdef PPS_SYNC
 			pps_shift = PPS_FAVG;
 #endif /* PPS_SYNC */
 		}
 		time_status &= STA_RONLY;
 		time_status |= ntv.status & ~STA_RONLY;
 	}
 	if (modes & MOD_TIMECONST) {
 		if (ntv.constant < 0)
 			time_constant = 0;
 		else if (ntv.constant > MAXTC)
 			time_constant = MAXTC;
 		else
 			time_constant = ntv.constant;
 	}
 	if (modes & MOD_TAI) {
 		if (ntv.constant > 0) /* XXX zero & negative numbers ? */
 			time_tai = ntv.constant;
 	}
 #ifdef PPS_SYNC
 	if (modes & MOD_PPSMAX) {
 		if (ntv.shift < PPS_FAVG)
 			pps_shiftmax = PPS_FAVG;
 		else if (ntv.shift > PPS_FAVGMAX)
 			pps_shiftmax = PPS_FAVGMAX;
 		else
 			pps_shiftmax = ntv.shift;
 	}
 #endif /* PPS_SYNC */
 	if (modes & MOD_NANO)
 		time_status |= STA_NANO;
 	if (modes & MOD_MICRO)
 		time_status &= ~STA_NANO;
 	if (modes & MOD_CLKB)
 		time_status |= STA_CLK;
 	if (modes & MOD_CLKA)
 		time_status &= ~STA_CLK;
 	if (modes & MOD_FREQUENCY) {
 		freq = (ntv.freq * 1000LL) >> 16;
 		if (freq > MAXFREQ)
 			L_LINT(time_freq, MAXFREQ);
 		else if (freq < -MAXFREQ)
 			L_LINT(time_freq, -MAXFREQ);
 		else {
 			/*
 			 * ntv.freq is [PPM * 2^16] = [us/s * 2^16]
 			 * time_freq is [ns/s * 2^32]
 			 */
 			time_freq = ntv.freq * 1000LL * 65536LL;
 		}
 #ifdef PPS_SYNC
 		pps_freq = time_freq;
 #endif /* PPS_SYNC */
 	}
 	if (modes & MOD_OFFSET) {
 		if (time_status & STA_NANO)
 			hardupdate(ntv.offset);
 		else
 			hardupdate(ntv.offset * 1000);
 	}
 
 	/*
 	 * Retrieve all clock variables. Note that the TAI offset is
 	 * returned only by ntp_gettime();
 	 */
 	if (time_status & STA_NANO)
 		ntv.offset = L_GINT(time_offset);
 	else
 		ntv.offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
 	ntv.freq = L_GINT((time_freq / 1000LL) << 16);
 	ntv.maxerror = time_maxerror;
 	ntv.esterror = time_esterror;
 	ntv.status = time_status;
 	ntv.constant = time_constant;
 	if (time_status & STA_NANO)
 		ntv.precision = time_precision;
 	else
 		ntv.precision = time_precision / 1000;
 	ntv.tolerance = MAXFREQ * SCALE_PPM;
 #ifdef PPS_SYNC
 	ntv.shift = pps_shift;
 	ntv.ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
 	if (time_status & STA_NANO)
 		ntv.jitter = pps_jitter;
 	else
 		ntv.jitter = pps_jitter / 1000;
 	ntv.stabil = pps_stabil;
 	ntv.calcnt = pps_calcnt;
 	ntv.errcnt = pps_errcnt;
 	ntv.jitcnt = pps_jitcnt;
 	ntv.stbcnt = pps_stbcnt;
 #endif /* PPS_SYNC */
 	splx(s);
 
 	error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
 	if (error)
 		goto done2;
 
 	/*
 	 * Status word error decode. See comments in
 	 * ntp_gettime() routine.
 	 */
 	if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
 	    (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
 	    !(time_status & STA_PPSSIGNAL)) ||
 	    (time_status & STA_PPSTIME &&
 	    time_status & STA_PPSJITTER) ||
 	    (time_status & STA_PPSFREQ &&
 	    time_status & (STA_PPSWANDER | STA_PPSERROR))) {
 		td->td_retval[0] = TIME_ERROR;
 	} else {
 		td->td_retval[0] = time_state;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * second_overflow() - called after ntp_tick_adjust()
  *
  * This routine is ordinarily called immediately following the above
  * routine ntp_tick_adjust(). While these two routines are normally
  * combined, they are separated here only for the purposes of
  * simulation.
  */
 void
 ntp_update_second(int64_t *adjustment, time_t *newsec)
 {
 	int tickrate;
 	l_fp ftemp;		/* 32/64-bit temporary */
 
 	/*
 	 * On rollover of the second both the nanosecond and microsecond
 	 * clocks are updated and the state machine cranked as
 	 * necessary. The phase adjustment to be used for the next
 	 * second is calculated and the maximum error is increased by
 	 * the tolerance.
 	 */
 	time_maxerror += MAXFREQ / 1000;
 
 	/*
 	 * Leap second processing. If in leap-insert state at
 	 * the end of the day, the system clock is set back one
 	 * second; if in leap-delete state, the system clock is
 	 * set ahead one second. The nano_time() routine or
 	 * external clock driver will insure that reported time
 	 * is always monotonic.
 	 */
 	switch (time_state) {
 
 		/*
 		 * No warning.
 		 */
 		case TIME_OK:
 		if (time_status & STA_INS)
 			time_state = TIME_INS;
 		else if (time_status & STA_DEL)
 			time_state = TIME_DEL;
 		break;
 
 		/*
 		 * Insert second 23:59:60 following second
 		 * 23:59:59.
 		 */
 		case TIME_INS:
 		if (!(time_status & STA_INS))
 			time_state = TIME_OK;
 		else if ((*newsec) % 86400 == 0) {
 			(*newsec)--;
 			time_state = TIME_OOP;
 			time_tai++;
 		}
 		break;
 
 		/*
 		 * Delete second 23:59:59.
 		 */
 		case TIME_DEL:
 		if (!(time_status & STA_DEL))
 			time_state = TIME_OK;
 		else if (((*newsec) + 1) % 86400 == 0) {
 			(*newsec)++;
 			time_tai--;
 			time_state = TIME_WAIT;
 		}
 		break;
 
 		/*
 		 * Insert second in progress.
 		 */
 		case TIME_OOP:
 			time_state = TIME_WAIT;
 		break;
 
 		/*
 		 * Wait for status bits to clear.
 		 */
 		case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
 			time_state = TIME_OK;
 	}
 
 	/*
 	 * Compute the total time adjustment for the next second
 	 * in ns. The offset is reduced by a factor depending on
 	 * whether the PPS signal is operating. Note that the
 	 * value is in effect scaled by the clock frequency,
 	 * since the adjustment is added at each tick interrupt.
 	 */
 	ftemp = time_offset;
 #ifdef PPS_SYNC
 	/* XXX even if PPS signal dies we should finish adjustment ? */
 	if (time_status & STA_PPSTIME && time_status &
 	    STA_PPSSIGNAL)
 		L_RSHIFT(ftemp, pps_shift);
 	else
 		L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
 #else
 		L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
 #endif /* PPS_SYNC */
 	time_adj = ftemp;
 	L_SUB(time_offset, ftemp);
 	L_ADD(time_adj, time_freq);
 	
 	/*
 	 * Apply any correction from adjtime(2).  If more than one second
 	 * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
 	 * until the last second is slewed the final < 500 usecs.
 	 */
 	if (time_adjtime != 0) {
 		if (time_adjtime > 1000000)
 			tickrate = 5000;
 		else if (time_adjtime < -1000000)
 			tickrate = -5000;
 		else if (time_adjtime > 500)
 			tickrate = 500;
 		else if (time_adjtime < -500)
 			tickrate = -500;
 		else
 			tickrate = time_adjtime;
 		time_adjtime -= tickrate;
 		L_LINT(ftemp, tickrate * 1000);
 		L_ADD(time_adj, ftemp);
 	}
 	*adjustment = time_adj;
 		
 #ifdef PPS_SYNC
 	if (pps_valid > 0)
 		pps_valid--;
 	else
 		time_status &= ~STA_PPSSIGNAL;
 #endif /* PPS_SYNC */
 }
 
 /*
  * ntp_init() - initialize variables and structures
  *
  * This routine must be called after the kernel variables hz and tick
  * are set or changed and before the next tick interrupt. In this
  * particular implementation, these values are assumed set elsewhere in
  * the kernel. The design allows the clock frequency and tick interval
  * to be changed while the system is running. So, this routine should
  * probably be integrated with the code that does that.
  */
 static void
 ntp_init()
 {
 
 	/*
 	 * The following variables are initialized only at startup. Only
 	 * those structures not cleared by the compiler need to be
 	 * initialized, and these only in the simulator. In the actual
 	 * kernel, any nonzero values here will quickly evaporate.
 	 */
 	L_CLR(time_offset);
 	L_CLR(time_freq);
 #ifdef PPS_SYNC
 	pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
 	pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
 	pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
 	pps_fcount = 0;
 	L_CLR(pps_freq);
 #endif /* PPS_SYNC */	   
 }
 
 SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, ntp_init, NULL)
 
 /*
  * hardupdate() - local clock update
  *
  * This routine is called by ntp_adjtime() to update the local clock
  * phase and frequency. The implementation is of an adaptive-parameter,
  * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
  * time and frequency offset estimates for each call. If the kernel PPS
  * discipline code is configured (PPS_SYNC), the PPS signal itself
  * determines the new time offset, instead of the calling argument.
  * Presumably, calls to ntp_adjtime() occur only when the caller
  * believes the local clock is valid within some bound (+-128 ms with
  * NTP). If the caller's time is far different than the PPS time, an
  * argument will ensue, and it's not clear who will lose.
  *
  * For uncompensated quartz crystal oscillators and nominal update
  * intervals less than 256 s, operation should be in phase-lock mode,
  * where the loop is disciplined to phase. For update intervals greater
  * than 1024 s, operation should be in frequency-lock mode, where the
  * loop is disciplined to frequency. Between 256 s and 1024 s, the mode
  * is selected by the STA_MODE status bit.
  */
 static void
 hardupdate(offset)
 	long offset;		/* clock offset (ns) */
 {
 	long mtemp;
 	l_fp ftemp;
 
 	/*
 	 * Select how the phase is to be controlled and from which
 	 * source. If the PPS signal is present and enabled to
 	 * discipline the time, the PPS offset is used; otherwise, the
 	 * argument offset is used.
 	 */
 	if (!(time_status & STA_PLL))
 		return;
 	if (!(time_status & STA_PPSTIME && time_status &
 	    STA_PPSSIGNAL)) {
 		if (offset > MAXPHASE)
 			time_monitor = MAXPHASE;
 		else if (offset < -MAXPHASE)
 			time_monitor = -MAXPHASE;
 		else
 			time_monitor = offset;
 		L_LINT(time_offset, time_monitor);
 	}
 
 	/*
 	 * Select how the frequency is to be controlled and in which
 	 * mode (PLL or FLL). If the PPS signal is present and enabled
 	 * to discipline the frequency, the PPS frequency is used;
 	 * otherwise, the argument offset is used to compute it.
 	 */
 	if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
 		time_reftime = time_second;
 		return;
 	}
 	if (time_status & STA_FREQHOLD || time_reftime == 0)
 		time_reftime = time_second;
 	mtemp = time_second - time_reftime;
 	L_LINT(ftemp, time_monitor);
 	L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
 	L_MPY(ftemp, mtemp);
 	L_ADD(time_freq, ftemp);
 	time_status &= ~STA_MODE;
 	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
 	    MAXSEC)) {
 		L_LINT(ftemp, (time_monitor << 4) / mtemp);
 		L_RSHIFT(ftemp, SHIFT_FLL + 4);
 		L_ADD(time_freq, ftemp);
 		time_status |= STA_MODE;
 	}
 	time_reftime = time_second;
 	if (L_GINT(time_freq) > MAXFREQ)
 		L_LINT(time_freq, MAXFREQ);
 	else if (L_GINT(time_freq) < -MAXFREQ)
 		L_LINT(time_freq, -MAXFREQ);
 }
 
 #ifdef PPS_SYNC
 /*
  * hardpps() - discipline CPU clock oscillator to external PPS signal
  *
  * This routine is called at each PPS interrupt in order to discipline
  * the CPU clock oscillator to the PPS signal. There are two independent
  * first-order feedback loops, one for the phase, the other for the
  * frequency. The phase loop measures and grooms the PPS phase offset
  * and leaves it in a handy spot for the seconds overflow routine. The
  * frequency loop averages successive PPS phase differences and
  * calculates the PPS frequency offset, which is also processed by the
  * seconds overflow routine. The code requires the caller to capture the
  * time and architecture-dependent hardware counter values in
  * nanoseconds at the on-time PPS signal transition.
  *
  * Note that, on some Unix systems this routine runs at an interrupt
  * priority level higher than the timer interrupt routine hardclock().
  * Therefore, the variables used are distinct from the hardclock()
  * variables, except for the actual time and frequency variables, which
  * are determined by this routine and updated atomically.
  */
 void
 hardpps(tsp, nsec)
 	struct timespec *tsp;	/* time at PPS */
 	long nsec;		/* hardware counter at PPS */
 {
 	long u_sec, u_nsec, v_nsec; /* temps */
 	l_fp ftemp;
 
 	/*
 	 * The signal is first processed by a range gate and frequency
 	 * discriminator. The range gate rejects noise spikes outside
 	 * the range +-500 us. The frequency discriminator rejects input
 	 * signals with apparent frequency outside the range 1 +-500
 	 * PPM. If two hits occur in the same second, we ignore the
 	 * later hit; if not and a hit occurs outside the range gate,
 	 * keep the later hit for later comparison, but do not process
 	 * it.
 	 */
 	time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
 	time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
 	pps_valid = PPS_VALID;
 	u_sec = tsp->tv_sec;
 	u_nsec = tsp->tv_nsec;
 	if (u_nsec >= (NANOSECOND >> 1)) {
 		u_nsec -= NANOSECOND;
 		u_sec++;
 	}
 	v_nsec = u_nsec - pps_tf[0].tv_nsec;
 	if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
 	    MAXFREQ)
 		return;
 	pps_tf[2] = pps_tf[1];
 	pps_tf[1] = pps_tf[0];
 	pps_tf[0].tv_sec = u_sec;
 	pps_tf[0].tv_nsec = u_nsec;
 
 	/*
 	 * Compute the difference between the current and previous
 	 * counter values. If the difference exceeds 0.5 s, assume it
 	 * has wrapped around, so correct 1.0 s. If the result exceeds
 	 * the tick interval, the sample point has crossed a tick
 	 * boundary during the last second, so correct the tick. Very
 	 * intricate.
 	 */
 	u_nsec = nsec;
 	if (u_nsec > (NANOSECOND >> 1))
 		u_nsec -= NANOSECOND;
 	else if (u_nsec < -(NANOSECOND >> 1))
 		u_nsec += NANOSECOND;
 	pps_fcount += u_nsec;
 	if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
 		return;
 	time_status &= ~STA_PPSJITTER;
 
 	/*
 	 * A three-stage median filter is used to help denoise the PPS
 	 * time. The median sample becomes the time offset estimate; the
 	 * difference between the other two samples becomes the time
 	 * dispersion (jitter) estimate.
 	 */
 	if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
 		if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
 			v_nsec = pps_tf[1].tv_nsec;	/* 0 1 2 */
 			u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
 		} else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
 			v_nsec = pps_tf[0].tv_nsec;	/* 2 0 1 */
 			u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
 		} else {
 			v_nsec = pps_tf[2].tv_nsec;	/* 0 2 1 */
 			u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
 		}
 	} else {
 		if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
 			v_nsec = pps_tf[1].tv_nsec;	/* 2 1 0 */
 			u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
 		} else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
 			v_nsec = pps_tf[0].tv_nsec;	/* 1 0 2 */
 			u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
 		} else {
 			v_nsec = pps_tf[2].tv_nsec;	/* 1 2 0 */
 			u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
 		}
 	}
 
 	/*
 	 * Nominal jitter is due to PPS signal noise and interrupt
 	 * latency. If it exceeds the popcorn threshold, the sample is
 	 * discarded. otherwise, if so enabled, the time offset is
 	 * updated. We can tolerate a modest loss of data here without
 	 * much degrading time accuracy.
 	 */
 	if (u_nsec > (pps_jitter << PPS_POPCORN)) {
 		time_status |= STA_PPSJITTER;
 		pps_jitcnt++;
 	} else if (time_status & STA_PPSTIME) {
 		time_monitor = -v_nsec;
 		L_LINT(time_offset, time_monitor);
 	}
 	pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
 	u_sec = pps_tf[0].tv_sec - pps_lastsec;
 	if (u_sec < (1 << pps_shift))
 		return;
 
 	/*
 	 * At the end of the calibration interval the difference between
 	 * the first and last counter values becomes the scaled
 	 * frequency. It will later be divided by the length of the
 	 * interval to determine the frequency update. If the frequency
 	 * exceeds a sanity threshold, or if the actual calibration
 	 * interval is not equal to the expected length, the data are
 	 * discarded. We can tolerate a modest loss of data here without
 	 * much degrading frequency accuracy.
 	 */
 	pps_calcnt++;
 	v_nsec = -pps_fcount;
 	pps_lastsec = pps_tf[0].tv_sec;
 	pps_fcount = 0;
 	u_nsec = MAXFREQ << pps_shift;
 	if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 <<
 	    pps_shift)) {
 		time_status |= STA_PPSERROR;
 		pps_errcnt++;
 		return;
 	}
 
 	/*
 	 * Here the raw frequency offset and wander (stability) is
 	 * calculated. If the wander is less than the wander threshold
 	 * for four consecutive averaging intervals, the interval is
 	 * doubled; if it is greater than the threshold for four
 	 * consecutive intervals, the interval is halved. The scaled
 	 * frequency offset is converted to frequency offset. The
 	 * stability metric is calculated as the average of recent
 	 * frequency changes, but is used only for performance
 	 * monitoring.
 	 */
 	L_LINT(ftemp, v_nsec);
 	L_RSHIFT(ftemp, pps_shift);
 	L_SUB(ftemp, pps_freq);
 	u_nsec = L_GINT(ftemp);
 	if (u_nsec > PPS_MAXWANDER) {
 		L_LINT(ftemp, PPS_MAXWANDER);
 		pps_intcnt--;
 		time_status |= STA_PPSWANDER;
 		pps_stbcnt++;
 	} else if (u_nsec < -PPS_MAXWANDER) {
 		L_LINT(ftemp, -PPS_MAXWANDER);
 		pps_intcnt--;
 		time_status |= STA_PPSWANDER;
 		pps_stbcnt++;
 	} else {
 		pps_intcnt++;
 	}
 	if (pps_intcnt >= 4) {
 		pps_intcnt = 4;
 		if (pps_shift < pps_shiftmax) {
 			pps_shift++;
 			pps_intcnt = 0;
 		}
 	} else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
 		pps_intcnt = -4;
 		if (pps_shift > PPS_FAVG) {
 			pps_shift--;
 			pps_intcnt = 0;
 		}
 	}
 	if (u_nsec < 0)
 		u_nsec = -u_nsec;
 	pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;
 
 	/*
 	 * The PPS frequency is recalculated and clamped to the maximum
 	 * MAXFREQ. If enabled, the system clock frequency is updated as
 	 * well.
 	 */
 	L_ADD(pps_freq, ftemp);
 	u_nsec = L_GINT(pps_freq);
 	if (u_nsec > MAXFREQ)
 		L_LINT(pps_freq, MAXFREQ);
 	else if (u_nsec < -MAXFREQ)
 		L_LINT(pps_freq, -MAXFREQ);
 	if (time_status & STA_PPSFREQ)
 		time_freq = pps_freq;
 }
 #endif /* PPS_SYNC */
 
 #ifndef _SYS_SYSPROTO_H_
 struct adjtime_args {
 	struct timeval *delta;
 	struct timeval *olddelta;
 };
 #endif
 /* ARGSUSED */
 int
 adjtime(struct thread *td, struct adjtime_args *uap)
 {
 	struct timeval delta, olddelta, *deltap;
 	int error;
 
 	if (uap->delta) {
 		error = copyin(uap->delta, &delta, sizeof(delta));
 		if (error)
 			return (error);
 		deltap = &delta;
 	} else
 		deltap = NULL;
 	error = kern_adjtime(td, deltap, &olddelta);
 	if (uap->olddelta && error == 0)
 		error = copyout(&olddelta, uap->olddelta, sizeof(olddelta));
 	return (error);
 }
 
 int
 kern_adjtime(struct thread *td, struct timeval *delta, struct timeval *olddelta)
 {
 	struct timeval atv;
 	int error;
 
 	if ((error = priv_check(td, PRIV_ADJTIME)))
 		return (error);
 
 	mtx_lock(&Giant);
 	if (olddelta) {
 		atv.tv_sec = time_adjtime / 1000000;
 		atv.tv_usec = time_adjtime % 1000000;
 		if (atv.tv_usec < 0) {
 			atv.tv_usec += 1000000;
 			atv.tv_sec--;
 		}
 		*olddelta = atv;
 	}
 	if (delta)
 		time_adjtime = (int64_t)delta->tv_sec * 1000000 +
 		    delta->tv_usec;
 	mtx_unlock(&Giant);
 	return (error);
 }
 
Index: head/sys/kern/kern_prot.c
===================================================================
--- head/sys/kern/kern_prot.c	(revision 167231)
+++ head/sys/kern/kern_prot.c	(revision 167232)
@@ -1,2050 +1,2048 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
  *	The Regents of the University of California.
  * (c) UNIX System Laboratories, Inc.
  * Copyright (c) 2000-2001 Robert N. M. Watson.
  * All rights reserved.
  *
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_prot.c	8.6 (Berkeley) 1/21/94
  */
 
 /*
  * System calls related to processes and protection
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/acct.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/refcount.h>
 #include <sys/sx.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_CRED, "cred", "credentials");
 
 SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy");
 
 #ifndef _SYS_SYSPROTO_H_
 struct getpid_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 getpid(struct thread *td, struct getpid_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	td->td_retval[0] = p->p_pid;
 #if defined(COMPAT_43)
 	PROC_LOCK(p);
 	td->td_retval[1] = p->p_pptr->p_pid;
 	PROC_UNLOCK(p);
 #endif
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getppid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 getppid(struct thread *td, struct getppid_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	td->td_retval[0] = p->p_pptr->p_pid;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Get process group ID; note that POSIX getpgrp takes no parameter.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getpgrp_args {
         int     dummy;
 };
 #endif
 int
 getpgrp(struct thread *td, struct getpgrp_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	td->td_retval[0] = p->p_pgrp->pg_id;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /* Get an arbitary pid's process group id */
 #ifndef _SYS_SYSPROTO_H_
 struct getpgid_args {
 	pid_t	pid;
 };
 #endif
 int
 getpgid(struct thread *td, struct getpgid_args *uap)
 {
 	struct proc *p;
 	int error;
 
 	if (uap->pid == 0) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(uap->pid);
 		if (p == NULL)
 			return (ESRCH);
 		error = p_cansee(td, p);
 		if (error) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 	}
 	td->td_retval[0] = p->p_pgrp->pg_id;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Get an arbitary pid's session id.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getsid_args {
 	pid_t	pid;
 };
 #endif
 int
 getsid(struct thread *td, struct getsid_args *uap)
 {
 	struct proc *p;
 	int error;
 
 	if (uap->pid == 0) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(uap->pid);
 		if (p == NULL)
 			return (ESRCH);
 		error = p_cansee(td, p);
 		if (error) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 	}
 	td->td_retval[0] = p->p_session->s_sid;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getuid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 getuid(struct thread *td, struct getuid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_ruid;
 #if defined(COMPAT_43)
 	td->td_retval[1] = td->td_ucred->cr_uid;
 #endif
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct geteuid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 geteuid(struct thread *td, struct geteuid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_uid;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getgid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 getgid(struct thread *td, struct getgid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_rgid;
 #if defined(COMPAT_43)
 	td->td_retval[1] = td->td_ucred->cr_groups[0];
 #endif
 	return (0);
 }
 
 /*
  * Get effective group ID.  The "egid" is groups[0], and could be obtained
  * via getgroups.  This syscall exists because it is somewhat painful to do
  * correctly in a library function.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getegid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 getegid(struct thread *td, struct getegid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_groups[0];
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getgroups_args {
 	u_int	gidsetsize;
 	gid_t	*gidset;
 };
 #endif
 int
 getgroups(struct thread *td, register struct getgroups_args *uap)
 {
 	gid_t groups[NGROUPS];
 	u_int ngrp;
 	int error;
 
 	ngrp = MIN(uap->gidsetsize, NGROUPS);
 	error = kern_getgroups(td, &ngrp, groups);
 	if (error)
 		return (error);
 	if (uap->gidsetsize > 0)
 		error = copyout(groups, uap->gidset, ngrp * sizeof(gid_t));
 	if (error == 0)
 		td->td_retval[0] = ngrp;
 	return (error);
 }
 
 int
 kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups)
 {
 	struct ucred *cred;
 
 	cred = td->td_ucred;
 	if (*ngrp == 0) {
 		*ngrp = cred->cr_ngroups;
 		return (0);
 	}
 	if (*ngrp < cred->cr_ngroups)
 		return (EINVAL);
 	*ngrp = cred->cr_ngroups;
 	bcopy(cred->cr_groups, groups, *ngrp * sizeof(gid_t));
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setsid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 setsid(register struct thread *td, struct setsid_args *uap)
 {
 	struct pgrp *pgrp;
 	int error;
 	struct proc *p = td->td_proc;
 	struct pgrp *newpgrp;
 	struct session *newsess;
 
 	error = 0;
 	pgrp = NULL;
 
 	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
 	MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
 
 	sx_xlock(&proctree_lock);
 
 	if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
 		if (pgrp != NULL)
 			PGRP_UNLOCK(pgrp);
 		error = EPERM;
 	} else {
 		(void)enterpgrp(p, p->p_pid, newpgrp, newsess);
 		td->td_retval[0] = p->p_pid;
 		newpgrp = NULL;
 		newsess = NULL;
 	}
 
 	sx_xunlock(&proctree_lock);
 
 	if (newpgrp != NULL)
 		FREE(newpgrp, M_PGRP);
 	if (newsess != NULL)
 		FREE(newsess, M_SESSION);
 
 	return (error);
 }
 
 /*
  * set process group (setpgid/old setpgrp)
  *
  * caller does setpgid(targpid, targpgid)
  *
  * pid must be caller or child of caller (ESRCH)
  * if a child
  *	pid must be in same session (EPERM)
  *	pid can't have done an exec (EACCES)
  * if pgid != pid
  * 	there must exist some pid in same session having pgid (EPERM)
  * pid must not be session leader (EPERM)
  */
 #ifndef _SYS_SYSPROTO_H_
 struct setpgid_args {
 	int	pid;		/* target process id */
 	int	pgid;		/* target pgrp id */
 };
 #endif
 /* ARGSUSED */
 int
 setpgid(struct thread *td, register struct setpgid_args *uap)
 {
 	struct proc *curp = td->td_proc;
 	register struct proc *targp;	/* target process */
 	register struct pgrp *pgrp;	/* target pgrp */
 	int error;
 	struct pgrp *newpgrp;
 
 	if (uap->pgid < 0)
 		return (EINVAL);
 
 	error = 0;
 
 	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
 
 	sx_xlock(&proctree_lock);
 	if (uap->pid != 0 && uap->pid != curp->p_pid) {
 		if ((targp = pfind(uap->pid)) == NULL) {
 			error = ESRCH;
 			goto done;
 		}
 		if (!inferior(targp)) {
 			PROC_UNLOCK(targp);
 			error = ESRCH;
 			goto done;
 		}
 		if ((error = p_cansee(td, targp))) {
 			PROC_UNLOCK(targp);
 			goto done;
 		}
 		if (targp->p_pgrp == NULL ||
 		    targp->p_session != curp->p_session) {
 			PROC_UNLOCK(targp);
 			error = EPERM;
 			goto done;
 		}
 		if (targp->p_flag & P_EXEC) {
 			PROC_UNLOCK(targp);
 			error = EACCES;
 			goto done;
 		}
 		PROC_UNLOCK(targp);
 	} else
 		targp = curp;
 	if (SESS_LEADER(targp)) {
 		error = EPERM;
 		goto done;
 	}
 	if (uap->pgid == 0)
 		uap->pgid = targp->p_pid;
 	if ((pgrp = pgfind(uap->pgid)) == NULL) {
 		if (uap->pgid == targp->p_pid) {
 			error = enterpgrp(targp, uap->pgid, newpgrp,
 			    NULL);
 			if (error == 0)
 				newpgrp = NULL;
 		} else
 			error = EPERM;
 	} else {
 		if (pgrp == targp->p_pgrp) {
 			PGRP_UNLOCK(pgrp);
 			goto done;
 		}
 		if (pgrp->pg_id != targp->p_pid &&
 		    pgrp->pg_session != curp->p_session) {
 			PGRP_UNLOCK(pgrp);
 			error = EPERM;
 			goto done;
 		}
 		PGRP_UNLOCK(pgrp);
 		error = enterthispgrp(targp, pgrp);
 	}
 done:
 	sx_xunlock(&proctree_lock);
 	KASSERT((error == 0) || (newpgrp != NULL),
 	    ("setpgid failed and newpgrp is NULL"));
 	if (newpgrp != NULL)
 		FREE(newpgrp, M_PGRP);
 	return (error);
 }
 
 /*
  * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
  * compatible.  It says that setting the uid/gid to euid/egid is a special
  * case of "appropriate privilege".  Once the rules are expanded out, this
  * basically means that setuid(nnn) sets all three id's, in all permitted
  * cases unless _POSIX_SAVED_IDS is enabled.  In that case, setuid(getuid())
  * does not set the saved id - this is dangerous for traditional BSD
  * programs.  For this reason, we *really* do not want to set
  * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
  */
 #define POSIX_APPENDIX_B_4_2_2
 
 #ifndef _SYS_SYSPROTO_H_
 struct setuid_args {
 	uid_t	uid;
 };
 #endif
 /* ARGSUSED */
 int
 setuid(struct thread *td, struct setuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t uid;
 	struct uidinfo *uip;
 	int error;
 
 	uid = uap->uid;
 	AUDIT_ARG(uid, uid);
 	newcred = crget();
 	uip = uifind(uid);
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setuid(p, oldcred, uid);
 	if (error)
 		goto fail;
 #endif
 
 	/*
 	 * See if we have "permission" by POSIX 1003.1 rules.
 	 *
 	 * Note that setuid(geteuid()) is a special case of
 	 * "appropriate privileges" in appendix B.4.2.2.  We need
 	 * to use this clause to be compatible with traditional BSD
 	 * semantics.  Basically, it means that "setuid(xx)" sets all
 	 * three id's (assuming you have privs).
 	 *
 	 * Notes on the logic.  We do things in three steps.
 	 * 1: We determine if the euid is going to change, and do EPERM
 	 *    right away.  We unconditionally change the euid later if this
 	 *    test is satisfied, simplifying that part of the logic.
 	 * 2: We determine if the real and/or saved uids are going to
 	 *    change.  Determined by compile options.
 	 * 3: Change euid last. (after tests in #2 for "appropriate privs")
 	 */
 	if (uid != oldcred->cr_ruid &&		/* allow setuid(getuid()) */
 #ifdef _POSIX_SAVED_IDS
 	    uid != oldcred->cr_svuid &&		/* allow setuid(saved gid) */
 #endif
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
 	    uid != oldcred->cr_uid &&		/* allow setuid(geteuid()) */
 #endif
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETUID,
 	    SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	/*
 	 * Copy credentials so other references do not see our changes.
 	 */
 	crcopy(newcred, oldcred);
 #ifdef _POSIX_SAVED_IDS
 	/*
 	 * Do we have "appropriate privileges" (are we root or uid == euid)
 	 * If so, we are changing the real uid and/or saved uid.
 	 */
 	if (
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use the clause from B.4.2.2 */
 	    uid == oldcred->cr_uid ||
 #endif
 	    /* We are using privs. */
 	    priv_check_cred(oldcred, PRIV_CRED_SETUID, SUSER_ALLOWJAIL) == 0)
 #endif
 	{
 		/*
 		 * Set the real uid and transfer proc count to new user.
 		 */
 		if (uid != oldcred->cr_ruid) {
 			change_ruid(newcred, uip);
 			setsugid(p);
 		}
 		/*
 		 * Set saved uid
 		 *
 		 * XXX always set saved uid even if not _POSIX_SAVED_IDS, as
 		 * the security of seteuid() depends on it.  B.4.2.2 says it
 		 * is important that we should do this.
 		 */
 		if (uid != oldcred->cr_svuid) {
 			change_svuid(newcred, uid);
 			setsugid(p);
 		}
 	}
 
 	/*
 	 * In all permitted cases, we are changing the euid.
 	 */
 	if (uid != oldcred->cr_uid) {
 		change_euid(newcred, uip);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	uifree(uip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(uip);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct seteuid_args {
 	uid_t	euid;
 };
 #endif
 /* ARGSUSED */
 int
 seteuid(struct thread *td, struct seteuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t euid;
 	struct uidinfo *euip;
 	int error;
 
 	euid = uap->euid;
 	AUDIT_ARG(euid, euid);
 	newcred = crget();
 	euip = uifind(euid);
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_seteuid(p, oldcred, euid);
 	if (error)
 		goto fail;
 #endif
 
 	if (euid != oldcred->cr_ruid &&		/* allow seteuid(getuid()) */
 	    euid != oldcred->cr_svuid &&	/* allow seteuid(saved uid) */
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID,
 	    SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	/*
 	 * Everything's okay, do it.  Copy credentials so other references do
 	 * not see our changes.
 	 */
 	crcopy(newcred, oldcred);
 	if (oldcred->cr_uid != euid) {
 		change_euid(newcred, euip);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	uifree(euip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(euip);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setgid_args {
 	gid_t	gid;
 };
 #endif
 /* ARGSUSED */
 int
 setgid(struct thread *td, struct setgid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t gid;
 	int error;
 
 	gid = uap->gid;
 	AUDIT_ARG(gid, gid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setgid(p, oldcred, gid);
 	if (error)
 		goto fail;
 #endif
 
 	/*
 	 * See if we have "permission" by POSIX 1003.1 rules.
 	 *
 	 * Note that setgid(getegid()) is a special case of
 	 * "appropriate privileges" in appendix B.4.2.2.  We need
 	 * to use this clause to be compatible with traditional BSD
 	 * semantics.  Basically, it means that "setgid(xx)" sets all
 	 * three id's (assuming you have privs).
 	 *
 	 * For notes on the logic here, see setuid() above.
 	 */
 	if (gid != oldcred->cr_rgid &&		/* allow setgid(getgid()) */
 #ifdef _POSIX_SAVED_IDS
 	    gid != oldcred->cr_svgid &&		/* allow setgid(saved gid) */
 #endif
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
 	    gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
 #endif
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETGID,
 	    SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 #ifdef _POSIX_SAVED_IDS
 	/*
 	 * Do we have "appropriate privileges" (are we root or gid == egid)
 	 * If so, we are changing the real uid and saved gid.
 	 */
 	if (
 #ifdef POSIX_APPENDIX_B_4_2_2	/* use the clause from B.4.2.2 */
 	    gid == oldcred->cr_groups[0] ||
 #endif
 	    /* We are using privs. */
 	    priv_check_cred(oldcred, PRIV_CRED_SETGID, SUSER_ALLOWJAIL) == 0)
 #endif
 	{
 		/*
 		 * Set real gid
 		 */
 		if (oldcred->cr_rgid != gid) {
 			change_rgid(newcred, gid);
 			setsugid(p);
 		}
 		/*
 		 * Set saved gid
 		 *
 		 * XXX always set saved gid even if not _POSIX_SAVED_IDS, as
 		 * the security of setegid() depends on it.  B.4.2.2 says it
 		 * is important that we should do this.
 		 */
 		if (oldcred->cr_svgid != gid) {
 			change_svgid(newcred, gid);
 			setsugid(p);
 		}
 	}
 	/*
 	 * In all cases permitted cases, we are changing the egid.
 	 * Copy credentials so other references do not see our changes.
 	 */
 	if (oldcred->cr_groups[0] != gid) {
 		change_egid(newcred, gid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setegid_args {
 	gid_t	egid;
 };
 #endif
 /* ARGSUSED */
 int
 setegid(struct thread *td, struct setegid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t egid;
 	int error;
 
 	egid = uap->egid;
 	AUDIT_ARG(egid, egid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setegid(p, oldcred, egid);
 	if (error)
 		goto fail;
 #endif
 
 	if (egid != oldcred->cr_rgid &&		/* allow setegid(getgid()) */
 	    egid != oldcred->cr_svgid &&	/* allow setegid(saved gid) */
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID,
 	    SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 	if (oldcred->cr_groups[0] != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setgroups_args {
 	u_int	gidsetsize;
 	gid_t	*gidset;
 };
 #endif
 /* ARGSUSED */
 int
 setgroups(struct thread *td, struct setgroups_args *uap)
 {
 	gid_t groups[NGROUPS];
 	int error;
 
 	if (uap->gidsetsize > NGROUPS)
 		return (EINVAL);
 	error = copyin(uap->gidset, groups, uap->gidsetsize * sizeof(gid_t));
 	if (error)
 		return (error);
 	return (kern_setgroups(td, uap->gidsetsize, groups));
 }
 
 int
 kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	int error;
 
 	if (ngrp > NGROUPS)
 		return (EINVAL);
 	AUDIT_ARG(groupset, groups, ngrp);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setgroups(p, oldcred, ngrp, groups);
 	if (error)
 		goto fail;
 #endif
 
 	error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS,
 	    SUSER_ALLOWJAIL);
 	if (error)
 		goto fail;
 
 	/*
 	 * XXX A little bit lazy here.  We could test if anything has
 	 * changed before crcopy() and setting P_SUGID.
 	 */
 	crcopy(newcred, oldcred);
 	if (ngrp < 1) {
 		/*
 		 * setgroups(0, NULL) is a legitimate way of clearing the
 		 * groups vector on non-BSD systems (which generally do not
 		 * have the egid in the groups[0]).  We risk security holes
 		 * when running non-BSD software if we do not do the same.
 		 */
 		newcred->cr_ngroups = 1;
 	} else {
 		bcopy(groups, newcred->cr_groups, ngrp * sizeof(gid_t));
 		newcred->cr_ngroups = ngrp;
 	}
 	setsugid(p);
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setreuid_args {
 	uid_t	ruid;
 	uid_t	euid;
 };
 #endif
 /* ARGSUSED */
 int
 setreuid(register struct thread *td, struct setreuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t euid, ruid;
 	struct uidinfo *euip, *ruip;
 	int error;
 
 	euid = uap->euid;
 	ruid = uap->ruid;
 	AUDIT_ARG(euid, euid);
 	AUDIT_ARG(ruid, ruid);
 	newcred = crget();
 	euip = uifind(euid);
 	ruip = uifind(ruid);
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setreuid(p, oldcred, ruid, euid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
 	      ruid != oldcred->cr_svuid) ||
 	     (euid != (uid_t)-1 && euid != oldcred->cr_uid &&
 	      euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID,
 	     SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
 		change_euid(newcred, euip);
 		setsugid(p);
 	}
 	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
 		change_ruid(newcred, ruip);
 		setsugid(p);
 	}
 	if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) &&
 	    newcred->cr_svuid != newcred->cr_uid) {
 		change_svuid(newcred, newcred->cr_uid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	uifree(ruip);
 	uifree(euip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(ruip);
 	uifree(euip);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setregid_args {
 	gid_t	rgid;
 	gid_t	egid;
 };
 #endif
 /* ARGSUSED */
 int
 setregid(register struct thread *td, struct setregid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t egid, rgid;
 	int error;
 
 	egid = uap->egid;
 	rgid = uap->rgid;
 	AUDIT_ARG(egid, egid);
 	AUDIT_ARG(rgid, rgid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setregid(p, oldcred, rgid, egid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
 	    rgid != oldcred->cr_svgid) ||
 	     (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
 	     egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID,
 	     SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
 	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
 		change_rgid(newcred, rgid);
 		setsugid(p);
 	}
 	if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) &&
 	    newcred->cr_svgid != newcred->cr_groups[0]) {
 		change_svgid(newcred, newcred->cr_groups[0]);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 /*
- * setresuid(ruid, euid, suid) is like setreuid except control over the
- * saved uid is explicit.
+ * setresuid(ruid, euid, suid) is like setreuid except control over the saved
+ * uid is explicit.
  */
-
 #ifndef _SYS_SYSPROTO_H_
 struct setresuid_args {
 	uid_t	ruid;
 	uid_t	euid;
 	uid_t	suid;
 };
 #endif
 /* ARGSUSED */
 int
 setresuid(register struct thread *td, struct setresuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t euid, ruid, suid;
 	struct uidinfo *euip, *ruip;
 	int error;
 
 	euid = uap->euid;
 	ruid = uap->ruid;
 	suid = uap->suid;
 	AUDIT_ARG(euid, euid);
 	AUDIT_ARG(ruid, ruid);
 	AUDIT_ARG(suid, suid);
 	newcred = crget();
 	euip = uifind(euid);
 	ruip = uifind(ruid);
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setresuid(p, oldcred, ruid, euid, suid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
 	     ruid != oldcred->cr_svuid &&
 	      ruid != oldcred->cr_uid) ||
 	     (euid != (uid_t)-1 && euid != oldcred->cr_ruid &&
 	    euid != oldcred->cr_svuid &&
 	      euid != oldcred->cr_uid) ||
 	     (suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
 	    suid != oldcred->cr_svuid &&
 	      suid != oldcred->cr_uid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID,
 	     SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
 		change_euid(newcred, euip);
 		setsugid(p);
 	}
 	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
 		change_ruid(newcred, ruip);
 		setsugid(p);
 	}
 	if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) {
 		change_svuid(newcred, suid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	uifree(ruip);
 	uifree(euip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(ruip);
 	uifree(euip);
 	crfree(newcred);
 	return (error);
 
 }
 
 /*
- * setresgid(rgid, egid, sgid) is like setregid except control over the
- * saved gid is explicit.
+ * setresgid(rgid, egid, sgid) is like setregid except control over the saved
+ * gid is explicit.
  */
-
 #ifndef _SYS_SYSPROTO_H_
 struct setresgid_args {
 	gid_t	rgid;
 	gid_t	egid;
 	gid_t	sgid;
 };
 #endif
 /* ARGSUSED */
 int
 setresgid(register struct thread *td, struct setresgid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t egid, rgid, sgid;
 	int error;
 
 	egid = uap->egid;
 	rgid = uap->rgid;
 	sgid = uap->sgid;
 	AUDIT_ARG(egid, egid);
 	AUDIT_ARG(rgid, rgid);
 	AUDIT_ARG(sgid, sgid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_check_proc_setresgid(p, oldcred, rgid, egid, sgid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
 	      rgid != oldcred->cr_svgid &&
 	      rgid != oldcred->cr_groups[0]) ||
 	     (egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
 	      egid != oldcred->cr_svgid &&
 	      egid != oldcred->cr_groups[0]) ||
 	     (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
 	      sgid != oldcred->cr_svgid &&
 	      sgid != oldcred->cr_groups[0])) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID,
 	     SUSER_ALLOWJAIL)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
 	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
 	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
 		change_rgid(newcred, rgid);
 		setsugid(p);
 	}
 	if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) {
 		change_svgid(newcred, sgid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getresuid_args {
 	uid_t	*ruid;
 	uid_t	*euid;
 	uid_t	*suid;
 };
 #endif
 /* ARGSUSED */
 int
 getresuid(register struct thread *td, struct getresuid_args *uap)
 {
 	struct ucred *cred;
 	int error1 = 0, error2 = 0, error3 = 0;
 
 	cred = td->td_ucred;
 	if (uap->ruid)
 		error1 = copyout(&cred->cr_ruid,
 		    uap->ruid, sizeof(cred->cr_ruid));
 	if (uap->euid)
 		error2 = copyout(&cred->cr_uid,
 		    uap->euid, sizeof(cred->cr_uid));
 	if (uap->suid)
 		error3 = copyout(&cred->cr_svuid,
 		    uap->suid, sizeof(cred->cr_svuid));
 	return (error1 ? error1 : error2 ? error2 : error3);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getresgid_args {
 	gid_t	*rgid;
 	gid_t	*egid;
 	gid_t	*sgid;
 };
 #endif
 /* ARGSUSED */
 int
 getresgid(register struct thread *td, struct getresgid_args *uap)
 {
 	struct ucred *cred;
 	int error1 = 0, error2 = 0, error3 = 0;
 
 	cred = td->td_ucred;
 	if (uap->rgid)
 		error1 = copyout(&cred->cr_rgid,
 		    uap->rgid, sizeof(cred->cr_rgid));
 	if (uap->egid)
 		error2 = copyout(&cred->cr_groups[0],
 		    uap->egid, sizeof(cred->cr_groups[0]));
 	if (uap->sgid)
 		error3 = copyout(&cred->cr_svgid,
 		    uap->sgid, sizeof(cred->cr_svgid));
 	return (error1 ? error1 : error2 ? error2 : error3);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct issetugid_args {
 	int dummy;
 };
 #endif
 /* ARGSUSED */
 int
 issetugid(register struct thread *td, struct issetugid_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	/*
 	 * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
 	 * we use P_SUGID because we consider changing the owners as
 	 * "tainting" as well.
 	 * This is significant for procs that start as root and "become"
 	 * a user without an exec - programs cannot know *everything*
 	 * that libc *might* have put in their data segment.
 	 */
 	PROC_LOCK(p);
 	td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 int
 __setugid(struct thread *td, struct __setugid_args *uap)
 {
 #ifdef REGRESSION
 	struct proc *p;
 
 	p = td->td_proc;
 	switch (uap->flag) {
 	case 0:
 		PROC_LOCK(p);
 		p->p_flag &= ~P_SUGID;
 		PROC_UNLOCK(p);
 		return (0);
 	case 1:
 		PROC_LOCK(p);
 		p->p_flag |= P_SUGID;
 		PROC_UNLOCK(p);
 		return (0);
 	default:
 		return (EINVAL);
 	}
 #else /* !REGRESSION */
 
 	return (ENOSYS);
 #endif /* REGRESSION */
 }
 
 /*
  * Check if gid is a member of the group set.
  */
 int
 groupmember(gid_t gid, struct ucred *cred)
 {
 	register gid_t *gp;
 	gid_t *egp;
 
 	egp = &(cred->cr_groups[cred->cr_ngroups]);
 	for (gp = cred->cr_groups; gp < egp; gp++)
 		if (*gp == gid)
 			return (1);
 	return (0);
 }
 
 /*
  * Test the active securelevel against a given level.  securelevel_gt()
  * implements (securelevel > level).  securelevel_ge() implements
  * (securelevel >= level).  Note that the logic is inverted -- these
  * functions return EPERM on "success" and 0 on "failure".
  *
  * XXXRW: Possibly since this has to do with privilege, it should move to
  * kern_priv.c.
  */
 int
 securelevel_gt(struct ucred *cr, int level)
 {
 	int active_securelevel;
 
 	active_securelevel = securelevel;
 	KASSERT(cr != NULL, ("securelevel_gt: null cr"));
 	if (cr->cr_prison != NULL)
 		active_securelevel = imax(cr->cr_prison->pr_securelevel,
 		    active_securelevel);
 	return (active_securelevel > level ? EPERM : 0);
 }
 
 int
 securelevel_ge(struct ucred *cr, int level)
 {
 	int active_securelevel;
 
 	active_securelevel = securelevel;
 	KASSERT(cr != NULL, ("securelevel_ge: null cr"));
 	if (cr->cr_prison != NULL)
 		active_securelevel = imax(cr->cr_prison->pr_securelevel,
 		    active_securelevel);
 	return (active_securelevel >= level ? EPERM : 0);
 }
 
 /*
  * 'see_other_uids' determines whether or not visibility of processes
  * and sockets with credentials holding different real uids is possible
  * using a variety of system MIBs.
  * XXX: data declarations should be together near the beginning of the file.
  */
 static int	see_other_uids = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW,
     &see_other_uids, 0,
     "Unprivileged processes may see subjects/objects with different real uid");
 
 /*-
  * Determine if u1 "can see" the subject specified by u2, according to the
  * 'see_other_uids' policy.
  * Returns: 0 for permitted, ESRCH otherwise
  * Locks: none
  * References: *u1 and *u2 must not change during the call
  *             u1 may equal u2, in which case only one reference is required
  */
 static int
 cr_seeotheruids(struct ucred *u1, struct ucred *u2)
 {
 
 	if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
 		if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, SUSER_ALLOWJAIL)
 		    != 0)
 			return (ESRCH);
 	}
 	return (0);
 }
 
 /*
  * 'see_other_gids' determines whether or not visibility of processes
  * and sockets with credentials holding different real gids is possible
  * using a variety of system MIBs.
  * XXX: data declarations should be together near the beginning of the file.
  */
 static int	see_other_gids = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW,
     &see_other_gids, 0,
     "Unprivileged processes may see subjects/objects with different real gid");
 
 /*
  * Determine if u1 can "see" the subject specified by u2, according to the
  * 'see_other_gids' policy.
  * Returns: 0 for permitted, ESRCH otherwise
  * Locks: none
  * References: *u1 and *u2 must not change during the call
  *             u1 may equal u2, in which case only one reference is required
  */
 static int
 cr_seeothergids(struct ucred *u1, struct ucred *u2)
 {
 	int i, match;
 	
 	if (!see_other_gids) {
 		match = 0;
 		for (i = 0; i < u1->cr_ngroups; i++) {
 			if (groupmember(u1->cr_groups[i], u2))
 				match = 1;
 			if (match)
 				break;
 		}
 		if (!match) {
 			if (priv_check_cred(u1, PRIV_SEEOTHERGIDS,
 			    SUSER_ALLOWJAIL) != 0)
 				return (ESRCH);
 		}
 	}
 	return (0);
 }
 
 /*-
  * Determine if u1 "can see" the subject specified by u2.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: none
  * References: *u1 and *u2 must not change during the call
  *             u1 may equal u2, in which case only one reference is required
  */
 int
 cr_cansee(struct ucred *u1, struct ucred *u2)
 {
 	int error;
 
 	if ((error = prison_check(u1, u2)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_check_cred_visible(u1, u2)))
 		return (error);
 #endif
 	if ((error = cr_seeotheruids(u1, u2)))
 		return (error);
 	if ((error = cr_seeothergids(u1, u2)))
 		return (error);
 	return (0);
 }
 
 /*-
  * Determine if td "can see" the subject specified by p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect p->p_ucred must be held.  td really
  *        should be curthread.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_cansee(struct thread *td, struct proc *p)
 {
 
 	/* Wrap cr_cansee() for all functionality. */
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	return (cr_cansee(td->td_ucred, p->p_ucred));
 }
 
 /*
  * 'conservative_signals' prevents the delivery of a broad class of
  * signals by unprivileged processes to processes that have changed their
  * credentials since the last invocation of execve().  This can prevent
  * the leakage of cached information or retained privileges as a result
  * of a common class of signal-related vulnerabilities.  However, this
  * may interfere with some applications that expect to be able to
  * deliver these signals to peer processes after having given up
  * privilege.
  */
 static int	conservative_signals = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW,
     &conservative_signals, 0, "Unprivileged processes prevented from "
     "sending certain signals to processes whose credentials have changed");
 /*-
  * Determine whether cred may deliver the specified signal to proc.
  * Returns: 0 for permitted, an errno value otherwise.
  * Locks: A lock must be held for proc.
  * References: cred and proc must be valid for the lifetime of the call.
  */
 int
 cr_cansignal(struct ucred *cred, struct proc *proc, int signum)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(proc, MA_OWNED);
 	/*
 	 * Jail semantics limit the scope of signalling to proc in the
 	 * same jail as cred, if cred is in jail.
 	 */
 	error = prison_check(cred, proc->p_ucred);
 	if (error)
 		return (error);
 #ifdef MAC
 	if ((error = mac_check_proc_signal(cred, proc, signum)))
 		return (error);
 #endif
 	if ((error = cr_seeotheruids(cred, proc->p_ucred)))
 		return (error);
 	if ((error = cr_seeothergids(cred, proc->p_ucred)))
 		return (error);
 
 	/*
 	 * UNIX signal semantics depend on the status of the P_SUGID
 	 * bit on the target process.  If the bit is set, then additional
 	 * restrictions are placed on the set of available signals.
 	 */
 	if (conservative_signals && (proc->p_flag & P_SUGID)) {
 		switch (signum) {
 		case 0:
 		case SIGKILL:
 		case SIGINT:
 		case SIGTERM:
 		case SIGALRM:
 		case SIGSTOP:
 		case SIGTTIN:
 		case SIGTTOU:
 		case SIGTSTP:
 		case SIGHUP:
 		case SIGUSR1:
 		case SIGUSR2:
 			/*
 			 * Generally, permit job and terminal control
 			 * signals.
 			 */
 			break;
 		default:
 			/* Not permitted without privilege. */
 			error = priv_check_cred(cred, PRIV_SIGNAL_SUGID,
 			    SUSER_ALLOWJAIL);
 			if (error)
 				return (error);
 		}
 	}
 
 	/*
 	 * Generally, the target credential's ruid or svuid must match the
 	 * subject credential's ruid or euid.
 	 */
 	if (cred->cr_ruid != proc->p_ucred->cr_ruid &&
 	    cred->cr_ruid != proc->p_ucred->cr_svuid &&
 	    cred->cr_uid != proc->p_ucred->cr_ruid &&
 	    cred->cr_uid != proc->p_ucred->cr_svuid) {
 		/* Not permitted without privilege. */
 		error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*-
  * Determine whether td may deliver the specified signal to p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must be
  *        held for p.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_cansignal(struct thread *td, struct proc *p, int signum)
 {
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (td->td_proc == p)
 		return (0);
 
 	/*
 	 * UNIX signalling semantics require that processes in the same
 	 * session always be able to deliver SIGCONT to one another,
 	 * overriding the remaining protections.
 	 */
 	/* XXX: This will require an additional lock of some sort. */
 	if (signum == SIGCONT && td->td_proc->p_session == p->p_session)
 		return (0);
 	/*
 	 * Some compat layers use SIGTHR and higher signals for
 	 * communication between different kernel threads of the same
 	 * process, so that they expect that it's always possible to
 	 * deliver them, even for suid applications where cr_cansignal() can
 	 * deny such ability for security consideration.  It should be
 	 * pretty safe to do since the only way to create two processes
 	 * with the same p_leader is via rfork(2).
 	 */
 	if (td->td_proc->p_leader != NULL && signum >= SIGTHR &&
 	    signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader)
 		return (0);
 
 	return (cr_cansignal(td->td_ucred, p, signum));
 }
 
 /*-
  * Determine whether td may reschedule p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must
  *        be held for p.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_cansched(struct thread *td, struct proc *p)
 {
 	int error;
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (td->td_proc == p)
 		return (0);
 	if ((error = prison_check(td->td_ucred, p->p_ucred)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_check_proc_sched(td->td_ucred, p)))
 		return (error);
 #endif
 	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
 		return (error);
 	if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
 		return (error);
 	if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid &&
 	    td->td_ucred->cr_uid != p->p_ucred->cr_ruid) {
 		error = priv_check_cred(td->td_ucred, PRIV_SCHED_DIFFCRED,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * The 'unprivileged_proc_debug' flag may be used to disable a variety of
  * unprivileged inter-process debugging services, including some procfs
  * functionality, ptrace(), and ktrace().  In the past, inter-process
  * debugging has been involved in a variety of security problems, and sites
  * not requiring the service might choose to disable it when hardening
  * systems.
  *
  * XXX: Should modifying and reading this variable require locking?
  * XXX: data declarations should be together near the beginning of the file.
  */
 static int	unprivileged_proc_debug = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW,
     &unprivileged_proc_debug, 0,
     "Unprivileged processes may use process debugging facilities");
 
 /*-
  * Determine whether td may debug p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must
  *        be held for p.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_candebug(struct thread *td, struct proc *p)
 {
 	int credentialchanged, error, grpsubset, i, uidsubset;
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (!unprivileged_proc_debug) {
 		error = priv_check_cred(td->td_ucred, PRIV_DEBUG_UNPRIV,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 	if (td->td_proc == p)
 		return (0);
 	if ((error = prison_check(td->td_ucred, p->p_ucred)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_check_proc_debug(td->td_ucred, p)))
 		return (error);
 #endif
 	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
 		return (error);
 	if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
 		return (error);
 
 	/*
 	 * Is p's group set a subset of td's effective group set?  This
 	 * includes p's egid, group access list, rgid, and svgid.
 	 */
 	grpsubset = 1;
 	for (i = 0; i < p->p_ucred->cr_ngroups; i++) {
 		if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) {
 			grpsubset = 0;
 			break;
 		}
 	}
 	grpsubset = grpsubset &&
 	    groupmember(p->p_ucred->cr_rgid, td->td_ucred) &&
 	    groupmember(p->p_ucred->cr_svgid, td->td_ucred);
 
 	/*
 	 * Are the uids present in p's credential equal to td's
 	 * effective uid?  This includes p's euid, svuid, and ruid.
 	 */
 	uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid &&
 	    td->td_ucred->cr_uid == p->p_ucred->cr_svuid &&
 	    td->td_ucred->cr_uid == p->p_ucred->cr_ruid);
 
 	/*
 	 * Has the credential of the process changed since the last exec()?
 	 */
 	credentialchanged = (p->p_flag & P_SUGID);
 
 	/*
 	 * If p's gids aren't a subset, or the uids aren't a subset,
 	 * or the credential has changed, require appropriate privilege
 	 * for td to debug p.
 	 */
 	if (!grpsubset || !uidsubset) {
 		error = priv_check_cred(td->td_ucred, PRIV_DEBUG_DIFFCRED,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 
 	if (credentialchanged) {
 		error = priv_check_cred(td->td_ucred, PRIV_DEBUG_SUGID,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 
 	/* Can't trace init when securelevel > 0. */
 	if (p == initproc) {
 		error = securelevel_gt(td->td_ucred, 0);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Can't trace a process that's currently exec'ing.
 	 *
 	 * XXX: Note, this is not a security policy decision, it's a
 	 * basic correctness/functionality decision.  Therefore, this check
 	 * should be moved to the caller's of p_candebug().
 	 */
 	if ((p->p_flag & P_INEXEC) != 0)
 		return (EAGAIN);
 
 	return (0);
 }
 
 /*-
  * Determine whether the subject represented by cred can "see" a socket.
  * Returns: 0 for permitted, ENOENT otherwise.
  */
 int
 cr_canseesocket(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	error = prison_check(cred, so->so_cred);
 	if (error)
 		return (ENOENT);
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_visible(cred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		return (error);
 #endif
 	if (cr_seeotheruids(cred, so->so_cred))
 		return (ENOENT);
 	if (cr_seeothergids(cred, so->so_cred))
 		return (ENOENT);
 
 	return (0);
 }
 
 /*-
  * Determine whether td can wait for the exit of p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must
  *        be held for p.
  * References: td and p must be valid for the lifetime of the call
 
  */
 int
 p_canwait(struct thread *td, struct proc *p)
 {
 	int error;
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((error = prison_check(td->td_ucred, p->p_ucred)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_check_proc_wait(td->td_ucred, p)))
 		return (error);
 #endif
 #if 0
 	/* XXXMAC: This could have odd effects on some shells. */
 	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
 		return (error);
 #endif
 
 	return (0);
 }
 
 /*
  * Allocate a zeroed cred structure.
  */
 struct ucred *
 crget(void)
 {
 	register struct ucred *cr;
 
 	MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
 	refcount_init(&cr->cr_ref, 1);
 #ifdef MAC
 	mac_init_cred(cr);
 #endif
 	return (cr);
 }
 
 /*
  * Claim another reference to a ucred structure.
  */
 struct ucred *
 crhold(struct ucred *cr)
 {
 
 	refcount_acquire(&cr->cr_ref);
 	return (cr);
 }
 
 /*
  * Free a cred structure.  Throws away space when ref count gets to 0.
  */
 void
 crfree(struct ucred *cr)
 {
 
 	KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
 	KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
 	if (refcount_release(&cr->cr_ref)) {
 		/*
 		 * Some callers of crget(), such as nfs_statfs(),
 		 * allocate a temporary credential, but don't
 		 * allocate a uidinfo structure.
 		 */
 		if (cr->cr_uidinfo != NULL)
 			uifree(cr->cr_uidinfo);
 		if (cr->cr_ruidinfo != NULL)
 			uifree(cr->cr_ruidinfo);
 		/*
 		 * Free a prison, if any.
 		 */
 		if (jailed(cr))
 			prison_free(cr->cr_prison);
 #ifdef MAC
 		mac_destroy_cred(cr);
 #endif
 		FREE(cr, M_CRED);
 	}
 }
 
 /*
  * Check to see if this ucred is shared.
  */
 int
 crshared(struct ucred *cr)
 {
 
 	return (cr->cr_ref > 1);
 }
 
 /*
  * Copy a ucred's contents from a template.  Does not block.
  */
 void
 crcopy(struct ucred *dest, struct ucred *src)
 {
 
 	KASSERT(crshared(dest) == 0, ("crcopy of shared ucred"));
 	bcopy(&src->cr_startcopy, &dest->cr_startcopy,
 	    (unsigned)((caddr_t)&src->cr_endcopy -
 		(caddr_t)&src->cr_startcopy));
 	uihold(dest->cr_uidinfo);
 	uihold(dest->cr_ruidinfo);
 	if (jailed(dest))
 		prison_hold(dest->cr_prison);
 #ifdef MAC
 	mac_copy_cred(src, dest);
 #endif
 }
 
 /*
  * Dup cred struct to a new held one.
  */
 struct ucred *
 crdup(struct ucred *cr)
 {
 	struct ucred *newcr;
 
 	newcr = crget();
 	crcopy(newcr, cr);
 	return (newcr);
 }
 
 /*
  * Fill in a struct xucred based on a struct ucred.
  */
 void
 cru2x(struct ucred *cr, struct xucred *xcr)
 {
 
 	bzero(xcr, sizeof(*xcr));
 	xcr->cr_version = XUCRED_VERSION;
 	xcr->cr_uid = cr->cr_uid;
 	xcr->cr_ngroups = cr->cr_ngroups;
 	bcopy(cr->cr_groups, xcr->cr_groups, sizeof(cr->cr_groups));
 }
 
 /*
  * small routine to swap a thread's current ucred for the correct one taken
  * from the process.
  */
 void
 cred_update_thread(struct thread *td)
 {
 	struct proc *p;
 	struct ucred *cred;
 
 	p = td->td_proc;
 	cred = td->td_ucred;
 	PROC_LOCK(p);
 	td->td_ucred = crhold(p->p_ucred);
 	PROC_UNLOCK(p);
 	if (cred != NULL)
 		crfree(cred);
 }
 
 /*
  * Get login name, if available.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getlogin_args {
 	char	*namebuf;
 	u_int	namelen;
 };
 #endif
 /* ARGSUSED */
 int
 getlogin(struct thread *td, struct getlogin_args *uap)
 {
 	int error;
 	char login[MAXLOGNAME];
 	struct proc *p = td->td_proc;
 
 	if (uap->namelen > MAXLOGNAME)
 		uap->namelen = MAXLOGNAME;
 	PROC_LOCK(p);
 	SESS_LOCK(p->p_session);
 	bcopy(p->p_session->s_login, login, uap->namelen);
 	SESS_UNLOCK(p->p_session);
 	PROC_UNLOCK(p);
 	error = copyout(login, uap->namebuf, uap->namelen);
 	return(error);
 }
 
 /*
  * Set login name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct setlogin_args {
 	char	*namebuf;
 };
 #endif
 /* ARGSUSED */
 int
 setlogin(struct thread *td, struct setlogin_args *uap)
 {
 	struct proc *p = td->td_proc;
 	int error;
 	char logintmp[MAXLOGNAME];
 
 	error = priv_check_cred(td->td_ucred, PRIV_PROC_SETLOGIN,
 	    SUSER_ALLOWJAIL);
 	if (error)
 		return (error);
 	error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL);
 	if (error == ENAMETOOLONG)
 		error = EINVAL;
 	else if (!error) {
 		PROC_LOCK(p);
 		SESS_LOCK(p->p_session);
 		(void) memcpy(p->p_session->s_login, logintmp,
 		    sizeof(logintmp));
 		SESS_UNLOCK(p->p_session);
 		PROC_UNLOCK(p);
 	}
 	return (error);
 }
 
 void
 setsugid(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag |= P_SUGID;
 	if (!(p->p_pfsflags & PF_ISUGID))
 		p->p_stops = 0;
 }
 
 /*-
  * Change a process's effective uid.
  * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_euid(struct ucred *newcred, struct uidinfo *euip)
 {
 
 	newcred->cr_uid = euip->ui_uid;
 	uihold(euip);
 	uifree(newcred->cr_uidinfo);
 	newcred->cr_uidinfo = euip;
 }
 
 /*-
  * Change a process's effective gid.
  * Side effects: newcred->cr_gid will be modified.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_egid(struct ucred *newcred, gid_t egid)
 {
 
 	newcred->cr_groups[0] = egid;
 }
 
 /*-
  * Change a process's real uid.
  * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo
  *               will be updated, and the old and new cr_ruidinfo proc
  *               counts will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_ruid(struct ucred *newcred, struct uidinfo *ruip)
 {
 
 	(void)chgproccnt(newcred->cr_ruidinfo, -1, 0);
 	newcred->cr_ruid = ruip->ui_uid;
 	uihold(ruip);
 	uifree(newcred->cr_ruidinfo);
 	newcred->cr_ruidinfo = ruip;
 	(void)chgproccnt(newcred->cr_ruidinfo, 1, 0);
 }
 
 /*-
  * Change a process's real gid.
  * Side effects: newcred->cr_rgid will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_rgid(struct ucred *newcred, gid_t rgid)
 {
 
 	newcred->cr_rgid = rgid;
 }
 
 /*-
  * Change a process's saved uid.
  * Side effects: newcred->cr_svuid will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_svuid(struct ucred *newcred, uid_t svuid)
 {
 
 	newcred->cr_svuid = svuid;
 }
 
 /*-
  * Change a process's saved gid.
  * Side effects: newcred->cr_svgid will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_svgid(struct ucred *newcred, gid_t svgid)
 {
 
 	newcred->cr_svgid = svgid;
 }
Index: head/sys/kern/kern_resource.c
===================================================================
--- head/sys/kern/kern_resource.c	(revision 167231)
+++ head/sys/kern/kern_resource.c	(revision 167232)
@@ -1,1260 +1,1257 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/time.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 
 static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
 static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
 #define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
 static struct mtx uihashtbl_mtx;
 static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
 static u_long uihash;		/* size of hash table - 1 */
 
 static void	calcru1(struct proc *p, struct rusage_ext *ruxp,
 		    struct timeval *up, struct timeval *sp);
 static int	donice(struct thread *td, struct proc *chgp, int n);
 static struct uidinfo *uilookup(uid_t uid);
 
 /*
  * Resource controls and accounting.
  */
-
 #ifndef _SYS_SYSPROTO_H_
 struct getpriority_args {
 	int	which;
 	int	who;
 };
 #endif
 int
 getpriority(td, uap)
 	struct thread *td;
 	register struct getpriority_args *uap;
 {
 	struct proc *p;
 	struct pgrp *pg;
 	int error, low;
 
 	error = 0;
 	low = PRIO_MAX + 1;
 	switch (uap->which) {
 
 	case PRIO_PROCESS:
 		if (uap->who == 0)
 			low = td->td_proc->p_nice;
 		else {
 			p = pfind(uap->who);
 			if (p == NULL)
 				break;
 			if (p_cansee(td, p) == 0)
 				low = p->p_nice;
 			PROC_UNLOCK(p);
 		}
 		break;
 
 	case PRIO_PGRP:
 		sx_slock(&proctree_lock);
 		if (uap->who == 0) {
 			pg = td->td_proc->p_pgrp;
 			PGRP_LOCK(pg);
 		} else {
 			pg = pgfind(uap->who);
 			if (pg == NULL) {
 				sx_sunlock(&proctree_lock);
 				break;
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (!p_cansee(td, p)) {
 				if (p->p_nice < low)
 					low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pg);
 		break;
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			/* Do not bother to check PRS_NEW processes */
 			if (p->p_state == PRS_NEW)
 				continue;
 			PROC_LOCK(p);
 			if (!p_cansee(td, p) &&
 			    p->p_ucred->cr_uid == uap->who) {
 				if (p->p_nice < low)
 					low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (low == PRIO_MAX + 1 && error == 0)
 		error = ESRCH;
 	td->td_retval[0] = low;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setpriority_args {
 	int	which;
 	int	who;
 	int	prio;
 };
 #endif
 int
 setpriority(td, uap)
 	struct thread *td;
 	struct setpriority_args *uap;
 {
 	struct proc *curp, *p;
 	struct pgrp *pg;
 	int found = 0, error = 0;
 
 	curp = td->td_proc;
 	switch (uap->which) {
 	case PRIO_PROCESS:
 		if (uap->who == 0) {
 			PROC_LOCK(curp);
 			error = donice(td, curp, uap->prio);
 			PROC_UNLOCK(curp);
 		} else {
 			p = pfind(uap->who);
 			if (p == 0)
 				break;
 			if (p_cansee(td, p) == 0)
 				error = donice(td, p, uap->prio);
 			PROC_UNLOCK(p);
 		}
 		found++;
 		break;
 
 	case PRIO_PGRP:
 		sx_slock(&proctree_lock);
 		if (uap->who == 0) {
 			pg = curp->p_pgrp;
 			PGRP_LOCK(pg);
 		} else {
 			pg = pgfind(uap->who);
 			if (pg == NULL) {
 				sx_sunlock(&proctree_lock);
 				break;
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (!p_cansee(td, p)) {
 				error = donice(td, p, uap->prio);
 				found++;
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pg);
 		break;
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_ucred->cr_uid == uap->who &&
 			    !p_cansee(td, p)) {
 				error = donice(td, p, uap->prio);
 				found++;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (found == 0 && error == 0)
 		error = ESRCH;
 	return (error);
 }
 
 /*
  * Set "nice" for a (whole) process.
  */
 static int
 donice(struct thread *td, struct proc *p, int n)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((error = p_cansched(td, p)))
 		return (error);
 	if (n > PRIO_MAX)
 		n = PRIO_MAX;
 	if (n < PRIO_MIN)
 		n = PRIO_MIN;
  	if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
 		return (EACCES);
 	mtx_lock_spin(&sched_lock);
 	sched_nice(p, n);
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 /*
  * Set realtime priority for LWP.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rtprio_thread_args {
 	int		function;
 	lwpid_t		lwpid;
 	struct rtprio	*rtp;
 };
 #endif
-
 int
 rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
 {
 	struct proc *curp;
 	struct proc *p;
 	struct rtprio rtp;
 	struct thread *td1;
 	int cierror, error;
 
 	/* Perform copyin before acquiring locks if needed. */
 	if (uap->function == RTP_SET)
 		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 	else
 		cierror = 0;
 
 	curp = td->td_proc;
 	/*
 	 * Though lwpid is unique, only current process is supported
 	 * since there is no efficient way to look up a LWP yet.
 	 */
 	p = curp;
 	PROC_LOCK(p);
 
 	switch (uap->function) {
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
 		mtx_lock_spin(&sched_lock);
 		if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 			td1 = td;
 		else
 			td1 = thread_find(p, uap->lwpid);
 		if (td1 != NULL)
 			pri_to_rtp(td1, &rtp);
 		else
 			error = ESRCH;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
 		if ((error = p_cansched(td, p)) || (error = cierror))
 			break;
 
 		/* Disallow setting rtprio in most cases if not superuser. */
 		if (priv_check(td, PRIV_SCHED_RTPRIO) != 0) {
 			/* can't set realtime priority */
 /*
  * Realtime priority has to be restricted for reasons which should be
  * obvious.  However, for idle priority, there is a potential for
  * system deadlock if an idleprio process gains a lock on a resource
  * that other processes need (and the idleprio process can't run
  * due to a CPU-bound normal process).  Fix me!  XXX
  */
 #if 0
  			if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 #else
 			if (rtp.type != RTP_PRIO_NORMAL) {
 #endif
 				error = EPERM;
 				break;
 			}
 		}
 
 		mtx_lock_spin(&sched_lock);
 		if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
 			td1 = td;
 		else
 			td1 = thread_find(p, uap->lwpid);
 		if (td1 != NULL)
 			error = rtp_to_pri(&rtp, td1);
 		else
 			error = ESRCH;
 		mtx_unlock_spin(&sched_lock);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Set realtime priority.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rtprio_args {
 	int		function;
 	pid_t		pid;
 	struct rtprio	*rtp;
 };
 #endif
-
 int
 rtprio(td, uap)
 	struct thread *td;		/* curthread */
 	register struct rtprio_args *uap;
 {
 	struct proc *curp;
 	struct proc *p;
 	struct thread *tdp;
 	struct rtprio rtp;
 	int cierror, error;
 
 	/* Perform copyin before acquiring locks if needed. */
 	if (uap->function == RTP_SET)
 		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 	else
 		cierror = 0;
 
 	curp = td->td_proc;
 	if (uap->pid == 0) {
 		p = curp;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(uap->pid);
 		if (p == NULL)
 			return (ESRCH);
 	}
 
 	switch (uap->function) {
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Return OUR priority if no pid specified,
 		 * or if one is, report the highest priority
 		 * in the process.  There isn't much more you can do as 
 		 * there is only room to return a single priority.
 		 * XXXKSE: maybe need a new interface to report 
 		 * priorities of multiple system scope threads.
 		 * Note: specifying our own pid is not the same
 		 * as leaving it zero.
 		 */
 		if (uap->pid == 0) {
 			pri_to_rtp(td, &rtp);
 		} else {
 			struct rtprio rtp2;
 
 			rtp.type = RTP_PRIO_IDLE;
 			rtp.prio = RTP_PRIO_MAX;
 			FOREACH_THREAD_IN_PROC(p, tdp) {
 				pri_to_rtp(tdp, &rtp2);
 				if (rtp2.type <  rtp.type ||
 				    (rtp2.type == rtp.type &&
 				    rtp2.prio < rtp.prio)) {
 					rtp.type = rtp2.type;
 					rtp.prio = rtp2.prio;
 				}
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
 		if ((error = p_cansched(td, p)) || (error = cierror))
 			break;
 
 		/* Disallow setting rtprio in most cases if not superuser. */
 		if (priv_check(td, PRIV_SCHED_RTPRIO) != 0) {
 			/* can't set someone else's */
 			if (uap->pid) {
 				error = EPERM;
 				break;
 			}
 			/* can't set realtime priority */
 /*
  * Realtime priority has to be restricted for reasons which should be
  * obvious.  However, for idle priority, there is a potential for
  * system deadlock if an idleprio process gains a lock on a resource
  * that other processes need (and the idleprio process can't run
  * due to a CPU-bound normal process).  Fix me!  XXX
  */
 #if 0
  			if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 #else
 			if (rtp.type != RTP_PRIO_NORMAL) {
 #endif
 				error = EPERM;
 				break;
 			}
 		}
 
 		/*
 		 * If we are setting our own priority, set just our
 		 * thread but if we are doing another process,
 		 * do all the threads on that process. If we
 		 * specify our own pid we do the latter.
 		 */
 		mtx_lock_spin(&sched_lock);
 		if (uap->pid == 0) {
 			error = rtp_to_pri(&rtp, td);
 		} else {
 			FOREACH_THREAD_IN_PROC(p, td) {
 				if ((error = rtp_to_pri(&rtp, td)) != 0)
 					break;
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 int
 rtp_to_pri(struct rtprio *rtp, struct thread *td)
 {
 	u_char	newpri;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (rtp->prio > RTP_PRIO_MAX)
 		return (EINVAL);
 	switch (RTP_PRIO_BASE(rtp->type)) {
 	case RTP_PRIO_REALTIME:
 		newpri = PRI_MIN_REALTIME + rtp->prio;
 		break;
 	case RTP_PRIO_NORMAL:
 		newpri = PRI_MIN_TIMESHARE + rtp->prio;
 		break;
 	case RTP_PRIO_IDLE:
 		newpri = PRI_MIN_IDLE + rtp->prio;
 		break;
 	default:
 		return (EINVAL);
 	}
 	sched_class(td, rtp->type);	/* XXX fix */
 	sched_user_prio(td, newpri);
 	if (curthread == td)
 		sched_prio(curthread, td->td_user_pri); /* XXX dubious */
 	return (0);
 }
 
 void
 pri_to_rtp(struct thread *td, struct rtprio *rtp)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	switch (PRI_BASE(td->td_pri_class)) {
 	case PRI_REALTIME:
 		rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
 		break;
 	case PRI_TIMESHARE:
 		rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
 		break;
 	case PRI_IDLE:
 		rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
 		break;
 	default:
 		break;
 	}
 	rtp->type = td->td_pri_class;
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
 int
 osetrlimit(td, uap)
 	struct thread *td;
 	register struct osetrlimit_args *uap;
 {
 	struct orlimit olim;
 	struct rlimit lim;
 	int error;
 
 	if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
 		return (error);
 	lim.rlim_cur = olim.rlim_cur;
 	lim.rlim_max = olim.rlim_max;
 	error = kern_setrlimit(td, uap->which, &lim);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ogetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
 int
 ogetrlimit(td, uap)
 	struct thread *td;
 	register struct ogetrlimit_args *uap;
 {
 	struct orlimit olim;
 	struct rlimit rl;
 	struct proc *p;
 	int error;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	lim_rlimit(p, uap->which, &rl);
 	PROC_UNLOCK(p);
 
 	/*
 	 * XXX would be more correct to convert only RLIM_INFINITY to the
 	 * old RLIM_INFINITY and fail with EOVERFLOW for other larger
 	 * values.  Most 64->32 and 32->16 conversions, including not
 	 * unimportant ones of uids are even more broken than what we
 	 * do here (they blindly truncate).  We don't do this correctly
 	 * here since we have little experience with EOVERFLOW yet.
 	 * Elsewhere, getuid() can't fail...
 	 */
 	olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
 	olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
 	error = copyout(&olim, uap->rlp, sizeof(olim));
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct __setrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
 int
 setrlimit(td, uap)
 	struct thread *td;
 	register struct __setrlimit_args *uap;
 {
 	struct rlimit alim;
 	int error;
 
 	if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
 		return (error);
 	error = kern_setrlimit(td, uap->which, &alim);
 	return (error);
 }
 
 int
 kern_setrlimit(td, which, limp)
 	struct thread *td;
 	u_int which;
 	struct rlimit *limp;
 {
 	struct plimit *newlim, *oldlim;
 	struct proc *p;
 	register struct rlimit *alimp;
 	rlim_t oldssiz;
 	int error;
 
 	if (which >= RLIM_NLIMITS)
 		return (EINVAL);
 
 	/*
 	 * Preserve historical bugs by treating negative limits as unsigned.
 	 */
 	if (limp->rlim_cur < 0)
 		limp->rlim_cur = RLIM_INFINITY;
 	if (limp->rlim_max < 0)
 		limp->rlim_max = RLIM_INFINITY;
 
 	oldssiz = 0;
 	p = td->td_proc;
 	newlim = lim_alloc();
 	PROC_LOCK(p);
 	oldlim = p->p_limit;
 	alimp = &oldlim->pl_rlimit[which];
 	if (limp->rlim_cur > alimp->rlim_max ||
 	    limp->rlim_max > alimp->rlim_max)
 		if ((error = priv_check_cred(td->td_ucred,
 		    PRIV_PROC_SETRLIMIT, SUSER_ALLOWJAIL))) {
 			PROC_UNLOCK(p);
 			lim_free(newlim);
 			return (error);
 		}
 	if (limp->rlim_cur > limp->rlim_max)
 		limp->rlim_cur = limp->rlim_max;
 	lim_copy(newlim, oldlim);
 	alimp = &newlim->pl_rlimit[which];
 
 	switch (which) {
 
 	case RLIMIT_CPU:
 		mtx_lock_spin(&sched_lock);
 		p->p_cpulimit = limp->rlim_cur;
 		mtx_unlock_spin(&sched_lock);
 		break;
 	case RLIMIT_DATA:
 		if (limp->rlim_cur > maxdsiz)
 			limp->rlim_cur = maxdsiz;
 		if (limp->rlim_max > maxdsiz)
 			limp->rlim_max = maxdsiz;
 		break;
 
 	case RLIMIT_STACK:
 		if (limp->rlim_cur > maxssiz)
 			limp->rlim_cur = maxssiz;
 		if (limp->rlim_max > maxssiz)
 			limp->rlim_max = maxssiz;
 		oldssiz = alimp->rlim_cur;
 		break;
 
 	case RLIMIT_NOFILE:
 		if (limp->rlim_cur > maxfilesperproc)
 			limp->rlim_cur = maxfilesperproc;
 		if (limp->rlim_max > maxfilesperproc)
 			limp->rlim_max = maxfilesperproc;
 		break;
 
 	case RLIMIT_NPROC:
 		if (limp->rlim_cur > maxprocperuid)
 			limp->rlim_cur = maxprocperuid;
 		if (limp->rlim_max > maxprocperuid)
 			limp->rlim_max = maxprocperuid;
 		if (limp->rlim_cur < 1)
 			limp->rlim_cur = 1;
 		if (limp->rlim_max < 1)
 			limp->rlim_max = 1;
 		break;
 	}
 	*alimp = *limp;
 	p->p_limit = newlim;
 	PROC_UNLOCK(p);
 	lim_free(oldlim);
 
 	if (which == RLIMIT_STACK) {
 		/*
 		 * Stack is allocated to the max at exec time with only
 		 * "rlim_cur" bytes accessible.  If stack limit is going
 		 * up make more accessible, if going down make inaccessible.
 		 */
 		if (limp->rlim_cur != oldssiz) {
 			vm_offset_t addr;
 			vm_size_t size;
 			vm_prot_t prot;
 
 			if (limp->rlim_cur > oldssiz) {
 				prot = p->p_sysent->sv_stackprot;
 				size = limp->rlim_cur - oldssiz;
 				addr = p->p_sysent->sv_usrstack -
 				    limp->rlim_cur;
 			} else {
 				prot = VM_PROT_NONE;
 				size = oldssiz - limp->rlim_cur;
 				addr = p->p_sysent->sv_usrstack - oldssiz;
 			}
 			addr = trunc_page(addr);
 			size = round_page(size);
 			(void)vm_map_protect(&p->p_vmspace->vm_map,
 			    addr, addr + size, prot, FALSE);
 		}
 	}
 
 	/*
 	 * The data size limit may need to be changed to a value
 	 * that makes sense for the 32 bit binary.
 	 */
 	if (p->p_sysent->sv_fixlimits != NULL)
 		p->p_sysent->sv_fixlimits(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __getrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
 /* ARGSUSED */
 int
 getrlimit(td, uap)
 	struct thread *td;
 	register struct __getrlimit_args *uap;
 {
 	struct rlimit rlim;
 	struct proc *p;
 	int error;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	lim_rlimit(p, uap->which, &rlim);
 	PROC_UNLOCK(p);
 	error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
 	return (error);
 }
 
 /*
  * Transform the running time and tick information for children of proc p
  * into user and system time usage.
  */
 void
 calccru(p, up, sp)
 	struct proc *p;
 	struct timeval *up;
 	struct timeval *sp;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	calcru1(p, &p->p_crux, up, sp);
 }
 
 /*
  * Transform the running time and tick information in proc p into user
  * and system time usage.  If appropriate, include the current time slice
  * on this CPU.
  */
 void
 calcru(struct proc *p, struct timeval *up, struct timeval *sp)
 {
 	struct rusage_ext rux;
 	struct thread *td;
 	uint64_t u;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_lock_spin(&sched_lock);
 
 	/*
 	 * If we are getting stats for the current process, then add in the
 	 * stats that this thread has accumulated in its current time slice.
 	 * We reset the thread and CPU state as if we had performed a context
 	 * switch right here.
 	 */
 	if (curthread->td_proc == p) {
 		td = curthread;
 		u = cpu_ticks();
 		p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
 		PCPU_SET(switchtime, u);
 		p->p_rux.rux_uticks += td->td_uticks;
 		td->td_uticks = 0;
 		p->p_rux.rux_iticks += td->td_iticks;
 		td->td_iticks = 0;
 		p->p_rux.rux_sticks += td->td_sticks;
 		td->td_sticks = 0;
 	}
 	/* Work on a copy of p_rux so we can let go of sched_lock */
 	rux = p->p_rux;
 	mtx_unlock_spin(&sched_lock);
 	calcru1(p, &rux, up, sp);
 	/* Update the result from the p_rux copy */
 	p->p_rux.rux_uu = rux.rux_uu;
 	p->p_rux.rux_su = rux.rux_su;
 	p->p_rux.rux_tu = rux.rux_tu;
 }
 
 static void
 calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
     struct timeval *sp)
 {
 	/* {user, system, interrupt, total} {ticks, usec}: */
 	u_int64_t ut, uu, st, su, it, tt, tu;
 
 	ut = ruxp->rux_uticks;
 	st = ruxp->rux_sticks;
 	it = ruxp->rux_iticks;
 	tt = ut + st + it;
 	if (tt == 0) {
 		/* Avoid divide by zero */
 		st = 1;
 		tt = 1;
 	}
 	tu = cputick2usec(ruxp->rux_runtime);
 	if ((int64_t)tu < 0) {
 		/* XXX: this should be an assert /phk */
 		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
 		    (intmax_t)tu, p->p_pid, p->p_comm);
 		tu = ruxp->rux_tu;
 	}
 
 	if (tu >= ruxp->rux_tu) {
 		/*
 		 * The normal case, time increased.
 		 * Enforce monotonicity of bucketed numbers.
 		 */
 		uu = (tu * ut) / tt;
 		if (uu < ruxp->rux_uu)
 			uu = ruxp->rux_uu;
 		su = (tu * st) / tt;
 		if (su < ruxp->rux_su)
 			su = ruxp->rux_su;
 	} else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
 		/* 
 		 * When we calibrate the cputicker, it is not uncommon to
 		 * see the presumably fixed frequency increase slightly over
 		 * time as a result of thermal stabilization and NTP
 		 * discipline (of the reference clock).  We therefore ignore
 		 * a bit of backwards slop because we  expect to catch up
  		 * shortly.  We use a 3 microsecond limit to catch low
 		 * counts and a 1% limit for high counts.
 		 */
 		uu = ruxp->rux_uu;
 		su = ruxp->rux_su;
 		tu = ruxp->rux_tu;
 	} else { /* tu < ruxp->rux_tu */
 		/*
 		 * What happene here was likely that a laptop, which ran at
 		 * a reduced clock frequency at boot, kicked into high gear.
 		 * The wisdom of spamming this message in that case is
 		 * dubious, but it might also be indicative of something
 		 * serious, so lets keep it and hope laptops can be made
 		 * more truthful about their CPU speed via ACPI.
 		 */
 		printf("calcru: runtime went backwards from %ju usec "
 		    "to %ju usec for pid %d (%s)\n",
 		    (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
 		    p->p_pid, p->p_comm);
 		uu = (tu * ut) / tt;
 		su = (tu * st) / tt;
 	}
 
 	ruxp->rux_uu = uu;
 	ruxp->rux_su = su;
 	ruxp->rux_tu = tu;
 
 	up->tv_sec = uu / 1000000;
 	up->tv_usec = uu % 1000000;
 	sp->tv_sec = su / 1000000;
 	sp->tv_usec = su % 1000000;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getrusage_args {
 	int	who;
 	struct	rusage *rusage;
 };
 #endif
 int
 getrusage(td, uap)
 	register struct thread *td;
 	register struct getrusage_args *uap;
 {
 	struct rusage ru;
 	int error;
 
 	error = kern_getrusage(td, uap->who, &ru);
 	if (error == 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
 
 int
 kern_getrusage(td, who, rup)
 	struct thread *td;
 	int who;
 	struct rusage *rup;
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	switch (who) {
 
 	case RUSAGE_SELF:
 		*rup = p->p_stats->p_ru;
 		calcru(p, &rup->ru_utime, &rup->ru_stime);
 		break;
 
 	case RUSAGE_CHILDREN:
 		*rup = p->p_stats->p_cru;
 		calccru(p, &rup->ru_utime, &rup->ru_stime);
 		break;
 
 	default:
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 void
 ruadd(ru, rux, ru2, rux2)
 	struct rusage *ru;
 	struct rusage_ext *rux;
 	struct rusage *ru2;
 	struct rusage_ext *rux2;
 {
 	register long *ip, *ip2;
 	register int i;
 
 	rux->rux_runtime += rux2->rux_runtime;
 	rux->rux_uticks += rux2->rux_uticks;
 	rux->rux_sticks += rux2->rux_sticks;
 	rux->rux_iticks += rux2->rux_iticks;
 	rux->rux_uu += rux2->rux_uu;
 	rux->rux_su += rux2->rux_su;
 	rux->rux_tu += rux2->rux_tu;
 	if (ru->ru_maxrss < ru2->ru_maxrss)
 		ru->ru_maxrss = ru2->ru_maxrss;
 	ip = &ru->ru_first;
 	ip2 = &ru2->ru_first;
 	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
 		*ip++ += *ip2++;
 }
 
 /*
  * Allocate a new resource limits structure and initialize its
  * reference count and mutex pointer.
  */
 struct plimit *
 lim_alloc()
 {
 	struct plimit *limp;
 
 	limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
 	refcount_init(&limp->pl_refcnt, 1);
 	return (limp);
 }
 
 struct plimit *
 lim_hold(limp)
 	struct plimit *limp;
 {
 
 	refcount_acquire(&limp->pl_refcnt);
 	return (limp);
 }
 
 void
 lim_free(limp)
 	struct plimit *limp;
 {
 
 	KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
 	if (refcount_release(&limp->pl_refcnt))
 		free((void *)limp, M_PLIMIT);
 }
 
 /*
  * Make a copy of the plimit structure.
  * We share these structures copy-on-write after fork.
  */
 void
 lim_copy(dst, src)
 	struct plimit *dst, *src;
 {
 
 	KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
 	bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
 }
 
 /*
  * Return the hard limit for a particular system resource.  The
  * which parameter specifies the index into the rlimit array.
  */
 rlim_t
 lim_max(struct proc *p, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit(p, which, &rl);
 	return (rl.rlim_max);
 }
 
 /*
  * Return the current (soft) limit for a particular system resource.
  * The which parameter which specifies the index into the rlimit array
  */
 rlim_t
 lim_cur(struct proc *p, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit(p, which, &rl);
 	return (rl.rlim_cur);
 }
 
 /*
  * Return a copy of the entire rlimit structure for the system limit
  * specified by 'which' in the rlimit structure pointed to by 'rlp'.
  */
 void
 lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(which >= 0 && which < RLIM_NLIMITS,
 	    ("request for invalid resource limit"));
 	*rlp = p->p_limit->pl_rlimit[which];
 }
 
 /*
  * Find the uidinfo structure for a uid.  This structure is used to
  * track the total resource consumption (process count, socket buffer
  * size, etc.) for the uid and impose limits.
  */
 void
 uihashinit()
 {
 
 	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
 	mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
 }
 
 /*
  * Look up a uidinfo struct for the parameter uid.
  * uihashtbl_mtx must be locked.
  */
 static struct uidinfo *
 uilookup(uid)
 	uid_t uid;
 {
 	struct uihashhead *uipp;
 	struct uidinfo *uip;
 
 	mtx_assert(&uihashtbl_mtx, MA_OWNED);
 	uipp = UIHASH(uid);
 	LIST_FOREACH(uip, uipp, ui_hash)
 		if (uip->ui_uid == uid)
 			break;
 
 	return (uip);
 }
 
 /*
  * Find or allocate a struct uidinfo for a particular uid.
  * Increase refcount on uidinfo struct returned.
  * uifree() should be called on a struct uidinfo when released.
  */
 struct uidinfo *
 uifind(uid)
 	uid_t uid;
 {
 	struct uidinfo *old_uip, *uip;
 
 	mtx_lock(&uihashtbl_mtx);
 	uip = uilookup(uid);
 	if (uip == NULL) {
 		mtx_unlock(&uihashtbl_mtx);
 		uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
 		mtx_lock(&uihashtbl_mtx);
 		/*
 		 * There's a chance someone created our uidinfo while we
 		 * were in malloc and not holding the lock, so we have to
 		 * make sure we don't insert a duplicate uidinfo.
 		 */
 		if ((old_uip = uilookup(uid)) != NULL) {
 			/* Someone else beat us to it. */
 			free(uip, M_UIDINFO);
 			uip = old_uip;
 		} else {
 			uip->ui_mtxp = mtx_pool_alloc(mtxpool_sleep);
 			uip->ui_uid = uid;
 			LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
 		}
 	}
 	uihold(uip);
 	mtx_unlock(&uihashtbl_mtx);
 	return (uip);
 }
 
 /*
  * Place another refcount on a uidinfo struct.
  */
 void
 uihold(uip)
 	struct uidinfo *uip;
 {
 
 	UIDINFO_LOCK(uip);
 	uip->ui_ref++;
 	UIDINFO_UNLOCK(uip);
 }
 
 /*-
  * Since uidinfo structs have a long lifetime, we use an
  * opportunistic refcounting scheme to avoid locking the lookup hash
  * for each release.
  *
  * If the refcount hits 0, we need to free the structure,
  * which means we need to lock the hash.
  * Optimal case:
  *   After locking the struct and lowering the refcount, if we find
  *   that we don't need to free, simply unlock and return.
  * Suboptimal case:
  *   If refcount lowering results in need to free, bump the count
  *   back up, lose the lock and aquire the locks in the proper
  *   order to try again.
  */
 void
 uifree(uip)
 	struct uidinfo *uip;
 {
 
 	/* Prepare for optimal case. */
 	UIDINFO_LOCK(uip);
 
 	if (--uip->ui_ref != 0) {
 		UIDINFO_UNLOCK(uip);
 		return;
 	}
 
 	/* Prepare for suboptimal case. */
 	uip->ui_ref++;
 	UIDINFO_UNLOCK(uip);
 	mtx_lock(&uihashtbl_mtx);
 	UIDINFO_LOCK(uip);
 
 	/*
 	 * We must subtract one from the count again because we backed out
 	 * our initial subtraction before dropping the lock.
 	 * Since another thread may have added a reference after we dropped the
 	 * initial lock we have to test for zero again.
 	 */
 	if (--uip->ui_ref == 0) {
 		LIST_REMOVE(uip, ui_hash);
 		mtx_unlock(&uihashtbl_mtx);
 		if (uip->ui_sbsize != 0)
 			printf("freeing uidinfo: uid = %d, sbsize = %jd\n",
 			    uip->ui_uid, (intmax_t)uip->ui_sbsize);
 		if (uip->ui_proccnt != 0)
 			printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
 			    uip->ui_uid, uip->ui_proccnt);
 		UIDINFO_UNLOCK(uip);
 		FREE(uip, M_UIDINFO);
 		return;
 	}
 
 	mtx_unlock(&uihashtbl_mtx);
 	UIDINFO_UNLOCK(uip);
 }
 
 /*
  * Change the count associated with number of processes
  * a given user is using.  When 'max' is 0, don't enforce a limit
  */
 int
 chgproccnt(uip, diff, max)
 	struct	uidinfo	*uip;
 	int	diff;
 	int	max;
 {
 
 	UIDINFO_LOCK(uip);
 	/* Don't allow them to exceed max, but allow subtraction. */
 	if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
 		UIDINFO_UNLOCK(uip);
 		return (0);
 	}
 	uip->ui_proccnt += diff;
 	if (uip->ui_proccnt < 0)
 		printf("negative proccnt for uid = %d\n", uip->ui_uid);
 	UIDINFO_UNLOCK(uip);
 	return (1);
 }
 
 /*
  * Change the total socket buffer size a user has used.
  */
 int
 chgsbsize(uip, hiwat, to, max)
 	struct	uidinfo	*uip;
 	u_int  *hiwat;
 	u_int	to;
 	rlim_t	max;
 {
 	rlim_t new;
 
 	UIDINFO_LOCK(uip);
 	new = uip->ui_sbsize + to - *hiwat;
 	/* Don't allow them to exceed max, but allow subtraction. */
 	if (to > *hiwat && new > max) {
 		UIDINFO_UNLOCK(uip);
 		return (0);
 	}
 	uip->ui_sbsize = new;
 	UIDINFO_UNLOCK(uip);
 	*hiwat = to;
 	if (new < 0)
 		printf("negative sbsize for uid = %d\n", uip->ui_uid);
 	return (1);
 }
Index: head/sys/kern/kern_sig.c
===================================================================
--- head/sys/kern/kern_sig.c	(revision 167231)
+++ head/sys/kern/kern_sig.c	(revision 167232)
@@ -1,3299 +1,3289 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/condvar.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 static int	coredump(struct thread *);
 static char	*expand_name(const char *, uid_t, pid_t);
 static int	killpg1(struct thread *td, int sig, int pgid, int all);
 static int	issignal(struct thread *p);
 static int	sigprop(int sig);
 static void	tdsigwakeup(struct thread *, int, sig_t, int);
 static void	sig_suspend_threads(struct thread *, struct proc *, int);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, int prop);
 #ifdef KSE
 static int	do_tdsignal(struct proc *, struct thread *, int, ksiginfo_t *);
 #endif
 static void	sigqueue_start(void);
 
 static uma_zone_t	ksiginfo_zone = NULL;
 struct filterops sig_filtops =
 	{ 0, filt_sigattach, filt_sigdetach, filt_signal };
 
 static int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, 
     &kern_logsigexit, 0, 
     "Log processes quitting on abnormal signals to syslog(3)");
 
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
 
 SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal");
 
 static int	max_pending_per_proc = 128;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
     &max_pending_per_proc, 0, "Max pending signals per proc");
 
 static int	preallocate_siginfo = 1024;
 TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
     &preallocate_siginfo, 0, "Preallocated signal memory size");
 
 static int	signal_overflow = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
     &signal_overflow, 0, "Number of signals overflew");
 
 static int	signal_alloc_fail = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
     &signal_alloc_fail, 0, "signals failed to be allocated");
 
 SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
  * in the right situations.
  */
 #define CANSIGIO(cr1, cr2) \
 	((cr1)->cr_uid == 0 || \
 	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
 	    (cr1)->cr_uid == (cr2)->cr_ruid || \
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
 int sugid_coredump;
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, 
     &sugid_coredump, 0, "Enable coredumping set user/group ID processes");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 static int	set_core_nodump_flag = 0;
 SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
 	0, "Enable setting the NODUMP flag on coredump files");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SA_KILL		0x01		/* terminates process by default */
 #define	SA_CORE		0x02		/* ditto and coredumps */
 #define	SA_STOP		0x04		/* suspend process */
 #define	SA_TTYSTOP	0x08		/* ditto, from tty */
 #define	SA_IGNORE	0x10		/* ignore by default */
 #define	SA_CONT		0x20		/* continue if suspended */
 #define	SA_CANTMASK	0x40		/* non-maskable, catchable */
 #define	SA_PROC		0x80		/* deliverable to any thread */
 
 static int sigproptbl[NSIG] = {
         SA_KILL|SA_PROC,		/* SIGHUP */
         SA_KILL|SA_PROC,		/* SIGINT */
         SA_KILL|SA_CORE|SA_PROC,	/* SIGQUIT */
         SA_KILL|SA_CORE,		/* SIGILL */
         SA_KILL|SA_CORE,		/* SIGTRAP */
         SA_KILL|SA_CORE,		/* SIGABRT */
         SA_KILL|SA_CORE|SA_PROC,	/* SIGEMT */
         SA_KILL|SA_CORE,		/* SIGFPE */
         SA_KILL|SA_PROC,		/* SIGKILL */
         SA_KILL|SA_CORE,		/* SIGBUS */
         SA_KILL|SA_CORE,		/* SIGSEGV */
         SA_KILL|SA_CORE,		/* SIGSYS */
         SA_KILL|SA_PROC,		/* SIGPIPE */
         SA_KILL|SA_PROC,		/* SIGALRM */
         SA_KILL|SA_PROC,		/* SIGTERM */
         SA_IGNORE|SA_PROC,		/* SIGURG */
         SA_STOP|SA_PROC,		/* SIGSTOP */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTSTP */
         SA_IGNORE|SA_CONT|SA_PROC,	/* SIGCONT */
         SA_IGNORE|SA_PROC,		/* SIGCHLD */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTTIN */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTTOU */
         SA_IGNORE|SA_PROC,		/* SIGIO */
         SA_KILL,			/* SIGXCPU */
         SA_KILL,			/* SIGXFSZ */
         SA_KILL|SA_PROC,		/* SIGVTALRM */
         SA_KILL|SA_PROC,		/* SIGPROF */
         SA_IGNORE|SA_PROC,		/* SIGWINCH  */
         SA_IGNORE|SA_PROC,		/* SIGINFO */
         SA_KILL|SA_PROC,		/* SIGUSR1 */
         SA_KILL|SA_PROC,		/* SIGUSR2 */
 };
 
 static void
 sigqueue_start(void)
 {
 	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
 	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
 	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
 	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
 }
 
 ksiginfo_t *
 ksiginfo_alloc(int wait)
 {
 	int flags;
 
 	flags = M_ZERO;
 	if (! wait)
 		flags |= M_NOWAIT;
 	if (ksiginfo_zone != NULL)
 		return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
 	return (NULL);
 }
 
 void
 ksiginfo_free(ksiginfo_t *ksi)
 {
 	uma_zfree(ksiginfo_zone, ksi);
 }
 
 static __inline int
 ksiginfo_tryfree(ksiginfo_t *ksi)
 {
 	if (!(ksi->ksi_flags & KSI_EXT)) {
 		uma_zfree(ksiginfo_zone, ksi);
 		return (1);
 	}
 	return (0);
 }
 
 void
 sigqueue_init(sigqueue_t *list, struct proc *p)
 {
 	SIGEMPTYSET(list->sq_signals);
 	SIGEMPTYSET(list->sq_kill);
 	TAILQ_INIT(&list->sq_list);
 	list->sq_proc = p;
 	list->sq_flags = SQ_INIT;
 }
 
 /*
  * Get a signal's ksiginfo.
  * Return:
  * 	0	-	signal not found
  *	others	-	signal number
  */ 
 int
 sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi, *next;
 	int count = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (!SIGISMEMBER(sq->sq_signals, signo))
 		return (0);
 
 	if (SIGISMEMBER(sq->sq_kill, signo)) {
 		count++;
 		SIGDELSET(sq->sq_kill, signo);
 	}
 
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (ksi->ksi_signo == signo) {
 			if (count == 0) {
 				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 				ksi->ksi_sigq = NULL;
 				ksiginfo_copy(ksi, si);
 				if (ksiginfo_tryfree(ksi) && p != NULL)
 					p->p_pendingcnt--;
 			}
 			if (++count > 1)
 				break;
 		}
 	}
 
 	if (count <= 1)
 		SIGDELSET(sq->sq_signals, signo);
 	si->ksi_signo = signo;
 	return (signo);
 }
 
 void
 sigqueue_take(ksiginfo_t *ksi)
 {
 	struct ksiginfo *kp;
 	struct proc	*p;
 	sigqueue_t	*sq;
 
 	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
 		return;
 
 	p = sq->sq_proc;
 	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 	ksi->ksi_sigq = NULL;
 	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
 		p->p_pendingcnt--;
 
 	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
 	     kp = TAILQ_NEXT(kp, ksi_link)) {
 		if (kp->ksi_signo == ksi->ksi_signo)
 			break;
 	}
 	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
 		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
 }
 
 int
 sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi;
 	int ret = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 	
 	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	/* directly insert the ksi, don't copy it */
 	if (si->ksi_flags & KSI_INS) {
 		TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
 		si->ksi_sigq = sq;
 		goto out_set_bit;
 	}
 
 	if (__predict_false(ksiginfo_zone == NULL)) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 	
 	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
 		signal_overflow++;
 		ret = EAGAIN;
 	} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
 		signal_alloc_fail++;
 		ret = EAGAIN;
 	} else {
 		if (p != NULL)
 			p->p_pendingcnt++;
 		ksiginfo_copy(si, ksi);
 		ksi->ksi_signo = signo;
 		TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = sq;
 	}
 
 	if ((si->ksi_flags & KSI_TRAP) != 0) {
 		if (ret != 0)
 			SIGADDSET(sq->sq_kill, signo);
 		ret = 0;
 		goto out_set_bit;
 	}
 
 	if (ret != 0)
 		return (ret);
 	
 out_set_bit:
 	SIGADDSET(sq->sq_signals, signo);
 	return (ret);
 }
 
 void
 sigqueue_flush(sigqueue_t *sq)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (p != NULL)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
 		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = NULL;
 		if (ksiginfo_tryfree(ksi) && p != NULL)
 			p->p_pendingcnt--;
 	}
 
 	SIGEMPTYSET(sq->sq_signals);
 	SIGEMPTYSET(sq->sq_kill);
 }
 
 void
 sigqueue_collect_set(sigqueue_t *sq, sigset_t *set)
 {
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	TAILQ_FOREACH(ksi, &sq->sq_list, ksi_link)
 		SIGADDSET(*set, ksi->ksi_signo);
 	SIGSETOR(*set, sq->sq_kill);
 }
 
 void
 sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, sigset_t *setp)
 {
 	sigset_t tmp, set;
 	struct proc *p1, *p2;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
 	/*
 	 * make a copy, this allows setp to point to src or dst
 	 * sq_signals without trouble.
 	 */
 	set = *setp;
 	p1 = src->sq_proc;
 	p2 = dst->sq_proc;
 	/* Move siginfo to target list */
 	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
 			if (p1 != NULL)
 				p1->p_pendingcnt--;
 			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = dst;
 			if (p2 != NULL)
 				p2->p_pendingcnt++;
 		}
 	}
 
 	/* Move pending bits to target list */
 	tmp = src->sq_kill;
 	SIGSETAND(tmp, set);
 	SIGSETOR(dst->sq_kill, tmp);
 	SIGSETNAND(src->sq_kill, tmp);
 
 	tmp = src->sq_signals;
 	SIGSETAND(tmp, set);
 	SIGSETOR(dst->sq_signals, tmp);
 	SIGSETNAND(src->sq_signals, tmp);
 
 	/* Finally, rescan src queue and set pending bits for it */
 	sigqueue_collect_set(src, &src->sq_signals);
 }
 
 void
 sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_move_set(src, dst, &set);
 }
 
 void
 sigqueue_delete_set(sigqueue_t *sq, sigset_t *set)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 
 	/* Remove siginfo queue */
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = NULL;
 			if (ksiginfo_tryfree(ksi) && p != NULL)
 				p->p_pendingcnt--;
 		}
 	}
 	SIGSETNAND(sq->sq_kill, *set);
 	SIGSETNAND(sq->sq_signals, *set);
 	/* Finally, rescan queue and set pending bits for it */
 	sigqueue_collect_set(sq, &sq->sq_signals);
 }
 
 void
 sigqueue_delete(sigqueue_t *sq, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set(sq, &set);
 }
 
 /* Remove a set of signals for a process */
 void
 sigqueue_delete_set_proc(struct proc *p, sigset_t *set)
 {
 	sigqueue_t worklist;
 	struct thread *td0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_init(&worklist, NULL);
 	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
 
 	mtx_lock_spin(&sched_lock);
 	FOREACH_THREAD_IN_PROC(p, td0)
 		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
 	mtx_unlock_spin(&sched_lock);
 
 	sigqueue_flush(&worklist);
 }
 
 void
 sigqueue_delete_proc(struct proc *p, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 void
 sigqueue_delete_stopmask_proc(struct proc *p)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, SIGSTOP);
 	SIGADDSET(set, SIGTSTP);
 	SIGADDSET(set, SIGTTIN);
 	SIGADDSET(set, SIGTTOU);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 /*
  * Determine signal that should be delivered to process p, the current
  * process, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
- *
- * MP SAFE.
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
  * mode.  This must be called whenever a signal is added to td_sigqueue or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 	struct proc *p;
 #ifdef KSE
 	sigset_t set, saved;
 #else
 	sigset_t set;
 #endif
 
 	p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * If our mask changed we may have to move signal that were
 	 * previously masked by all threads to our sigqueue.
 	 */
 	set = p->p_sigqueue.sq_signals;
 #ifdef KSE
 	if (p->p_flag & P_SA)
 		saved = p->p_sigqueue.sq_signals;
 #endif
 	SIGSETNAND(set, td->td_sigmask);
 	if (! SIGISEMPTY(set))
 		sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set);
 	if (SIGPENDING(td)) {
 		mtx_lock_spin(&sched_lock);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
 		mtx_unlock_spin(&sched_lock);
 	}
 #ifdef KSE
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
 		if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
 			/* pending set changed */
 			p->p_flag |= P_SIGEVENT;
 			wakeup(&p->p_siglist);
 		}
 	}
 #endif
 }
 
 int
 sigonstack(size_t sp)
 {
 	struct thread *td = curthread;
 
 	return ((td->td_pflags & TDP_ALTSTACK) ?
 #if defined(COMPAT_43)
 	    ((td->td_sigstk.ss_size == 0) ?
 		(td->td_sigstk.ss_flags & SS_ONSTACK) :
 		((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size))
 #else
 	    ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)
 #endif
 	    : 0);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < NSIG)
 		return (sigproptbl[_SIG_IDX(sig)]);
 	return (0);
 }
 
 int
 sig_ffs(sigset_t *set)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++)
 		if (set->__bits[i])
 			return (ffs(set->__bits[i]) + (i * 32));
 	return (0);
 }
 
 /*
  * kern_sigaction
  * sigaction
  * freebsd4_sigaction
  * osigaction
  */
 int
 kern_sigaction(td, sig, act, oact, flags)
 	struct thread *td;
 	register int sig;
 	struct sigaction *act, *oact;
 	int flags;
 {
 	struct sigacts *ps;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if (oact) {
 		oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		oact->sa_flags = 0;
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig))
 			oact->sa_flags |= SA_SIGINFO;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL) {
 			mtx_unlock(&ps->ps_mtx);
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 
 		/*
 		 * Change setting atomically.
 		 */
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (act->sa_flags & SA_SIGINFO) {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!(act->sa_flags & SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (act->sa_flags & SA_ONSTACK)
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (act->sa_flags & SA_RESETHAND)
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (act->sa_flags & SA_NODEFER)
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				ps->ps_flag |= PS_NOCLDSTOP;
 			else
 				ps->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					ps->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					ps->ps_flag |= PS_NOCLDWAIT;
 			} else
 				ps->ps_flag &= ~PS_NOCLDWAIT;
 			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 				ps->ps_flag |= PS_CLDSIGIGN;
 			else
 				ps->ps_flag &= ~PS_CLDSIGIGN;
 		}
 		/*
 		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SA_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 #ifdef KSE
 			if ((p->p_flag & P_SA) &&
 			     SIGISMEMBER(p->p_sigqueue.sq_signals, sig)) {
 				p->p_flag |= P_SIGEVENT;
 				wakeup(&p->p_siglist);
 			}
 #endif
 			/* never to be seen again */
 			sigqueue_delete_proc(p, sig);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 		} else {
 			SIGDELSET(ps->ps_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(ps->ps_sigcatch, sig);
 			else
 				SIGADDSET(ps->ps_sigcatch, sig);
 		}
 #ifdef COMPAT_FREEBSD4
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_FREEBSD4) == 0)
 			SIGDELSET(ps->ps_freebsd4, sig);
 		else
 			SIGADDSET(ps->ps_freebsd4, sig);
 #endif
 #ifdef COMPAT_43
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_OSIGSET) == 0)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 #endif
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 sigaction(td, uap)
 	struct thread *td;
 	register struct sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 freebsd4_sigaction(td, uap)
 	struct thread *td;
 	register struct freebsd4_sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 #endif	/* COMAPT_FREEBSD4 */
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
 int
 osigaction(td, uap)
 	struct thread *td;
 	register struct osigaction_args *uap;
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 
 #if !defined(__i386__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args *uap;
 {
 
 	return (nosys(td, (struct nosys_args *)uap));
 }
 #endif
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(p)
 	struct proc *p;
 {
 	register int i;
 	struct sigacts *ps;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	for (i = 1; i <= NSIG; i++)
 		if (sigprop(i) & SA_IGNORE && i != SIGCONT)
 			SIGADDSET(ps->ps_sigignore, i);
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(struct proc *p)
 {
 	struct sigacts *ps;
 	int sig;
 	struct thread *td;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through td_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	while (SIGNOTEMPTY(ps->ps_sigcatch)) {
 		sig = sig_ffs(&ps->ps_sigcatch);
 		SIGDELSET(ps->ps_sigcatch, sig);
 		if (sigprop(sig) & SA_IGNORE) {
 			if (sig != SIGCONT)
 				SIGADDSET(ps->ps_sigignore, sig);
 			sigqueue_delete_proc(p, sig);
 		}
 		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	}
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	td->td_sigstk.ss_flags = SS_DISABLE;
 	td->td_sigstk.ss_size = 0;
 	td->td_sigstk.ss_sp = 0;
 	td->td_pflags &= ~TDP_ALTSTACK;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
 	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
 	mtx_unlock(&ps->ps_mtx);
 }
 
 /*
  * kern_sigprocmask()
  *
  *	Manipulate signal mask.
  */
 int
 kern_sigprocmask(td, how, set, oset, old)
 	struct thread *td;
 	int how;
 	sigset_t *set, *oset;
 	int old;
 {
 	int error;
 
 	PROC_LOCK(td->td_proc);
 	if (oset != NULL)
 		*oset = td->td_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			SIGSETOR(td->td_sigmask, *set);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(td->td_sigmask, *set);
 			signotify(td);
 			break;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			if (old)
 				SIGSETLO(td->td_sigmask, *set);
 			else
 				td->td_sigmask = *set;
 			signotify(td);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 	}
 	PROC_UNLOCK(td->td_proc);
 	return (error);
 }
 
-/*
- * sigprocmask() - MP SAFE
- */
-
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sigprocmask(td, uap)
 	register struct thread *td;
 	struct sigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
-/*
- * osigprocmask() - MP SAFE
- */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(td, uap)
 	register struct thread *td;
 	struct osigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 int
 sigwait(struct thread *td, struct sigwait_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error) {
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error) {
 		if (error == ERESTART)
 			return (error);
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
 	td->td_retval[0] = error;
 	return (0);
 }
 
 int
 sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 	
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
 	struct timespec *timeout)
 {
 	struct sigacts *ps;
 	sigset_t savedmask;
 	struct proc *p;
 	int error, sig, hz, i, timevalid = 0;
 	struct timespec rts, ets, ts;
 	struct timeval tv;
 
 	p = td->td_proc;
 	error = 0;
 	sig = 0;
 	SIG_CANTMASK(waitset);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	savedmask = td->td_sigmask;
 	if (timeout) {
 		if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
 			timevalid = 1;
 			getnanouptime(&rts);
 		 	ets = rts;
 			timespecadd(&ets, timeout);
 		}
 	}
 
 restart:
 	for (i = 1; i <= _SIG_MAXSIG; ++i) {
 		if (!SIGISMEMBER(waitset, i))
 			continue;
 		if (!SIGISMEMBER(td->td_sigqueue.sq_signals, i)) {
 			if (SIGISMEMBER(p->p_sigqueue.sq_signals, i)) {
 #ifdef KSE
 				if (p->p_flag & P_SA) {
 					p->p_flag |= P_SIGEVENT;
 					wakeup(&p->p_siglist);
 				}
 #endif
 				sigqueue_move(&p->p_sigqueue,
 					&td->td_sigqueue, i);
 			} else
 				continue;
 		}
 
 		SIGFILLSET(td->td_sigmask);
 		SIG_CANTMASK(td->td_sigmask);
 		SIGDELSET(td->td_sigmask, i);
 		mtx_lock(&ps->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&ps->ps_mtx);
 		if (sig)
 			goto out;
 		else {
 			/*
 			 * Because cursig() may have stopped current thread,
 			 * after it is resumed, things may have already been 
 			 * changed, it should rescan any pending signals.
 			 */
 			goto restart;
 		}
 	}
 
 	if (error)
 		goto out;
 
 	/*
 	 * POSIX says this must be checked after looking for pending
 	 * signals.
 	 */
 	if (timeout) {
 		if (!timevalid) {
 			error = EINVAL;
 			goto out;
 		}
 		getnanouptime(&rts);
 		if (timespeccmp(&rts, &ets, >=)) {
 			error = EAGAIN;
 			goto out;
 		}
 		ts = ets;
 		timespecsub(&ts, &rts);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		hz = tvtohz(&tv);
 	} else
 		hz = 0;
 
 	td->td_sigmask = savedmask;
 	SIGSETNAND(td->td_sigmask, waitset);
 	signotify(td);
 	error = msleep(&ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", hz);
 	if (timeout) {
 		if (error == ERESTART) {
 			/* timeout can not be restarted. */
 			error = EINTR;
 		} else if (error == EAGAIN) {
 			/* will calculate timeout by ourself. */
 			error = 0;
 		}
 	}
 	goto restart;
 
 out:
 	td->td_sigmask = savedmask;
 	signotify(td);
 	if (sig) {
 		ksiginfo_init(ksi);
 		sigqueue_get(&td->td_sigqueue, sig, ksi);
 		ksi->ksi_signo = sig;
 		if (ksi->ksi_code == SI_TIMER)
 			itimer_accept(p, ksi->ksi_timerid, ksi);
 		error = 0;
 
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_PSIG)) {
 			sig_t action;
 
 			mtx_lock(&ps->ps_mtx);
 			action = ps->ps_sigact[_SIG_IDX(sig)];
 			mtx_unlock(&ps->ps_mtx);
 			ktrpsig(sig, action, &td->td_sigmask, 0);
 		}
 #endif
 		if (sig == SIGKILL)
 			sigexit(td, sig);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
 int
 sigpending(td, uap)
 	struct thread *td;
 	struct sigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	return (copyout(&pending, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
 int
 osigpending(td, uap)
 	struct thread *td;
 	struct osigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	SIG2OSIG(pending, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
 /* ARGSUSED */
 int
 osigvec(td, uap)
 	struct thread *td;
 	register struct osigvec_args *uap;
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
 int
 osigblock(td, uap)
 	register struct thread *td;
 	struct osigblock_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	PROC_LOCK(p);
 	SIG2OSIG(td->td_sigmask, td->td_retval[0]);
 	SIGSETOR(td->td_sigmask, set);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
 int
 osigsetmask(td, uap)
 	struct thread *td;
 	struct osigsetmask_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	PROC_LOCK(p);
 	SIG2OSIG(td->td_sigmask, td->td_retval[0]);
 	SIGSETLO(td->td_sigmask, set);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 /*
- * Suspend calling thread until signal, providing mask to be set
- * in the meantime. 
+ * Suspend calling thread until signal, providing mask to be set in the
+ * meantime. 
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
 /* ARGSUSED */
 int
 sigsuspend(td, uap)
 	struct thread *td;
 	struct sigsuspend_args *uap;
 {
 	sigset_t mask;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 	return (kern_sigsuspend(td, mask));
 }
 
 int
 kern_sigsuspend(struct thread *td, sigset_t mask)
 {
 	struct proc *p = td->td_proc;
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	PROC_LOCK(p);
 	td->td_oldsigmask = td->td_sigmask;
 	td->td_pflags |= TDP_OLDMASK;
 	SIG_CANTMASK(mask);
 	td->td_sigmask = mask;
 	signotify(td);
 	while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0)
 		/* void */;
 	PROC_UNLOCK(p);
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * Compatibility sigsuspend call for old binaries.  Note nonstandard calling
  * convention: libc stub passes mask, not pointer, to save a copyin.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
 /* ARGSUSED */
 int
 osigsuspend(td, uap)
 	struct thread *td;
 	struct osigsuspend_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t mask;
 
 	PROC_LOCK(p);
 	td->td_oldsigmask = td->td_sigmask;
 	td->td_pflags |= TDP_OLDMASK;
 	OSIG2SIG(uap->mask, mask);
 	SIG_CANTMASK(mask);
 	SIGSETLO(td->td_sigmask, mask);
 	signotify(td);
 	while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0)
 		/* void */;
 	PROC_UNLOCK(p);
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
 /* ARGSUSED */
 int
 osigstack(td, uap)
 	struct thread *td;
 	register struct osigstack_args *uap;
 {
 	struct sigstack nss, oss;
 	int error = 0;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &nss, sizeof(nss));
 		if (error)
 			return (error);
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (uap->nss != NULL) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(oss));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
 /* ARGSUSED */
 int
 sigaltstack(td, uap)
 	struct thread *td;
 	register struct sigaltstack_args *uap;
 {
 	stack_t ss, oss;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &ss, sizeof(ss));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
 	    (uap->oss != NULL) ? &oss : NULL);
 	if (error)
 		return (error);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(stack_t));
 	return (error);
 }
 
 int
 kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
 {
 	struct proc *p = td->td_proc;
 	int oonstack;
 
 	oonstack = sigonstack(cpu_getstack(td));
 
 	if (oss != NULL) {
 		*oss = td->td_sigstk;
 		oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	}
 
 	if (ss != NULL) {
 		if (oonstack)
 			return (EPERM);
 		if ((ss->ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss->ss_flags & SS_DISABLE)) {
 			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
 
 			td->td_sigstk = *ss;
 			td->td_pflags |= TDP_ALTSTACK;
 		} else {
 			td->td_pflags &= ~TDP_ALTSTACK;
 		}
 	}
 	return (0);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * cp is calling process.
  */
 static int
 killpg1(td, sig, pgid, all)
 	register struct thread *td;
 	int sig, pgid, all;
 {
 	register struct proc *p;
 	struct pgrp *pgrp;
 	int nfound = 0;
 
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p == td->td_proc || p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p_cansignal(td, p, sig) == 0) {
 				nfound++;
 				if (sig)
 					psignal(p, sig);
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 	} else {
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = td->td_proc->p_pgrp;
 			PGRP_LOCK(pgrp);
 		} else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL) {
 				sx_sunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);	      
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 				p->p_state == PRS_NEW ) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p_cansignal(td, p, sig) == 0) {
 				nfound++;
 				if (sig)
 					psignal(p, sig);
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pgrp);
 	}
 	return (nfound ? 0 : ESRCH);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 kill(td, uap)
 	register struct thread *td;
 	register struct kill_args *uap;
 {
 	register struct proc *p;
 	int error;
 
 	AUDIT_ARG(signum, uap->signum);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	if (uap->pid > 0) {
 		/* kill single process */
 		if ((p = pfind(uap->pid)) == NULL) {
 			if ((p = zpfind(uap->pid)) == NULL)
 				return (ESRCH);
 		}
 		AUDIT_ARG(process, p);
 		error = p_cansignal(td, p, uap->signum);
 		if (error == 0 && uap->signum)
 			psignal(p, uap->signum);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	AUDIT_ARG(pid, uap->pid);
 	switch (uap->pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(td, uap->signum, 0, 1));
 	case 0:			/* signal own process group */
 		return (killpg1(td, uap->signum, 0, 0));
 	default:		/* negative explicit process group */
 		return (killpg1(td, uap->signum, -uap->pid, 0));
 	}
 	/* NOTREACHED */
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 okillpg(td, uap)
 	struct thread *td;
 	register struct okillpg_args *uap;
 {
 
 	AUDIT_ARG(signum, uap->signum);
 	AUDIT_ARG(pid, uap->pgid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	return (killpg1(td, uap->signum, uap->pgid, 0));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigqueue_args {
 	pid_t pid;
 	int signum;
 	/* union sigval */ void *value;
 };
 #endif
-
 int
 sigqueue(struct thread *td, struct sigqueue_args *uap)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	/*
 	 * Specification says sigqueue can only send signal to
 	 * single process.
 	 */
 	if (uap->pid <= 0)
 		return (EINVAL);
 
 	if ((p = pfind(uap->pid)) == NULL) {
 		if ((p = zpfind(uap->pid)) == NULL)
 			return (ESRCH);
 	}
 	error = p_cansignal(td, p, uap->signum);
 	if (error == 0 && uap->signum != 0) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = uap->signum;
 		ksi.ksi_code = SI_QUEUE;
 		ksi.ksi_pid = td->td_proc->p_pid;
 		ksi.ksi_uid = td->td_ucred->cr_ruid;
 		ksi.ksi_value.sival_ptr = uap->value;
 		error = tdsignal(p, NULL, ksi.ksi_signo, &ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Send a signal to a process group.
  */
 void
 gsignal(pgid, sig)
 	int pgid, sig;
 {
 	struct pgrp *pgrp;
 
 	if (pgid != 0) {
 		sx_slock(&proctree_lock);
 		pgrp = pgfind(pgid);
 		sx_sunlock(&proctree_lock);
 		if (pgrp != NULL) {
 			pgsignal(pgrp, sig, 0);
 			PGRP_UNLOCK(pgrp);
 		}
 	}
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(pgrp, sig, checkctty)
 	struct pgrp *pgrp;
 	int sig, checkctty;
 {
 	register struct proc *p;
 
 	if (pgrp) {
 		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (checkctty == 0 || p->p_flag & P_CONTROLT)
 				psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 /*
  * Send a signal caused by a trap to the current thread.  If it will be
  * caught immediately, deliver it with correct code.  Otherwise, post it
  * normally.
  */
 void
 trapsignal(struct thread *td, ksiginfo_t *ksi)
 {
 	struct sigacts *ps;
 	struct proc *p;
 #ifdef KSE
 	int error;
 #endif
 	int sig;
 	int code;
 
 	p = td->td_proc;
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	KASSERT(_SIG_VALID(sig), ("invalid signal"));
 
 #ifdef KSE
 	if (td->td_pflags & TDP_SA) {
 		if (td->td_mailbox == NULL)
 			thread_user_enter(td);
 		PROC_LOCK(p);
 		SIGDELSET(td->td_sigmask, sig);
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Force scheduling an upcall, so UTS has chance to
 		 * process the signal before thread runs again in
 		 * userland.
 		 */
 		if (td->td_upcall)
 			td->td_upcall->ku_flags |= KUF_DOUPCALL;
 		mtx_unlock_spin(&sched_lock);
 	} else {
 		PROC_LOCK(p);
 	}
 #else
 	PROC_LOCK(p);
 #endif
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(td->td_sigmask, sig)) {
 		p->p_stats->p_ru.ru_nsignals++;
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, code);
 #endif
 #ifdef KSE
 		if (!(td->td_pflags & TDP_SA))
 			(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], 
 				ksi, &td->td_sigmask);
 #else
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], 
 				ksi, &td->td_sigmask);
 #endif
 #ifdef KSE
 		else if (td->td_mailbox == NULL) {
 			mtx_unlock(&ps->ps_mtx);
 			/* UTS caused a sync signal */
 			p->p_code = code;	/* XXX for core dump/debugger */
 			p->p_sig = sig;		/* XXX to verify code */
 			sigexit(td, sig);
 		} else {
 			mtx_unlock(&ps->ps_mtx);
 			SIGADDSET(td->td_sigmask, sig);
 			PROC_UNLOCK(p);
 			error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig,
 			    sizeof(siginfo_t));
 			PROC_LOCK(p);
 			/* UTS memory corrupted */
 			if (error)
 				sigexit(td, SIGSEGV);
 			mtx_lock(&ps->ps_mtx);
 		}
 #endif
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		/*
 		 * Avoid a possible infinite loop if the thread
 		 * masking the signal or process is ignoring the
 		 * signal.
 		 */
 		if (kern_forcesigexit &&
 		    (SIGISMEMBER(td->td_sigmask, sig) ||
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
 			SIGDELSET(td->td_sigmask, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 			SIGDELSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 		p->p_code = code;	/* XXX for core dump/debugger */
 		p->p_sig = sig;		/* XXX to verify code */
 		tdsignal(p, td, sig, ksi);
 	}
 	PROC_UNLOCK(p);
 }
 
 static struct thread *
 sigtd(struct proc *p, int sig, int prop)
 {
 	struct thread *td, *signal_td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Check if current thread can handle the signal without
 	 * switching conetxt to another thread.
 	 */
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
 		return (curthread);
 	signal_td = NULL;
 	mtx_lock_spin(&sched_lock);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig)) {
 			signal_td = td;
 			break;
 		}
 	}
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
 	mtx_unlock_spin(&sched_lock);
 	return (signal_td);
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
  */
 void
 psignal(struct proc *p, int sig)
 {
 	(void) tdsignal(p, NULL, sig, NULL);
 }
 
 int
 psignal_event(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
 {
 	struct thread *td = NULL;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	KASSERT(!KSI_ONQ(ksi), ("psignal_event: ksi on queue"));
 
 	/*
 	 * ksi_code and other fields should be set before
 	 * calling this function.
 	 */
 	ksi->ksi_signo = sigev->sigev_signo;
 	ksi->ksi_value = sigev->sigev_value;
 	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
 		td = thread_find(p, sigev->sigev_notify_thread_id);
 		if (td == NULL)
 			return (ESRCH);
 	}
 	return (tdsignal(p, td, ksi->ksi_signo, ksi));
 }
 
 int
 tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 #ifdef KSE
 	sigset_t saved;
 	int ret;
 
 	if (p->p_flag & P_SA)
 		saved = p->p_sigqueue.sq_signals;
 	ret = do_tdsignal(p, td, sig, ksi);
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
 		if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
 			/* pending set changed */
 			p->p_flag |= P_SIGEVENT;
 			wakeup(&p->p_siglist);
 		}
 	}
 	return (ret);
 }
 
 static int
 do_tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 #endif
 	sig_t action;
 	sigqueue_t *sigqueue;
 	int prop;
 	struct sigacts *ps;
 	int intrval;
 	int ret = 0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!_SIG_VALID(sig))
 #ifdef KSE
 		panic("do_tdsignal(): invalid signal %d", sig);
 #else
 		panic("tdsignal(): invalid signal %d", sig);
 #endif
 
 #ifdef KSE
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("do_tdsignal: ksi on queue"));
 #else
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("tdsignal: ksi on queue"));
 #endif
 
 	/*
 	 * IEEE Std 1003.1-2001: return success when killing a zombie.
 	 */
 	if (p->p_state == PRS_ZOMBIE) {
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 
 	ps = p->p_sigacts;
 	KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
 	/*
 	 * If the signal is blocked and not destined for this thread, then
 	 * assign it to the process so that we can find it later in the first
 	 * thread that unblocks it.  Otherwise, assign it to this thread now.
 	 */
 	if (td == NULL) {
 		td = sigtd(p, sig, prop);
 		if (SIGISMEMBER(td->td_sigmask, sig))
 			sigqueue = &p->p_sigqueue;
 		else
 			sigqueue = &td->td_sigqueue;
 	} else {
 		KASSERT(td->td_proc == p, ("invalid thread"));
 		sigqueue = &td->td_sigqueue;
 	}
 
 	/*
 	 * If the signal is being ignored,
 	 * then we forget about it immediately.
 	 * (Note: we don't set SIGCONT in ps_sigignore,
 	 * and if it is set to SIG_IGN,
 	 * action will be SIG_DFL here.)
 	 */
 	mtx_lock(&ps->ps_mtx);
 	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
 		mtx_unlock(&ps->ps_mtx);
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 	if (SIGISMEMBER(td->td_sigmask, sig))
 		action = SIG_HOLD;
 	else if (SIGISMEMBER(ps->ps_sigcatch, sig))
 		action = SIG_CATCH;
 	else
 		action = SIG_DFL;
 	if (SIGISMEMBER(ps->ps_sigintr, sig))
 		intrval = EINTR;
 	else
 		intrval = ERESTART;
 	mtx_unlock(&ps->ps_mtx);
 
 	if (prop & SA_CONT)
 		sigqueue_delete_stopmask_proc(p);
 	else if (prop & SA_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if ((prop & SA_TTYSTOP) &&
 		    (p->p_pgrp->pg_jobc == 0) &&
 		    (action == SIG_DFL)) {
 			if (ksi && (ksi->ksi_flags & KSI_INS))
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		}
 		sigqueue_delete_proc(p, SIGCONT);
 		if (p->p_flag & P_CONTINUED) {
 			p->p_flag &= ~P_CONTINUED;
 			PROC_LOCK(p->p_pptr);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(p->p_pptr);
 		}
 	}
 
 	ret = sigqueue_add(sigqueue, sig, ksi);
 	if (ret != 0)
 		return (ret);
 	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
 		return (ret);
 	/*
 	 * SIGKILL: Remove procfs STOPEVENTs.
 	 */
 	if (sig == SIGKILL) {
 		/* from procfs_ioctl.c: PIOCBIC */
 		p->p_stops = 0;
 		/* from procfs_ioctl.c: PIOCCONT */
 		p->p_step = 0;
 		wakeup(&p->p_step);
 	}
 	/*
 	 * Some signals have a process-wide effect and a per-thread
 	 * component.  Most processing occurs when the process next
 	 * tries to cross the user boundary, however there are some
 	 * times when processing needs to be done immediatly, such as
 	 * waking up threads so that they can cross the user boundary.
 	 * We try do the per-process part here.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		/*
 		 * The process is in stopped mode. All the threads should be
 		 * either winding down or already on the suspended queue.
 		 */
 		if (p->p_flag & P_TRACED) {
 			/*
 			 * The traced process is already stopped,
 			 * so no further action is necessary.
 			 * No signal can restart us.
 			 */
 			goto out;
 		}
 
 		if (sig == SIGKILL) {
 			/*
 			 * SIGKILL sets process running.
 			 * It will die elsewhere.
 			 * All threads must be restarted.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			goto runfast;
 		}
 
 		if (prop & SA_CONT) {
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in sigqueue as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * sigqueue.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			if (p->p_numthreads == p->p_suspcount) {
 				p->p_flag |= P_CONTINUED;
 				p->p_xstat = SIGCONT;
 				PROC_LOCK(p->p_pptr);
 				childproc_continued(p);
 				PROC_UNLOCK(p->p_pptr);
 			}
 			if (action == SIG_DFL) {
 				sigqueue_delete(sigqueue, sig);
 			} else if (action == SIG_CATCH) {
 #ifdef KSE
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 * It would seem that the answer would be to
 				 * run an upcall in the next KSE to run, and
 				 * deliver the signal that way. In a NON KSE
 				 * process, we need to make sure that the
 				 * single thread is runnable asap.
 				 * XXXKSE for now however, make them all run.
 				 */
 #else
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 */
 #endif
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
 			mtx_lock_spin(&sched_lock);
 			thread_unsuspend(p);
 			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
 
 		if (prop & SA_STOP) {
 			/*
 			 * Already stopped, don't need to stop again
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
 			p->p_flag |= P_STOPPED_SIG;
 			sigqueue_delete(sigqueue, sig);
 			goto out;
 		}
 
 		/*
 		 * All other kinds of signals:
 		 * If a thread is sleeping interruptibly, simulate a
 		 * wakeup so that when it is continued it will be made
 		 * runnable and can look at the signal.  However, don't make
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
 		mtx_lock_spin(&sched_lock);
 		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
 			sleepq_abort(td, intrval);
 		mtx_unlock_spin(&sched_lock);
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
 		 * hit thread_suspend_check() soon.
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
 			mtx_lock_spin(&sched_lock);
 			tdsigwakeup(td, sig, action, intrval);
 			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
 
 		MPASS(action == SIG_DFL);
 
 		if (prop & SA_STOP) {
 			if (p->p_flag & P_PPWAIT)
 				goto out;
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xstat = sig;
 			mtx_lock_spin(&sched_lock);
 			sig_suspend_threads(td, p, 1);
 			if (p->p_numthreads == p->p_suspcount) {
 				/*
 				 * only thread sending signal to another
 				 * process can reach here, if thread is sending
 				 * signal to its process, because thread does
 				 * not suspend itself here, p_numthreads
 				 * should never be equal to p_suspcount.
 				 */
 				thread_stopped(p);
 				mtx_unlock_spin(&sched_lock);
 				sigqueue_delete_proc(p, p->p_xstat);
 			} else
 				mtx_unlock_spin(&sched_lock);
 			goto out;
 		} 
 		else
 			goto runfast;
 		/* NOTREACHED */
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
 		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
 
 	/*
 	 * The process is not stopped so we need to apply the signal to all the
 	 * running threads.
 	 */
 
 runfast:
 	mtx_lock_spin(&sched_lock);
 	tdsigwakeup(td, sig, action, intrval);
 	thread_unsuspend(p);
 	mtx_unlock_spin(&sched_lock);
 out:
 	/* If we jump here, sched_lock should not be owned. */
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	return (ret);
 }
 
 /*
  * The force of a signal has been directed against a single
  * thread.  We need to see what we can do about knocking it
  * out of any sleep it may be in etc.
  */
 static void
 tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
 {
 	struct proc *p = td->td_proc;
 	register int prop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	prop = sigprop(sig);
 
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.
 	 */
 	if (action == SIG_DFL && (prop & SA_KILL)) {
 		if (p->p_nice > 0)
 			sched_nice(td->td_proc, 0);
 		if (td->td_priority > PUSER)
 			sched_prio(td, PUSER);
 	}
 
 	if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If thread is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((td->td_flags & TDF_SINTR) == 0)
 			return;
 		/*
 		 * If SIGCONT is default (or ignored) and process is
 		 * asleep, we are finished; the process should not
 		 * be awakened.
 		 */
 		if ((prop & SA_CONT) && action == SIG_DFL) {
 			mtx_unlock_spin(&sched_lock);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
 			sigqueue_delete(&td->td_sigqueue, sig);
 			mtx_lock_spin(&sched_lock);
 			return;
 		}
 
 		/*
 		 * Give low priority threads a better chance to run.
 		 */
 		if (td->td_priority > PUSER)
 			sched_prio(td, PUSER);
 
 		sleepq_abort(td, intrval);
 	} else {
 		/*
 		 * Other states do nothing with the signal immediately,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
 #ifdef SMP
 		if (TD_IS_RUNNING(td) && td != curthread)
 			forward_signal(td);
 #endif
 	}
 }
 
 static void
 sig_suspend_threads(struct thread *td, struct proc *p, int sending)
 {
 	struct thread *td2;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR) &&
 		    !TD_IS_SUSPENDED(td2)) {
 			thread_suspend_one(td2);
 		} else {
 			if (sending || td != td2)
 				td2->td_flags |= TDF_ASTPENDING;
 #ifdef SMP
 			if (TD_IS_RUNNING(td2) && td2 != td)
 				forward_signal(td2);
 #endif
 		}
 	}
 }
 
 int
 ptracestop(struct thread *td, int sig)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 	    &p->p_mtx.mtx_object, "Stopping for traced signal");
 
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_XSIG;
 	mtx_unlock_spin(&sched_lock);
 	td->td_xsig = sig;
 	while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) {
 		if (p->p_flag & P_SINGLE_EXIT) {
 			mtx_lock_spin(&sched_lock);
 			td->td_flags &= ~TDF_XSIG;
 			mtx_unlock_spin(&sched_lock);
 			return (sig);
 		}
 		/*
 		 * Just make wait() to work, the last stopped thread
 		 * will win.
 		 */
 		p->p_xstat = sig;
 		p->p_xthread = td;
 		p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
 		mtx_lock_spin(&sched_lock);
 		sig_suspend_threads(td, p, 0);
 stopme:
 		thread_stopped(p);
 		thread_suspend_one(td);
 		PROC_UNLOCK(p);
 		DROP_GIANT();
 		mi_switch(SW_VOL, NULL);
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		PROC_LOCK(p);
 		if (!(p->p_flag & P_TRACED))
 			break;
 		if (td->td_flags & TDF_DBSUSPEND) {
 			if (p->p_flag & P_SINGLE_EXIT)
 				break;
 			mtx_lock_spin(&sched_lock);
 			goto stopme;
 		}
 	}
 	return (td->td_xsig);
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling issignal
  * by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
  *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 static int
 issignal(td)
 	struct thread *td;
 {
 	struct proc *p;
 	struct sigacts *ps;
 	sigset_t sigpending;
 	int sig, prop, newsig;
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (;;) {
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
 		sigpending = td->td_sigqueue.sq_signals;
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if (p->p_flag & P_PPWAIT)
 			SIG_STOPSIGMASK(sigpending);
 		if (SIGISEMPTY(sigpending))	/* no signal to send */
 			return (0);
 		sig = sig_ffs(&sigpending);
 
 		if (p->p_stops & S_SIG) {
 			mtx_unlock(&ps->ps_mtx);
 			stopevent(p, S_SIG, sig);
 			mtx_lock(&ps->ps_mtx);
 		}
 
 		/*
 		 * We should see pending but ignored signals
 		 * only if P_TRACED was on when they were posted.
 		 */
 		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
 			sigqueue_delete(&td->td_sigqueue, sig);
 #ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				SIGADDSET(td->td_sigmask, sig);
 #endif
 			continue;
 		}
 		if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
 			/*
 			 * If traced, always stop.
 			 */
 			mtx_unlock(&ps->ps_mtx);
 			newsig = ptracestop(td, sig);
 			mtx_lock(&ps->ps_mtx);
 
 #ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				SIGADDSET(td->td_sigmask, sig);
 
 #endif
 			if (sig != newsig) {
 				ksiginfo_t ksi;
 				/*
 				 * clear old signal.
 				 * XXX shrug off debugger, it causes siginfo to
 				 * be thrown away.
 				 */
 				sigqueue_get(&td->td_sigqueue, sig, &ksi);
 
 				/*
 				 * If parent wants us to take the signal,
 				 * then it will leave it in p->p_xstat;
 				 * otherwise we just look for signals again.
 			 	*/
 				if (newsig == 0)
 					continue;
 				sig = newsig;
 
 				/*
 				 * Put the new signal into td_sigqueue. If the
 				 * signal is being masked, look for other signals.
 				 */
 				SIGADDSET(td->td_sigqueue.sq_signals, sig);
 #ifdef KSE
 				if (td->td_pflags & TDP_SA)
 					SIGDELSET(td->td_sigmask, sig);
 #endif
 				if (SIGISMEMBER(td->td_sigmask, sig))
 					continue;
 				signotify(td);
 			}
 
 			/*
 			 * If the traced bit got turned off, go back up
 			 * to the top to rescan signals.  This ensures
 			 * that p_sig* and p_sigact are consistent.
 			 */
 			if ((p->p_flag & P_TRACED) == 0)
 				continue;
 		}
 
 		prop = sigprop(sig);
 
 		/*
 		 * Decide whether the signal should be returned.
 		 * Return the signal's number, or fall through
 		 * to clear it from the pending mask.
 		 */
 		switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 
 		case (intptr_t)SIG_DFL:
 			/*
 			 * Don't take default actions on system processes.
 			 */
 			if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 				/*
 				 * Are you sure you want to ignore SIGSEGV
 				 * in init? XXX
 				 */
 				printf("Process (pid %lu) got signal %d\n",
 					(u_long)p->p_pid, sig);
 #endif
 				break;		/* == ignore */
 			}
 			/*
 			 * If there is a pending stop signal to process
 			 * with default action, stop here,
 			 * then clear the signal.  However,
 			 * if process is member of an orphaned
 			 * process group, ignore tty stop signals.
 			 */
 			if (prop & SA_STOP) {
 				if (p->p_flag & P_TRACED ||
 		    		    (p->p_pgrp->pg_jobc == 0 &&
 				     prop & SA_TTYSTOP))
 					break;	/* == ignore */
 				mtx_unlock(&ps->ps_mtx);
 				WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 				    &p->p_mtx.mtx_object, "Catching SIGSTOP");
 				p->p_flag |= P_STOPPED_SIG;
 				p->p_xstat = sig;
 				mtx_lock_spin(&sched_lock);
 				sig_suspend_threads(td, p, 0);
 				thread_stopped(p);
 				thread_suspend_one(td);
 				PROC_UNLOCK(p);
 				DROP_GIANT();
 				mi_switch(SW_INVOL, NULL);
 				mtx_unlock_spin(&sched_lock);
 				PICKUP_GIANT();
 				PROC_LOCK(p);
 				mtx_lock(&ps->ps_mtx);
 				break;
 			} else if (prop & SA_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
 				 */
 				break;		/* == ignore */
 			} else
 				return (sig);
 			/*NOTREACHED*/
 
 		case (intptr_t)SIG_IGN:
 			/*
 			 * Masking above should prevent us ever trying
 			 * to take action on an ignored signal other
 			 * than SIGCONT, unless process is traced.
 			 */
 			if ((prop & SA_CONT) == 0 &&
 			    (p->p_flag & P_TRACED) == 0)
 				printf("issignal\n");
 			break;		/* == ignore */
 
 		default:
 			/*
 			 * This signal has an action, let
 			 * postsig() process it.
 			 */
 			return (sig);
 		}
 		sigqueue_delete(&td->td_sigqueue, sig);		/* take the signal! */
 	}
 	/* NOTREACHED */
 }
 
 void
 thread_stopped(struct proc *p)
 {
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
 		mtx_unlock_spin(&sched_lock);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
 		childproc_stopped(p, (p->p_flag & P_TRACED) ?
 			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
 		mtx_lock_spin(&sched_lock);
 	}
 }
  
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 void
 postsig(sig)
 	register int sig;
 {
 	struct thread *td = curthread;
 	register struct proc *p = td->td_proc;
 	struct sigacts *ps;
 	sig_t action;
 	ksiginfo_t ksi;
 	sigset_t returnmask;
 	int code;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	ksiginfo_init(&ksi);
 	sigqueue_get(&td->td_sigqueue, sig, &ksi);
 	ksi.ksi_signo = sig;
 	if (ksi.ksi_code == SI_TIMER)
 		itimer_accept(p, ksi.ksi_timerid, &ksi);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
 		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 		    &td->td_oldsigmask : &td->td_sigmask, 0);
 #endif
 	if (p->p_stops & S_SIG) {
 		mtx_unlock(&ps->ps_mtx);
 		stopevent(p, S_SIG, sig);
 		mtx_lock(&ps->ps_mtx);
 	}
 
 #ifdef KSE
 	if (!(td->td_pflags & TDP_SA) && action == SIG_DFL) {
 #else
 	if (action == SIG_DFL) {
 #endif
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
 #ifdef KSE
 		if (td->td_pflags & TDP_SA) {
 			if (sig == SIGKILL) {
 				mtx_unlock(&ps->ps_mtx);
 				sigexit(td, sig);
 			}
 		}
 
 #endif
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig),
 		    ("postsig action"));
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		if (td->td_pflags & TDP_OLDMASK) {
 			returnmask = td->td_oldsigmask;
 			td->td_pflags &= ~TDP_OLDMASK;
 		} else
 			returnmask = td->td_sigmask;
 
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
 
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		p->p_stats->p_ru.ru_nsignals++;
 		if (p->p_sig != sig) {
 			code = 0;
 		} else {
 			code = p->p_code;
 			p->p_code = 0;
 			p->p_sig = 0;
 		}
 #ifdef KSE
 		if (td->td_pflags & TDP_SA)
 			thread_signal_add(curthread, &ksi);
 		else
 			(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 #else
 		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 #endif
 	}
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(p, why)
 	struct proc *p;
 	char *why;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
 		p, p->p_pid, p->p_comm);
 	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
 		p->p_ucred ? p->p_ucred->cr_uid : -1, why);
 	psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  */
 void
 sigexit(td, sig)
 	struct thread *td;
 	int sig;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_acflag |= AXSIG;
 	/*
 	 * We must be single-threading to generate a core dump.  This
 	 * ensures that the registers in the core file are up-to-date.
 	 * Also, the ELF dump handler assumes that the thread list doesn't
 	 * change out from under it.
 	 *
 	 * XXX If another thread attempts to single-thread before us
 	 *     (e.g. via fork()), we won't get a dump at all.
 	 */
 	if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 * Note that coredump() drops proc lock.
 		 */
 		if (coredump(td) == 0)
 			sig |= WCOREFLAG;
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), uid %d: exited on signal %d%s\n",
 			    p->p_pid, p->p_comm,
 			    td->td_ucred ? td->td_ucred->cr_uid : -1,
 			    sig &~ WCOREFLAG,
 			    sig & WCOREFLAG ? " (core dumped)" : "");
 	} else
 		PROC_UNLOCK(p);
 	exit1(td, W_EXITCODE(0, sig));
 	/* NOTREACHED */
 }
 
 /*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
 static void
 sigparent(struct proc *p, int reason, int status)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	if (p->p_ksi != NULL) {
 		p->p_ksi->ksi_signo  = SIGCHLD;
 		p->p_ksi->ksi_code   = reason;
 		p->p_ksi->ksi_status = status;
 		p->p_ksi->ksi_pid    = p->p_pid;
 		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
 		if (KSI_ONQ(p->p_ksi))
 			return;
 	}
 	tdsignal(p->p_pptr, NULL, SIGCHLD, p->p_ksi);
 }
 
 static void
 childproc_jobstate(struct proc *p, int reason, int status)
 {
 	struct sigacts *ps;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	/*
 	 * Wake up parent sleeping in kern_wait(), also send
 	 * SIGCHLD to parent, but SIGCHLD does not guarantee
 	 * that parent will awake, because parent may masked
 	 * the signal.
 	 */
 	p->p_pptr->p_flag |= P_STATCHILD;
 	wakeup(p->p_pptr);
 
 	ps = p->p_pptr->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		sigparent(p, reason, status);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 void
 childproc_stopped(struct proc *p, int reason)
 {
 	childproc_jobstate(p, reason, p->p_xstat);
 }
 
 void
 childproc_continued(struct proc *p)
 {
 	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
 }
 
 void
 childproc_exited(struct proc *p)
 {
 	int reason;
 	int status = p->p_xstat; /* convert to int */
 
 	reason = CLD_EXITED;
 	if (WCOREDUMP(status))
 		reason = CLD_DUMPED;
 	else if (WIFSIGNALED(status))
 		reason = CLD_KILLED;
 	/*
 	 * XXX avoid calling wakeup(p->p_pptr), the work is
 	 * done in exit1().
 	 */
 	sigparent(p, reason, status);
 }
 
 static char corefilename[MAXPATHLEN] = {"%N.core"};
 SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
 	      sizeof(corefilename), "process corefile name format string");
 
 /*
  * expand_name(name, uid, pid)
  * Expand the name described in corefilename, using name, uid, and pid.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 
 static char *
 expand_name(name, uid, pid)
 	const char *name;
 	uid_t uid;
 	pid_t pid;
 {
 	const char *format, *appendstr;
 	char *temp;
 	char buf[11];		/* Buffer for pid/uid -- max 4B */
 	size_t i, l, n;
 
 	format = corefilename;
 	temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO);
 	if (temp == NULL)
 		return (NULL);
 	for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) {
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				appendstr = "%";
 				break;
 			case 'N':	/* process name */
 				appendstr = name;
 				break;
 			case 'P':	/* process id */
 				sprintf(buf, "%u", pid);
 				appendstr = buf;
 				break;
 			case 'U':	/* user id */
 				sprintf(buf, "%u", uid);
 				appendstr = buf;
 				break;
 			default:
 				appendstr = "";
 			  	log(LOG_ERR,
 				    "Unknown format character %c in `%s'\n",
 				    format[i], format);
 			}
 			l = strlen(appendstr);
 			if ((n + l) >= MAXPATHLEN)
 				goto toolong;
 			memcpy(temp + n, appendstr, l);
 			n += l;
 			break;
 		default:
 			temp[n++] = format[i];
 		}
 	}
 	if (format[i] != '\0')
 		goto toolong;
 	return (temp);
 toolong:
 	log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n",
 	    (long)pid, name, (u_long)uid);
 	free(temp, M_TEMP);
 	return (NULL);
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	register struct vnode *vp;
 	register struct ucred *cred = td->td_ucred;
 	struct flock lf;
 	struct nameidata nd;
 	struct vattr vattr;
 	int error, error1, flags, locked;
 	struct mount *mp;
 	char *name;			/* name of corefile */
 	off_t limit;
 	int vfslocked;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
 	_STOPEVENT(p, S_CORE, 0);
 
 	if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) {
 		PROC_UNLOCK(p);
 		return (EFAULT);
 	}
 	
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = (off_t)lim_cur(p, RLIMIT_CORE);
 	PROC_UNLOCK(p);
 	if (limit == 0)
 		return (EFBIG);
 
 restart:
 	name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid);
 	if (name == NULL)
 		return (EINVAL);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td);
 	flags = O_CREAT | FWRITE | O_NOFOLLOW;
 	error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, -1);
 	free(name, M_TEMP);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/* Don't dump to non-regular files or files with links. */
 	if (vp->v_type != VREG ||
 	    VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) {
 		VOP_UNLOCK(vp, 0, td);
 		error = EFAULT;
 		goto close;
 	}
 
 	VOP_UNLOCK(vp, 0, td);
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = F_WRLCK;
 	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
 
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		lf.l_type = F_UNLCK;
 		if (locked)
 			VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 		if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
 			goto out;
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto restart;
 	}
 
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	if (set_core_nodump_flag)
 		vattr.va_flags = UF_NODUMP;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VOP_LEASE(vp, td, cred, LEASE_WRITE);
 	VOP_SETATTR(vp, &vattr, cred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
 
 	error = p->p_sysent->sv_coredump ?
 	  p->p_sysent->sv_coredump(td, vp, limit) :
 	  ENOSYS;
 
 	if (locked) {
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	}
 close:
 	error1 = vn_close(vp, FWRITE, cred, td);
 	if (error == 0)
 		error = error1;
 out:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Nonexistent system call-- signal process (may want to handle it).  Flag
  * error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 nosys(td, args)
 	struct thread *td;
 	struct nosys_args *args;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	psignal(p, SIGSYS);
 	PROC_UNLOCK(p);
 	return (ENOSYS);
 }
 
 /*
  * Send a SIGIO or SIGURG signal to a process or process group using stored
  * credentials rather than those of the current process.
  */
 void
 pgsigio(sigiop, sig, checkctty)
 	struct sigio **sigiop;
 	int sig, checkctty;
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	if (sigio->sio_pgid > 0) {
 		PROC_LOCK(sigio->sio_proc);
 		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
 			psignal(sigio->sio_proc, sig);
 		PROC_UNLOCK(sigio->sio_proc);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		PGRP_LOCK(sigio->sio_pgrp);
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(sigio->sio_pgrp);
 	}
 	SIGIO_UNLOCK();
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	knlist_add(&p->p_klist, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	knlist_remove(&p->p_klist, kn, 0);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to 
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
 
 struct sigacts *
 sigacts_alloc(void)
 {
 	struct sigacts *ps;
 
 	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
 	ps->ps_refcnt = 1;
 	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
 	return (ps);
 }
 
 void
 sigacts_free(struct sigacts *ps)
 {
 
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt--;
 	if (ps->ps_refcnt == 0) {
 		mtx_destroy(&ps->ps_mtx);
 		free(ps, M_SUBPROC);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 struct sigacts *
 sigacts_hold(struct sigacts *ps)
 {
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt++;
 	mtx_unlock(&ps->ps_mtx);
 	return (ps);
 }
 
 void
 sigacts_copy(struct sigacts *dest, struct sigacts *src)
 {
 
 	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
 	mtx_lock(&src->ps_mtx);
 	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
 	mtx_unlock(&src->ps_mtx);
 }
 
 int
 sigacts_shared(struct sigacts *ps)
 {
 	int shared;
 
 	mtx_lock(&ps->ps_mtx);
 	shared = ps->ps_refcnt > 1;
 	mtx_unlock(&ps->ps_mtx);
 	return (shared);
 }
Index: head/sys/kern/kern_synch.c
===================================================================
--- head/sys/kern/kern_synch.c	(revision 167231)
+++ head/sys/kern/kern_synch.c	(revision 167232)
@@ -1,577 +1,577 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 
 static void synch_setup(void *dummy);
 SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL)
 
 int	hogticks;
 int	lbolt;
 static int pause_wchan;
 
 static struct callout loadav_callout;
 static struct callout lbolt_callout;
 
 struct loadavg averunnable =
 	{ {0, 0, 0}, FSCALE };	/* load average, of runnable procs */
 /*
  * Constants for averages over 1, 5, and 15 minutes
  * when sampling at 5 second intervals.
  */
 static fixpt_t cexp[3] = {
 	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
 	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
 	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
 };
 
 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
 static int      fscale __unused = FSCALE;
 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 
 static void	loadav(void *arg);
 static void	lboltcb(void *arg);
 
 void
 sleepinit(void)
 {
 
 	hogticks = (hz / 10) * 2;	/* Default only. */
 	init_sleepqueues();
 }
 
 /*
  * General sleep call.  Suspends the current thread until a wakeup is
  * performed on the specified identifier.  The thread will then be made
  * runnable with the specified priority.  Sleeps at most timo/hz seconds
  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
  * before and after sleeping, else signals are not checked.  Returns 0 if
  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
  * signal needs to be delivered, ERESTART is returned if the current system
  * call should be restarted if possible, and EINTR is returned if the system
  * call should be interrupted by the signal (return EINTR).
  *
  * The mutex argument is unlocked before the caller is suspended, and
  * re-locked before msleep returns.  If priority includes the PDROP
  * flag the mutex is not re-locked before returning.
  */
 int
 msleep(ident, mtx, priority, wmesg, timo)
 	void *ident;
 	struct mtx *mtx;
 	int priority, timo;
 	const char *wmesg;
 {
 	struct thread *td;
 	struct proc *p;
 	int catch, rval, flags, pri;
 	WITNESS_SAVE_DECL(mtx);
 
 	td = curthread;
 	p = td->td_proc;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, mtx == NULL ? NULL :
 	    &mtx->mtx_object, "Sleeping on \"%s\"", wmesg);
 	KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL ||
 	    ident == &lbolt, ("sleeping without a mutex"));
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
 
 	if (cold) {
 		/*
 		 * During autoconfiguration, just return;
 		 * don't run any other threads or panic below,
 		 * in case this is the idle thread and already asleep.
 		 * XXX: this used to do "s = splhigh(); splx(safepri);
 		 * splx(s);" to give interrupts a chance, but there is
 		 * no way to give interrupts a chance now.
 		 */
 		if (mtx != NULL && priority & PDROP)
 			mtx_unlock(mtx);
 		return (0);
 	}
 	catch = priority & PCATCH;
 	rval = 0;
 
 	/*
 	 * If we are already on a sleep queue, then remove us from that
 	 * sleep queue first.  We have to do this to handle recursive
 	 * sleeps.
 	 */
 	if (TD_ON_SLEEPQ(td))
 		sleepq_remove(td, td->td_wchan);
 
 	if (ident == &pause_wchan)
 		flags = SLEEPQ_PAUSE;
 	else
 		flags = SLEEPQ_MSLEEP;
 	if (catch)
 		flags |= SLEEPQ_INTERRUPTIBLE;
 
 	sleepq_lock(ident);
 	CTR5(KTR_PROC, "msleep: thread %ld (pid %ld, %s) on %s (%p)",
 	    td->td_tid, p->p_pid, p->p_comm, wmesg, ident);
 
 	DROP_GIANT();
 	if (mtx != NULL) {
 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 		WITNESS_SAVE(&mtx->mtx_object, mtx);
 		mtx_unlock(mtx);
 	}
 
 	/*
 	 * We put ourselves on the sleep queue and start our timeout
 	 * before calling thread_suspend_check, as we could stop there,
 	 * and a wakeup or a SIGCONT (or both) could occur while we were
 	 * stopped without resuming us.  Thus, we must be ready for sleep
 	 * when cursig() is called.  If the wakeup happens while we're
 	 * stopped, then td will no longer be on a sleep queue upon
 	 * return from cursig().
 	 */
 	sleepq_add(ident, ident == &lbolt ? NULL : &mtx->mtx_object, wmesg,
 	    flags, 0);
 	if (timo)
 		sleepq_set_timeout(ident, timo);
 
 	/*
 	 * Adjust this thread's priority, if necessary.
 	 */
 	pri = priority & PRIMASK;
 	if (pri != 0 && pri != td->td_priority) {
 		mtx_lock_spin(&sched_lock);
 		sched_prio(td, pri);
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	if (timo && catch)
 		rval = sleepq_timedwait_sig(ident);
 	else if (timo)
 		rval = sleepq_timedwait(ident);
 	else if (catch)
 		rval = sleepq_wait_sig(ident);
 	else {
 		sleepq_wait(ident);
 		rval = 0;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
 	if (mtx != NULL && !(priority & PDROP)) {
 		mtx_lock(mtx);
 		WITNESS_RESTORE(&mtx->mtx_object, mtx);
 	}
 	return (rval);
 }
 
 int
 msleep_spin(ident, mtx, wmesg, timo)
 	void *ident;
 	struct mtx *mtx;
 	const char *wmesg;
 	int timo;
 {
 	struct thread *td;
 	struct proc *p;
 	int rval;
 	WITNESS_SAVE_DECL(mtx);
 
 	td = curthread;
 	p = td->td_proc;
 	KASSERT(mtx != NULL, ("sleeping without a mutex"));
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
 
 	if (cold) {
 		/*
 		 * During autoconfiguration, just return;
 		 * don't run any other threads or panic below,
 		 * in case this is the idle thread and already asleep.
 		 * XXX: this used to do "s = splhigh(); splx(safepri);
 		 * splx(s);" to give interrupts a chance, but there is
 		 * no way to give interrupts a chance now.
 		 */
 		return (0);
 	}
 
 	sleepq_lock(ident);
 	CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
 	    td->td_tid, p->p_pid, p->p_comm, wmesg, ident);
 
 	DROP_GIANT();
 	mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 	WITNESS_SAVE(&mtx->mtx_object, mtx);
 	mtx_unlock_spin(mtx);
 
 	/*
 	 * We put ourselves on the sleep queue and start our timeout.
 	 */
 	sleepq_add(ident, &mtx->mtx_object, wmesg, SLEEPQ_MSLEEP, 0);
 	if (timo)
 		sleepq_set_timeout(ident, timo);
 
 	/*
 	 * Can't call ktrace with any spin locks held so it can lock the
 	 * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
 	 * any spin lock.  Thus, we have to drop the sleepq spin lock while
 	 * we handle those requests.  This is safe since we have placed our
 	 * thread on the sleep queue already.
 	 */
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW)) {
 		sleepq_release(ident);
 		ktrcsw(1, 0);
 		sleepq_lock(ident);
 	}
 #endif
 #ifdef WITNESS
 	sleepq_release(ident);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"",
 	    wmesg);
 	sleepq_lock(ident);
 #endif
 	if (timo)
 		rval = sleepq_timedwait(ident);
 	else {
 		sleepq_wait(ident);
 		rval = 0;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
 	mtx_lock_spin(mtx);
 	WITNESS_RESTORE(&mtx->mtx_object, mtx);
 	return (rval);
 }
 
 /*
  * pause() is like tsleep() except that the intention is to not be
  * explicitly woken up by another thread.  Instead, the current thread
  * simply wishes to sleep until the timeout expires.  It is
  * implemented using a dummy wait channel.
  */
 int
 pause(wmesg, timo)
 	const char *wmesg;
 	int timo;
 {
 
 	KASSERT(timo != 0, ("pause: timeout required"));
 	return (tsleep(&pause_wchan, 0, wmesg, timo));
 }
 
 /*
  * Make all threads sleeping on the specified identifier runnable.
  */
 void
 wakeup(ident)
 	register void *ident;
 {
 
 	sleepq_lock(ident);
 	sleepq_broadcast(ident, SLEEPQ_MSLEEP, -1, 0);
 }
 
 /*
  * Make a thread sleeping on the specified identifier runnable.
  * May wake more than one thread if a target thread is currently
  * swapped out.
  */
 void
 wakeup_one(ident)
 	register void *ident;
 {
 
 	sleepq_lock(ident);
 	sleepq_signal(ident, SLEEPQ_MSLEEP, -1, 0);
 }
 
 /*
  * The machine independent parts of context switching.
  */
 void
 mi_switch(int flags, struct thread *newtd)
 {
 	uint64_t new_switchtime;
 	struct thread *td;
 	struct proc *p;
 
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	td = curthread;			/* XXX */
 	p = td->td_proc;		/* XXX */
 	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
 #ifdef INVARIANTS
 	if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
 		mtx_assert(&Giant, MA_NOTOWNED);
 #endif
 	KASSERT(td->td_critnest == 1 || (td->td_critnest == 2 &&
 	    (td->td_owepreempt) && (flags & SW_INVOL) != 0 &&
 	    newtd == NULL) || panicstr,
 	    ("mi_switch: switch in a critical section"));
 	KASSERT((flags & (SW_INVOL | SW_VOL)) != 0,
 	    ("mi_switch: switch must be voluntary or involuntary"));
 	KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));
 
 	/*
 	 * Don't perform context switches from the debugger.
 	 */
 	if (kdb_active) {
 		mtx_unlock_spin(&sched_lock);
 		kdb_backtrace();
 		kdb_reenter();
 		panic("%s: did not reenter debugger", __func__);
 	}
 
 	if (flags & SW_VOL)
 		p->p_stats->p_ru.ru_nvcsw++;
 	else
 		p->p_stats->p_ru.ru_nivcsw++;
 
 	/*
 	 * Compute the amount of time during which the current
 	 * process was running, and add that to its total so far.
 	 */
 	new_switchtime = cpu_ticks();
 	p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime));
 	p->p_rux.rux_uticks += td->td_uticks;
 	td->td_uticks = 0;
 	p->p_rux.rux_iticks += td->td_iticks;
 	td->td_iticks = 0;
 	p->p_rux.rux_sticks += td->td_sticks;
 	td->td_sticks = 0;
 
 	td->td_generation++;	/* bump preempt-detect counter */
 
 	/*
 	 * Check if the process exceeds its cpu resource allocation.  If
 	 * it reaches the max, arrange to kill the process in ast().
 	 */
 	if (p->p_cpulimit != RLIM_INFINITY &&
 	    p->p_rux.rux_runtime >= p->p_cpulimit * cpu_tickrate()) {
 		p->p_sflag |= PS_XCPU;
 		td->td_flags |= TDF_ASTPENDING;
 	}
 
 	/*
 	 * Finish up stats for outgoing thread.
 	 */
 	cnt.v_swtch++;
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
 	CTR4(KTR_PROC, "mi_switch: old thread %ld (kse %p, pid %ld, %s)",
 	    td->td_tid, td->td_sched, p->p_pid, p->p_comm);
 #if (KTR_COMPILE & KTR_SCHED) != 0
 	if (td == PCPU_GET(idlethread))
 		CTR3(KTR_SCHED, "mi_switch: %p(%s) prio %d idle",
 		    td, td->td_proc->p_comm, td->td_priority);
 	else if (newtd != NULL)
 		CTR5(KTR_SCHED,
 		    "mi_switch: %p(%s) prio %d preempted by %p(%s)",
 		    td, td->td_proc->p_comm, td->td_priority, newtd,
 		    newtd->td_proc->p_comm);
 	else
 		CTR6(KTR_SCHED,
 		    "mi_switch: %p(%s) prio %d inhibit %d wmesg %s lock %s",
 		    td, td->td_proc->p_comm, td->td_priority,
 		    td->td_inhibitors, td->td_wmesg, td->td_lockname);
 #endif
 	/*
 	 * We call thread_switchout after the KTR_SCHED prints above so kse
 	 * selecting a new thread to run does not show up as a preemption.
 	 */
 #ifdef KSE
 	if ((flags & SW_VOL) && (td->td_proc->p_flag & P_SA))
 		newtd = thread_switchout(td, flags, newtd);
 #endif
 	sched_switch(td, newtd, flags);
 	CTR3(KTR_SCHED, "mi_switch: running %p(%s) prio %d",
 	    td, td->td_proc->p_comm, td->td_priority);
 
 	CTR4(KTR_PROC, "mi_switch: new thread %ld (kse %p, pid %ld, %s)",
 	    td->td_tid, td->td_sched, p->p_pid, p->p_comm);
 
 	/* 
 	 * If the last thread was exiting, finish cleaning it up.
 	 */
 	if ((td = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(td);
 	}
 }
 
 /*
  * Change process state to be runnable,
  * placing it on the run queue if it is in memory,
  * and awakening the swapper if it isn't in memory.
  */
 void
 setrunnable(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	mtx_assert(&sched_lock, MA_OWNED);
 	switch (p->p_state) {
 	case PRS_ZOMBIE:
 		panic("setrunnable(1)");
 	default:
 		break;
 	}
 	switch (td->td_state) {
 	case TDS_RUNNING:
 	case TDS_RUNQ:
 		return;
 	case TDS_INHIBITED:
 		/*
 		 * If we are only inhibited because we are swapped out
 		 * then arange to swap in this process. Otherwise just return.
 		 */
 		if (td->td_inhibitors != TDI_SWAPPED)
 			return;
 		/* XXX: intentional fall-through ? */
 	case TDS_CAN_RUN:
 		break;
 	default:
 		printf("state is 0x%x", td->td_state);
 		panic("setrunnable(2)");
 	}
 	if ((p->p_sflag & PS_INMEM) == 0) {
 		if ((p->p_sflag & PS_SWAPPINGIN) == 0) {
 			p->p_sflag |= PS_SWAPINREQ;
 			/*
 			 * due to a LOR between sched_lock and
 			 * the sleepqueue chain locks, use
 			 * lower level scheduling functions.
 			 */
 			kick_proc0();
 		}
 	} else
 		sched_wakeup(td);
 }
 
 /*
  * Compute a tenex style load average of a quantity on
  * 1, 5 and 15 minute intervals.
  * XXXKSE   Needs complete rewrite when correct info is available.
  * Completely Bogus.. only works with 1:1 (but compiles ok now :-)
  */
 static void
 loadav(void *arg)
 {
 	int i, nrun;
 	struct loadavg *avg;
 
 	nrun = sched_load();
 	avg = &averunnable;
 
 	for (i = 0; i < 3; i++)
 		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
 		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 
 	/*
 	 * Schedule the next update to occur after 5 seconds, but add a
 	 * random variation to avoid synchronisation with processes that
 	 * run at regular intervals.
 	 */
 	callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
 	    loadav, NULL);
 }
 
 static void
 lboltcb(void *arg)
 {
 	wakeup(&lbolt);
 	callout_reset(&lbolt_callout, hz, lboltcb, NULL);
 }
 
 /* ARGSUSED */
 static void
 synch_setup(dummy)
 	void *dummy;
 {
 	callout_init(&loadav_callout, CALLOUT_MPSAFE);
 	callout_init(&lbolt_callout, CALLOUT_MPSAFE);
 
 	/* Kick off timeout driven events by calling first time. */
 	loadav(NULL);
 	lboltcb(NULL);
 }
 
 /*
- * General purpose yield system call
+ * General purpose yield system call.
  */
 int
 yield(struct thread *td, struct yield_args *uap)
 {
 	mtx_assert(&Giant, MA_NOTOWNED);
 	(void)uap;
 	sched_relinquish(td);
 	return (0);
 }
Index: head/sys/kern/kern_sysctl.c
===================================================================
--- head/sys/kern/kern_sysctl.c	(revision 167231)
+++ head/sys/kern/kern_sysctl.c	(revision 167232)
@@ -1,1610 +1,1609 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
  * project, to make these variables more userfriendly.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
 static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
 static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
 
 /*
  * Locking - this locks the sysctl tree in memory.
  */
 static struct sx sysctllock;
 
 #define	SYSCTL_LOCK()		sx_xlock(&sysctllock)
 #define	SYSCTL_UNLOCK()		sx_xunlock(&sysctllock)
 #define	SYSCTL_INIT()		sx_init(&sysctllock, "sysctl lock")
 
 static int sysctl_root(SYSCTL_HANDLER_ARGS);
 
 struct sysctl_oid_list sysctl__children; /* root list */
 
 static struct sysctl_oid *
 sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
 {
 	struct sysctl_oid *oidp;
 
 	SLIST_FOREACH(oidp, list, oid_link) {
 		if (strcmp(oidp->oid_name, name) == 0) {
 			return (oidp);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Initialization of the MIB tree.
  *
  * Order by number in each list.
  */
 
 void
 sysctl_register_oid(struct sysctl_oid *oidp)
 {
 	struct sysctl_oid_list *parent = oidp->oid_parent;
 	struct sysctl_oid *p;
 	struct sysctl_oid *q;
 
 	/*
 	 * First check if another oid with the same name already
 	 * exists in the parent's list.
 	 */
 	p = sysctl_find_oidname(oidp->oid_name, parent);
 	if (p != NULL) {
 		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			p->oid_refcnt++;
 			return;
 		} else {
 			printf("can't re-use a leaf (%s)!\n", p->oid_name);
 			return;
 		}
 	}
 	/*
 	 * If this oid has a number OID_AUTO, give it a number which
 	 * is greater than any current oid.
 	 * NOTE: DO NOT change the starting value here, change it in
 	 * <sys/sysctl.h>, and make sure it is at least 256 to
 	 * accomodate e.g. net.inet.raw as a static sysctl node.
 	 */
 	if (oidp->oid_number == OID_AUTO) {
 		static int newoid = CTL_AUTO_START;
 
 		oidp->oid_number = newoid++;
 		if (newoid == 0x7fffffff)
 			panic("out of oids");
 	}
 #if 0
 	else if (oidp->oid_number >= CTL_AUTO_START) {
 		/* do not panic; this happens when unregistering sysctl sets */
 		printf("static sysctl oid too high: %d", oidp->oid_number);
 	}
 #endif
 
 	/*
 	 * Insert the oid into the parent's list in order.
 	 */
 	q = NULL;
 	SLIST_FOREACH(p, parent, oid_link) {
 		if (oidp->oid_number < p->oid_number)
 			break;
 		q = p;
 	}
 	if (q)
 		SLIST_INSERT_AFTER(q, oidp, oid_link);
 	else
 		SLIST_INSERT_HEAD(parent, oidp, oid_link);
 }
 
 void
 sysctl_unregister_oid(struct sysctl_oid *oidp)
 {
 	struct sysctl_oid *p;
 	int error;
 
 	error = ENOENT;
 	if (oidp->oid_number == OID_AUTO) {
 		error = EINVAL;
 	} else {
 		SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
 			if (p == oidp) {
 				SLIST_REMOVE(oidp->oid_parent, oidp,
 				    sysctl_oid, oid_link);
 				error = 0;
 				break;
 			}
 		}
 	}
 
 	/* 
 	 * This can happen when a module fails to register and is
 	 * being unloaded afterwards.  It should not be a panic()
 	 * for normal use.
 	 */
 	if (error)
 		printf("%s: failed to unregister sysctl\n", __func__);
 }
 
 /* Initialize a new context to keep track of dynamically added sysctls. */
 int
 sysctl_ctx_init(struct sysctl_ctx_list *c)
 {
 
 	if (c == NULL) {
 		return (EINVAL);
 	}
 	TAILQ_INIT(c);
 	return (0);
 }
 
 /* Free the context, and destroy all dynamic oids registered in this context */
 int
 sysctl_ctx_free(struct sysctl_ctx_list *clist)
 {
 	struct sysctl_ctx_entry *e, *e1;
 	int error;
 
 	error = 0;
 	/*
 	 * First perform a "dry run" to check if it's ok to remove oids.
 	 * XXX FIXME
 	 * XXX This algorithm is a hack. But I don't know any
 	 * XXX better solution for now...
 	 */
 	TAILQ_FOREACH(e, clist, link) {
 		error = sysctl_remove_oid(e->entry, 0, 0);
 		if (error)
 			break;
 	}
 	/*
 	 * Restore deregistered entries, either from the end,
 	 * or from the place where error occured.
 	 * e contains the entry that was not unregistered
 	 */
 	if (error)
 		e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
 	else
 		e1 = TAILQ_LAST(clist, sysctl_ctx_list);
 	while (e1 != NULL) {
 		sysctl_register_oid(e1->entry);
 		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
 	}
 	if (error)
 		return(EBUSY);
 	/* Now really delete the entries */
 	e = TAILQ_FIRST(clist);
 	while (e != NULL) {
 		e1 = TAILQ_NEXT(e, link);
 		error = sysctl_remove_oid(e->entry, 1, 0);
 		if (error)
 			panic("sysctl_remove_oid: corrupt tree, entry: %s",
 			    e->entry->oid_name);
 		free(e, M_SYSCTLOID);
 		e = e1;
 	}
 	return (error);
 }
 
 /* Add an entry to the context */
 struct sysctl_ctx_entry *
 sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
 	e->entry = oidp;
 	TAILQ_INSERT_HEAD(clist, e, link);
 	return (e);
 }
 
 /* Find an entry in the context */
 struct sysctl_ctx_entry *
 sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	TAILQ_FOREACH(e, clist, link) {
 		if(e->entry == oidp)
 			return(e);
 	}
 	return (e);
 }
 
 /*
  * Delete an entry from the context.
  * NOTE: this function doesn't free oidp! You have to remove it
  * with sysctl_remove_oid().
  */
 int
 sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
 {
 	struct sysctl_ctx_entry *e;
 
 	if (clist == NULL || oidp == NULL)
 		return (EINVAL);
 	e = sysctl_ctx_entry_find(clist, oidp);
 	if (e != NULL) {
 		TAILQ_REMOVE(clist, e, link);
 		free(e, M_SYSCTLOID);
 		return (0);
 	} else
 		return (ENOENT);
 }
 
 /*
  * Remove dynamically created sysctl trees.
  * oidp - top of the tree to be removed
  * del - if 0 - just deregister, otherwise free up entries as well
  * recurse - if != 0 traverse the subtree to be deleted
  */
 int
 sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
 {
 	struct sysctl_oid *p;
 	int error;
 
 	if (oidp == NULL)
 		return(EINVAL);
 	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
 		printf("can't remove non-dynamic nodes!\n");
 		return (EINVAL);
 	}
 	/*
 	 * WARNING: normal method to do this should be through
 	 * sysctl_ctx_free(). Use recursing as the last resort
 	 * method to purge your sysctl tree of leftovers...
 	 * However, if some other code still references these nodes,
 	 * it will panic.
 	 */
 	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		if (oidp->oid_refcnt == 1) {
 			SLIST_FOREACH(p, SYSCTL_CHILDREN(oidp), oid_link) {
 				if (!recurse)
 					return (ENOTEMPTY);
 				error = sysctl_remove_oid(p, del, recurse);
 				if (error)
 					return (error);
 			}
 			if (del)
 				free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID);
 		}
 	}
 	if (oidp->oid_refcnt > 1 ) {
 		oidp->oid_refcnt--;
 	} else {
 		if (oidp->oid_refcnt == 0) {
 			printf("Warning: bad oid_refcnt=%u (%s)!\n",
 				oidp->oid_refcnt, oidp->oid_name);
 			return (EINVAL);
 		}
 		sysctl_unregister_oid(oidp);
 		if (del) {
 			if (oidp->oid_descr)
 				free((void *)(uintptr_t)(const void *)oidp->oid_descr, M_SYSCTLOID);
 			free((void *)(uintptr_t)(const void *)oidp->oid_name,
 			     M_SYSCTLOID);
 			free(oidp, M_SYSCTLOID);
 		}
 	}
 	return (0);
 }
 
 /*
  * Create new sysctls at run time.
  * clist may point to a valid context initialized with sysctl_ctx_init().
  */
 struct sysctl_oid *
 sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
 	int number, const char *name, int kind, void *arg1, int arg2,
 	int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
 {
 	struct sysctl_oid *oidp;
 	ssize_t len;
 	char *newname;
 
 	/* You have to hook up somewhere.. */
 	if (parent == NULL)
 		return(NULL);
 	/* Check if the node already exists, otherwise create it */
 	oidp = sysctl_find_oidname(name, parent);
 	if (oidp != NULL) {
 		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 			oidp->oid_refcnt++;
 			/* Update the context */
 			if (clist != NULL)
 				sysctl_ctx_entry_add(clist, oidp);
 			return (oidp);
 		} else {
 			printf("can't re-use a leaf (%s)!\n", name);
 			return (NULL);
 		}
 	}
 	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
 	oidp->oid_parent = parent;
 	SLIST_NEXT(oidp, oid_link) = NULL;
 	oidp->oid_number = number;
 	oidp->oid_refcnt = 1;
 	len = strlen(name);
 	newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
 	bcopy(name, newname, len + 1);
 	newname[len] = '\0';
 	oidp->oid_name = newname;
 	oidp->oid_handler = handler;
 	oidp->oid_kind = CTLFLAG_DYN | kind;
 	if ((kind & CTLTYPE) == CTLTYPE_NODE) {
 		/* Allocate space for children */
 		SYSCTL_CHILDREN_SET(oidp, malloc(sizeof(struct sysctl_oid_list),
 		    M_SYSCTLOID, M_WAITOK));
 		SLIST_INIT(SYSCTL_CHILDREN(oidp));
 	} else {
 		oidp->oid_arg1 = arg1;
 		oidp->oid_arg2 = arg2;
 	}
 	oidp->oid_fmt = fmt;
 	if (descr) {
 		int len = strlen(descr) + 1;
 		oidp->oid_descr = malloc(len, M_SYSCTLOID, M_WAITOK);
 		if (oidp->oid_descr)
 			strcpy((char *)(uintptr_t)(const void *)oidp->oid_descr, descr);
 	}
 	/* Update the context, if used */
 	if (clist != NULL)
 		sysctl_ctx_entry_add(clist, oidp);
 	/* Register this oid */
 	sysctl_register_oid(oidp);
 	return (oidp);
 }
 
 /*
  * Reparent an existing oid.
  */
 int
 sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
 {
 	struct sysctl_oid *oidp;
 
 	if (oid->oid_parent == parent)
 		return (0);
 	oidp = sysctl_find_oidname(oid->oid_name, parent);
 	if (oidp != NULL)
 		return (EEXIST);
 	sysctl_unregister_oid(oid);
 	oid->oid_parent = parent;
 	oid->oid_number = OID_AUTO;
 	sysctl_register_oid(oid);
 	return (0);
 }
 
 /*
  * Register the kernel's oids on startup.
  */
 SET_DECLARE(sysctl_set, struct sysctl_oid);
 
 static void
 sysctl_register_all(void *arg)
 {
 	struct sysctl_oid **oidp;
 
 	SYSCTL_INIT();
 	SET_FOREACH(oidp, sysctl_set)
 		sysctl_register_oid(*oidp);
 }
 SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);
 
 /*
  * "Staff-functions"
  *
  * These functions implement a presently undocumented interface 
  * used by the sysctl program to walk the tree, and get the type
  * so it can print the value.
  * This interface is under work and consideration, and should probably
  * be killed with a big axe by the first person who can find the time.
  * (be aware though, that the proper interface isn't as obvious as it
  * may seem, there are various conflicting requirements.
  *
  * {0,0}	printf the entire MIB-tree.
  * {0,1,...}	return the name of the "..." OID.
  * {0,2,...}	return the next OID.
  * {0,3}	return the OID of the name in "new"
  * {0,4,...}	return the kind & format info for the "..." OID.
  * {0,5,...}	return the description the "..." OID.
  */
 
 #ifdef SYSCTL_DEBUG
 static void
 sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
 {
 	int k;
 	struct sysctl_oid *oidp;
 
 	SLIST_FOREACH(oidp, l, oid_link) {
 
 		for (k=0; k<i; k++)
 			printf(" ");
 
 		printf("%d %s ", oidp->oid_number, oidp->oid_name);
 
 		printf("%c%c",
 			oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
 			oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
 
 		if (oidp->oid_handler)
 			printf(" *Handler");
 
 		switch (oidp->oid_kind & CTLTYPE) {
 			case CTLTYPE_NODE:
 				printf(" Node\n");
 				if (!oidp->oid_handler) {
 					sysctl_sysctl_debug_dump_node(
 						oidp->oid_arg1, i+2);
 				}
 				break;
 			case CTLTYPE_INT:    printf(" Int\n"); break;
 			case CTLTYPE_STRING: printf(" String\n"); break;
 			case CTLTYPE_QUAD:   printf(" Quad\n"); break;
 			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
 			default:	     printf("\n");
 		}
 
 	}
 }
 
 static int
 sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
 	if (error)
 		return (error);
 	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
 	return (ENOENT);
 }
 
 SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_sysctl_debug, "-", "");
 #endif
 
 static int
 sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	int error = 0;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
 	char buf[10];
 
 	while (namelen) {
 		if (!lsp) {
 			snprintf(buf,sizeof(buf),"%d",*name);
 			if (req->oldidx)
 				error = SYSCTL_OUT(req, ".", 1);
 			if (!error)
 				error = SYSCTL_OUT(req, buf, strlen(buf));
 			if (error)
 				return (error);
 			namelen--;
 			name++;
 			continue;
 		}
 		lsp2 = 0;
 		SLIST_FOREACH(oid, lsp, oid_link) {
 			if (oid->oid_number != *name)
 				continue;
 
 			if (req->oldidx)
 				error = SYSCTL_OUT(req, ".", 1);
 			if (!error)
 				error = SYSCTL_OUT(req, oid->oid_name,
 					strlen(oid->oid_name));
 			if (error)
 				return (error);
 
 			namelen--;
 			name++;
 
 			if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
 				break;
 
 			if (oid->oid_handler)
 				break;
 
 			lsp2 = (struct sysctl_oid_list *)oid->oid_arg1;
 			break;
 		}
 		lsp = lsp2;
 	}
 	return (SYSCTL_OUT(req, "", 1));
 }
 
 static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, "");
 
 static int
 sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen, 
 	int *next, int *len, int level, struct sysctl_oid **oidpp)
 {
 	struct sysctl_oid *oidp;
 
 	*len = level;
 	SLIST_FOREACH(oidp, lsp, oid_link) {
 		*next = oidp->oid_number;
 		*oidpp = oidp;
 
 		if (oidp->oid_kind & CTLFLAG_SKIP)
 			continue;
 
 		if (!namelen) {
 			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
 				return (0);
 			if (oidp->oid_handler) 
 				/* We really should call the handler here...*/
 				return (0);
 			lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 			if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1, 
 				len, level+1, oidpp))
 				return (0);
 			goto emptynode;
 		}
 
 		if (oidp->oid_number < *name)
 			continue;
 
 		if (oidp->oid_number > *name) {
 			if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 				return (0);
 			if (oidp->oid_handler)
 				return (0);
 			lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 			if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, 
 				next+1, len, level+1, oidpp))
 				return (0);
 			goto next;
 		}
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 			continue;
 
 		if (oidp->oid_handler)
 			continue;
 
 		lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 		if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1, 
 			len, level+1, oidpp))
 			return (0);
 	next:
 		namelen = 1;
 	emptynode:
 		*len = level;
 	}
 	return (1);
 }
 
 static int
 sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *) arg1;
 	u_int namelen = arg2;
 	int i, j, error;
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	int newoid[CTL_MAXNAME];
 
 	i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
 	if (i)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, "");
 
 static int
 name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp)
 {
 	int i;
 	struct sysctl_oid *oidp;
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	char *p;
 
 	if (!*name)
 		return (ENOENT);
 
 	p = name + strlen(name) - 1 ;
 	if (*p == '.')
 		*p = '\0';
 
 	*len = 0;
 
 	for (p = name; *p && *p != '.'; p++) 
 		;
 	i = *p;
 	if (i == '.')
 		*p = '\0';
 
 	oidp = SLIST_FIRST(lsp);
 
 	while (oidp && *len < CTL_MAXNAME) {
 		if (strcmp(name, oidp->oid_name)) {
 			oidp = SLIST_NEXT(oidp, oid_link);
 			continue;
 		}
 		*oid++ = oidp->oid_number;
 		(*len)++;
 
 		if (!i) {
 			if (oidpp)
 				*oidpp = oidp;
 			return (0);
 		}
 
 		if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
 			break;
 
 		if (oidp->oid_handler)
 			break;
 
 		lsp = (struct sysctl_oid_list *)oidp->oid_arg1;
 		oidp = SLIST_FIRST(lsp);
 		name = p+1;
 		for (p = name; *p && *p != '.'; p++) 
 				;
 		i = *p;
 		if (i == '.')
 			*p = '\0';
 	}
 	return (ENOENT);
 }
 
 static int
 sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
 {
 	char *p;
 	int error, oid[CTL_MAXNAME], len;
 	struct sysctl_oid *op = 0;
 
 	if (!req->newlen) 
 		return (ENOENT);
 	if (req->newlen >= MAXPATHLEN)	/* XXX arbitrary, undocumented */
 		return (ENAMETOOLONG);
 
 	p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
 
 	error = SYSCTL_IN(req, p, req->newlen);
 	if (error) {
 		free(p, M_SYSCTL);
 		return (error);
 	}
 
 	p [req->newlen] = '\0';
 
 	error = name2oid(p, oid, &len, &op);
 
 	free(p, M_SYSCTL);
 
 	if (error)
 		return (error);
 
 	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
 	return (error);
 }
 
 SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, 
 	sysctl_sysctl_name2oid, "I", "");
 
 static int
 sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		return (error);
 
 	if (!oid->oid_fmt)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
 	return (error);
 }
 
 
 static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, "");
 
 static int
 sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
 	if (error)
 		return (error);
 
 	if (!oid->oid_descr)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
 	return (error);
 }
 
 static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD, sysctl_sysctl_oiddescr, "");
 
 /*
  * Default "handler" functions.
  */
 
 /*
  * Handle an int, signed or unsigned.
  * Two cases:
  *     a variable:  point arg1 at it.
  *     a constant:  pass it in arg2.
  */
 
 int
 sysctl_handle_int(SYSCTL_HANDLER_ARGS)
 {
 	int tmpout, error = 0;
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (arg1)
 		tmpout = *(int *)arg1;
 	else
 		tmpout = arg2;
 	error = SYSCTL_OUT(req, &tmpout, sizeof(int));
 
 	if (error || !req->newptr)
 		return (error);
 
 	if (!arg1)
 		error = EPERM;
 	else
 		error = SYSCTL_IN(req, arg1, sizeof(int));
 	return (error);
 }
 
 
 /*
  * Based on on sysctl_handle_int() convert milliseconds into ticks.
  */
 
 int
 sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
 {
 	int error, s, tt;
 
 	tt = *(int *)oidp->oid_arg1;
 	s = (int)((int64_t)tt * 1000 / hz);
 
 	error = sysctl_handle_int(oidp, &s, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	tt = (int)((int64_t)s * hz / 1000);
 	if (tt < 1)
 		return (EINVAL);
 
 	*(int *)oidp->oid_arg1 = tt;
 	return (0);
 }
 
 
 /*
  * Handle a long, signed or unsigned.  arg1 points to it.
  */
 
 int
 sysctl_handle_long(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	long tmplong;
 #ifdef SCTL_MASK32
 	int tmpint;
 #endif
 
 	/*
 	 * Attempt to get a coherent snapshot by making a copy of the data.
 	 */
 	if (!arg1)
 		return (EINVAL);
 	tmplong = *(long *)arg1;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		tmpint = tmplong;
 		error = SYSCTL_OUT(req, &tmpint, sizeof(int));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &tmplong, sizeof(long));
 
 	if (error || !req->newptr)
 		return (error);
 
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		error = SYSCTL_IN(req, &tmpint, sizeof(int));
 		*(long *)arg1 = (long)tmpint;
 	} else
 #endif
 		error = SYSCTL_IN(req, arg1, sizeof(long));
 	return (error);
 }
 
 /*
  * Handle our generic '\0' terminated 'C' string.
  * Two cases:
  * 	a variable string:  point arg1 at it, arg2 is max length.
  * 	a constant string:  point arg1 at it, arg2 is zero.
  */
 
 int
 sysctl_handle_string(SYSCTL_HANDLER_ARGS)
 {
 	int error=0;
 	char *tmparg;
 	size_t outlen;
 
 	/*
 	 * Attempt to get a coherent snapshot by copying to a
 	 * temporary kernel buffer.
 	 */
 retry:
 	outlen = strlen((char *)arg1)+1;
 	tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK);
 
 	if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) {
 		free(tmparg, M_SYSCTLTMP);
 		goto retry;
 	}
 
 	error = SYSCTL_OUT(req, tmparg, outlen);
 	free(tmparg, M_SYSCTLTMP);
 
 	if (error || !req->newptr)
 		return (error);
 
 	if ((req->newlen - req->newidx) >= arg2) {
 		error = EINVAL;
 	} else {
 		arg2 = (req->newlen - req->newidx);
 		error = SYSCTL_IN(req, arg1, arg2);
 		((char *)arg1)[arg2] = '\0';
 	}
 
 	return (error);
 }
 
 /*
  * Handle any kind of opaque data.
  * arg1 points to it, arg2 is the size.
  */
 
 int
 sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
 {
 	int error, tries;
 	u_int generation;
 	struct sysctl_req req2;
 
 	/*
 	 * Attempt to get a coherent snapshot, by using the thread
 	 * pre-emption counter updated from within mi_switch() to
 	 * determine if we were pre-empted during a bcopy() or
 	 * copyout(). Make 3 attempts at doing this before giving up.
 	 * If we encounter an error, stop immediately.
 	 */
 	tries = 0;
 	req2 = *req;
 retry:
 	generation = curthread->td_generation;
 	error = SYSCTL_OUT(req, arg1, arg2);
 	if (error)
 		return (error);
 	tries++;
 	if (generation != curthread->td_generation && tries < 3) {
 		*req = req2;
 		goto retry;
 	}
 
 	error = SYSCTL_IN(req, arg1, arg2);
 
 	return (error);
 }
 
 /*
  * Transfer functions to/from kernel space.
  * XXX: rather untested at this point
  */
 static int
 sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
 {
 	size_t i = 0;
 
 	if (req->oldptr) {
 		i = l;
 		if (req->oldlen <= req->oldidx)
 			i = 0;
 		else
 			if (i > req->oldlen - req->oldidx)
 				i = req->oldlen - req->oldidx;
 		if (i > 0)
 			bcopy(p, (char *)req->oldptr + req->oldidx, i);
 	}
 	req->oldidx += l;
 	if (req->oldptr && i != l)
 		return (ENOMEM);
 	return (0);
 }
 
 static int
 sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
 {
 	if (!req->newptr)
 		return (0);
 	if (req->newlen - req->newidx < l)
 		return (EINVAL);
 	bcopy((char *)req->newptr + req->newidx, p, l);
 	req->newidx += l;
 	return (0);
 }
 
 int
 kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
     size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
 {
 	int error = 0;
 	struct sysctl_req req;
 
 	bzero(&req, sizeof req);
 
 	req.td = td;
 	req.flags = flags;
 
 	if (oldlenp) {
 		req.oldlen = *oldlenp;
 	}
 	req.validlen = req.oldlen;
 
 	if (old) {
 		req.oldptr= old;
 	}
 
 	if (new != NULL) {
 		req.newlen = newlen;
 		req.newptr = new;
 	}
 
 	req.oldfunc = sysctl_old_kernel;
 	req.newfunc = sysctl_new_kernel;
 	req.lock = REQ_LOCKED;
 
 	SYSCTL_LOCK();
 
 	error = sysctl_root(0, name, namelen, &req);
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
 
 	SYSCTL_UNLOCK();
 
 	if (error && error != ENOMEM)
 		return (error);
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
 }
 
 int
 kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
     void *new, size_t newlen, size_t *retval, int flags)
 {
         int oid[CTL_MAXNAME];
         size_t oidlen, plen;
 	int error;
 
 	oid[0] = 0;		/* sysctl internal magic */
 	oid[1] = 3;		/* name2oid */
 	oidlen = sizeof(oid);
 
 	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
 	    (void *)name, strlen(name), &plen, flags);
 	if (error)
 		return (error);
 
 	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
 	    new, newlen, retval, flags);
 	return (error);
 }
 
 /*
  * Transfer function to/from user space.
  */
 static int
 sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
 {
 	int error = 0;
 	size_t i, len, origidx;
 
 	origidx = req->oldidx;
 	req->oldidx += l;
 	if (req->oldptr == NULL)
 		return (0);
 	/*
 	 * If we have not wired the user supplied buffer and we are currently
 	 * holding locks, drop a witness warning, as it's possible that
 	 * write operations to the user page can sleep.
 	 */
 	if (req->lock != REQ_WIRED)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "sysctl_old_user()");
 	i = l;
 	len = req->validlen;
 	if (len <= origidx)
 		i = 0;
 	else {
 		if (i > len - origidx)
 			i = len - origidx;
 		error = copyout(p, (char *)req->oldptr + origidx, i);
 	}
 	if (error)
 		return (error);
 	if (i < l)
 		return (ENOMEM);
 	return (0);
 }
 
 static int
 sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
 {
 	int error;
 
 	if (!req->newptr)
 		return (0);
 	if (req->newlen - req->newidx < l)
 		return (EINVAL);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "sysctl_new_user()");
 	error = copyin((char *)req->newptr + req->newidx, p, l);
 	req->newidx += l;
 	return (error);
 }
 
 /*
  * Wire the user space destination buffer.  If set to a value greater than
  * zero, the len parameter limits the maximum amount of wired memory.
  */
 int
 sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
 {
 	int ret;
 	size_t i, wiredlen;
 	char *cp, dummy;
 
 	wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
 	ret = 0;
 	if (req->lock == REQ_LOCKED && req->oldptr &&
 	    req->oldfunc == sysctl_old_user) {
 		if (wiredlen != 0) {
 			ret = vslock(req->oldptr, wiredlen);
 			if (ret != 0) {
 				if (ret != ENOMEM)
 					return (ret);
 				wiredlen = 0;
 			}
 			/*
 			 * Touch all the wired pages to avoid PTE modified
 			 * bit emulation traps on Alpha while holding locks
 			 * in the sysctl handler.
 			 */
 			for (i = (wiredlen + PAGE_SIZE - 1) / PAGE_SIZE,
 			    cp = req->oldptr; i > 0; i--, cp += PAGE_SIZE) {
 				copyin(cp, &dummy, 1);
 				copyout(&dummy, cp, 1);
 			}
 		}
 		req->lock = REQ_WIRED;
 		req->validlen = wiredlen;
 	}
 	return (0);
 }
 
 int
 sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
     int *nindx, struct sysctl_req *req)
 {
 	struct sysctl_oid *oid;
 	int indx;
 
 	oid = SLIST_FIRST(&sysctl__children);
 	indx = 0;
 	while (oid && indx < CTL_MAXNAME) {
 		if (oid->oid_number == name[indx]) {
 			indx++;
 			if (oid->oid_kind & CTLFLAG_NOLOCK)
 				req->lock = REQ_UNLOCKED;
 			if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 				if (oid->oid_handler != NULL ||
 				    indx == namelen) {
 					*noid = oid;
 					if (nindx != NULL)
 						*nindx = indx;
 					return (0);
 				}
 				oid = SLIST_FIRST(
 				    (struct sysctl_oid_list *)oid->oid_arg1);
 			} else if (indx == namelen) {
 				*noid = oid;
 				if (nindx != NULL)
 					*nindx = indx;
 				return (0);
 			} else {
 				return (ENOTDIR);
 			}
 		} else {
 			oid = SLIST_NEXT(oid, oid_link);
 		}
 	}
 	return (ENOENT);
 }
 
 /*
  * Traverse our tree, and find the right node, execute whatever it points
  * to, and return the resulting error code.
  */
 
 static int
 sysctl_root(SYSCTL_HANDLER_ARGS)
 {
 	struct sysctl_oid *oid;
 	int error, indx, lvl;
 
 	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
 	if (error)
 		return (error);
 
 	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		/*
 		 * You can't call a sysctl when it's a node, but has
 		 * no handler.  Inform the user that it's a node.
 		 * The indx may or may not be the same as namelen.
 		 */
 		if (oid->oid_handler == NULL)
 			return (EISDIR);
 	}
 
 	/* Is this sysctl writable? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
 		return (EPERM);
 
 	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
 
 	/* Is this sysctl sensitive to securelevels? */
 	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
 		lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
 		error = securelevel_gt(req->td->td_ucred, lvl);
 		if (error)
 			return (error);
 	}
 
 	/* Is this sysctl writable by only privileged users? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
 		if (oid->oid_kind & CTLFLAG_PRISON)
 			error = priv_check_cred(req->td->td_ucred,
 			    PRIV_SYSCTL_WRITEJAIL, SUSER_ALLOWJAIL);
 		else
 			error = priv_check(req->td, PRIV_SYSCTL_WRITE);
 		if (error)
 			return (error);
 	}
 
 	if (!oid->oid_handler)
 		return (EINVAL);
 
 	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
 		arg1 = (int *)arg1 + indx;
 		arg2 -= indx;
 	} else {
 		arg1 = oid->oid_arg1;
 		arg2 = oid->oid_arg2;
 	}
 #ifdef MAC
 	error = mac_check_system_sysctl(req->td->td_ucred, oid, arg1, arg2,
 	    req);
 	if (error != 0)
 		return (error);
 #endif
 	error = oid->oid_handler(oid, arg1, arg2, req);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sysctl_args {
 	int	*name;
 	u_int	namelen;
 	void	*old;
 	size_t	*oldlenp;
 	void	*new;
 	size_t	newlen;
 };
 #endif
-
 int
 __sysctl(struct thread *td, struct sysctl_args *uap)
 {
 	int error, name[CTL_MAXNAME];
 	size_t j;
 
 	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
 		return (EINVAL);
 
  	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
  	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 
 	error = userland_sysctl(td, name, uap->namelen,
 		uap->old, uap->oldlenp, 0,
 		uap->new, uap->newlen, &j, 0);
 	if (error && error != ENOMEM)
 		goto done2;
 	if (uap->oldlenp) {
 		int i = copyout(&j, uap->oldlenp, sizeof(j));
 		if (i)
 			error = i;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * This is used from various compatibility syscalls too.  That's why name
  * must be in kernel space.
  */
 int
 userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
     size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval,
     int flags)
 {
 	int error = 0;
 	struct sysctl_req req;
 
 	bzero(&req, sizeof req);
 
 	req.td = td;
 	req.flags = flags;
 
 	if (oldlenp) {
 		if (inkernel) {
 			req.oldlen = *oldlenp;
 		} else {
 			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
 			if (error)
 				return (error);
 		}
 	}
 	req.validlen = req.oldlen;
 
 	if (old) {
 		if (!useracc(old, req.oldlen, VM_PROT_WRITE))
 			return (EFAULT);
 		req.oldptr= old;
 	}
 
 	if (new != NULL) {
 		if (!useracc(new, req.newlen, VM_PROT_READ))
 			return (EFAULT);
 		req.newlen = newlen;
 		req.newptr = new;
 	}
 
 	req.oldfunc = sysctl_old_user;
 	req.newfunc = sysctl_new_user;
 	req.lock = REQ_LOCKED;
 
 	SYSCTL_LOCK();
 
 	do {
 		req.oldidx = 0;
 		req.newidx = 0;
 		error = sysctl_root(0, name, namelen, &req);
 	} while (error == EAGAIN);
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
 
 	SYSCTL_UNLOCK();
 
 	if (error && error != ENOMEM)
 		return (error);
 
 	if (retval) {
 		if (req.oldptr && req.oldidx > req.validlen)
 			*retval = req.validlen;
 		else
 			*retval = req.oldidx;
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43
 #include <sys/socket.h>
 #include <vm/vm_param.h>
 
 #define	KINFO_PROC		(0<<8)
 #define	KINFO_RT		(1<<8)
 #define	KINFO_VNODE		(2<<8)
 #define	KINFO_FILE		(3<<8)
 #define	KINFO_METER		(4<<8)
 #define	KINFO_LOADAVG		(5<<8)
 #define	KINFO_CLOCKRATE		(6<<8)
 
 /* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
 #define	KINFO_BSDI_SYSINFO	(101<<8)
 
 /*
  * XXX this is bloat, but I hope it's better here than on the potentially
  * limited kernel stack...  -Peter
  */
 
 static struct {
 	int	bsdi_machine;		/* "i386" on BSD/386 */
 /*      ^^^ this is an offset to the string, relative to the struct start */
 	char	*pad0;
 	long	pad1;
 	long	pad2;
 	long	pad3;
 	u_long	pad4;
 	u_long	pad5;
 	u_long	pad6;
 
 	int	bsdi_ostype;		/* "BSD/386" on BSD/386 */
 	int	bsdi_osrelease;		/* "1.1" on BSD/386 */
 	long	pad7;
 	long	pad8;
 	char	*pad9;
 
 	long	pad10;
 	long	pad11;
 	int	pad12;
 	long	pad13;
 	quad_t	pad14;
 	long	pad15;
 
 	struct	timeval pad16;
 	/* we dont set this, because BSDI's uname used gethostname() instead */
 	int	bsdi_hostname;		/* hostname on BSD/386 */
 
 	/* the actual string data is appended here */
 
 } bsdi_si;
+
 /*
  * this data is appended to the end of the bsdi_si structure during copyout.
  * The "char *" offsets are relative to the base of the bsdi_si struct.
  * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
  * should not exceed the length of the buffer here... (or else!! :-)
  */
 static char bsdi_strings[80];	/* It had better be less than this! */
 
 #ifndef _SYS_SYSPROTO_H_
 struct getkerninfo_args {
 	int	op;
 	char	*where;
 	size_t	*size;
 	int	arg;
 };
 #endif
-
 int
 ogetkerninfo(struct thread *td, struct getkerninfo_args *uap)
 {
 	int error, name[6];
 	size_t size;
 	u_int needed = 0;
 
 	mtx_lock(&Giant);
 
 	switch (uap->op & 0xff00) {
 
 	case KINFO_RT:
 		name[0] = CTL_NET;
 		name[1] = PF_ROUTE;
 		name[2] = 0;
 		name[3] = (uap->op & 0xff0000) >> 16;
 		name[4] = uap->op & 0xff;
 		name[5] = uap->arg;
 		error = userland_sysctl(td, name, 6, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_VNODE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_VNODE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_PROC:
 		name[0] = CTL_KERN;
 		name[1] = KERN_PROC;
 		name[2] = uap->op & 0xff;
 		name[3] = uap->arg;
 		error = userland_sysctl(td, name, 4, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_FILE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_FILE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_METER:
 		name[0] = CTL_VM;
 		name[1] = VM_TOTAL;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_LOADAVG:
 		name[0] = CTL_VM;
 		name[1] = VM_LOADAVG;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_CLOCKRATE:
 		name[0] = CTL_KERN;
 		name[1] = KERN_CLOCKRATE;
 		error = userland_sysctl(td, name, 2, uap->where, uap->size,
 			0, 0, 0, &size, 0);
 		break;
 
 	case KINFO_BSDI_SYSINFO: {
 		/*
 		 * this is pretty crude, but it's just enough for uname()
 		 * from BSDI's 1.x libc to work.
 		 *
 		 * *size gives the size of the buffer before the call, and
 		 * the amount of data copied after a successful call.
 		 * If successful, the return value is the amount of data
 		 * available, which can be larger than *size.
 		 *
 		 * BSDI's 2.x product apparently fails with ENOMEM if *size
 		 * is too small.
 		 */
 
 		u_int left;
 		char *s;
 
 		bzero((char *)&bsdi_si, sizeof(bsdi_si));
 		bzero(bsdi_strings, sizeof(bsdi_strings));
 
 		s = bsdi_strings;
 
 		bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, ostype);
 		s += strlen(s) + 1;
 
 		bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, osrelease);
 		s += strlen(s) + 1;
 
 		bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
 		strcpy(s, machine);
 		s += strlen(s) + 1;
 
 		needed = sizeof(bsdi_si) + (s - bsdi_strings);
 
 		if ((uap->where == NULL) || (uap->size == NULL)) {
 			/* process is asking how much buffer to supply.. */
 			size = needed;
 			error = 0;
 			break;
 		}
 
 		if ((error = copyin(uap->size, &size, sizeof(size))) != 0)
 			break;
 
 		/* if too much buffer supplied, trim it down */
 		if (size > needed)
 			size = needed;
 
 		/* how much of the buffer is remaining */
 		left = size;
 
 		if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
 			break;
 
 		/* is there any point in continuing? */
 		if (left > sizeof(bsdi_si)) {
 			left -= sizeof(bsdi_si);
 			error = copyout(&bsdi_strings,
 					uap->where + sizeof(bsdi_si), left);
 		}
 		break;
 	}
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	if (error == 0) {
 		td->td_retval[0] = needed ? needed : size;
 		if (uap->size) {
 			error = copyout(&size, uap->size, sizeof(size));
 		}
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 #endif /* COMPAT_43 */
Index: head/sys/kern/kern_time.c
===================================================================
--- head/sys/kern/kern_time.c	(revision 167231)
+++ head/sys/kern/kern_time.c	(revision 167232)
@@ -1,1510 +1,1500 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_time.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/clock.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/time.h>
 #include <sys/timers.h>
 #include <sys/timetc.h>
 #include <sys/vnode.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #define MAX_CLOCKS 	(CLOCK_MONOTONIC+1)
 
 static struct kclock	posix_clocks[MAX_CLOCKS];
 static uma_zone_t	itimer_zone = NULL;
 
 /*
  * Time of day and interval timer support.
  *
  * These routines provide the kernel entry points to get and set
  * the time-of-day and per-process interval timers.  Subroutines
  * here provide support for adding and subtracting timeval structures
  * and decrementing interval timers, optionally reloading the interval
  * timers when they expire.
  */
 
 static int	settime(struct thread *, struct timeval *);
 static void	timevalfix(struct timeval *);
 static void	no_lease_updatetime(int);
 
 static void	itimer_start(void);
 static int	itimer_init(void *, int, int);
 static void	itimer_fini(void *, int);
 static void	itimer_enter(struct itimer *);
 static void	itimer_leave(struct itimer *);
 static struct itimer *itimer_find(struct proc *, int);
 static void	itimers_alloc(struct proc *);
 static void	itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp);
 static void	itimers_event_hook_exit(void *arg, struct proc *p);
 static int	realtimer_create(struct itimer *);
 static int	realtimer_gettime(struct itimer *, struct itimerspec *);
 static int	realtimer_settime(struct itimer *, int,
 			struct itimerspec *, struct itimerspec *);
 static int	realtimer_delete(struct itimer *);
 static void	realtimer_clocktime(clockid_t, struct timespec *);
 static void	realtimer_expire(void *);
 static int	kern_timer_create(struct thread *, clockid_t,
 			struct sigevent *, int *, int);
 static int	kern_timer_delete(struct thread *, int);
 
 int		register_posix_clock(int, struct kclock *);
 void		itimer_fire(struct itimer *it);
 int		itimespecfix(struct timespec *ts);
 
 #define CLOCK_CALL(clock, call, arglist)		\
 	((*posix_clocks[clock].call) arglist)
 
 SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL);
 
 
 static void 
 no_lease_updatetime(deltat)
 	int deltat;
 {
 }
 
 void (*lease_updatetime)(int)  = no_lease_updatetime;
 
 static int
 settime(struct thread *td, struct timeval *tv)
 {
 	struct timeval delta, tv1, tv2;
 	static struct timeval maxtime, laststep;
 	struct timespec ts;
 	int s;
 
 	s = splclock();
 	microtime(&tv1);
 	delta = *tv;
 	timevalsub(&delta, &tv1);
 
 	/*
 	 * If the system is secure, we do not allow the time to be 
 	 * set to a value earlier than 1 second less than the highest
 	 * time we have yet seen. The worst a miscreant can do in
 	 * this circumstance is "freeze" time. He couldn't go
 	 * back to the past.
 	 *
 	 * We similarly do not allow the clock to be stepped more
 	 * than one second, nor more than once per second. This allows
 	 * a miscreant to make the clock march double-time, but no worse.
 	 */
 	if (securelevel_gt(td->td_ucred, 1) != 0) {
 		if (delta.tv_sec < 0 || delta.tv_usec < 0) {
 			/*
 			 * Update maxtime to latest time we've seen.
 			 */
 			if (tv1.tv_sec > maxtime.tv_sec)
 				maxtime = tv1;
 			tv2 = *tv;
 			timevalsub(&tv2, &maxtime);
 			if (tv2.tv_sec < -1) {
 				tv->tv_sec = maxtime.tv_sec - 1;
 				printf("Time adjustment clamped to -1 second\n");
 			}
 		} else {
 			if (tv1.tv_sec == laststep.tv_sec) {
 				splx(s);
 				return (EPERM);
 			}
 			if (delta.tv_sec > 1) {
 				tv->tv_sec = tv1.tv_sec + 1;
 				printf("Time adjustment clamped to +1 second\n");
 			}
 			laststep = *tv;
 		}
 	}
 
 	ts.tv_sec = tv->tv_sec;
 	ts.tv_nsec = tv->tv_usec * 1000;
 	mtx_lock(&Giant);
 	tc_setclock(&ts);
 	(void) splsoftclock();
 	lease_updatetime(delta.tv_sec);
 	splx(s);
 	resettodr();
 	mtx_unlock(&Giant);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_gettime_args {
 	clockid_t clock_id;
 	struct	timespec *tp;
 };
 #endif
-
 /* ARGSUSED */
 int
 clock_gettime(struct thread *td, struct clock_gettime_args *uap)
 {
 	struct timespec ats;
 	int error;
 
 	error = kern_clock_gettime(td, uap->clock_id, &ats);
 	if (error == 0)
 		error = copyout(&ats, uap->tp, sizeof(ats));
 
 	return (error);
 }
 
 int
 kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct timeval sys, user;
 	struct proc *p;
 
 	p = td->td_proc;
 	switch (clock_id) {
 	case CLOCK_REALTIME:		/* Default to precise. */
 	case CLOCK_REALTIME_PRECISE:
 		nanotime(ats);
 		break;
 	case CLOCK_REALTIME_FAST:
 		getnanotime(ats);
 		break;
 	case CLOCK_VIRTUAL:
 		PROC_LOCK(p);
 		calcru(p, &user, &sys);
 		PROC_UNLOCK(p);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
 	case CLOCK_PROF:
 		PROC_LOCK(p);
 		calcru(p, &user, &sys);
 		PROC_UNLOCK(p);
 		timevaladd(&user, &sys);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
 	case CLOCK_MONOTONIC:		/* Default to precise. */
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_UPTIME:
 	case CLOCK_UPTIME_PRECISE:
 		nanouptime(ats);
 		break;
 	case CLOCK_UPTIME_FAST:
 	case CLOCK_MONOTONIC_FAST:
 		getnanouptime(ats);
 		break;
 	case CLOCK_SECOND:
 		ats->tv_sec = time_second;
 		ats->tv_nsec = 0;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_settime_args {
 	clockid_t clock_id;
 	const struct	timespec *tp;
 };
 #endif
-
 /* ARGSUSED */
 int
 clock_settime(struct thread *td, struct clock_settime_args *uap)
 {
 	struct timespec ats;
 	int error;
 
 	if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
 		return (error);
 	return (kern_clock_settime(td, uap->clock_id, &ats));
 }
 
 int
 kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct timeval atv;
 	int error;
 
 #ifdef MAC
 	error = mac_check_system_settime(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
 		return (error);
 	if (clock_id != CLOCK_REALTIME)
 		return (EINVAL);
 	if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000)
 		return (EINVAL);
 	/* XXX Don't convert nsec->usec and back */
 	TIMESPEC_TO_TIMEVAL(&atv, ats);
 	error = settime(td, &atv);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_getres_args {
 	clockid_t clock_id;
 	struct	timespec *tp;
 };
 #endif
-
 int
 clock_getres(struct thread *td, struct clock_getres_args *uap)
 {
 	struct timespec ts;
 	int error;
 
 	if (uap->tp == NULL)
 		return (0);
 
 	error = kern_clock_getres(td, uap->clock_id, &ts);
 	if (error == 0)
 		error = copyout(&ts, uap->tp, sizeof(ts));
 	return (error);
 }
 
 int
 kern_clock_getres(struct thread *td, clockid_t clock_id, struct timespec *ts)
 {
 
 	ts->tv_sec = 0;
 	switch (clock_id) {
 	case CLOCK_REALTIME:
 	case CLOCK_REALTIME_FAST:
 	case CLOCK_REALTIME_PRECISE:
 	case CLOCK_MONOTONIC:
 	case CLOCK_MONOTONIC_FAST:
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_UPTIME:
 	case CLOCK_UPTIME_FAST:
 	case CLOCK_UPTIME_PRECISE:
 		/*
 		 * Round up the result of the division cheaply by adding 1.
 		 * Rounding up is especially important if rounding down
 		 * would give 0.  Perfect rounding is unimportant.
 		 */
 		ts->tv_nsec = 1000000000 / tc_getfrequency() + 1;
 		break;
 	case CLOCK_VIRTUAL:
 	case CLOCK_PROF:
 		/* Accurately round up here because we can do so cheaply. */
 		ts->tv_nsec = (1000000000 + hz - 1) / hz;
 		break;
 	case CLOCK_SECOND:
 		ts->tv_sec = 1;
 		ts->tv_nsec = 0;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int nanowait;
 
 int
 kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
 {
 	struct timespec ts, ts2, ts3;
 	struct timeval tv;
 	int error;
 
 	if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
 		return (EINVAL);
 	if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
 		return (0);
 	getnanouptime(&ts);
 	timespecadd(&ts, rqt);
 	TIMESPEC_TO_TIMEVAL(&tv, rqt);
 	for (;;) {
 		error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp",
 		    tvtohz(&tv));
 		getnanouptime(&ts2);
 		if (error != EWOULDBLOCK) {
 			if (error == ERESTART)
 				error = EINTR;
 			if (rmt != NULL) {
 				timespecsub(&ts, &ts2);
 				if (ts.tv_sec < 0)
 					timespecclear(&ts);
 				*rmt = ts;
 			}
 			return (error);
 		}
 		if (timespeccmp(&ts2, &ts, >=))
 			return (0);
 		ts3 = ts;
 		timespecsub(&ts3, &ts2);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct nanosleep_args {
 	struct	timespec *rqtp;
 	struct	timespec *rmtp;
 };
 #endif
-
 /* ARGSUSED */
 int
 nanosleep(struct thread *td, struct nanosleep_args *uap)
 {
 	struct timespec rmt, rqt;
 	int error;
 
 	error = copyin(uap->rqtp, &rqt, sizeof(rqt));
 	if (error)
 		return (error);
 
 	if (uap->rmtp &&
 	    !useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE))
 			return (EFAULT);
 	error = kern_nanosleep(td, &rqt, &rmt);
 	if (error && uap->rmtp) {
 		int error2;
 
 		error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
 		if (error2)
 			error = error2;
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct gettimeofday_args {
 	struct	timeval *tp;
 	struct	timezone *tzp;
 };
 #endif
 /* ARGSUSED */
 int
 gettimeofday(struct thread *td, struct gettimeofday_args *uap)
 {
 	struct timeval atv;
 	struct timezone rtz;
 	int error = 0;
 
 	if (uap->tp) {
 		microtime(&atv);
 		error = copyout(&atv, uap->tp, sizeof (atv));
 	}
 	if (error == 0 && uap->tzp != NULL) {
 		rtz.tz_minuteswest = tz_minuteswest;
 		rtz.tz_dsttime = tz_dsttime;
 		error = copyout(&rtz, uap->tzp, sizeof (rtz));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct settimeofday_args {
 	struct	timeval *tv;
 	struct	timezone *tzp;
 };
 #endif
 /* ARGSUSED */
 int
 settimeofday(struct thread *td, struct settimeofday_args *uap)
 {
 	struct timeval atv, *tvp;
 	struct timezone atz, *tzp;
 	int error;
 
 	if (uap->tv) {
 		error = copyin(uap->tv, &atv, sizeof(atv));
 		if (error)
 			return (error);
 		tvp = &atv;
 	} else
 		tvp = NULL;
 	if (uap->tzp) {
 		error = copyin(uap->tzp, &atz, sizeof(atz));
 		if (error)
 			return (error);
 		tzp = &atz;
 	} else
 		tzp = NULL;
 	return (kern_settimeofday(td, tvp, tzp));
 }
 
 int
 kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp)
 {
 	int error;
 
 #ifdef MAC
 	error = mac_check_system_settime(td->td_ucred);
 	if (error)
 		return (error);
 #endif
 	error = priv_check(td, PRIV_SETTIMEOFDAY);
 	if (error)
 		return (error);
 	/* Verify all parameters before changing time. */
 	if (tv) {
 		if (tv->tv_usec < 0 || tv->tv_usec >= 1000000)
 			return (EINVAL);
 		error = settime(td, tv);
 	}
 	if (tzp && error == 0) {
 		tz_minuteswest = tzp->tz_minuteswest;
 		tz_dsttime = tzp->tz_dsttime;
 	}
 	return (error);
 }
 
 /*
- * Get value of an interval timer.  The process virtual and
- * profiling virtual time timers are kept in the p_stats area, since
- * they can be swapped out.  These are kept internally in the
- * way they are specified externally: in time until they expire.
+ * Get value of an interval timer.  The process virtual and profiling virtual
+ * time timers are kept in the p_stats area, since they can be swapped out.
+ * These are kept internally in the way they are specified externally: in
+ * time until they expire.
  *
- * The real time interval timer is kept in the process table slot
- * for the process, and its value (it_value) is kept as an
- * absolute time rather than as a delta, so that it is easy to keep
- * periodic real-time signals from drifting.
+ * The real time interval timer is kept in the process table slot for the
+ * process, and its value (it_value) is kept as an absolute time rather than
+ * as a delta, so that it is easy to keep periodic real-time signals from
+ * drifting.
  *
  * Virtual time timers are processed in the hardclock() routine of
- * kern_clock.c.  The real time timer is processed by a timeout
- * routine, called from the softclock() routine.  Since a callout
- * may be delayed in real time due to interrupt processing in the system,
- * it is possible for the real time timeout routine (realitexpire, given below),
- * to be delayed in real time past when it is supposed to occur.  It
- * does not suffice, therefore, to reload the real timer .it_value from the
- * real time timers .it_interval.  Rather, we compute the next time in
- * absolute time the timer should go off.
+ * kern_clock.c.  The real time timer is processed by a timeout routine,
+ * called from the softclock() routine.  Since a callout may be delayed in
+ * real time due to interrupt processing in the system, it is possible for
+ * the real time timeout routine (realitexpire, given below), to be delayed
+ * in real time past when it is supposed to occur.  It does not suffice,
+ * therefore, to reload the real timer .it_value from the real time timers
+ * .it_interval.  Rather, we compute the next time in absolute time the timer
+ * should go off.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getitimer_args {
 	u_int	which;
 	struct	itimerval *itv;
 };
 #endif
 int
 getitimer(struct thread *td, struct getitimer_args *uap)
 {
 	struct itimerval aitv;
 	int error;
 
 	error = kern_getitimer(td, uap->which, &aitv);
 	if (error != 0)
 		return (error);
 	return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
 }
 
 int
 kern_getitimer(struct thread *td, u_int which, struct itimerval *aitv)
 {
 	struct proc *p = td->td_proc;
 	struct timeval ctv;
 
 	if (which > ITIMER_PROF)
 		return (EINVAL);
 
 	if (which == ITIMER_REAL) {
 		/*
 		 * Convert from absolute to relative time in .it_value
 		 * part of real time timer.  If time for real time timer
 		 * has passed return 0, else return difference between
 		 * current time and time for the timer to go off.
 		 */
 		PROC_LOCK(p);
 		*aitv = p->p_realtimer;
 		PROC_UNLOCK(p);
 		if (timevalisset(&aitv->it_value)) {
 			getmicrouptime(&ctv);
 			if (timevalcmp(&aitv->it_value, &ctv, <))
 				timevalclear(&aitv->it_value);
 			else
 				timevalsub(&aitv->it_value, &ctv);
 		}
 	} else {
 		mtx_lock_spin(&sched_lock);
 		*aitv = p->p_stats->p_timer[which];
 		mtx_unlock_spin(&sched_lock);
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setitimer_args {
 	u_int	which;
 	struct	itimerval *itv, *oitv;
 };
 #endif
-
 int
 setitimer(struct thread *td, struct setitimer_args *uap)
 {
 	struct itimerval aitv, oitv;
 	int error;
 
 	if (uap->itv == NULL) {
 		uap->itv = uap->oitv;
 		return (getitimer(td, (struct getitimer_args *)uap));
 	}
 
 	if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval))))
 		return (error);
 	error = kern_setitimer(td, uap->which, &aitv, &oitv);
 	if (error != 0 || uap->oitv == NULL)
 		return (error);
 	return (copyout(&oitv, uap->oitv, sizeof(struct itimerval)));
 }
 
 int
 kern_setitimer(struct thread *td, u_int which, struct itimerval *aitv,
     struct itimerval *oitv)
 {
 	struct proc *p = td->td_proc;
 	struct timeval ctv;
 
 	if (aitv == NULL)
 		return (kern_getitimer(td, which, oitv));
 
 	if (which > ITIMER_PROF)
 		return (EINVAL);
 	if (itimerfix(&aitv->it_value))
 		return (EINVAL);
 	if (!timevalisset(&aitv->it_value))
 		timevalclear(&aitv->it_interval);
 	else if (itimerfix(&aitv->it_interval))
 		return (EINVAL);
 
 	if (which == ITIMER_REAL) {
 		PROC_LOCK(p);
 		if (timevalisset(&p->p_realtimer.it_value))
 			callout_stop(&p->p_itcallout);
 		getmicrouptime(&ctv);
 		if (timevalisset(&aitv->it_value)) {
 			callout_reset(&p->p_itcallout, tvtohz(&aitv->it_value),
 			    realitexpire, p);
 			timevaladd(&aitv->it_value, &ctv);
 		}
 		*oitv = p->p_realtimer;
 		p->p_realtimer = *aitv;
 		PROC_UNLOCK(p);
 		if (timevalisset(&oitv->it_value)) {
 			if (timevalcmp(&oitv->it_value, &ctv, <))
 				timevalclear(&oitv->it_value);
 			else
 				timevalsub(&oitv->it_value, &ctv);
 		}
 	} else {
 		mtx_lock_spin(&sched_lock);
 		*oitv = p->p_stats->p_timer[which];
 		p->p_stats->p_timer[which] = *aitv;
 		mtx_unlock_spin(&sched_lock);
 	}
 	return (0);
 }
 
 /*
  * Real interval timer expired:
  * send process whose timer expired an alarm signal.
  * If time is not set up to reload, then just return.
  * Else compute next time timer should go off which is > current time.
  * This is where delay in processing this timeout causes multiple
  * SIGALRM calls to be compressed into one.
  * tvtohz() always adds 1 to allow for the time until the next clock
  * interrupt being strictly less than 1 clock tick, but we don't want
  * that here since we want to appear to be in sync with the clock
  * interrupt even when we're delayed.
  */
 void
 realitexpire(void *arg)
 {
 	struct proc *p;
 	struct timeval ctv, ntv;
 
 	p = (struct proc *)arg;
 	PROC_LOCK(p);
 	psignal(p, SIGALRM);
 	if (!timevalisset(&p->p_realtimer.it_interval)) {
 		timevalclear(&p->p_realtimer.it_value);
 		if (p->p_flag & P_WEXIT)
 			wakeup(&p->p_itcallout);
 		PROC_UNLOCK(p);
 		return;
 	}
 	for (;;) {
 		timevaladd(&p->p_realtimer.it_value,
 		    &p->p_realtimer.it_interval);
 		getmicrouptime(&ctv);
 		if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
 			ntv = p->p_realtimer.it_value;
 			timevalsub(&ntv, &ctv);
 			callout_reset(&p->p_itcallout, tvtohz(&ntv) - 1,
 			    realitexpire, p);
 			PROC_UNLOCK(p);
 			return;
 		}
 	}
 	/*NOTREACHED*/
 }
 
 /*
  * Check that a proposed value to load into the .it_value or
  * .it_interval part of an interval timer is acceptable, and
  * fix it to have at least minimal value (i.e. if it is less
  * than the resolution of the clock, round it up.)
  */
 int
 itimerfix(struct timeval *tv)
 {
 
 	if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
 		return (EINVAL);
 	if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
 		tv->tv_usec = tick;
 	return (0);
 }
 
 /*
  * Decrement an interval timer by a specified number
  * of microseconds, which must be less than a second,
  * i.e. < 1000000.  If the timer expires, then reload
  * it.  In this case, carry over (usec - old value) to
  * reduce the value reloaded into the timer so that
  * the timer does not drift.  This routine assumes
  * that it is called in a context where the timers
  * on which it is operating cannot change in value.
  */
 int
 itimerdecr(struct itimerval *itp, int usec)
 {
 
 	if (itp->it_value.tv_usec < usec) {
 		if (itp->it_value.tv_sec == 0) {
 			/* expired, and already in next interval */
 			usec -= itp->it_value.tv_usec;
 			goto expire;
 		}
 		itp->it_value.tv_usec += 1000000;
 		itp->it_value.tv_sec--;
 	}
 	itp->it_value.tv_usec -= usec;
 	usec = 0;
 	if (timevalisset(&itp->it_value))
 		return (1);
 	/* expired, exactly at end of interval */
 expire:
 	if (timevalisset(&itp->it_interval)) {
 		itp->it_value = itp->it_interval;
 		itp->it_value.tv_usec -= usec;
 		if (itp->it_value.tv_usec < 0) {
 			itp->it_value.tv_usec += 1000000;
 			itp->it_value.tv_sec--;
 		}
 	} else
 		itp->it_value.tv_usec = 0;		/* sec is already 0 */
 	return (0);
 }
 
 /*
  * Add and subtract routines for timevals.
  * N.B.: subtract routine doesn't deal with
  * results which are before the beginning,
  * it just gets very confused in this case.
  * Caveat emptor.
  */
 void
 timevaladd(struct timeval *t1, const struct timeval *t2)
 {
 
 	t1->tv_sec += t2->tv_sec;
 	t1->tv_usec += t2->tv_usec;
 	timevalfix(t1);
 }
 
 void
 timevalsub(struct timeval *t1, const struct timeval *t2)
 {
 
 	t1->tv_sec -= t2->tv_sec;
 	t1->tv_usec -= t2->tv_usec;
 	timevalfix(t1);
 }
 
 static void
 timevalfix(struct timeval *t1)
 {
 
 	if (t1->tv_usec < 0) {
 		t1->tv_sec--;
 		t1->tv_usec += 1000000;
 	}
 	if (t1->tv_usec >= 1000000) {
 		t1->tv_sec++;
 		t1->tv_usec -= 1000000;
 	}
 }
 
 /*
  * ratecheck(): simple time-based rate-limit checking.
  */
 int
 ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
 {
 	struct timeval tv, delta;
 	int rv = 0;
 
 	getmicrouptime(&tv);		/* NB: 10ms precision */
 	delta = tv;
 	timevalsub(&delta, lasttime);
 
 	/*
 	 * check for 0,0 is so that the message will be seen at least once,
 	 * even if interval is huge.
 	 */
 	if (timevalcmp(&delta, mininterval, >=) ||
 	    (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
 		*lasttime = tv;
 		rv = 1;
 	}
 
 	return (rv);
 }
 
 /*
  * ppsratecheck(): packets (or events) per second limitation.
  *
  * Return 0 if the limit is to be enforced (e.g. the caller
  * should drop a packet because of the rate limitation).
  *
  * maxpps of 0 always causes zero to be returned.  maxpps of -1
  * always causes 1 to be returned; this effectively defeats rate
  * limiting.
  *
  * Note that we maintain the struct timeval for compatibility
  * with other bsd systems.  We reuse the storage and just monitor
  * clock ticks for minimal overhead.  
  */
 int
 ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
 {
 	int now;
 
 	/*
 	 * Reset the last time and counter if this is the first call
 	 * or more than a second has passed since the last update of
 	 * lasttime.
 	 */
 	now = ticks;
 	if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
 		lasttime->tv_sec = now;
 		*curpps = 1;
 		return (maxpps != 0);
 	} else {
 		(*curpps)++;		/* NB: ignore potential overflow */
 		return (maxpps < 0 || *curpps < maxpps);
 	}
 }
 
 static void
 itimer_start(void)
 {
 	struct kclock rt_clock = {
 		.timer_create  = realtimer_create,
 		.timer_delete  = realtimer_delete,
 		.timer_settime = realtimer_settime,
 		.timer_gettime = realtimer_gettime,
 		.event_hook    = NULL
 	};
 
 	itimer_zone = uma_zcreate("itimer", sizeof(struct itimer),
 		NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0);
 	register_posix_clock(CLOCK_REALTIME,  &rt_clock);
 	register_posix_clock(CLOCK_MONOTONIC, &rt_clock);
 	p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L);
 	p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX);
 	p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX);
 	EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit,
 		(void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY);
 	EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec,
 		(void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY);
 }
 
 int
 register_posix_clock(int clockid, struct kclock *clk)
 {
 	if ((unsigned)clockid >= MAX_CLOCKS) {
 		printf("%s: invalid clockid\n", __func__);
 		return (0);
 	}
 	posix_clocks[clockid] = *clk;
 	return (1);
 }
 
 static int
 itimer_init(void *mem, int size, int flags)
 {
 	struct itimer *it;
 
 	it = (struct itimer *)mem;
 	mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF);
 	return (0);
 }
 
 static void
 itimer_fini(void *mem, int size)
 {
 	struct itimer *it;
 
 	it = (struct itimer *)mem;
 	mtx_destroy(&it->it_mtx);
 }
 
 static void
 itimer_enter(struct itimer *it)
 {
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	it->it_usecount++;
 }
 
 static void
 itimer_leave(struct itimer *it)
 {
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	KASSERT(it->it_usecount > 0, ("invalid it_usecount"));
 
 	if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0)
 		wakeup(it);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_create_args {
 	clockid_t clock_id;
 	struct sigevent * evp;
 	int * timerid;
 };
 #endif
-
 int
 ktimer_create(struct thread *td, struct ktimer_create_args *uap)
 {
 	struct sigevent *evp1, ev;
 	int id;
 	int error;
 
 	if (uap->evp != NULL) {
 		error = copyin(uap->evp, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp1 = &ev;
 	} else
 		evp1 = NULL;
 
 	error = kern_timer_create(td, uap->clock_id, evp1, &id, -1);
 
 	if (error == 0) {
 		error = copyout(&id, uap->timerid, sizeof(int));
 		if (error != 0)
 			kern_timer_delete(td, id);
 	}
 	return (error);
 }
 
 static int
 kern_timer_create(struct thread *td, clockid_t clock_id,
 	struct sigevent *evp, int *timerid, int preset_id)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	int id;
 	int error;
 
 	if (clock_id < 0 || clock_id >= MAX_CLOCKS)
 		return (EINVAL);
 
 	if (posix_clocks[clock_id].timer_create == NULL)
 		return (EINVAL);
 
 	if (evp != NULL) {
 		if (evp->sigev_notify != SIGEV_NONE &&
 		    evp->sigev_notify != SIGEV_SIGNAL &&
 		    evp->sigev_notify != SIGEV_THREAD_ID)
 			return (EINVAL);
 		if ((evp->sigev_notify == SIGEV_SIGNAL ||
 		     evp->sigev_notify == SIGEV_THREAD_ID) &&
 			!_SIG_VALID(evp->sigev_signo))
 			return (EINVAL);
 	}
 	
 	if (p->p_itimers == NULL)
 		itimers_alloc(p);
 	
 	it = uma_zalloc(itimer_zone, M_WAITOK);
 	it->it_flags = 0;
 	it->it_usecount = 0;
 	it->it_active = 0;
 	timespecclear(&it->it_time.it_value);
 	timespecclear(&it->it_time.it_interval);
 	it->it_overrun = 0;
 	it->it_overrun_last = 0;
 	it->it_clockid = clock_id;
 	it->it_timerid = -1;
 	it->it_proc = p;
 	ksiginfo_init(&it->it_ksi);
 	it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 	error = CLOCK_CALL(clock_id, timer_create, (it));
 	if (error != 0)
 		goto out;
 
 	PROC_LOCK(p);
 	if (preset_id != -1) {
 		KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id"));
 		id = preset_id;
 		if (p->p_itimers->its_timers[id] != NULL) {
 			PROC_UNLOCK(p);
 			error = 0;
 			goto out;
 		}
 	} else {
 		/*
 		 * Find a free timer slot, skipping those reserved
 		 * for setitimer().
 		 */
 		for (id = 3; id < TIMER_MAX; id++)
 			if (p->p_itimers->its_timers[id] == NULL)
 				break;
 		if (id == TIMER_MAX) {
 			PROC_UNLOCK(p);
 			error = EAGAIN;
 			goto out;
 		}
 	}
 	it->it_timerid = id;
 	p->p_itimers->its_timers[id] = it;
 	if (evp != NULL)
 		it->it_sigev = *evp;
 	else {
 		it->it_sigev.sigev_notify = SIGEV_SIGNAL;
 		switch (clock_id) {
 		default:
 		case CLOCK_REALTIME:
 			it->it_sigev.sigev_signo = SIGALRM;
 			break;
 		case CLOCK_VIRTUAL:
  			it->it_sigev.sigev_signo = SIGVTALRM;
 			break;
 		case CLOCK_PROF:
 			it->it_sigev.sigev_signo = SIGPROF;
 			break;
 		}
 		it->it_sigev.sigev_value.sival_int = id;
 	}
 
 	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
 	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
 		it->it_ksi.ksi_signo = it->it_sigev.sigev_signo;
 		it->it_ksi.ksi_code = SI_TIMER;
 		it->it_ksi.ksi_value = it->it_sigev.sigev_value;
 		it->it_ksi.ksi_timerid = id;
 	}
 	PROC_UNLOCK(p);
 	*timerid = id;
 	return (0);
 
 out:
 	ITIMER_LOCK(it);
 	CLOCK_CALL(it->it_clockid, timer_delete, (it));
 	ITIMER_UNLOCK(it);
 	uma_zfree(itimer_zone, it);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_delete_args {
 	int timerid;
 };
 #endif
-
 int
 ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
 {
 	return (kern_timer_delete(td, uap->timerid));
 }
 
 static struct itimer *
 itimer_find(struct proc *p, int timerid)
 {
 	struct itimer *it;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_itimers == NULL) || (timerid >= TIMER_MAX) ||
 	    (it = p->p_itimers->its_timers[timerid]) == NULL) {
 		return (NULL);
 	}
 	ITIMER_LOCK(it);
 	if ((it->it_flags & ITF_DELETING) != 0) {
 		ITIMER_UNLOCK(it);
 		it = NULL;
 	}
 	return (it);
 }
 
 static int
 kern_timer_delete(struct thread *td, int timerid)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 
 	PROC_LOCK(p);
 	it = itimer_find(p, timerid);
 	if (it == NULL) {
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	PROC_UNLOCK(p);
 
 	it->it_flags |= ITF_DELETING;
 	while (it->it_usecount > 0) {
 		it->it_flags |= ITF_WANTED;
 		msleep(it, &it->it_mtx, PPAUSE, "itimer", 0);
 	}
 	it->it_flags &= ~ITF_WANTED;
 	CLOCK_CALL(it->it_clockid, timer_delete, (it));
 	ITIMER_UNLOCK(it);
 
 	PROC_LOCK(p);
 	if (KSI_ONQ(&it->it_ksi))
 		sigqueue_take(&it->it_ksi);
 	p->p_itimers->its_timers[timerid] = NULL;
 	PROC_UNLOCK(p);
 	uma_zfree(itimer_zone, it);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_settime_args {
 	int timerid;
 	int flags;
 	const struct itimerspec * value;
 	struct itimerspec * ovalue;
 };
 #endif
-
 int
 ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	struct itimerspec val, oval, *ovalp;
 	int error;
 
 	error = copyin(uap->value, &val, sizeof(val));
 	if (error != 0)
 		return (error);
 	
 	if (uap->ovalue != NULL)
 		ovalp = &oval;
 	else
 		ovalp = NULL;
 
 	PROC_LOCK(p);
 	if (uap->timerid < 3 ||
 	    (it = itimer_find(p, uap->timerid)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		PROC_UNLOCK(p);
 		itimer_enter(it);
 		error = CLOCK_CALL(it->it_clockid, timer_settime,
 				(it, uap->flags, &val, ovalp));
 		itimer_leave(it);
 		ITIMER_UNLOCK(it);
 	}
 	if (error == 0 && uap->ovalue != NULL)
 		error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_gettime_args {
 	int timerid;
 	struct itimerspec * value;
 };
 #endif
-
 int
 ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	struct itimerspec val;
 	int error;
 
 	PROC_LOCK(p);
 	if (uap->timerid < 3 ||
 	   (it = itimer_find(p, uap->timerid)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		PROC_UNLOCK(p);
 		itimer_enter(it);
 		error = CLOCK_CALL(it->it_clockid, timer_gettime,
 				(it, &val));
 		itimer_leave(it);
 		ITIMER_UNLOCK(it);
 	}
 	if (error == 0)
 		error = copyout(&val, uap->value, sizeof(val));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct timer_getoverrun_args {
 	int timerid;
 };
 #endif
-
 int
 ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	int error ;
 
 	PROC_LOCK(p);
 	if (uap->timerid < 3 ||
 	    (it = itimer_find(p, uap->timerid)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		td->td_retval[0] = it->it_overrun_last;
 		ITIMER_UNLOCK(it);
 		PROC_UNLOCK(p);
 		error = 0;
 	}
 	return (error);
 }
 
 static int
 realtimer_create(struct itimer *it)
 {
 	callout_init_mtx(&it->it_callout, &it->it_mtx, 0);
 	return (0);
 }
 
 static int
 realtimer_delete(struct itimer *it)
 {
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	
 	ITIMER_UNLOCK(it);
 	callout_drain(&it->it_callout);
 	ITIMER_LOCK(it);
 	return (0);
 }
 
 static int
 realtimer_gettime(struct itimer *it, struct itimerspec *ovalue)
 {
 	struct timespec cts;
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 
 	realtimer_clocktime(it->it_clockid, &cts);
 	*ovalue = it->it_time;
 	if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) {
 		timespecsub(&ovalue->it_value, &cts);
 		if (ovalue->it_value.tv_sec < 0 ||
 		    (ovalue->it_value.tv_sec == 0 &&
 		     ovalue->it_value.tv_nsec == 0)) {
 			ovalue->it_value.tv_sec  = 0;
 			ovalue->it_value.tv_nsec = 1;
 		}
 	}
 	return (0);
 }
 
 static int
 realtimer_settime(struct itimer *it, int flags,
 	struct itimerspec *value, struct itimerspec *ovalue)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct itimerspec val;
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 
 	val = *value;
 	if (itimespecfix(&val.it_value))
 		return (EINVAL);
 
 	if (timespecisset(&val.it_value)) {
 		if (itimespecfix(&val.it_interval))
 			return (EINVAL);
 	} else {
 		timespecclear(&val.it_interval);
 	}
 	
 	if (ovalue != NULL)
 		realtimer_gettime(it, ovalue);
 
 	it->it_time = val;
 	if (timespecisset(&val.it_value)) {
 		realtimer_clocktime(it->it_clockid, &cts);
 		ts = val.it_value;
 		if ((flags & TIMER_ABSTIME) == 0) {
 			/* Convert to absolute time. */
 			timespecadd(&it->it_time.it_value, &cts);
 		} else {
 			timespecsub(&ts, &cts);
 			/*
 			 * We don't care if ts is negative, tztohz will
 			 * fix it.
 			 */
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&it->it_callout, tvtohz(&tv),
 			realtimer_expire, it);
 	} else {
 		callout_stop(&it->it_callout);
 	}
 
 	return (0);
 }
 
 static void
 realtimer_clocktime(clockid_t id, struct timespec *ts)
 {
 	if (id == CLOCK_REALTIME)
 		getnanotime(ts);
 	else	/* CLOCK_MONOTONIC */
 		getnanouptime(ts);
 }
 
 int
 itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi)
 {
 	struct itimer *it;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	it = itimer_find(p, timerid);
 	if (it != NULL) {
 		ksi->ksi_overrun = it->it_overrun;
 		it->it_overrun_last = it->it_overrun;
 		it->it_overrun = 0;
 		ITIMER_UNLOCK(it);
 		return (0);
 	}
 	return (EINVAL);
 }
 
 int
 itimespecfix(struct timespec *ts)
 {
 
 	if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
 		return (EINVAL);
 	if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
 		ts->tv_nsec = tick * 1000;
 	return (0);
 }
 
 /* Timeout callback for realtime timer */
 static void
 realtimer_expire(void *arg)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct itimer *it;
 	struct proc *p;
 
 	it = (struct itimer *)arg;
 	p = it->it_proc;
 
 	realtimer_clocktime(it->it_clockid, &cts);
 	/* Only fire if time is reached. */
 	if (timespeccmp(&cts, &it->it_time.it_value, >=)) {
 		if (timespecisset(&it->it_time.it_interval)) {
 			timespecadd(&it->it_time.it_value,
 				    &it->it_time.it_interval);
 			while (timespeccmp(&cts, &it->it_time.it_value, >=)) {
 				if (it->it_overrun < INT_MAX)
 					it->it_overrun++;
 				else
 					it->it_ksi.ksi_errno = ERANGE;
 				timespecadd(&it->it_time.it_value,
 					    &it->it_time.it_interval);
 			}
 		} else {
 			/* single shot timer ? */
 			timespecclear(&it->it_time.it_value);
 		}
 		if (timespecisset(&it->it_time.it_value)) {
 			ts = it->it_time.it_value;
 			timespecsub(&ts, &cts);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
 			callout_reset(&it->it_callout, tvtohz(&tv),
 				 realtimer_expire, it);
 		}
 		ITIMER_UNLOCK(it);
 		itimer_fire(it);
 		ITIMER_LOCK(it);
 	} else if (timespecisset(&it->it_time.it_value)) {
 		ts = it->it_time.it_value;
 		timespecsub(&ts, &cts);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire,
  			it);
 	}
 }
 
 void
 itimer_fire(struct itimer *it)
 {
 	struct proc *p = it->it_proc;
 	int ret;
 
 	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
 	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
 		PROC_LOCK(p);
 		if (!KSI_ONQ(&it->it_ksi)) {
 			it->it_ksi.ksi_errno = 0;
 			ret = psignal_event(p, &it->it_sigev, &it->it_ksi);
 			if (__predict_false(ret != 0)) {
 				it->it_overrun++;
 				/*
 				 * Broken userland code, thread went
 				 * away, disarm the timer.
 				 */
 				if (ret == ESRCH) {
 					ITIMER_LOCK(it);
 					timespecclear(&it->it_time.it_value);
 					timespecclear(&it->it_time.it_interval);
 					callout_stop(&it->it_callout);
 					ITIMER_UNLOCK(it);
 				}
 			}
 		} else {
 			if (it->it_overrun < INT_MAX)
 				it->it_overrun++;
 			else
 				it->it_ksi.ksi_errno = ERANGE;
 		}
 		PROC_UNLOCK(p);
 	}
 }
 
 static void
 itimers_alloc(struct proc *p)
 {
 	struct itimers *its;
 	int i;
 
 	its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO);
 	LIST_INIT(&its->its_virtual);
 	LIST_INIT(&its->its_prof);
 	TAILQ_INIT(&its->its_worklist);
 	for (i = 0; i < TIMER_MAX; i++)
 		its->its_timers[i] = NULL;
 	PROC_LOCK(p);
 	if (p->p_itimers == NULL) {
 		p->p_itimers = its;
 		PROC_UNLOCK(p);
 	}
 	else {
 		PROC_UNLOCK(p);
 		free(its, M_SUBPROC);
 	}
 }
 
 static void
 itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
 {
 	itimers_event_hook_exit(arg, p);
 }
 
 /* Clean up timers when some process events are being triggered. */
 static void
 itimers_event_hook_exit(void *arg, struct proc *p)
 {
 	struct itimers *its;
 	struct itimer *it;
 	int event = (int)(intptr_t)arg;
 	int i;
 
 	if (p->p_itimers != NULL) {
 		its = p->p_itimers;
 		for (i = 0; i < MAX_CLOCKS; ++i) {
 			if (posix_clocks[i].event_hook != NULL)
 				CLOCK_CALL(i, event_hook, (p, i, event));
 		}
 		/*
 		 * According to susv3, XSI interval timers should be inherited
 		 * by new image.
 		 */
 		if (event == ITIMER_EV_EXEC)
 			i = 3;
 		else if (event == ITIMER_EV_EXIT)
 			i = 0;
 		else
 			panic("unhandled event");
 		for (; i < TIMER_MAX; ++i) {
 			if ((it = its->its_timers[i]) != NULL)
 				kern_timer_delete(curthread, i);
 		}
 		if (its->its_timers[0] == NULL &&
 		    its->its_timers[1] == NULL &&
 		    its->its_timers[2] == NULL) {
 			free(its, M_SUBPROC);
 			p->p_itimers = NULL;
 		}
 	}
 }
Index: head/sys/kern/kern_umtx.c
===================================================================
--- head/sys/kern/kern_umtx.c	(revision 167231)
+++ head/sys/kern/kern_umtx.c	(revision 167232)
@@ -1,2760 +1,2759 @@
 /*-
  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/umtx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <machine/cpu.h>
 
 #ifdef COMPAT_IA32
 #include <compat/freebsd32/freebsd32_proto.h>
 #endif
 
 #define TYPE_SIMPLE_LOCK	0
 #define TYPE_SIMPLE_WAIT	1
 #define TYPE_NORMAL_UMUTEX	2
 #define TYPE_PI_UMUTEX		3
 #define TYPE_PP_UMUTEX		4
 #define TYPE_CV			5	
 
 /* Key to represent a unique userland synchronous object */
 struct umtx_key {
 	int	hash;
 	int	type;
 	int	shared;
 	union {
 		struct {
 			vm_object_t	object;
 			uintptr_t	offset;
 		} shared;
 		struct {
 			struct vmspace	*vs;
 			uintptr_t	addr;
 		} private;
 		struct {
 			void		*a;
 			uintptr_t	b;
 		} both;
 	} info;
 };
 
 /* Priority inheritance mutex info. */
 struct umtx_pi {
 	/* Owner thread */
 	struct thread		*pi_owner;
 
 	/* Reference count */
 	int			pi_refcount;
 
  	/* List entry to link umtx holding by thread */
 	TAILQ_ENTRY(umtx_pi)	pi_link;
 
 	/* List entry in hash */
 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
 
 	/* List for waiters */
 	TAILQ_HEAD(,umtx_q)	pi_blocked;
 
 	/* Identify a userland lock object */
 	struct umtx_key		pi_key;
 };
 
 /* A userland synchronous object user. */
 struct umtx_q {
 	/* Linked list for the hash. */
 	TAILQ_ENTRY(umtx_q)	uq_link;
 
 	/* Umtx key. */
 	struct umtx_key		uq_key;
 
 	/* Umtx flags. */
 	int			uq_flags;
 #define UQF_UMTXQ	0x0001
 
 	/* The thread waits on. */
 	struct thread		*uq_thread;
 
 	/*
 	 * Blocked on PI mutex. read can use chain lock
 	 * or sched_lock, write must have both chain lock and
 	 * sched_lock being hold.
 	 */
 	struct umtx_pi		*uq_pi_blocked;
 
 	/* On blocked list */
 	TAILQ_ENTRY(umtx_q)	uq_lockq;
 
 	/* Thread contending with us */
 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
 
 	/* Inherited priority from PP mutex */
 	u_char			uq_inherited_pri;
 };
 
 TAILQ_HEAD(umtxq_head, umtx_q);
 
 /* Userland lock object's wait-queue chain */
 struct umtxq_chain {
 	/* Lock for this chain. */
 	struct mtx		uc_lock;
 
 	/* List of sleep queues. */
 	struct umtxq_head	uc_queue;
 
 	/* Busy flag */
 	char			uc_busy;
 
 	/* Chain lock waiters */
 	int			uc_waiters;
 
 	/* All PI in the list */
 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
 };
 
 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
 
 /*
  * Don't propagate time-sharing priority, there is a security reason,
  * a user can simply introduce PI-mutex, let thread A lock the mutex,
  * and let another thread B block on the mutex, because B is
  * sleeping, its priority will be boosted, this causes A's priority to
  * be boosted via priority propagating too and will never be lowered even
  * if it is using 100%CPU, this is unfair to other processes.
  */
 
 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
 
 #define	GOLDEN_RATIO_PRIME	2654404609U
 #define	UMTX_CHAINS		128
 #define	UMTX_SHIFTS		(__WORD_BIT - 7)
 
 #define THREAD_SHARE		0
 #define PROCESS_SHARE		1
 #define AUTO_SHARE		2
 
 #define	GET_SHARE(flags)	\
     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
 
 static uma_zone_t		umtx_pi_zone;
 static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
 static int			umtx_pi_allocated;
 
 SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
     &umtx_pi_allocated, 0, "Allocated umtx_pi");
 SYSCTL_DECL(_kern_threads);
 static int			umtx_dflt_spins = 0;
 SYSCTL_INT(_kern_threads, OID_AUTO, umtx_dflt_spins, CTLFLAG_RW,
     &umtx_dflt_spins, 0, "default umtx spin count");
 static int			umtx_max_spins = 3000;
 SYSCTL_INT(_kern_threads, OID_AUTO, umtx_max_spins, CTLFLAG_RW,
     &umtx_max_spins, 0, "max umtx spin count");
 
 static void umtxq_sysinit(void *);
 static void umtxq_hash(struct umtx_key *key);
 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
 static void umtxq_lock(struct umtx_key *key);
 static void umtxq_unlock(struct umtx_key *key);
 static void umtxq_busy(struct umtx_key *key);
 static void umtxq_unbusy(struct umtx_key *key);
 static void umtxq_insert(struct umtx_q *uq);
 static void umtxq_remove(struct umtx_q *uq);
 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
 static int umtxq_count(struct umtx_key *key);
 static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
 static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
 static int umtx_key_get(void *addr, int type, int share,
 	struct umtx_key *key);
 static void umtx_key_release(struct umtx_key *key);
 static struct umtx_pi *umtx_pi_alloc(int);
 static void umtx_pi_free(struct umtx_pi *pi);
 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
 static void umtx_thread_cleanup(struct thread *td);
 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
 	struct image_params *imgp __unused);
 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
 
 static void
 umtxq_sysinit(void *arg __unused)
 {
 	int i;
 
 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	for (i = 0; i < UMTX_CHAINS; ++i) {
 		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
 			 MTX_DEF | MTX_DUPOK);
 		TAILQ_INIT(&umtxq_chains[i].uc_queue);
 		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
 		umtxq_chains[i].uc_busy = 0;
 		umtxq_chains[i].uc_waiters = 0;
 	}
 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 struct umtx_q *
 umtxq_alloc(void)
 {
 	struct umtx_q *uq;
 
 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&uq->uq_pi_contested);
 	uq->uq_inherited_pri = PRI_MAX;
 	return (uq);
 }
 
 void
 umtxq_free(struct umtx_q *uq)
 {
 	free(uq, M_UMTX);
 }
 
 static inline void
 umtxq_hash(struct umtx_key *key)
 {
 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
 }
 
 static inline int
 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
 {
 	return (k1->type == k2->type &&
 		k1->info.both.a == k2->info.both.a &&
 	        k1->info.both.b == k2->info.both.b);
 }
 
 static inline struct umtxq_chain *
 umtxq_getchain(struct umtx_key *key)
 {
 	return (&umtxq_chains[key->hash]);
 }
 
 /*
  * Set chain to busy state when following operation
  * may be blocked (kernel mutex can not be used).
  */
 static inline void
 umtxq_busy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	while (uc->uc_busy != 0) {
 		uc->uc_waiters++;
 		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
 		uc->uc_waiters--;
 	}
 	uc->uc_busy = 1;
 }
 
 /*
  * Unbusy a chain.
  */
 static inline void
 umtxq_unbusy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	KASSERT(uc->uc_busy != 0, ("not busy"));
 	uc->uc_busy = 0;
 	if (uc->uc_waiters)
 		wakeup_one(uc);
 }
 
 /*
  * Lock a chain.
  */
 static inline void
 umtxq_lock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_lock(&uc->uc_lock);
 }
 
 /*
  * Unlock a chain.
  */
 static inline void
 umtxq_unlock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_unlock(&uc->uc_lock);
 }
 
 /*
  * Insert a thread onto the umtx queue.
  */
 static inline void
 umtxq_insert(struct umtx_q *uq)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
 	uq->uq_flags |= UQF_UMTXQ;
 }
 
 /*
  * Remove thread from the umtx queue.
  */
 static inline void
 umtxq_remove(struct umtx_q *uq)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	if (uq->uq_flags & UQF_UMTXQ) {
 		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
 		uq->uq_flags &= ~UQF_UMTXQ;
 	}
 }
 
 /*
  * Check if there are multiple waiters
  */
 static int
 umtxq_count(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtx_q *uq;
 	int count = 0;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
 		if (umtx_key_match(&uq->uq_key, key)) {
 			if (++count > 1)
 				break;
 		}
 	}
 	return (count);
 }
 
 /*
  * Check if there are multiple PI waiters and returns first
  * waiter.
  */
 static int
 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
 {
 	struct umtxq_chain *uc;
 	struct umtx_q *uq;
 	int count = 0;
 
 	*first = NULL;
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
 		if (umtx_key_match(&uq->uq_key, key)) {
 			if (++count > 1)
 				break;
 			*first = uq;
 		}
 	}
 	return (count);
 }
 
 /*
  * Wake up threads waiting on an userland object.
  */
 static int
 umtxq_signal(struct umtx_key *key, int n_wake)
 {
 	struct umtxq_chain *uc;
 	struct umtx_q *uq, *next;
 	int ret;
 
 	ret = 0;
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
 		if (umtx_key_match(&uq->uq_key, key)) {
 			umtxq_remove(uq);
 			wakeup(uq);
 			if (++ret >= n_wake)
 				break;
 		}
 	}
 	return (ret);
 }
 
 /*
  * Wake up specified thread.
  */
 static inline void
 umtxq_signal_thread(struct umtx_q *uq)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	umtxq_remove(uq);
 	wakeup(uq);
 }
 
 /*
  * Put thread into sleep state, before sleeping, check if
  * thread was removed from umtx queue.
  */
 static inline int
 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
 {
 	struct umtxq_chain *uc;
 	int error;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	if (!(uq->uq_flags & UQF_UMTXQ))
 		return (0);
 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
 	if (error == EWOULDBLOCK)
 		error = ETIMEDOUT;
 	return (error);
 }
 
 /*
  * Convert userspace address into unique logical address.
  */
 static int
 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
 {
 	struct thread *td = curthread;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 
 	key->type = type;
 	if (share == THREAD_SHARE) {
 		key->shared = 0;
 		key->info.private.vs = td->td_proc->p_vmspace;
 		key->info.private.addr = (uintptr_t)addr;
 	} else {
 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
 		map = &td->td_proc->p_vmspace->vm_map;
 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
 		    &entry, &key->info.shared.object, &pindex, &prot,
 		    &wired) != KERN_SUCCESS) {
 			return EFAULT;
 		}
 
 		if ((share == PROCESS_SHARE) ||
 		    (share == AUTO_SHARE &&
 		     VM_INHERIT_SHARE == entry->inheritance)) {
 			key->shared = 1;
 			key->info.shared.offset = entry->offset + entry->start -
 				(vm_offset_t)addr;
 			vm_object_reference(key->info.shared.object);
 		} else {
 			key->shared = 0;
 			key->info.private.vs = td->td_proc->p_vmspace;
 			key->info.private.addr = (uintptr_t)addr;
 		}
 		vm_map_lookup_done(map, entry);
 	}
 
 	umtxq_hash(key);
 	return (0);
 }
 
 /*
  * Release key.
  */
 static inline void
 umtx_key_release(struct umtx_key *key)
 {
 	if (key->shared)
 		vm_object_deallocate(key->info.shared.object);
 }
 
 /*
  * Lock a umtx object.
  */
 static int
 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
 {
 	struct umtx_q *uq;
 	u_long owner;
 	u_long old;
 	int error = 0;
 
 	uq = td->td_umtxq;
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
 
 		/* The acquire succeeded. */
 		if (owner == UMTX_UNOWNED)
 			return (0);
 
 		/* The address was invalid. */
 		if (owner == -1)
 			return (EFAULT);
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMTX_CONTESTED) {
 			owner = casuword(&umtx->u_owner,
 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
 
 			if (owner == UMTX_CONTESTED)
 				return (0);
 
 			/* The address was invalid. */
 			if (owner == -1)
 				return (EFAULT);
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			return (error);
 
 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
 			AUTO_SHARE, &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
 
 		/* The address was invalid. */
 		if (old == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		if (old == owner)
 			error = umtxq_sleep(uq, "umtx", timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 	}
 
 	return (0);
 }
 
 /*
  * Lock a umtx object.
  */
 static int
 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
 	struct timespec *timeout)
 {
 	struct timespec ts, ts2, ts3;
 	struct timeval tv;
 	int error;
 
 	if (timeout == NULL) {
 		error = _do_lock_umtx(td, umtx, id, 0);
 		/* Mutex locking is restarted if it is interrupted. */
 		if (error == EINTR)
 			error = ERESTART;
 	} else {
 		getnanouptime(&ts);
 		timespecadd(&ts, timeout);
 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
 		for (;;) {
 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
 			if (error != ETIMEDOUT)
 				break;
 			getnanouptime(&ts2);
 			if (timespeccmp(&ts2, &ts, >=)) {
 				error = ETIMEDOUT;
 				break;
 			}
 			ts3 = ts;
 			timespecsub(&ts3, &ts2);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 		}
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a umtx object.
  */
 static int
 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
 {
 	struct umtx_key key;
 	u_long owner;
 	u_long old;
 	int error;
 	int count;
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
 	if (owner == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMTX_CONTESTED) != id)
 		return (EPERM);
 
 	/* This should be done in userland */
 	if ((owner & UMTX_CONTESTED) == 0) {
 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
 		if (old == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
 		&key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	old = casuword(&umtx->u_owner, owner,
 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
 	umtxq_lock(&key);
 	umtxq_signal(&key,1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (old == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 #ifdef COMPAT_IA32
 
 /*
  * Lock a umtx object.
  */
 static int
 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
 {
 	struct umtx_q *uq;
 	uint32_t owner;
 	uint32_t old;
 	int error = 0;
 
 	uq = td->td_umtxq;
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		owner = casuword32(m, UMUTEX_UNOWNED, id);
 
 		/* The acquire succeeded. */
 		if (owner == UMUTEX_UNOWNED)
 			return (0);
 
 		/* The address was invalid. */
 		if (owner == -1)
 			return (EFAULT);
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMUTEX_CONTESTED) {
 			owner = casuword32(m,
 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
 			if (owner == UMUTEX_CONTESTED)
 				return (0);
 
 			/* The address was invalid. */
 			if (owner == -1)
 				return (EFAULT);
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			return (error);
 
 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
 			AUTO_SHARE, &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (old == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		if (old == owner)
 			error = umtxq_sleep(uq, "umtx", timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 	}
 
 	return (0);
 }
 
 /*
  * Lock a umtx object.
  */
 static int
 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
 	struct timespec *timeout)
 {
 	struct timespec ts, ts2, ts3;
 	struct timeval tv;
 	int error;
 
 	if (timeout == NULL) {
 		error = _do_lock_umtx32(td, m, id, 0);
 		/* Mutex locking is restarted if it is interrupted. */
 		if (error == EINTR)
 			error = ERESTART;
 	} else {
 		getnanouptime(&ts);
 		timespecadd(&ts, timeout);
 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
 		for (;;) {
 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
 			if (error != ETIMEDOUT)
 				break;
 			getnanouptime(&ts2);
 			if (timespeccmp(&ts2, &ts, >=)) {
 				error = ETIMEDOUT;
 				break;
 			}
 			ts3 = ts;
 			timespecsub(&ts3, &ts2);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 		}
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a umtx object.
  */
 static int
 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
 {
 	struct umtx_key key;
 	uint32_t owner;
 	uint32_t old;
 	int error;
 	int count;
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	owner = fuword32(m);
 	if (owner == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		old = casuword32(m, owner, UMUTEX_UNOWNED);
 		if (old == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
 		&key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	old = casuword32(m, owner,
 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
 	umtxq_lock(&key);
 	umtxq_signal(&key,1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (old == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 #endif
 
 /*
  * Fetch and compare value, sleep on the address if value is not changed.
  */
 static int
 do_wait(struct thread *td, void *addr, u_long id,
 	struct timespec *timeout, int compat32)
 {
 	struct umtx_q *uq;
 	struct timespec ts, ts2, ts3;
 	struct timeval tv;
 	u_long tmp;
 	int error = 0;
 
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	if (compat32 == 0)
 		tmp = fuword(addr);
         else
 		tmp = fuword32(addr);
 	if (tmp != id) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	} else if (timeout == NULL) {
 		umtxq_lock(&uq->uq_key);
 		error = umtxq_sleep(uq, "uwait", 0);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	} else {
 		getnanouptime(&ts);
 		timespecadd(&ts, timeout);
 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
 		umtxq_lock(&uq->uq_key);
 		for (;;) {
 			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
 			if (!(uq->uq_flags & UQF_UMTXQ))
 				break;
 			if (error != ETIMEDOUT)
 				break;
 			umtxq_unlock(&uq->uq_key);
 			getnanouptime(&ts2);
 			if (timespeccmp(&ts2, &ts, >=)) {
 				error = ETIMEDOUT;
 				umtxq_lock(&uq->uq_key);
 				break;
 			}
 			ts3 = ts;
 			timespecsub(&ts3, &ts2);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 			umtxq_lock(&uq->uq_key);
 		}
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	}
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 /*
  * Wake up threads sleeping on the specified address.
  */
 int
 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
 {
 	struct umtx_key key;
 	int ret;
 	
 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
 	   &key)) != 0)
 		return (ret);
 	umtxq_lock(&key);
 	ret = umtxq_signal(&key, n_wake);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (0);
 }
 
 /*
  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
 	int try)
 {
 	struct umtx_q *uq;
 	uint32_t owner, old, id;
 #ifdef SMP
 	int spincount;
 #endif
 	int error = 0;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		spincount = fuword32(&m->m_spincount);
 		if (spincount == 0)
 			spincount = umtx_dflt_spins;
 		if (spincount > umtx_max_spins)
 			spincount = umtx_max_spins;
 	} else
 		spincount = 0;
 #endif
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 #ifdef SMP
 try_unowned:
 #endif
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
 
 		/* The acquire succeeded. */
 		if (owner == UMUTEX_UNOWNED)
 			return (0);
 
 		/* The address was invalid. */
 		if (owner == -1)
 			return (EFAULT);
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMUTEX_CONTESTED) {
 #ifdef SMP
 try_contested:
 #endif
 			owner = casuword32(&m->m_owner,
 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
 
 			if (owner == UMUTEX_CONTESTED)
 				return (0);
 
 			/* The address was invalid. */
 			if (owner == -1)
 				return (EFAULT);
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
 		    (owner & ~UMUTEX_CONTESTED) == id)
 			return (EDEADLK);
 
 		if (try != 0)
 			return (EBUSY);
 
 #ifdef SMP
 		if (spincount > 0 && (owner & ~UMUTEX_CONTESTED) != id) {
 			int i, found = 0;
 			struct pcpu *pcpu = NULL;
 
 			/* Look for a cpu the owner is running on */
 			for (i = 0; i < MAXCPU; i++) {
 				if (CPU_ABSENT(i))
 					continue;
 				pcpu = pcpu_find(i);
 				if ((owner & ~UMUTEX_CONTESTED) == pcpu->pc_curtid) {
 					found = 1;
 					break;
 				}
 			}
 
 			if (__predict_false(!found))
 				goto end_spin;
 
 			while ((owner & ~UMUTEX_CONTESTED) == pcpu->pc_curtid &&
 			       (owner & ~UMUTEX_CONTESTED) != id) {
 				if (--spincount <= 0)
 					break;
 				if ((td->td_flags &
 			    	    (TDF_NEEDRESCHED|TDF_ASTPENDING|TDF_NEEDSIGCHK)) ||
 				     P_SHOULDSTOP(td->td_proc))
 					break;
 				owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
 				if (owner == UMUTEX_UNOWNED)
 					goto try_unowned;
  				if (owner == UMUTEX_CONTESTED)
 					goto try_contested;
 				cpu_spinwait();
 			}
 		}
 end_spin:
 		spincount = 0;
 
 #endif
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			return (error);
 
 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
 		    GET_SHARE(flags), &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (old == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		if (old == owner)
 			error = umtxq_sleep(uq, "umtxn", timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 	}
 
 	return (0);
 }
 
 /*
  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 /*
  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	uint32_t owner, old, id;
 	int error;
 	int count;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
 	if (owner == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
 		if (old == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	old = casuword32(&m->m_owner, owner,
 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
 	umtxq_lock(&key);
 	umtxq_signal(&key,1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (old == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 static inline struct umtx_pi *
 umtx_pi_alloc(int flags)
 {
 	struct umtx_pi *pi;
 
 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
 	TAILQ_INIT(&pi->pi_blocked);
 	atomic_add_int(&umtx_pi_allocated, 1);
 	return (pi);
 }
 
 static inline void
 umtx_pi_free(struct umtx_pi *pi)
 {
 	uma_zfree(umtx_pi_zone, pi);
 	atomic_add_int(&umtx_pi_allocated, -1);
 }
 
 /*
  * Adjust the thread's position on a pi_state after its priority has been
  * changed.
  */
 static int
 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
 {
 	struct umtx_q *uq, *uq1, *uq2;
 	struct thread *td1;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (pi == NULL)
 		return (0);
 
 	uq = td->td_umtxq;
 
 	/*
 	 * Check if the thread needs to be moved on the blocked chain.
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
 	uq2 = TAILQ_NEXT(uq, uq_lockq);
 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
 		/*
 		 * Remove thread from blocked chain and determine where
 		 * it should be moved to.
 		 */
 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 			td1 = uq1->uq_thread;
 			MPASS(td1->td_proc->p_magic == P_MAGIC);
 			if (UPRI(td1) > UPRI(td))
 				break;
 		}
 
 		if (uq1 == NULL)
 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 		else
 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	}
 	return (1);
 }
 
 /*
  * Propagate priority when a thread is blocked on POSIX
  * PI mutex.
  */ 
 static void
 umtx_propagate_priority(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	int pri;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	pri = UPRI(td);
 	uq = td->td_umtxq;
 	pi = uq->uq_pi_blocked;
 	if (pi == NULL)
 		return;
 
 	for (;;) {
 		td = pi->pi_owner;
 		if (td == NULL)
 			return;
 
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
 		if (UPRI(td) <= pri)
 			return;
 
 		sched_lend_user_prio(td, pri);
 
 		/*
 		 * Pick up the lock that td is blocked on.
 		 */
 		uq = td->td_umtxq;
 		pi = uq->uq_pi_blocked;
 		/* Resort td on the list if needed. */
 		if (!umtx_pi_adjust_thread(pi, td))
 			break;
 	}
 }
 
 /*
  * Unpropagate priority for a PI mutex when a thread blocked on
  * it is interrupted by signal or resumed by others.
  */
 static void
 umtx_unpropagate_priority(struct umtx_pi *pi)
 {
 	struct umtx_q *uq, *uq_owner;
 	struct umtx_pi *pi2;
 	int pri;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	while (pi != NULL && pi->pi_owner != NULL) {
 		pri = PRI_MAX;
 		uq_owner = pi->pi_owner->td_umtxq;
 
 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
 			uq = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq != NULL) {
 				if (pri > UPRI(uq->uq_thread))
 					pri = UPRI(uq->uq_thread);
 			}
 		}
 
 		if (pri > uq_owner->uq_inherited_pri)
 			pri = uq_owner->uq_inherited_pri;
 		sched_unlend_user_prio(pi->pi_owner, pri);
 		pi = uq_owner->uq_pi_blocked;
 	}
 }
 
 /*
  * Insert a PI mutex into owned list.
  */
 static void
 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq_owner;
 
 	uq_owner = owner->td_umtxq;
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (pi->pi_owner != NULL)
 		panic("pi_ower != NULL");
 	pi->pi_owner = owner;
 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
 }
 
 /*
  * Claim ownership of a PI mutex.
  */
 static int
 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq, *uq_owner;
 
 	uq_owner = owner->td_umtxq;
 	mtx_lock_spin(&sched_lock);
 	if (pi->pi_owner == owner) {
 		mtx_unlock_spin(&sched_lock);
 		return (0);
 	}
 
 	if (pi->pi_owner != NULL) {
 		/*
 		 * userland may have already messed the mutex, sigh.
 		 */
 		mtx_unlock_spin(&sched_lock);
 		return (EPERM);
 	}
 	umtx_pi_setowner(pi, owner);
 	uq = TAILQ_FIRST(&pi->pi_blocked);
 	if (uq != NULL) {
 		int pri;
 
 		pri = UPRI(uq->uq_thread);
 		if (pri < UPRI(owner))
 			sched_lend_user_prio(owner, pri);
 	}
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 /*
  * Adjust a thread's order position in its blocked PI mutex,
  * this may result new priority propagating process.
  */
 void
 umtx_pi_adjust(struct thread *td, u_char oldpri)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 
 	uq = td->td_umtxq;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	MPASS(TD_ON_UPILOCK(td));
 
 	/*
 	 * Pick up the lock that td is blocked on.
 	 */
 	pi = uq->uq_pi_blocked;
 	MPASS(pi != NULL);
 
 	/* Resort the turnstile on the list. */
 	if (!umtx_pi_adjust_thread(pi, td))
 		return;
 
 	/*
 	 * If our priority was lowered and we are at the head of the
 	 * turnstile, then propagate our new priority up the chain.
 	 */
 	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
 		umtx_propagate_priority(td);
 }
 
 /*
  * Sleep on a PI mutex.
  */
 static int
 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
 	uint32_t owner, const char *wmesg, int timo)
 {
 	struct umtxq_chain *uc;
 	struct thread *td, *td1;
 	struct umtx_q *uq1;
 	int pri;
 	int error = 0;
 
 	td = uq->uq_thread;
 	KASSERT(td == curthread, ("inconsistent uq_thread"));
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	umtxq_insert(uq);
 	if (pi->pi_owner == NULL) {
 		/* XXX
 		 * Current, We only support process private PI-mutex,
 		 * non-contended PI-mutexes are locked in userland.
 		 * Process shared PI-mutex should always be initialized
 		 * by kernel and be registered in kernel, locking should
 		 * always be done by kernel to avoid security problems.
 		 * For process private PI-mutex, we can find owner
 		 * thread and boost its priority safely.
 		 */
 		PROC_LOCK(curproc);
 		td1 = thread_find(curproc, owner);
 		mtx_lock_spin(&sched_lock);
 		if (td1 != NULL && pi->pi_owner == NULL) {
 			uq1 = td1->td_umtxq;
 			umtx_pi_setowner(pi, td1);
 		}
 		PROC_UNLOCK(curproc);
 	} else {
 		mtx_lock_spin(&sched_lock);
 	}
 
 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 		pri = UPRI(uq1->uq_thread);
 		if (pri > UPRI(td))
 			break;
 	}
 
 	if (uq1 != NULL)
 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	else
 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 
 	uq->uq_pi_blocked = pi;
 	td->td_flags |= TDF_UPIBLOCKED;
 	mtx_unlock_spin(&sched_lock);
 	umtxq_unlock(&uq->uq_key);
 
 	mtx_lock_spin(&sched_lock);
 	umtx_propagate_priority(td);
 	mtx_unlock_spin(&sched_lock);
 
 	umtxq_lock(&uq->uq_key);
 	if (uq->uq_flags & UQF_UMTXQ) {
 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
 		if (error == EWOULDBLOCK)
 			error = ETIMEDOUT;
 		if (uq->uq_flags & UQF_UMTXQ) {
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unbusy(&uq->uq_key);
 		}
 	}
 	umtxq_unlock(&uq->uq_key);
 
 	mtx_lock_spin(&sched_lock);
 	uq->uq_pi_blocked = NULL;
 	td->td_flags &= ~TDF_UPIBLOCKED;
 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 	umtx_unpropagate_priority(pi);
 	mtx_unlock_spin(&sched_lock);
 
 	umtxq_lock(&uq->uq_key);
 
 	return (error);
 }
 
 /*
  * Add reference count for a PI mutex.
  */
 static void
 umtx_pi_ref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	pi->pi_refcount++;
 }
 
 /*
  * Decrease reference count for a PI mutex, if the counter
  * is decreased to zero, its memory space is freed.
  */ 
 static void
 umtx_pi_unref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 	int free = 0;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
 	if (--pi->pi_refcount == 0) {
 		mtx_lock_spin(&sched_lock);
 		if (pi->pi_owner != NULL) {
 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
 				pi, pi_link);
 			pi->pi_owner = NULL;
 		}
 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
 			("blocked queue not empty"));
 		mtx_unlock_spin(&sched_lock);
 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
 		free = 1;
 	}
 	if (free)
 		umtx_pi_free(pi);
 }
 
 /*
  * Find a PI mutex in hash table.
  */
 static struct umtx_pi *
 umtx_pi_lookup(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtx_pi *pi;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 
 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
 		if (umtx_key_match(&pi->pi_key, key)) {
 			return (pi);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Insert a PI mutex into hash table.
  */
 static inline void
 umtx_pi_insert(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
 }
 
 /*
  * Lock a PI mutex.
  */
 static int
 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
 	int try)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi, *new_pi;
 	uint32_t id, owner, old;
 	int error;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 
 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 	umtxq_lock(&uq->uq_key);
 	pi = umtx_pi_lookup(&uq->uq_key);
 	if (pi == NULL) {
 		new_pi = umtx_pi_alloc(M_NOWAIT);
 		if (new_pi == NULL) {
 			umtxq_unlock(&uq->uq_key);
 			new_pi = umtx_pi_alloc(M_WAITOK);
 			new_pi->pi_key = uq->uq_key;
 			umtxq_lock(&uq->uq_key);
 			pi = umtx_pi_lookup(&uq->uq_key);
 			if (pi != NULL) {
 				umtx_pi_free(new_pi);
 				new_pi = NULL;
 			}
 		}
 		if (new_pi != NULL) {
 			new_pi->pi_key = uq->uq_key;
 			umtx_pi_insert(new_pi);
 			pi = new_pi;
 		}
 	}
 	umtx_pi_ref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure.  It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
 
 		/* The acquire succeeded. */
 		if (owner == UMUTEX_UNOWNED) {
 			error = 0;
 			break;
 		}
 
 		/* The address was invalid. */
 		if (owner == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMUTEX_CONTESTED) {
 			owner = casuword32(&m->m_owner,
 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
 
 			if (owner == UMUTEX_CONTESTED) {
 				umtxq_lock(&uq->uq_key);
 				error = umtx_pi_claim(pi, td);
 				umtxq_unlock(&uq->uq_key);
 				break;
 			}
 
 			/* The address was invalid. */
 			if (owner == -1) {
 				error = EFAULT;
 				break;
 			}
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
 		    (owner & ~UMUTEX_CONTESTED) == id) {
 			error = EDEADLK;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 			
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (old == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		if (old == owner)
 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
 				 "umtxpi", timo);
 		umtxq_unlock(&uq->uq_key);
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtx_pi_unref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PI mutex.
  */
 static int
 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	struct umtx_q *uq_first, *uq_first2, *uq_me;
 	struct umtx_pi *pi, *pi2;
 	uint32_t owner, old, id;
 	int error;
 	int count;
 	int pri;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
 	if (owner == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
 		if (old == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count_pi(&key, &uq_first);
 	if (uq_first != NULL) {
 		pi = uq_first->uq_pi_blocked;
 		if (pi->pi_owner != curthread) {
 			umtxq_unbusy(&key);
 			umtxq_unlock(&key);
 			/* userland messed the mutex */
 			return (EPERM);
 		}
 		uq_me = curthread->td_umtxq;
 		mtx_lock_spin(&sched_lock);
 		pi->pi_owner = NULL;
 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq_first2 != NULL) {
 				if (pri > UPRI(uq_first2->uq_thread))
 					pri = UPRI(uq_first2->uq_thread);
 			}
 		}
 		sched_unlend_user_prio(curthread, pri);
 		mtx_unlock_spin(&sched_lock);
 	}
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	old = casuword32(&m->m_owner, owner,
 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
 
 	umtxq_lock(&key);
 	if (uq_first != NULL)
 		umtxq_signal_thread(uq_first);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (old == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Lock a PP mutex.
  */
 static int
 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
 	int try)
 {
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t ceiling;
 	uint32_t owner, id;
 	int error, pri, old_inherited_pri, su;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 	for (;;) {
 		old_inherited_pri = uq->uq_inherited_pri;
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
 		if (ceiling > RTP_PRIO_MAX) {
 			error = EINVAL;
 			goto out;
 		}
 
 		mtx_lock_spin(&sched_lock);
 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
 			mtx_unlock_spin(&sched_lock);
 			error = EINVAL;
 			goto out;
 		}
 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
 			if (uq->uq_inherited_pri < UPRI(td))
 				sched_lend_user_prio(td, uq->uq_inherited_pri);
 		}
 		mtx_unlock_spin(&sched_lock);
 
 		owner = casuword32(&m->m_owner,
 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
 
 		if (owner == UMUTEX_CONTESTED) {
 			error = 0;
 			break;
 		}
 
 		/* The address was invalid. */
 		if (owner == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
 		    (owner & ~UMUTEX_CONTESTED) == id) {
 			error = EDEADLK;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		mtx_lock_spin(&sched_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		sched_unlend_user_prio(td, pri);
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	if (error != 0) {
 		mtx_lock_spin(&sched_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		sched_unlend_user_prio(td, pri);
 		mtx_unlock_spin(&sched_lock);
 	}
 
 out:
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PP mutex.
  */
 static int
 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t owner, id;
 	uint32_t rceiling;
 	int error, pri, new_inherited_pri, su;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
 	if (owner == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
 	if (error != 0)
 		return (error);
 
 	if (rceiling == -1)
 		new_inherited_pri = PRI_MAX;
 	else {
 		rceiling = RTP_PRIO_MAX - rceiling;
 		if (rceiling > RTP_PRIO_MAX)
 			return (EINVAL);
 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
 	}
 
 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_unlock(&key);
 	/*
 	 * For priority protected mutex, always set unlocked state
 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
 	 * to lock the mutex, it is necessary because thread priority
 	 * has to be adjusted for such mutex.
 	 */
 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
 		UMUTEX_CONTESTED);
 
 	umtxq_lock(&key);
 	if (error == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 
 	if (error == -1)
 		error = EFAULT;
 	else {
 		mtx_lock_spin(&sched_lock);
 		if (su != 0)
 			uq->uq_inherited_pri = new_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		sched_unlend_user_prio(td, pri);
 		mtx_unlock_spin(&sched_lock);
 	}
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
 	uint32_t *old_ceiling)
 {
 	struct umtx_q *uq;
 	uint32_t save_ceiling;
 	uint32_t owner, id;
 	uint32_t flags;
 	int error;
 
 	flags = fuword32(&m->m_flags);
 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 	if (ceiling > RTP_PRIO_MAX)
 		return (EINVAL);
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
 	   &uq->uq_key)) != 0)
 		return (error);
 	for (;;) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		save_ceiling = fuword32(&m->m_ceilings[0]);
 
 		owner = casuword32(&m->m_owner,
 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
 
 		if (owner == UMUTEX_CONTESTED) {
 			suword32(&m->m_ceilings[0], ceiling);
 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
 				UMUTEX_CONTESTED);
 			error = 0;
 			break;
 		}
 
 		/* The address was invalid. */
 		if (owner == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			suword32(&m->m_ceilings[0], ceiling);
 			error = 0;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", 0);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0)
 		umtxq_signal(&uq->uq_key, INT_MAX);
 	umtxq_unbusy(&uq->uq_key);
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == 0 && old_ceiling != NULL)
 		suword32(old_ceiling, save_ceiling);
 	return (error);
 }
 
 static int
 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
 	int try)
 {
 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		return (_do_lock_normal(td, m, flags, timo, try));
 	case UMUTEX_PRIO_INHERIT:
 		return (_do_lock_pi(td, m, flags, timo, try));
 	case UMUTEX_PRIO_PROTECT:
 		return (_do_lock_pp(td, m, flags, timo, try));
 	}
 	return (EINVAL);
 }
 
 /*
  * Lock a userland POSIX mutex.
  */
 static int
 do_lock_umutex(struct thread *td, struct umutex *m,
 	struct timespec *timeout, int try)
 {
 	struct timespec ts, ts2, ts3;
 	struct timeval tv;
 	uint32_t flags;
 	int error;
 
 	flags = fuword32(&m->m_flags);
 	if (flags == -1)
 		return (EFAULT);
 
 	if (timeout == NULL) {
 		error = _do_lock_umutex(td, m, flags, 0, try);
 		/* Mutex locking is restarted if it is interrupted. */
 		if (error == EINTR)
 			error = ERESTART;
 	} else {
 		getnanouptime(&ts);
 		timespecadd(&ts, timeout);
 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
 		for (;;) {
 			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
 			if (error != ETIMEDOUT)
 				break;
 			getnanouptime(&ts2);
 			if (timespeccmp(&ts2, &ts, >=)) {
 				error = ETIMEDOUT;
 				break;
 			}
 			ts3 = ts;
 			timespecsub(&ts3, &ts2);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 		}
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a userland POSIX mutex.
  */
 static int
 do_unlock_umutex(struct thread *td, struct umutex *m)
 {
 	uint32_t flags;
 
 	flags = fuword32(&m->m_flags);
 	if (flags == -1)
 		return (EFAULT);
 
 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		return (do_unlock_normal(td, m, flags));
 	case UMUTEX_PRIO_INHERIT:
 		return (do_unlock_pi(td, m, flags));
 	case UMUTEX_PRIO_PROTECT:
 		return (do_unlock_pp(td, m, flags));
 	}
 
 	return (EINVAL);
 }
 
 static int
 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
 	struct timespec *timeout, u_long wflags)
 {
 	struct umtx_q *uq;
 	struct timeval tv;
 	struct timespec cts, ets, tts;
 	uint32_t flags;
 	int error;
 
 	uq = td->td_umtxq;
 	flags = fuword32(&cv->c_flags);
 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * The magic thing is we should set c_has_waiters to 1 before
 	 * releasing user mutex.
 	 */
 	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 	umtxq_unlock(&uq->uq_key);
 
 	error = do_unlock_umutex(td, m);
 	
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		if ((wflags & UMTX_CHECK_UNPARKING) &&
 		    (td->td_pflags & TDP_WAKEUP)) {
 			td->td_pflags &= ~TDP_WAKEUP;
 			error = EINTR;
 		} else if (timeout == NULL) {
 			error = umtxq_sleep(uq, "ucond", 0);
 		} else {
 			getnanouptime(&ets);
 			timespecadd(&ets, timeout);
 			TIMESPEC_TO_TIMEVAL(&tv, timeout);
 			for (;;) {
 				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
 				if (error != ETIMEDOUT)
 					break;
 				getnanouptime(&cts);
 				if (timespeccmp(&cts, &ets, >=)) {
 					error = ETIMEDOUT;
 					break;
 				}
 				tts = ets;
 				timespecsub(&tts, &cts);
 				TIMESPEC_TO_TIMEVAL(&tv, &tts);
 			}
 		}
 	}
 
 	if (error != 0) {
 		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
 			/*
 			 * If we concurrently got do_cv_signal()d
 			 * and we got an error or UNIX signals or a timeout,
 			 * then, perform another umtxq_signal to avoid
 			 * consuming the wakeup. This may cause supurious
 			 * wakeup for another thread which was just queued,
 			 * but SUSV3 explicitly allows supurious wakeup to
 			 * occur, and indeed a kernel based implementation
 			 * can not avoid it.
 			 */ 
 			if (!umtxq_signal(&uq->uq_key, 1))
 				error = 0;
 		}
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	umtxq_remove(uq);
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland condition variable.
  */
 static int
 do_cv_signal(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error, cnt, nwake;
 	uint32_t flags;
 
 	flags = fuword32(&cv->c_flags);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	nwake = umtxq_signal(&key, 1);
 	if (cnt <= nwake) {
 		umtxq_unlock(&key);
 		error = suword32(
 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
 		umtxq_lock(&key);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_cv_broadcast(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error;
 	uint32_t flags;
 
 	flags = fuword32(&cv->c_flags);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_signal(&key, INT_MAX);
 	umtxq_unlock(&key);
 
 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
 
 	umtxq_lock(&key);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 
 	umtx_key_release(&key);
 	return (error);
 }
 
 int
 _umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
     /* struct umtx *umtx */
 {
 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
 }
 
 int
 _umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
     /* struct umtx *umtx */
 {
 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
 }
 
 static int
 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
 		if (error != 0)
 			return (error);
 		if (timeout.tv_nsec >= 1000000000 ||
 		    timeout.tv_nsec < 0) {
 			return (EINVAL);
 		}
 		ts = &timeout;
 	}
 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
 }
 
 static int
 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
 {
 	return (do_unlock_umtx(td, uap->obj, uap->val));
 }
 
 static int
 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
 		if (error != 0)
 			return (error);
 		if (timeout.tv_nsec >= 1000000000 ||
 		    timeout.tv_nsec < 0)
 			return (EINVAL);
 		ts = &timeout;
 	}
 	return do_wait(td, uap->obj, uap->val, ts, 0);
 }
 
 static int
 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 	return (kern_umtx_wake(td, uap->obj, uap->val));
 }
 
 static int
 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->uaddr2, &timeout,
 		    sizeof(timeout));
 		if (error != 0)
 			return (error);
 		if (timeout.tv_nsec >= 1000000000 ||
 		    timeout.tv_nsec < 0) {
 			return (EINVAL);
 		}
 		ts = &timeout;
 	}
 	return do_lock_umutex(td, uap->obj, ts, 0);
 }
 
 static int
 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_lock_umutex(td, uap->obj, NULL, 1);
 }
 
 static int
 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_unlock_umutex(td, uap->obj);
 }
 
 static int
 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
 }
 
 static int
 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->uaddr2, &timeout,
 		    sizeof(timeout));
 		if (error != 0)
 			return (error);
 		if (timeout.tv_nsec >= 1000000000 ||
 		    timeout.tv_nsec < 0) {
 			return (EINVAL);
 		}
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_cv_signal(td, uap->obj);
 }
 
 static int
 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_cv_broadcast(td, uap->obj);
 }
 
 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
 
 static _umtx_op_func op_table[] = {
 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
 	__umtx_op_wait,			/* UMTX_OP_WAIT */
 	__umtx_op_wake,			/* UMTX_OP_WAKE */
 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
 	__umtx_op_cv_broadcast		/* UMTX_OP_CV_BROADCAST */
 };
 
 int
 _umtx_op(struct thread *td, struct _umtx_op_args *uap)
 {
 	if ((unsigned)uap->op < UMTX_OP_MAX)
 		return (*op_table[uap->op])(td, uap);
 	return (EINVAL);
 }
 
 #ifdef COMPAT_IA32
-
 int
 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
     /* struct umtx *umtx */
 {
 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
 }
 
 int
 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
     /* struct umtx *umtx */
 {
 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
 }
 
 struct timespec32 {
 	u_int32_t tv_sec;
 	u_int32_t tv_nsec;
 };
 
 static inline int
 copyin_timeout32(void *addr, struct timespec *tsp)
 {
 	struct timespec32 ts32;
 	int error;
 
 	error = copyin(addr, &ts32, sizeof(struct timespec32));
 	if (error == 0) {
 		tsp->tv_sec = ts32.tv_sec;
 		tsp->tv_nsec = ts32.tv_nsec;
 	}
 	return (error);
 }
 
 static int
 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = copyin_timeout32(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		if (timeout.tv_nsec >= 1000000000 ||
 		    timeout.tv_nsec < 0) {
 			return (EINVAL);
 		}
 		ts = &timeout;
 	}
 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
 }
 
 static int
 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
 }
 
 static int
 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = copyin_timeout32(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		if (timeout.tv_nsec >= 1000000000 ||
 		    timeout.tv_nsec < 0)
 			return (EINVAL);
 		ts = &timeout;
 	}
 	return do_wait(td, uap->obj, uap->val, ts, 1);
 }
 
 static int
 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = copyin_timeout32(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		if (timeout.tv_nsec >= 1000000000 ||
 		    timeout.tv_nsec < 0)
 			return (EINVAL);
 		ts = &timeout;
 	}
 	return do_lock_umutex(td, uap->obj, ts, 0);
 }
 
 static int
 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = copyin_timeout32(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		if (timeout.tv_nsec >= 1000000000 ||
 		    timeout.tv_nsec < 0)
 			return (EINVAL);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static _umtx_op_func op_table_compat32[] = {
 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
 	__umtx_op_wake,			/* UMTX_OP_WAKE */
 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
 	__umtx_op_cv_broadcast		/* UMTX_OP_CV_BROADCAST */
 };
 
 int
 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
 {
 	if ((unsigned)uap->op < UMTX_OP_MAX)
 		return (*op_table_compat32[uap->op])(td,
 			(struct _umtx_op_args *)uap);
 	return (EINVAL);
 }
 #endif
 
 void
 umtx_thread_init(struct thread *td)
 {
 	td->td_umtxq = umtxq_alloc();
 	td->td_umtxq->uq_thread = td;
 }
 
 void
 umtx_thread_fini(struct thread *td)
 {
 	umtxq_free(td->td_umtxq);
 }
 
 /*
  * It will be called when new thread is created, e.g fork().
  */
 void
 umtx_thread_alloc(struct thread *td)
 {
 	struct umtx_q *uq;
 
 	uq = td->td_umtxq;
 	uq->uq_inherited_pri = PRI_MAX;
 
 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
 }
 
 /*
  * exec() hook.
  */
 static void
 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
 	struct image_params *imgp __unused)
 {
 	umtx_thread_cleanup(curthread);
 }
 
 /*
  * thread_exit() hook.
  */
 void
 umtx_thread_exit(struct thread *td)
 {
 	umtx_thread_cleanup(td);
 }
 
 /*
  * clean up umtx data.
  */
 static void
 umtx_thread_cleanup(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 
 	if ((uq = td->td_umtxq) == NULL)
 		return;
 
 	mtx_lock_spin(&sched_lock);
 	uq->uq_inherited_pri = PRI_MAX;
 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
 		pi->pi_owner = NULL;
 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
 	}
 	td->td_flags &= ~TDF_UBORROWING;
 	mtx_unlock_spin(&sched_lock);
 }
Index: head/sys/kern/kern_uuid.c
===================================================================
--- head/sys/kern/kern_uuid.c	(revision 167231)
+++ head/sys/kern/kern_uuid.c	(revision 167232)
@@ -1,362 +1,361 @@
 /*-
  * Copyright (c) 2002 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/socket.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/uuid.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 
 /*
  * See also:
  *	http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
  *	http://www.opengroup.org/onlinepubs/009629399/apdxa.htm
  *
  * Note that the generator state is itself an UUID, but the time and clock
  * sequence fields are written in the native byte order.
  */
 
 CTASSERT(sizeof(struct uuid) == 16);
 
 /* We use an alternative, more convenient representation in the generator. */
 struct uuid_private {
 	union {
 		uint64_t	ll;		/* internal. */
 		struct {
 			uint32_t	low;
 			uint16_t	mid;
 			uint16_t	hi;
 		} x;
 	} time;
 	uint16_t	seq;			/* Big-endian. */
 	uint16_t	node[UUID_NODE_LEN>>1];
 };
 
 CTASSERT(sizeof(struct uuid_private) == 16);
 
 static struct uuid_private uuid_last;
 
 static struct mtx uuid_mutex;
 MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF);
 
 /*
  * Return the first MAC address we encounter or, if none was found,
  * construct a sufficiently random multicast address. We don't try
  * to return the same MAC address as previously returned. We always
  * generate a new multicast address if no MAC address exists in the
  * system.
  * It would be nice to know if 'ifnet' or any of its sub-structures
  * has been changed in any way. If not, we could simply skip the
  * scan and safely return the MAC address we returned before.
  */
 static void
 uuid_node(uint16_t *node)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	int i;
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
 		/* Walk the address list */
 		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sdl = (struct sockaddr_dl*)ifa->ifa_addr;
 			if (sdl != NULL && sdl->sdl_family == AF_LINK &&
 			    sdl->sdl_type == IFT_ETHER) {
 				/* Got a MAC address. */
 				bcopy(LLADDR(sdl), node, UUID_NODE_LEN);
 				IFNET_RUNLOCK();
 				return;
 			}
 		}
 	}
 	IFNET_RUNLOCK();
 
 	for (i = 0; i < (UUID_NODE_LEN>>1); i++)
 		node[i] = (uint16_t)arc4random();
 	*((uint8_t*)node) |= 0x01;
 }
 
 /*
  * Get the current time as a 60 bit count of 100-nanosecond intervals
  * since 00:00:00.00, October 15,1582. We apply a magic offset to convert
  * the Unix time since 00:00:00.00, Januari 1, 1970 to the date of the
  * Gregorian reform to the Christian calendar.
  */
 static uint64_t
 uuid_time(void)
 {
 	struct bintime bt;
 	uint64_t time = 0x01B21DD213814000LL;
 
 	bintime(&bt);
 	time += (uint64_t)bt.sec * 10000000LL;
 	time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32;
 	return (time & ((1LL << 60) - 1LL));
 }
 
 struct uuid *
 kern_uuidgen(struct uuid *store, size_t count)
 {
 	struct uuid_private uuid;
 	uint64_t time;
 	size_t n;
 
 	mtx_lock(&uuid_mutex);
 
 	uuid_node(uuid.node);
 	time = uuid_time();
 
 	if (uuid_last.time.ll == 0LL || uuid_last.node[0] != uuid.node[0] ||
 	    uuid_last.node[1] != uuid.node[1] ||
 	    uuid_last.node[2] != uuid.node[2])
 		uuid.seq = (uint16_t)arc4random() & 0x3fff;
 	else if (uuid_last.time.ll >= time)
 		uuid.seq = (uuid_last.seq + 1) & 0x3fff;
 	else
 		uuid.seq = uuid_last.seq;
 
 	uuid_last = uuid;
 	uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL);
 
 	mtx_unlock(&uuid_mutex);
 
 	/* Set sequence and variant and deal with byte order. */
 	uuid.seq = htobe16(uuid.seq | 0x8000);
 
 	for (n = 0; n < count; n++) {
 		/* Set time and version (=1). */
 		uuid.time.x.low = (uint32_t)time;
 		uuid.time.x.mid = (uint16_t)(time >> 32);
 		uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12);
 		store[n] = *(struct uuid *)&uuid;
 		time++;
 	}
 
 	return (store);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct uuidgen_args {
 	struct uuid *store;
 	int	count;
 };
 #endif
-
 int
 uuidgen(struct thread *td, struct uuidgen_args *uap)
 {
 	struct uuid *store;
 	size_t count;
 	int error;
 
 	/*
 	 * Limit the number of UUIDs that can be created at the same time
 	 * to some arbitrary number. This isn't really necessary, but I
 	 * like to have some sort of upper-bound that's less than 2G :-)
 	 * XXX probably needs to be tunable.
 	 */
 	if (uap->count < 1 || uap->count > 2048)
 		return (EINVAL);
 
 	count = uap->count;
 	store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK);
 	kern_uuidgen(store, count);
 	error = copyout(store, uap->store, count * sizeof(struct uuid));
 	free(store, M_TEMP);
 	return (error);
 }
 
 int
 snprintf_uuid(char *buf, size_t sz, struct uuid *uuid)
 {
 	struct uuid_private *id;
 	int cnt;
 
 	id = (struct uuid_private *)uuid;
 	cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x",
 	    id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq),
 	    be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2]));
 	return (cnt);
 }
 
 int
 printf_uuid(struct uuid *uuid)
 {
 	char buf[38];
 
 	snprintf_uuid(buf, sizeof(buf), uuid);
 	return (printf("%s", buf));
 }
 
 int
 sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid)
 {
 	char buf[38];
 
 	snprintf_uuid(buf, sizeof(buf), uuid);
 	return (sbuf_printf(sb, "%s", buf));
 }
 
 /*
  * Encode/Decode UUID into byte-stream.
  *   http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
  *
  * 0                   1                   2                   3
  *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *  |                          time_low                             |
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *  |       time_mid                |         time_hi_and_version   |
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *  |clk_seq_hi_res |  clk_seq_low  |         node (0-1)            |
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *  |                         node (2-5)                            |
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
 
 void
 le_uuid_enc(void *buf, struct uuid const *uuid)
 {
 	u_char *p;
 	int i;
 
 	p = buf;
 	le32enc(p, uuid->time_low);
 	le16enc(p + 4, uuid->time_mid);
 	le16enc(p + 6, uuid->time_hi_and_version);
 	p[8] = uuid->clock_seq_hi_and_reserved;
 	p[9] = uuid->clock_seq_low;
 	for (i = 0; i < _UUID_NODE_LEN; i++)
 		p[10 + i] = uuid->node[i];
 }
 
 void
 le_uuid_dec(void const *buf, struct uuid *uuid)
 {
 	u_char const *p;
 	int i;
 
 	p = buf;
 	uuid->time_low = le32dec(p);
 	uuid->time_mid = le16dec(p + 4);
 	uuid->time_hi_and_version = le16dec(p + 6);
 	uuid->clock_seq_hi_and_reserved = p[8];
 	uuid->clock_seq_low = p[9];
 	for (i = 0; i < _UUID_NODE_LEN; i++)
 		uuid->node[i] = p[10 + i];
 }
 
 void
 be_uuid_enc(void *buf, struct uuid const *uuid)
 {
 	u_char *p;
 	int i;
 
 	p = buf;
 	be32enc(p, uuid->time_low);
 	be16enc(p + 4, uuid->time_mid);
 	be16enc(p + 6, uuid->time_hi_and_version);
 	p[8] = uuid->clock_seq_hi_and_reserved;
 	p[9] = uuid->clock_seq_low;
 	for (i = 0; i < _UUID_NODE_LEN; i++)
 		p[10 + i] = uuid->node[i];
 }
 
 void
 be_uuid_dec(void const *buf, struct uuid *uuid)
 {
 	u_char const *p;
 	int i;
 
 	p = buf;
 	uuid->time_low = be32dec(p);
 	uuid->time_mid = le16dec(p + 4);
 	uuid->time_hi_and_version = be16dec(p + 6);
 	uuid->clock_seq_hi_and_reserved = p[8];
 	uuid->clock_seq_low = p[9];
 	for (i = 0; i < _UUID_NODE_LEN; i++)
 		uuid->node[i] = p[10 + i];
 }
 
 int
 parse_uuid(const char *str, struct uuid *uuid)
 {
 	u_int c[11];
 	int n;
 
 	/* An empty string represents a nil UUID. */
 	if (*str == '\0') {
 		bzero(uuid, sizeof(*uuid));
 		return (0);
 	}
 
 	/* The UUID string representation has a fixed length. */
 	if (strlen(str) != 36)
 		return (EINVAL);
 
 	/*
 	 * We only work with "new" UUIDs. New UUIDs have the form:
 	 *      01234567-89ab-cdef-0123-456789abcdef
 	 * The so called "old" UUIDs, which we don't support, have the form:
 	 *      0123456789ab.cd.ef.01.23.45.67.89.ab
 	 */
 	if (str[8] != '-')
 		return (EINVAL);
 
 	n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1,
 	    c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10);
 	/* Make sure we have all conversions. */
 	if (n != 11)
 		return (EINVAL);
 
 	/* Successful scan. Build the UUID. */
 	uuid->time_low = c[0];
 	uuid->time_mid = c[1];
 	uuid->time_hi_and_version = c[2];
 	uuid->clock_seq_hi_and_reserved = c[3];
 	uuid->clock_seq_low = c[4];
 	for (n = 0; n < 6; n++)
 		uuid->node[n] = c[n + 5];
 
 	/* Check semantics... */
 	return (((c[3] & 0x80) != 0x00 &&		/* variant 0? */
 	    (c[3] & 0xc0) != 0x80 &&			/* variant 1? */
 	    (c[3] & 0xe0) != 0xc0) ? EINVAL : 0);	/* variant 2? */
 }
Index: head/sys/kern/kern_xxx.c
===================================================================
--- head/sys/kern/kern_xxx.c	(revision 167231)
+++ head/sys/kern/kern_xxx.c	(revision 167232)
@@ -1,290 +1,288 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_xxx.c	8.2 (Berkeley) 11/14/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/utsname.h>
 
 
 #if defined(COMPAT_43)
 
 #ifndef _SYS_SYSPROTO_H_
 struct gethostname_args {
 	char	*hostname;
 	u_int	len;
 };
 #endif
 /* ARGSUSED */
 int
 ogethostname(td, uap)
 	struct thread *td;
 	struct gethostname_args *uap;
 {
 	int name[2];
 	int error;
 	size_t len = uap->len;
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTNAME;
 	mtx_lock(&Giant);
 	error = userland_sysctl(td, name, 2, uap->hostname, &len,
 	    1, 0, 0, 0, 0);
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sethostname_args {
 	char	*hostname;
 	u_int	len;
 };
 #endif
 /* ARGSUSED */
 int
 osethostname(td, uap)
 	struct thread *td;
 	register struct sethostname_args *uap;
 {
 	int name[2];
 	int error;
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTNAME;
 	mtx_lock(&Giant);
 	error = userland_sysctl(td, name, 2, 0, 0, 0, uap->hostname,
 	    uap->len, 0, 0);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ogethostid_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 ogethostid(td, uap)
 	struct thread *td;
 	struct ogethostid_args *uap;
 {
 
 	*(long *)(td->td_retval) = hostid;
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
 struct osethostid_args {
 	long	hostid;
 };
 #endif
 /* ARGSUSED */
 int
 osethostid(td, uap)
 	struct thread *td;
 	struct osethostid_args *uap;
 {
 	int error;
 
 	error = priv_check(td, PRIV_SETHOSTID);
 	if (error)
 		return (error);
 	mtx_lock(&Giant);
 	hostid = uap->hostid;
 	mtx_unlock(&Giant);
 	return (0);
 }
 
 int
 oquota(td, uap)
 	struct thread *td;
 	struct oquota_args *uap;
 {
+
 	return (ENOSYS);
 }
 #endif /* COMPAT_43 */
 
 /*
- * This is the FreeBSD-1.1 compatable uname(2) interface.  These
- * days it is done in libc as a wrapper around a bunch of sysctl's.
- * This must maintain the old 1.1 binary ABI.
+ * This is the FreeBSD-1.1 compatable uname(2) interface.  These days it is
+ * done in libc as a wrapper around a bunch of sysctl's.  This must maintain
+ * the old 1.1 binary ABI.
  */
 #if SYS_NMLN != 32
 #error "FreeBSD-1.1 uname syscall has been broken"
 #endif
 #ifndef _SYS_SYSPROTO_H_
 struct uname_args {
         struct utsname  *name;
 };
 #endif
-
 /* ARGSUSED */
 int
 uname(td, uap)
 	struct thread *td;
 	struct uname_args *uap;
 {
 	int name[2], error;
 	size_t len;
 	char *s, *us;
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_OSTYPE;
 	len = sizeof (uap->name->sysname);
 	mtx_lock(&Giant);
 	error = userland_sysctl(td, name, 2, uap->name->sysname, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		goto done2;
 	subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
 
 	name[1] = KERN_HOSTNAME;
 	len = sizeof uap->name->nodename;
 	error = userland_sysctl(td, name, 2, uap->name->nodename, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		goto done2;
 	subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
 
 	name[1] = KERN_OSRELEASE;
 	len = sizeof uap->name->release;
 	error = userland_sysctl(td, name, 2, uap->name->release, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		goto done2;
 	subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
 
 /*
 	name = KERN_VERSION;
 	len = sizeof uap->name->version;
 	error = userland_sysctl(td, name, 2, uap->name->version, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		goto done2;
 	subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
 */
 
 /*
  * this stupid hackery to make the version field look like FreeBSD 1.1
  */
 	for(s = version; *s && *s != '#'; s++);
 
 	for(us = uap->name->version; *s && *s != ':'; s++) {
 		error = subyte( us++, *s);
 		if (error)
 			goto done2;
 	}
 	error = subyte( us++, 0);
 	if (error)
 		goto done2;
 
 	name[0] = CTL_HW;
 	name[1] = HW_MACHINE;
 	len = sizeof uap->name->machine;
 	error = userland_sysctl(td, name, 2, uap->name->machine, &len, 
 		1, 0, 0, 0, 0);
 	if (error)
 		goto done2;
 	subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getdomainname_args {
         char    *domainname;
         int     len;
 };
 #endif
-
 /* ARGSUSED */
 int
 getdomainname(td, uap)
         struct thread *td;
         struct getdomainname_args *uap;
 {
 	int domainnamelen;
 	int error;
 
 	mtx_lock(&Giant);
 	domainnamelen = strlen(domainname) + 1;
 	if ((u_int)uap->len > domainnamelen)
 		uap->len = domainnamelen;
 	error = copyout(domainname, uap->domainname, uap->len);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setdomainname_args {
         char    *domainname;
         int     len;
 };
 #endif
-
 /* ARGSUSED */
 int
 setdomainname(td, uap)
         struct thread *td;
         struct setdomainname_args *uap;
 {
         int error, domainnamelen;
 
 	error = priv_check(td, PRIV_SETDOMAINNAME);
 	if (error)
 		return (error);
 	mtx_lock(&Giant);
         if ((u_int)uap->len > sizeof (domainname) - 1) {
 		error = EINVAL;
 		goto done2;
 	}
         domainnamelen = uap->len;
         error = copyin(uap->domainname, domainname, uap->len);
         domainname[domainnamelen] = 0;
 done2:
 	mtx_unlock(&Giant);
         return (error);
 }
Index: head/sys/kern/p1003_1b.c
===================================================================
--- head/sys/kern/p1003_1b.c	(revision 167231)
+++ head/sys/kern/p1003_1b.c	(revision 167232)
@@ -1,320 +1,320 @@
 /*-
  * Copyright (c) 1996, 1997, 1998
  *	HD Associates, Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by HD Associates, Inc
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /* p1003_1b: Real Time common code.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_posix.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 
 MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
 
-/* The system calls return ENOSYS if an entry is called that is
- * not run-time supported.  I am also logging since some programs
- * start to use this when they shouldn't.  That will be removed if annoying.
+/* The system calls return ENOSYS if an entry is called that is not run-time
+ * supported.  I am also logging since some programs start to use this when
+ * they shouldn't.  That will be removed if annoying.
  */
 int
 syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap)
 {
 	log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
 			td->td_proc->p_comm, td->td_proc->p_pid, s);
 
 	/* a " return nosys(p, uap); " here causes a core dump.
 	 */
 
 	return ENOSYS;
 }
 
 #if !defined(_KPOSIX_PRIORITY_SCHEDULING)
 
 /* Not configured but loadable via a module:
  */
 
 static int
 sched_attach(void)
 {
 	return 0;
 }
 
 SYSCALL_NOT_PRESENT_GEN(sched_setparam)
 SYSCALL_NOT_PRESENT_GEN(sched_getparam)
 SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
 SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
 SYSCALL_NOT_PRESENT_GEN(sched_yield)
 SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
 SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
 SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
 #else
 
 /* Configured in kernel version:
  */
 static struct ksched *ksched;
 
 static int
 sched_attach(void)
 {
 	int ret = ksched_attach(&ksched);
 
 	if (ret == 0)
 		p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1);
 
 	return ret;
 }
 
 int
 sched_setparam(struct thread *td, struct sched_setparam_args *uap)
 {
 	struct thread *targettd;
 	struct proc *targetp;
 	int e;
 	struct sched_param sched_param;
 
 	e = copyin(uap->param, &sched_param, sizeof(sched_param));
 	if (e)
 		return (e);
 
 	if (uap->pid == 0) {
 		targetp = td->td_proc;
 		targettd = td;
 		PROC_LOCK(targetp);
 	} else {
 		targetp = pfind(uap->pid);
 		if (targetp == NULL)
 			return (ESRCH);
 		targettd = FIRST_THREAD_IN_PROC(targetp);
 	}
 
 	e = p_cansched(td, targetp);
 	if (e == 0) {
 		e = ksched_setparam(ksched, targettd,
 			(const struct sched_param *)&sched_param);
 	}
 	PROC_UNLOCK(targetp);
 	return (e);
 }
 
 int
 sched_getparam(struct thread *td, struct sched_getparam_args *uap)
 {
 	int e;
 	struct sched_param sched_param;
 	struct thread *targettd;
 	struct proc *targetp;
 
 	if (uap->pid == 0) {
 		targetp = td->td_proc;
 		targettd = td;
 		PROC_LOCK(targetp);
 	} else {
 		targetp = pfind(uap->pid);
 		if (targetp == NULL) {
 			return (ESRCH);
 		}
 		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
 	}
 
 	e = p_cansee(td, targetp);
 	if (e == 0) {
 		e = ksched_getparam(ksched, targettd, &sched_param);
 	}
 	PROC_UNLOCK(targetp);
 	if (e == 0)
 		e = copyout(&sched_param, uap->param, sizeof(sched_param));
 	return (e);
 }
 
 int
 sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap)
 {
 	int e;
 	struct sched_param sched_param;
 	struct thread *targettd;
 	struct proc *targetp;
 
 	/* Don't allow non root user to set a scheduler policy. */
 	e = priv_check(td, PRIV_SCHED_SET);
 	if (e)
 		return (e);
 
 	e = copyin(uap->param, &sched_param, sizeof(sched_param));
 	if (e)
 		return (e);
 
 	if (uap->pid == 0) {
 		targetp = td->td_proc;
 		targettd = td;
 		PROC_LOCK(targetp);
 	} else {
 		targetp = pfind(uap->pid);
 		if (targetp == NULL)
 			return (ESRCH);
 		targettd = FIRST_THREAD_IN_PROC(targetp);
 	}
 
 	e = p_cansched(td, targetp);
 	if (e == 0) {
 		e = ksched_setscheduler(ksched, targettd,
 			uap->policy, (const struct sched_param *)&sched_param);
 	}
 	PROC_UNLOCK(targetp);
 	return (e);
 }
 
 int
 sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap)
 {
 	int e, policy;
 	struct thread *targettd;
 	struct proc *targetp;
 
 	if (uap->pid == 0) {
 		targetp = td->td_proc;
 		targettd = td;
 		PROC_LOCK(targetp);
 	} else {
 		targetp = pfind(uap->pid);
 		if (targetp == NULL) {
 			e = ESRCH;
 			goto done2;
 		}
 		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
 	}
 
 	e = p_cansee(td, targetp);
 	if (e == 0) {
 		e = ksched_getscheduler(ksched, targettd, &policy);
 		td->td_retval[0] = policy;
 	}
 	PROC_UNLOCK(targetp);
 
 done2:
 	return (e);
 }
 
 int
 sched_yield(struct thread *td, struct sched_yield_args *uap)
 {
 
 	return (ksched_yield(ksched));
 }
 
 int
 sched_get_priority_max(struct thread *td,
     struct sched_get_priority_max_args *uap)
 {
 	int error, prio;
 
 	error = ksched_get_priority_max(ksched, uap->policy, &prio);
 	td->td_retval[0] = prio;
 	return (error);
 }
 
 int
 sched_get_priority_min(struct thread *td,
     struct sched_get_priority_min_args *uap)
 {
 	int error, prio;
 
 	error = ksched_get_priority_min(ksched, uap->policy, &prio);
 	td->td_retval[0] = prio;
 	return (error);
 }
 
 int
 sched_rr_get_interval(struct thread *td,
     struct sched_rr_get_interval_args *uap)
 {
 	struct timespec timespec;
 	int error;
 
 	error = kern_sched_rr_get_interval(td, uap->pid, &timespec);
 	if (error == 0)
 		error = copyout(&timespec, uap->interval, sizeof(timespec));
 	return (error);
 }
 
 int
 kern_sched_rr_get_interval(struct thread *td, pid_t pid,
     struct timespec *ts)
 {
 	int e;
 	struct thread *targettd;
 	struct proc *targetp;
 
 	if (pid == 0) {
 		targettd = td;
 		targetp = td->td_proc;
 		PROC_LOCK(targetp);
 	} else {
 		targetp = td->td_proc;
 		PROC_LOCK(targetp);
 		targettd = thread_find(targetp, pid);
 		if (targettd == NULL) {
 			PROC_UNLOCK(targetp);
 			return (ESRCH);
 		}
 	}
 
 	e = p_cansee(td, targetp);
 	if (e == 0)
 		e = ksched_rr_get_interval(ksched, targettd, ts);
 	PROC_UNLOCK(targetp);
 	return (e);
 }
 
 #endif
 
 static void
 p31binit(void *notused)
 {
 	(void) sched_attach();
 	p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
 }
 
 SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
Index: head/sys/kern/sys_generic.c
===================================================================
--- head/sys/kern/sys_generic.c	(revision 167231)
+++ head/sys/kern/sys_generic.c	(revision 167232)
@@ -1,1147 +1,1132 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/sleepqueue.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/condvar.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 
 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
 
 static int	pollscan(struct thread *, struct pollfd *, u_int);
 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
 static int	dofileread(struct thread *, int, struct file *, struct uio *,
 		    off_t, int);
 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
 		    off_t, int);
 static void	doselwakeup(struct selinfo *, int);
 
-/*
- * Read system call.
- */
 #ifndef _SYS_SYSPROTO_H_
 struct read_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 };
 #endif
 int
 read(td, uap)
 	struct thread *td;
 	struct read_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > INT_MAX)
 		return (EINVAL);
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_readv(td, uap->fd, &auio);
 	return(error);
 }
 
 /*
  * Positioned read system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pread_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
 int
 pread(td, uap)
 	struct thread *td;
 	struct pread_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > INT_MAX)
 		return (EINVAL);
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
 	return(error);
 }
 
 /*
  * Scatter read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
 int
 readv(struct thread *td, struct readv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_readv(td, uap->fd, auio);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_readv(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_read(td, fd, &fp);
 	if (error)
 		return (error);
 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Scatter positioned read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct preadv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 	off_t	offset;
 };
 #endif
 int
 preadv(struct thread *td, struct preadv_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_preadv(td, uap->fd, auio, uap->offset);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_preadv(td, fd, auio, offset)
 	struct thread *td;
 	int fd;
 	struct uio *auio;
 	off_t offset;
 {
 	struct file *fp;
 	int error;
 
 	error = fget_read(td, fd, &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
 		error = EINVAL;
 	else
 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common code for readv and preadv that reads data in
  * from a file using the passed in uio, offset, and flags.
  */
 static int
 dofileread(td, fd, fp, auio, offset, flags)
 	struct thread *td;
 	int fd;
 	struct file *fp;
 	struct uio *auio;
 	off_t offset;
 	int flags;
 {
 	ssize_t cnt;
 	int error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	/* Finish zero length reads right here */
 	if (auio->uio_resid == 0) {
 		td->td_retval[0] = 0;
 		return(0);
 	}
 	auio->uio_rw = UIO_READ;
 	auio->uio_offset = offset;
 	auio->uio_td = td;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO)) 
 		ktruio = cloneuio(auio);
 #endif
 	cnt = auio->uio_resid;
 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
 		if (auio->uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 	cnt -= auio->uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = cnt;
 		ktrgenio(fd, UIO_READ, ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
-/*
- * Write system call
- */
 #ifndef _SYS_SYSPROTO_H_
 struct write_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 };
 #endif
 int
 write(td, uap)
 	struct thread *td;
 	struct write_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > INT_MAX)
 		return (EINVAL);
 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_writev(td, uap->fd, &auio);
 	return(error);
 }
 
 /*
  * Positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwrite_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
 int
 pwrite(td, uap)
 	struct thread *td;
 	struct pwrite_args *uap;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if (uap->nbyte > INT_MAX)
 		return (EINVAL);
 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
 	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_resid = uap->nbyte;
 	auio.uio_segflg = UIO_USERSPACE;
 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
 	return(error);
 }
 
 /*
  * Gather write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct writev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
 int
 writev(struct thread *td, struct writev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_writev(td, uap->fd, auio);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_writev(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_write(td, fd, &fp);
 	if (error)
 		return (error);
 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Gather positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwritev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 	off_t	offset;
 };
 #endif
 int
 pwritev(struct thread *td, struct pwritev_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_pwritev(td, fd, auio, offset)
 	struct thread *td;
 	struct uio *auio;
 	int fd;
 	off_t offset;
 {
 	struct file *fp;
 	int error;
 
 	error = fget_write(td, fd, &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
 		error = EINVAL;
 	else
 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common code for writev and pwritev that writes data to
  * a file using the passed in uio, offset, and flags.
  */
 static int
 dofilewrite(td, fd, fp, auio, offset, flags)
 	struct thread *td;
 	int fd;
 	struct file *fp;
 	struct uio *auio;
 	off_t offset;
 	int flags;
 {
 	ssize_t cnt;
 	int error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	auio->uio_rw = UIO_WRITE;
 	auio->uio_td = td;
 	auio->uio_offset = offset;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(auio);
 #endif
 	cnt = auio->uio_resid;
 	if (fp->f_type == DTYPE_VNODE)
 		bwillwrite();
 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
 		if (auio->uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Socket layer is responsible for issuing SIGPIPE. */
 		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	cnt -= auio->uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = cnt;
 		ktrgenio(fd, UIO_WRITE, ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
-/*
- * Ioctl system call.
- */
 #ifndef _SYS_SYSPROTO_H_
 struct ioctl_args {
 	int	fd;
 	u_long	com;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 ioctl(struct thread *td, struct ioctl_args *uap)
 {
 	u_long com;
 	int arg, error;
 	u_int size;
 	caddr_t data;
 
 	if (uap->com > 0xffffffff) {
 		printf(
 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
 		uap->com &= 0xffffffff;
 	}
 	com = uap->com;
 
 	/*
 	 * Interpret high order word to find amount of data to be
 	 * copied to/from the user's address space.
 	 */
 	size = IOCPARM_LEN(com);
 	if ((size > IOCPARM_MAX) ||
 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
 	    ((com & IOC_OUT) && size == 0) ||
 #else
 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
 #endif
 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
 		return (ENOTTY);
 
 	if (size > 0) {
 		if (!(com & IOC_VOID))
 			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
 		else {
 			/* Integer argument. */
 			arg = (intptr_t)uap->data;
 			data = (void *)&arg;
 			size = 0;
 		}
 	} else
 		data = (void *)&uap->data;
 	if (com & IOC_IN) {
 		error = copyin(uap->data, data, (u_int)size);
 		if (error) {
 			if (size > 0)
 				free(data, M_IOCTLOPS);
 			return (error);
 		}
 	} else if (com & IOC_OUT) {
 		/*
 		 * Zero the buffer so the user always
 		 * gets back something deterministic.
 		 */
 		bzero(data, size);
 	}
 
 	error = kern_ioctl(td, uap->fd, com, data);
 
 	if (error == 0 && (com & IOC_OUT))
 		error = copyout(data, uap->data, (u_int)size);
 
 	if (size > 0)
 		free(data, M_IOCTLOPS);
 	return (error);
 }
 
 int
 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	int error;
 	int tmp;
 
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	fdp = td->td_proc->p_fd;
 	switch (com) {
 	case FIONCLEX:
 		FILEDESC_LOCK_FAST(fdp);
 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
 		FILEDESC_UNLOCK_FAST(fdp);
 		goto out;
 	case FIOCLEX:
 		FILEDESC_LOCK_FAST(fdp);
 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
 		FILEDESC_UNLOCK_FAST(fdp);
 		goto out;
 	case FIONBIO:
 		FILE_LOCK(fp);
 		if ((tmp = *(int *)data))
 			fp->f_flag |= FNONBLOCK;
 		else
 			fp->f_flag &= ~FNONBLOCK;
 		FILE_UNLOCK(fp);
 		data = (void *)&tmp;
 		break;
 	case FIOASYNC:
 		FILE_LOCK(fp);
 		if ((tmp = *(int *)data))
 			fp->f_flag |= FASYNC;
 		else
 			fp->f_flag &= ~FASYNC;
 		FILE_UNLOCK(fp);
 		data = (void *)&tmp;
 		break;
 	}
 
 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * sellock and selwait are initialized in selectinit() via SYSINIT.
  */
 struct mtx	sellock;
 struct cv	selwait;
 u_int		nselcoll;	/* Select collisions since boot */
 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
 
-/*
- * Select system call.
- */
 #ifndef _SYS_SYSPROTO_H_
 struct select_args {
 	int	nd;
 	fd_set	*in, *ou, *ex;
 	struct	timeval *tv;
 };
 #endif
 int
 select(td, uap)
 	register struct thread *td;
 	register struct select_args *uap;
 {
 	struct timeval tv, *tvp;
 	int error;
 
 	if (uap->tv != NULL) {
 		error = copyin(uap->tv, &tv, sizeof(tv));
 		if (error)
 			return (error);
 		tvp = &tv;
 	} else
 		tvp = NULL;
 
 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
 }
 
 int
 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
     fd_set *fd_ex, struct timeval *tvp)
 {
 	struct filedesc *fdp;
 	/*
 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
 	 * of 256.
 	 */
 	fd_mask s_selbits[howmany(2048, NFDBITS)];
 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
 	struct timeval atv, rtv, ttv;
 	int error, timo;
 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
 
 	if (nd < 0)
 		return (EINVAL);
 	fdp = td->td_proc->p_fd;
 	
 	FILEDESC_LOCK_FAST(fdp);
 
 	if (nd > td->td_proc->p_fd->fd_nfiles)
 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
 	FILEDESC_UNLOCK_FAST(fdp);
 
 	/*
 	 * Allocate just enough bits for the non-null fd_sets.  Use the
 	 * preallocated auto buffer if possible.
 	 */
 	nfdbits = roundup(nd, NFDBITS);
 	ncpbytes = nfdbits / NBBY;
 	nbufbytes = 0;
 	if (fd_in != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (fd_ou != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (fd_ex != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (nbufbytes <= sizeof s_selbits)
 		selbits = &s_selbits[0];
 	else
 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
 
 	/*
 	 * Assign pointers into the bit buffers and fetch the input bits.
 	 * Put the output buffers together so that they can be bzeroed
 	 * together.
 	 */
 	sbp = selbits;
 #define	getbits(name, x) \
 	do {								\
 		if (name == NULL)					\
 			ibits[x] = NULL;				\
 		else {							\
 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
 			obits[x] = sbp;					\
 			sbp += ncpbytes / sizeof *sbp;			\
 			error = copyin(name, ibits[x], ncpbytes);	\
 			if (error != 0)					\
 				goto done_nosellock;			\
 		}							\
 	} while (0)
 	getbits(fd_in, 0);
 	getbits(fd_ou, 1);
 	getbits(fd_ex, 2);
 #undef	getbits
 	if (nbufbytes != 0)
 		bzero(selbits, nbufbytes / 2);
 
 	if (tvp != NULL) {
 		atv = *tvp;
 		if (itimerfix(&atv)) {
 			error = EINVAL;
 			goto done_nosellock;
 		}
 		getmicrouptime(&rtv);
 		timevaladd(&atv, &rtv);
 	} else {
 		atv.tv_sec = 0;
 		atv.tv_usec = 0;
 	}
 	timo = 0;
 	TAILQ_INIT(&td->td_selq);
 	mtx_lock(&sellock);
 retry:
 	ncoll = nselcoll;
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&sellock);
 
 	error = selscan(td, ibits, obits, nd);
 	mtx_lock(&sellock);
 	if (error || td->td_retval[0])
 		goto done;
 	if (atv.tv_sec || atv.tv_usec) {
 		getmicrouptime(&rtv);
 		if (timevalcmp(&rtv, &atv, >=))
 			goto done;
 		ttv = atv;
 		timevalsub(&ttv, &rtv);
 		timo = ttv.tv_sec > 24 * 60 * 60 ?
 		    24 * 60 * 60 * hz : tvtohz(&ttv);
 	}
 
 	/*
 	 * An event of interest may occur while we do not hold
 	 * sellock, so check TDF_SELECT and the number of
 	 * collisions and rescan the file descriptors if
 	 * necessary.
 	 */
 	mtx_lock_spin(&sched_lock);
 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
 		mtx_unlock_spin(&sched_lock);
 		goto retry;
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	if (timo > 0)
 		error = cv_timedwait_sig(&selwait, &sellock, timo);
 	else
 		error = cv_wait_sig(&selwait, &sellock);
 	
 	if (error == 0)
 		goto retry;
 
 done:
 	clear_selinfo_list(td);
 	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&sellock);
 
 done_nosellock:
 	/* select is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 #define	putbits(name, x) \
 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
 		error = error2;
 	if (error == 0) {
 		int error2;
 
 		putbits(fd_in, 0);
 		putbits(fd_ou, 1);
 		putbits(fd_ex, 2);
 #undef putbits
 	}
 	if (selbits != &s_selbits[0])
 		free(selbits, M_SELECT);
 
 	return (error);
 }
 
 static int
 selscan(td, ibits, obits, nfd)
 	struct thread *td;
 	fd_mask **ibits, **obits;
 	int nfd;
 {
 	int msk, i, fd;
 	fd_mask bits;
 	struct file *fp;
 	int n = 0;
 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
 	struct filedesc *fdp = td->td_proc->p_fd;
 
 	FILEDESC_LOCK(fdp);
 	for (msk = 0; msk < 3; msk++) {
 		if (ibits[msk] == NULL)
 			continue;
 		for (i = 0; i < nfd; i += NFDBITS) {
 			bits = ibits[msk][i/NFDBITS];
 			/* ffs(int mask) not portable, fd_mask is long */
 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
 				if (!(bits & 1))
 					continue;
 				if ((fp = fget_locked(fdp, fd)) == NULL) {
 					FILEDESC_UNLOCK(fdp);
 					return (EBADF);
 				}
 				if (fo_poll(fp, flag[msk], td->td_ucred,
 				    td)) {
 					obits[msk][(fd)/NFDBITS] |=
 					    ((fd_mask)1 << ((fd) % NFDBITS));
 					n++;
 				}
 			}
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 	td->td_retval[0] = n;
 	return (0);
 }
 
-/*
- * Poll system call.
- */
 #ifndef _SYS_SYSPROTO_H_
 struct poll_args {
 	struct pollfd *fds;
 	u_int	nfds;
 	int	timeout;
 };
 #endif
 int
 poll(td, uap)
 	struct thread *td;
 	struct poll_args *uap;
 {
 	struct pollfd *bits;
 	struct pollfd smallbits[32];
 	struct timeval atv, rtv, ttv;
 	int error = 0, timo;
 	u_int ncoll, nfds;
 	size_t ni;
 
 	nfds = uap->nfds;
 
 	/*
 	 * This is kinda bogus.  We have fd limits, but that is not
 	 * really related to the size of the pollfd array.  Make sure
 	 * we let the process use at least FD_SETSIZE entries and at
 	 * least enough for the current limits.  We want to be reasonably
 	 * safe, but not overly restrictive.
 	 */
 	PROC_LOCK(td->td_proc);
 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
 	    (nfds > FD_SETSIZE)) {
 		PROC_UNLOCK(td->td_proc);
 		error = EINVAL;
 		goto done2;
 	}
 	PROC_UNLOCK(td->td_proc);
 	ni = nfds * sizeof(struct pollfd);
 	if (ni > sizeof(smallbits))
 		bits = malloc(ni, M_TEMP, M_WAITOK);
 	else
 		bits = smallbits;
 	error = copyin(uap->fds, bits, ni);
 	if (error)
 		goto done_nosellock;
 	if (uap->timeout != INFTIM) {
 		atv.tv_sec = uap->timeout / 1000;
 		atv.tv_usec = (uap->timeout % 1000) * 1000;
 		if (itimerfix(&atv)) {
 			error = EINVAL;
 			goto done_nosellock;
 		}
 		getmicrouptime(&rtv);
 		timevaladd(&atv, &rtv);
 	} else {
 		atv.tv_sec = 0;
 		atv.tv_usec = 0;
 	}
 	timo = 0;
 	TAILQ_INIT(&td->td_selq);
 	mtx_lock(&sellock);
 retry:
 	ncoll = nselcoll;
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&sellock);
 
 	error = pollscan(td, bits, nfds);
 	mtx_lock(&sellock);
 	if (error || td->td_retval[0])
 		goto done;
 	if (atv.tv_sec || atv.tv_usec) {
 		getmicrouptime(&rtv);
 		if (timevalcmp(&rtv, &atv, >=))
 			goto done;
 		ttv = atv;
 		timevalsub(&ttv, &rtv);
 		timo = ttv.tv_sec > 24 * 60 * 60 ?
 		    24 * 60 * 60 * hz : tvtohz(&ttv);
 	}
 	/*
 	 * An event of interest may occur while we do not hold
 	 * sellock, so check TDF_SELECT and the number of collisions
 	 * and rescan the file descriptors if necessary.
 	 */
 	mtx_lock_spin(&sched_lock);
 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
 		mtx_unlock_spin(&sched_lock);
 		goto retry;
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	if (timo > 0)
 		error = cv_timedwait_sig(&selwait, &sellock, timo);
 	else
 		error = cv_wait_sig(&selwait, &sellock);
 
 	if (error == 0)
 		goto retry;
 
 done:
 	clear_selinfo_list(td);
 	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	mtx_unlock(&sellock);
 
 done_nosellock:
 	/* poll is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 	if (error == 0) {
 		error = copyout(bits, uap->fds, ni);
 		if (error)
 			goto out;
 	}
 out:
 	if (ni > sizeof(smallbits))
 		free(bits, M_TEMP);
 done2:
 	return (error);
 }
 
 static int
 pollscan(td, fds, nfd)
 	struct thread *td;
 	struct pollfd *fds;
 	u_int nfd;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	int i;
 	struct file *fp;
 	int n = 0;
 
 	FILEDESC_LOCK(fdp);
 	for (i = 0; i < nfd; i++, fds++) {
 		if (fds->fd >= fdp->fd_nfiles) {
 			fds->revents = POLLNVAL;
 			n++;
 		} else if (fds->fd < 0) {
 			fds->revents = 0;
 		} else {
 			fp = fdp->fd_ofiles[fds->fd];
 			if (fp == NULL) {
 				fds->revents = POLLNVAL;
 				n++;
 			} else {
 				/*
 				 * Note: backend also returns POLLHUP and
 				 * POLLERR if appropriate.
 				 */
 				fds->revents = fo_poll(fp, fds->events,
 				    td->td_ucred, td);
 				if (fds->revents != 0)
 					n++;
 			}
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * OpenBSD poll system call.
  *
  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct openbsd_poll_args {
 	struct pollfd *fds;
 	u_int	nfds;
 	int	timeout;
 };
 #endif
 int
 openbsd_poll(td, uap)
 	register struct thread *td;
 	register struct openbsd_poll_args *uap;
 {
 	return (poll(td, (struct poll_args *)uap));
 }
 
 /*
  * Remove the references to the thread from all of the objects we were
  * polling.
  *
  * This code assumes that the underlying owner of the selinfo structure will
  * hold sellock before it changes it, and that it will unlink itself from our
  * list if it goes away.
  */
 void
 clear_selinfo_list(td)
 	struct thread *td;
 {
 	struct selinfo *si;
 
 	mtx_assert(&sellock, MA_OWNED);
 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
 		si->si_thread = NULL;
 	TAILQ_INIT(&td->td_selq);
 }
 
 /*
  * Record a select request.
  */
 void
 selrecord(selector, sip)
 	struct thread *selector;
 	struct selinfo *sip;
 {
 
 	mtx_lock(&sellock);
 	/*
 	 * If the selinfo's thread pointer is NULL then take ownership of it.
 	 *
 	 * If the thread pointer is not NULL and it points to another
 	 * thread, then we have a collision.
 	 *
 	 * If the thread pointer is not NULL and points back to us then leave
 	 * it alone as we've already added pointed it at us and added it to
 	 * our list.
 	 */
 	if (sip->si_thread == NULL) {
 		sip->si_thread = selector;
 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
 	} else if (sip->si_thread != selector) {
 		sip->si_flags |= SI_COLL;
 	}
 
 	mtx_unlock(&sellock);
 }
 
 /* Wake up a selecting thread. */
 void
 selwakeup(sip)
 	struct selinfo *sip;
 {
 	doselwakeup(sip, -1);
 }
 
 /* Wake up a selecting thread, and set its priority. */
 void
 selwakeuppri(sip, pri)
 	struct selinfo *sip;
 	int pri;
 {
 	doselwakeup(sip, pri);
 }
 
 /*
  * Do a wakeup when a selectable event occurs.
  */
 static void
 doselwakeup(sip, pri)
 	struct selinfo *sip;
 	int pri;
 {
 	struct thread *td;
 
 	mtx_lock(&sellock);
 	td = sip->si_thread;
 	if ((sip->si_flags & SI_COLL) != 0) {
 		nselcoll++;
 		sip->si_flags &= ~SI_COLL;
 		cv_broadcastpri(&selwait, pri);
 	}
 	if (td == NULL) {
 		mtx_unlock(&sellock);
 		return;
 	}
 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 	sip->si_thread = NULL;
 	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	sleepq_remove(td, &selwait);
 	mtx_unlock(&sellock);
 }
 
 static void selectinit(void *);
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
 
 /* ARGSUSED*/
 static void
 selectinit(dummy)
 	void *dummy;
 {
 	cv_init(&selwait, "select");
 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
 }
Index: head/sys/kern/sys_pipe.c
===================================================================
--- head/sys/kern/sys_pipe.c	(revision 167231)
+++ head/sys/kern/sys_pipe.c	(revision 167232)
@@ -1,1631 +1,1630 @@
 /*-
  * Copyright (c) 1996 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Absolutely no warranty of function or purpose is made by the author
  *    John S. Dyson.
  * 4. Modifications may be freely made to this file if the above conditions
  *    are met.
  */
 
 /*
  * This file contains a high-performance replacement for the socket-based
  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
  * all features of sockets, but does do everything that pipes normally
  * do.
  */
 
 /*
  * This code has two modes of operation, a small write mode and a large
  * write mode.  The small write mode acts like conventional pipes with
  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
  * the receiving process can copy it directly from the pages in the sending
  * process.
  *
  * If the sending process receives a signal, it is possible that it will
  * go away, and certainly its address space can change, because control
  * is returned back to the user-mode side.  In that case, the pipe code
  * arranges to copy the buffer supplied by the user process, to a pageable
  * kernel buffer, and the receiving process will grab the data from the
  * pageable kernel buffer.  Since signals don't happen all that often,
  * the copy operation is normally eliminated.
  *
  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
  * happen for small transfers so that the system will not spend all of
  * its time context switching.
  *
  * In order to limit the resource use of pipes, two sysctls exist:
  *
  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
  * address space available to us in pipe_map. This value is normally
  * autotuned, but may also be loader tuned.
  *
  * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
  * memory in use by pipes.
  *
  * Based on how large pipekva is relative to maxpipekva, the following
  * will happen:
  *
  * 0% - 50%:
  *     New pipes are given 16K of memory backing, pipes may dynamically
  *     grow to as large as 64K where needed.
  * 50% - 75%:
  *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
  *     existing pipes may NOT grow.
  * 75% - 100%:
  *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
  *     existing pipes will be shrunk down to 4K whenever possible.
  *
  * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
  * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
  * resize which MUST occur for reverse-direction pipes when they are
  * first used.
  *
  * Additional information about the current state of pipes may be obtained
  * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
  * and kern.ipc.piperesizefail.
  *
  * Locking rules:  There are two locks present here:  A mutex, used via
  * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
  * the flag, as mutexes can not persist over uiomove.  The mutex
  * exists only to guard access to the flag, and is not in itself a
  * locking mechanism.  Also note that there is only a single mutex for
  * both directions of a pipe.
  *
  * As pipelock() may have to sleep before it can acquire the flag, it
  * is important to reread all data after a call to pipelock(); everything
  * in the structure may have changed.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/ttycom.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/pipe.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/event.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 /*
  * Use this define if you want to disable *fancy* VM things.  Expect an
  * approx 30% decrease in transfer rate.  This could be useful for
  * NetBSD or OpenBSD.
  */
 /* #define PIPE_NODIRECT */
 
 /*
  * interfaces to the outside world
  */
 static fo_rdwr_t	pipe_read;
 static fo_rdwr_t	pipe_write;
 static fo_ioctl_t	pipe_ioctl;
 static fo_poll_t	pipe_poll;
 static fo_kqfilter_t	pipe_kqfilter;
 static fo_stat_t	pipe_stat;
 static fo_close_t	pipe_close;
 
 static struct fileops pipeops = {
 	.fo_read = pipe_read,
 	.fo_write = pipe_write,
 	.fo_ioctl = pipe_ioctl,
 	.fo_poll = pipe_poll,
 	.fo_kqfilter = pipe_kqfilter,
 	.fo_stat = pipe_stat,
 	.fo_close = pipe_close,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_pipedetach(struct knote *kn);
 static int	filt_piperead(struct knote *kn, long hint);
 static int	filt_pipewrite(struct knote *kn, long hint);
 
 static struct filterops pipe_rfiltops =
 	{ 1, NULL, filt_pipedetach, filt_piperead };
 static struct filterops pipe_wfiltops =
 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
 
 /*
  * Default pipe buffer size(s), this can be kind-of large now because pipe
  * space is pageable.  The pipe code will try to maintain locality of
  * reference for performance reasons, so small amounts of outstanding I/O
  * will not wipe the cache.
  */
 #define MINPIPESIZE (PIPE_SIZE/3)
 #define MAXPIPESIZE (2*PIPE_SIZE/3)
 
 static int amountpipes;
 static int amountpipekva;
 static int pipefragretry;
 static int pipeallocfail;
 static int piperesizefail;
 static int piperesizeallowed = 1;
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
 	   &maxpipekva, 0, "Pipe KVA limit");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
 	   &amountpipes, 0, "Current # of pipes");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
 	   &amountpipekva, 0, "Pipe KVA usage");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
 	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
 	  &pipeallocfail, 0, "Pipe allocation failures");
 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
 	  &piperesizefail, 0, "Pipe resize failures");
 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
 	  &piperesizeallowed, 0, "Pipe resizing allowed");
 
 static void pipeinit(void *dummy __unused);
 static void pipeclose(struct pipe *cpipe);
 static void pipe_free_kmem(struct pipe *cpipe);
 static int pipe_create(struct pipe *pipe, int backing);
 static __inline int pipelock(struct pipe *cpipe, int catch);
 static __inline void pipeunlock(struct pipe *cpipe);
 static __inline void pipeselwakeup(struct pipe *cpipe);
 #ifndef PIPE_NODIRECT
 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
 static void pipe_destroy_write_buffer(struct pipe *wpipe);
 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
 static void pipe_clone_write_buffer(struct pipe *wpipe);
 #endif
 static int pipespace(struct pipe *cpipe, int size);
 static int pipespace_new(struct pipe *cpipe, int size);
 
 static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
 static void	pipe_zone_dtor(void *mem, int size, void *arg);
 static int	pipe_zone_init(void *mem, int size, int flags);
 static void	pipe_zone_fini(void *mem, int size);
 
 static uma_zone_t pipe_zone;
 
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
 
 static void
 pipeinit(void *dummy __unused)
 {
 
 	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair),
 	    pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini,
 	    UMA_ALIGN_PTR, 0);
 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
 }
 
 static int
 pipe_zone_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct pipepair *pp;
 	struct pipe *rpipe, *wpipe;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	/*
 	 * We zero both pipe endpoints to make sure all the kmem pointers
 	 * are NULL, flag fields are zero'd, etc.  We timestamp both
 	 * endpoints with the same time.
 	 */
 	rpipe = &pp->pp_rpipe;
 	bzero(rpipe, sizeof(*rpipe));
 	vfs_timestamp(&rpipe->pipe_ctime);
 	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
 
 	wpipe = &pp->pp_wpipe;
 	bzero(wpipe, sizeof(*wpipe));
 	wpipe->pipe_ctime = rpipe->pipe_ctime;
 	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
 
 	rpipe->pipe_peer = wpipe;
 	rpipe->pipe_pair = pp;
 	wpipe->pipe_peer = rpipe;
 	wpipe->pipe_pair = pp;
 
 	/*
 	 * Mark both endpoints as present; they will later get free'd
 	 * one at a time.  When both are free'd, then the whole pair
 	 * is released.
 	 */
 	rpipe->pipe_present = 1;
 	wpipe->pipe_present = 1;
 
 	/*
 	 * Eventually, the MAC Framework may initialize the label
 	 * in ctor or init, but for now we do it elswhere to avoid
 	 * blocking in ctor or init.
 	 */
 	pp->pp_label = NULL;
 
 	atomic_add_int(&amountpipes, 2);
 	return (0);
 }
 
 static void
 pipe_zone_dtor(void *mem, int size, void *arg)
 {
 	struct pipepair *pp;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	atomic_subtract_int(&amountpipes, 2);
 }
 
 static int
 pipe_zone_init(void *mem, int size, int flags)
 {
 	struct pipepair *pp;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
 	return (0);
 }
 
 static void
 pipe_zone_fini(void *mem, int size)
 {
 	struct pipepair *pp;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	mtx_destroy(&pp->pp_mtx);
 }
 
 /*
- * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail,
- * let the zone pick up the pieces via pipeclose().
+ * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
+ * the zone pick up the pieces via pipeclose().
  */
-
 /* ARGSUSED */
 int
 pipe(td, uap)
 	struct thread *td;
 	struct pipe_args /* {
 		int	dummy;
 	} */ *uap;
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *rf, *wf;
 	struct pipepair *pp;
 	struct pipe *rpipe, *wpipe;
 	int fd, error;
 
 	pp = uma_zalloc(pipe_zone, M_WAITOK);
 #ifdef MAC
 	/*
 	 * The MAC label is shared between the connected endpoints.  As a
 	 * result mac_init_pipe() and mac_create_pipe() are called once
 	 * for the pair, and not on the endpoints.
 	 */
 	mac_init_pipe(pp);
 	mac_create_pipe(td->td_ucred, pp);
 #endif
 	rpipe = &pp->pp_rpipe;
 	wpipe = &pp->pp_wpipe;
 
 	knlist_init(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe), NULL, NULL,
 	    NULL);
 	knlist_init(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe), NULL, NULL,
 	    NULL);
 
 	/* Only the forward direction pipe is backed by default */
 	if ((error = pipe_create(rpipe, 1)) != 0 ||
 	    (error = pipe_create(wpipe, 0)) != 0) {
 		pipeclose(rpipe);
 		pipeclose(wpipe);
 		return (error);
 	}
 
 	rpipe->pipe_state |= PIPE_DIRECTOK;
 	wpipe->pipe_state |= PIPE_DIRECTOK;
 
 	error = falloc(td, &rf, &fd);
 	if (error) {
 		pipeclose(rpipe);
 		pipeclose(wpipe);
 		return (error);
 	}
 	/* An extra reference on `rf' has been held for us by falloc(). */
 	td->td_retval[0] = fd;
 
 	/*
 	 * Warning: once we've gotten past allocation of the fd for the
 	 * read-side, we can only drop the read side via fdrop() in order
 	 * to avoid races against processes which manage to dup() the read
 	 * side while we are blocked trying to allocate the write side.
 	 */
 	FILE_LOCK(rf);
 	rf->f_flag = FREAD | FWRITE;
 	rf->f_type = DTYPE_PIPE;
 	rf->f_data = rpipe;
 	rf->f_ops = &pipeops;
 	FILE_UNLOCK(rf);
 	error = falloc(td, &wf, &fd);
 	if (error) {
 		fdclose(fdp, rf, td->td_retval[0], td);
 		fdrop(rf, td);
 		/* rpipe has been closed by fdrop(). */
 		pipeclose(wpipe);
 		return (error);
 	}
 	/* An extra reference on `wf' has been held for us by falloc(). */
 	FILE_LOCK(wf);
 	wf->f_flag = FREAD | FWRITE;
 	wf->f_type = DTYPE_PIPE;
 	wf->f_data = wpipe;
 	wf->f_ops = &pipeops;
 	FILE_UNLOCK(wf);
 	fdrop(wf, td);
 	td->td_retval[1] = fd;
 	fdrop(rf, td);
 
 	return (0);
 }
 
 /*
  * Allocate kva for pipe circular buffer, the space is pageable
  * This routine will 'realloc' the size of a pipe safely, if it fails
  * it will retain the old buffer.
  * If it fails it will return ENOMEM.
  */
 static int
 pipespace_new(cpipe, size)
 	struct pipe *cpipe;
 	int size;
 {
 	caddr_t buffer;
 	int error, cnt, firstseg;
 	static int curfail = 0;
 	static struct timeval lastfail;
 
 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
 	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
 		("pipespace: resize of direct writes not allowed"));
 retry:
 	cnt = cpipe->pipe_buffer.cnt;
 	if (cnt > size)
 		size = cnt;
 
 	size = round_page(size);
 	buffer = (caddr_t) vm_map_min(pipe_map);
 
 	error = vm_map_find(pipe_map, NULL, 0,
 		(vm_offset_t *) &buffer, size, 1,
 		VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error != KERN_SUCCESS) {
 		if ((cpipe->pipe_buffer.buffer == NULL) &&
 			(size > SMALL_PIPE_SIZE)) {
 			size = SMALL_PIPE_SIZE;
 			pipefragretry++;
 			goto retry;
 		}
 		if (cpipe->pipe_buffer.buffer == NULL) {
 			pipeallocfail++;
 			if (ppsratecheck(&lastfail, &curfail, 1))
 				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
 		} else {
 			piperesizefail++;
 		}
 		return (ENOMEM);
 	}
 
 	/* copy data, then free old resources if we're resizing */
 	if (cnt > 0) {
 		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
 			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
 			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
 				buffer, firstseg);
 			if ((cnt - firstseg) > 0)
 				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
 					cpipe->pipe_buffer.in);
 		} else {
 			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
 				buffer, cnt);
 		}
 	}
 	pipe_free_kmem(cpipe);
 	cpipe->pipe_buffer.buffer = buffer;
 	cpipe->pipe_buffer.size = size;
 	cpipe->pipe_buffer.in = cnt;
 	cpipe->pipe_buffer.out = 0;
 	cpipe->pipe_buffer.cnt = cnt;
 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
 	return (0);
 }
 
 /*
  * Wrapper for pipespace_new() that performs locking assertions.
  */
 static int
 pipespace(cpipe, size)
 	struct pipe *cpipe;
 	int size;
 {
 
 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
 		("Unlocked pipe passed to pipespace"));
 	return (pipespace_new(cpipe, size));
 }
 
 /*
  * lock a pipe for I/O, blocking other access
  */
 static __inline int
 pipelock(cpipe, catch)
 	struct pipe *cpipe;
 	int catch;
 {
 	int error;
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	while (cpipe->pipe_state & PIPE_LOCKFL) {
 		cpipe->pipe_state |= PIPE_LWANT;
 		error = msleep(cpipe, PIPE_MTX(cpipe),
 		    catch ? (PRIBIO | PCATCH) : PRIBIO,
 		    "pipelk", 0);
 		if (error != 0)
 			return (error);
 	}
 	cpipe->pipe_state |= PIPE_LOCKFL;
 	return (0);
 }
 
 /*
  * unlock a pipe I/O lock
  */
 static __inline void
 pipeunlock(cpipe)
 	struct pipe *cpipe;
 {
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
 		("Unlocked pipe passed to pipeunlock"));
 	cpipe->pipe_state &= ~PIPE_LOCKFL;
 	if (cpipe->pipe_state & PIPE_LWANT) {
 		cpipe->pipe_state &= ~PIPE_LWANT;
 		wakeup(cpipe);
 	}
 }
 
 static __inline void
 pipeselwakeup(cpipe)
 	struct pipe *cpipe;
 {
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	if (cpipe->pipe_state & PIPE_SEL) {
 		cpipe->pipe_state &= ~PIPE_SEL;
 		selwakeuppri(&cpipe->pipe_sel, PSOCK);
 	}
 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
 	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
 }
 
 /*
  * Initialize and allocate VM and memory for pipe.  The structure
  * will start out zero'd from the ctor, so we just manage the kmem.
  */
 static int
 pipe_create(pipe, backing)
 	struct pipe *pipe;
 	int backing;
 {
 	int error;
 
 	if (backing) {
 		if (amountpipekva > maxpipekva / 2)
 			error = pipespace_new(pipe, SMALL_PIPE_SIZE);
 		else
 			error = pipespace_new(pipe, PIPE_SIZE);
 	} else {
 		/* If we're not backing this pipe, no need to do anything. */
 		error = 0;
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 pipe_read(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	struct thread *td;
 	int flags;
 {
 	struct pipe *rpipe = fp->f_data;
 	int error;
 	int nread = 0;
 	u_int size;
 
 	PIPE_LOCK(rpipe);
 	++rpipe->pipe_busy;
 	error = pipelock(rpipe, 1);
 	if (error)
 		goto unlocked_error;
 
 #ifdef MAC
 	error = mac_check_pipe_read(active_cred, rpipe->pipe_pair);
 	if (error)
 		goto locked_error;
 #endif
 	if (amountpipekva > (3 * maxpipekva) / 4) {
 		if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
 			(rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
 			(rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
 			(piperesizeallowed == 1)) {
 			PIPE_UNLOCK(rpipe);
 			pipespace(rpipe, SMALL_PIPE_SIZE);
 			PIPE_LOCK(rpipe);
 		}
 	}
 
 	while (uio->uio_resid) {
 		/*
 		 * normal pipe buffer receive
 		 */
 		if (rpipe->pipe_buffer.cnt > 0) {
 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
 			if (size > rpipe->pipe_buffer.cnt)
 				size = rpipe->pipe_buffer.cnt;
 			if (size > (u_int) uio->uio_resid)
 				size = (u_int) uio->uio_resid;
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove(
 			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
 			    size, uio);
 			PIPE_LOCK(rpipe);
 			if (error)
 				break;
 
 			rpipe->pipe_buffer.out += size;
 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
 				rpipe->pipe_buffer.out = 0;
 
 			rpipe->pipe_buffer.cnt -= size;
 
 			/*
 			 * If there is no more to read in the pipe, reset
 			 * its pointers to the beginning.  This improves
 			 * cache hit stats.
 			 */
 			if (rpipe->pipe_buffer.cnt == 0) {
 				rpipe->pipe_buffer.in = 0;
 				rpipe->pipe_buffer.out = 0;
 			}
 			nread += size;
 #ifndef PIPE_NODIRECT
 		/*
 		 * Direct copy, bypassing a kernel buffer.
 		 */
 		} else if ((size = rpipe->pipe_map.cnt) &&
 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
 			if (size > (u_int) uio->uio_resid)
 				size = (u_int) uio->uio_resid;
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove_fromphys(rpipe->pipe_map.ms,
 			    rpipe->pipe_map.pos, size, uio);
 			PIPE_LOCK(rpipe);
 			if (error)
 				break;
 			nread += size;
 			rpipe->pipe_map.pos += size;
 			rpipe->pipe_map.cnt -= size;
 			if (rpipe->pipe_map.cnt == 0) {
 				rpipe->pipe_state &= ~PIPE_DIRECTW;
 				wakeup(rpipe);
 			}
 #endif
 		} else {
 			/*
 			 * detect EOF condition
 			 * read returns 0 on EOF, no need to set error
 			 */
 			if (rpipe->pipe_state & PIPE_EOF)
 				break;
 
 			/*
 			 * If the "write-side" has been blocked, wake it up now.
 			 */
 			if (rpipe->pipe_state & PIPE_WANTW) {
 				rpipe->pipe_state &= ~PIPE_WANTW;
 				wakeup(rpipe);
 			}
 
 			/*
 			 * Break if some data was read.
 			 */
 			if (nread > 0)
 				break;
 
 			/*
 			 * Unlock the pipe buffer for our remaining processing.
 			 * We will either break out with an error or we will
 			 * sleep and relock to loop.
 			 */
 			pipeunlock(rpipe);
 
 			/*
 			 * Handle non-blocking mode operation or
 			 * wait for more data.
 			 */
 			if (fp->f_flag & FNONBLOCK) {
 				error = EAGAIN;
 			} else {
 				rpipe->pipe_state |= PIPE_WANTR;
 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
 				    PRIBIO | PCATCH,
 				    "piperd", 0)) == 0)
 					error = pipelock(rpipe, 1);
 			}
 			if (error)
 				goto unlocked_error;
 		}
 	}
 #ifdef MAC
 locked_error:
 #endif
 	pipeunlock(rpipe);
 
 	/* XXX: should probably do this before getting any locks. */
 	if (error == 0)
 		vfs_timestamp(&rpipe->pipe_atime);
 unlocked_error:
 	--rpipe->pipe_busy;
 
 	/*
 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
 	 */
 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
 		wakeup(rpipe);
 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
 		/*
 		 * Handle write blocking hysteresis.
 		 */
 		if (rpipe->pipe_state & PIPE_WANTW) {
 			rpipe->pipe_state &= ~PIPE_WANTW;
 			wakeup(rpipe);
 		}
 	}
 
 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
 		pipeselwakeup(rpipe);
 
 	PIPE_UNLOCK(rpipe);
 	return (error);
 }
 
 #ifndef PIPE_NODIRECT
 /*
  * Map the sending processes' buffer into kernel space and wire it.
  * This is similar to a physical write operation.
  */
 static int
 pipe_build_write_buffer(wpipe, uio)
 	struct pipe *wpipe;
 	struct uio *uio;
 {
 	pmap_t pmap;
 	u_int size;
 	int i, j;
 	vm_offset_t addr, endaddr;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
 	KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
 		("Clone attempt on non-direct write pipe!"));
 
 	size = (u_int) uio->uio_iov->iov_len;
 	if (size > wpipe->pipe_buffer.size)
 		size = wpipe->pipe_buffer.size;
 
 	pmap = vmspace_pmap(curproc->p_vmspace);
 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
 		/*
 		 * vm_fault_quick() can sleep.  Consequently,
 		 * vm_page_lock_queue() and vm_page_unlock_queue()
 		 * should not be performed outside of this loop.
 		 */
 	race:
 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
 			vm_page_lock_queues();
 			for (j = 0; j < i; j++)
 				vm_page_unhold(wpipe->pipe_map.ms[j]);
 			vm_page_unlock_queues();
 			return (EFAULT);
 		}
 		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
 		    VM_PROT_READ);
 		if (wpipe->pipe_map.ms[i] == NULL)
 			goto race;
 	}
 
 /*
  * set up the control block
  */
 	wpipe->pipe_map.npages = i;
 	wpipe->pipe_map.pos =
 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
 	wpipe->pipe_map.cnt = size;
 
 /*
  * and update the uio data
  */
 
 	uio->uio_iov->iov_len -= size;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
 	if (uio->uio_iov->iov_len == 0)
 		uio->uio_iov++;
 	uio->uio_resid -= size;
 	uio->uio_offset += size;
 	return (0);
 }
 
 /*
  * unmap and unwire the process buffer
  */
 static void
 pipe_destroy_write_buffer(wpipe)
 	struct pipe *wpipe;
 {
 	int i;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	vm_page_lock_queues();
 	for (i = 0; i < wpipe->pipe_map.npages; i++) {
 		vm_page_unhold(wpipe->pipe_map.ms[i]);
 	}
 	vm_page_unlock_queues();
 	wpipe->pipe_map.npages = 0;
 }
 
 /*
  * In the case of a signal, the writing process might go away.  This
  * code copies the data into the circular buffer so that the source
  * pages can be freed without loss of data.
  */
 static void
 pipe_clone_write_buffer(wpipe)
 	struct pipe *wpipe;
 {
 	struct uio uio;
 	struct iovec iov;
 	int size;
 	int pos;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	size = wpipe->pipe_map.cnt;
 	pos = wpipe->pipe_map.pos;
 
 	wpipe->pipe_buffer.in = size;
 	wpipe->pipe_buffer.out = 0;
 	wpipe->pipe_buffer.cnt = size;
 	wpipe->pipe_state &= ~PIPE_DIRECTW;
 
 	PIPE_UNLOCK(wpipe);
 	iov.iov_base = wpipe->pipe_buffer.buffer;
 	iov.iov_len = size;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = size;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = curthread;
 	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
 	PIPE_LOCK(wpipe);
 	pipe_destroy_write_buffer(wpipe);
 }
 
 /*
  * This implements the pipe buffer write mechanism.  Note that only
  * a direct write OR a normal pipe write can be pending at any given time.
  * If there are any characters in the pipe buffer, the direct write will
  * be deferred until the receiving process grabs all of the bytes from
  * the pipe buffer.  Then the direct mapping write is set-up.
  */
 static int
 pipe_direct_write(wpipe, uio)
 	struct pipe *wpipe;
 	struct uio *uio;
 {
 	int error;
 
 retry:
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	error = pipelock(wpipe, 1);
 	if (wpipe->pipe_state & PIPE_EOF)
 		error = EPIPE;
 	if (error) {
 		pipeunlock(wpipe);
 		goto error1;
 	}
 	while (wpipe->pipe_state & PIPE_DIRECTW) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe),
 		    PRIBIO | PCATCH, "pipdww", 0);
 		if (error)
 			goto error1;
 		else
 			goto retry;
 	}
 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
 	if (wpipe->pipe_buffer.cnt > 0) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe),
 		    PRIBIO | PCATCH, "pipdwc", 0);
 		if (error)
 			goto error1;
 		else
 			goto retry;
 	}
 
 	wpipe->pipe_state |= PIPE_DIRECTW;
 
 	PIPE_UNLOCK(wpipe);
 	error = pipe_build_write_buffer(wpipe, uio);
 	PIPE_LOCK(wpipe);
 	if (error) {
 		wpipe->pipe_state &= ~PIPE_DIRECTW;
 		pipeunlock(wpipe);
 		goto error1;
 	}
 
 	error = 0;
 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
 		if (wpipe->pipe_state & PIPE_EOF) {
 			pipe_destroy_write_buffer(wpipe);
 			pipeselwakeup(wpipe);
 			pipeunlock(wpipe);
 			error = EPIPE;
 			goto error1;
 		}
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
 		    "pipdwt", 0);
 		pipelock(wpipe, 0);
 	}
 
 	if (wpipe->pipe_state & PIPE_EOF)
 		error = EPIPE;
 	if (wpipe->pipe_state & PIPE_DIRECTW) {
 		/*
 		 * this bit of trickery substitutes a kernel buffer for
 		 * the process that might be going away.
 		 */
 		pipe_clone_write_buffer(wpipe);
 	} else {
 		pipe_destroy_write_buffer(wpipe);
 	}
 	pipeunlock(wpipe);
 	return (error);
 
 error1:
 	wakeup(wpipe);
 	return (error);
 }
 #endif
 
 static int
 pipe_write(fp, uio, active_cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *active_cred;
 	struct thread *td;
 	int flags;
 {
 	int error = 0;
 	int desiredsize, orig_resid;
 	struct pipe *wpipe, *rpipe;
 
 	rpipe = fp->f_data;
 	wpipe = rpipe->pipe_peer;
 
 	PIPE_LOCK(rpipe);
 	error = pipelock(wpipe, 1);
 	if (error) {
 		PIPE_UNLOCK(rpipe);
 		return (error);
 	}
 	/*
 	 * detect loss of pipe read side, issue SIGPIPE if lost.
 	 */
 	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
 		pipeunlock(wpipe);
 		PIPE_UNLOCK(rpipe);
 		return (EPIPE);
 	}
 #ifdef MAC
 	error = mac_check_pipe_write(active_cred, wpipe->pipe_pair);
 	if (error) {
 		pipeunlock(wpipe);
 		PIPE_UNLOCK(rpipe);
 		return (error);
 	}
 #endif
 	++wpipe->pipe_busy;
 
 	/* Choose a larger size if it's advantageous */
 	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
 	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
 		if (piperesizeallowed != 1)
 			break;
 		if (amountpipekva > maxpipekva / 2)
 			break;
 		if (desiredsize == BIG_PIPE_SIZE)
 			break;
 		desiredsize = desiredsize * 2;
 	}
 
 	/* Choose a smaller size if we're in a OOM situation */
 	if ((amountpipekva > (3 * maxpipekva) / 4) &&
 		(wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
 		(wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
 		(piperesizeallowed == 1))
 		desiredsize = SMALL_PIPE_SIZE;
 
 	/* Resize if the above determined that a new size was necessary */
 	if ((desiredsize != wpipe->pipe_buffer.size) &&
 		((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
 		PIPE_UNLOCK(wpipe);
 		pipespace(wpipe, desiredsize);
 		PIPE_LOCK(wpipe);
 	}
 	if (wpipe->pipe_buffer.size == 0) {
 		/*
 		 * This can only happen for reverse direction use of pipes
 		 * in a complete OOM situation.
 		 */
 		error = ENOMEM;
 		--wpipe->pipe_busy;
 		pipeunlock(wpipe);
 		PIPE_UNLOCK(wpipe);
 		return (error);
 	}
 
 	pipeunlock(wpipe);
 
 	orig_resid = uio->uio_resid;
 
 	while (uio->uio_resid) {
 		int space;
 
 		pipelock(wpipe, 0);
 		if (wpipe->pipe_state & PIPE_EOF) {
 			pipeunlock(wpipe);
 			error = EPIPE;
 			break;
 		}
 #ifndef PIPE_NODIRECT
 		/*
 		 * If the transfer is large, we can gain performance if
 		 * we do process-to-process copies directly.
 		 * If the write is non-blocking, we don't use the
 		 * direct write mechanism.
 		 *
 		 * The direct write mechanism will detect the reader going
 		 * away on us.
 		 */
 		if (uio->uio_segflg == UIO_USERSPACE &&
 		    uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
 		    wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
 		    (fp->f_flag & FNONBLOCK) == 0) {
 			pipeunlock(wpipe);
 			error = pipe_direct_write(wpipe, uio);
 			if (error)
 				break;
 			continue;
 		}
 #endif
 
 		/*
 		 * Pipe buffered writes cannot be coincidental with
 		 * direct writes.  We wait until the currently executing
 		 * direct write is completed before we start filling the
 		 * pipe buffer.  We break out if a signal occurs or the
 		 * reader goes away.
 		 */
 		if (wpipe->pipe_state & PIPE_DIRECTW) {
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 			pipeunlock(wpipe);
 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
 			    "pipbww", 0);
 			if (error)
 				break;
 			else
 				continue;
 		}
 
 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 
 		/* Writes of size <= PIPE_BUF must be atomic. */
 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
 			space = 0;
 
 		if (space > 0) {
 			int size;	/* Transfer size */
 			int segsize;	/* first segment to transfer */
 
 			/*
 			 * Transfer size is minimum of uio transfer
 			 * and free space in pipe buffer.
 			 */
 			if (space > uio->uio_resid)
 				size = uio->uio_resid;
 			else
 				size = space;
 			/*
 			 * First segment to transfer is minimum of
 			 * transfer size and contiguous space in
 			 * pipe buffer.  If first segment to transfer
 			 * is less than the transfer size, we've got
 			 * a wraparound in the buffer.
 			 */
 			segsize = wpipe->pipe_buffer.size -
 				wpipe->pipe_buffer.in;
 			if (segsize > size)
 				segsize = size;
 
 			/* Transfer first segment */
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
 					segsize, uio);
 			PIPE_LOCK(rpipe);
 
 			if (error == 0 && segsize < size) {
 				KASSERT(wpipe->pipe_buffer.in + segsize ==
 					wpipe->pipe_buffer.size,
 					("Pipe buffer wraparound disappeared"));
 				/*
 				 * Transfer remaining part now, to
 				 * support atomic writes.  Wraparound
 				 * happened.
 				 */
 
 				PIPE_UNLOCK(rpipe);
 				error = uiomove(
 				    &wpipe->pipe_buffer.buffer[0],
 				    size - segsize, uio);
 				PIPE_LOCK(rpipe);
 			}
 			if (error == 0) {
 				wpipe->pipe_buffer.in += size;
 				if (wpipe->pipe_buffer.in >=
 				    wpipe->pipe_buffer.size) {
 					KASSERT(wpipe->pipe_buffer.in ==
 						size - segsize +
 						wpipe->pipe_buffer.size,
 						("Expected wraparound bad"));
 					wpipe->pipe_buffer.in = size - segsize;
 				}
 
 				wpipe->pipe_buffer.cnt += size;
 				KASSERT(wpipe->pipe_buffer.cnt <=
 					wpipe->pipe_buffer.size,
 					("Pipe buffer overflow"));
 			}
 			pipeunlock(wpipe);
 			if (error != 0)
 				break;
 		} else {
 			/*
 			 * If the "read-side" has been blocked, wake it up now.
 			 */
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 
 			/*
 			 * don't block on non-blocking I/O
 			 */
 			if (fp->f_flag & FNONBLOCK) {
 				error = EAGAIN;
 				pipeunlock(wpipe);
 				break;
 			}
 
 			/*
 			 * We have no more space and have something to offer,
 			 * wake up select/poll.
 			 */
 			pipeselwakeup(wpipe);
 
 			wpipe->pipe_state |= PIPE_WANTW;
 			pipeunlock(wpipe);
 			error = msleep(wpipe, PIPE_MTX(rpipe),
 			    PRIBIO | PCATCH, "pipewr", 0);
 			if (error != 0)
 				break;
 		}
 	}
 
 	pipelock(wpipe, 0);
 	--wpipe->pipe_busy;
 
 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
 		wakeup(wpipe);
 	} else if (wpipe->pipe_buffer.cnt > 0) {
 		/*
 		 * If we have put any characters in the buffer, we wake up
 		 * the reader.
 		 */
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 	}
 
 	/*
 	 * Don't return EPIPE if I/O was successful
 	 */
 	if ((wpipe->pipe_buffer.cnt == 0) &&
 	    (uio->uio_resid == 0) &&
 	    (error == EPIPE)) {
 		error = 0;
 	}
 
 	if (error == 0)
 		vfs_timestamp(&wpipe->pipe_mtime);
 
 	/*
 	 * We have something to offer,
 	 * wake up select/poll.
 	 */
 	if (wpipe->pipe_buffer.cnt)
 		pipeselwakeup(wpipe);
 
 	pipeunlock(wpipe);
 	PIPE_UNLOCK(rpipe);
 	return (error);
 }
 
 /*
  * we implement a very minimal set of ioctls for compatibility with sockets.
  */
 static int
 pipe_ioctl(fp, cmd, data, active_cred, td)
 	struct file *fp;
 	u_long cmd;
 	void *data;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct pipe *mpipe = fp->f_data;
 	int error;
 
 	PIPE_LOCK(mpipe);
 
 #ifdef MAC
 	error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
 	if (error) {
 		PIPE_UNLOCK(mpipe);
 		return (error);
 	}
 #endif
 
 	error = 0;
 	switch (cmd) {
 
 	case FIONBIO:
 		break;
 
 	case FIOASYNC:
 		if (*(int *)data) {
 			mpipe->pipe_state |= PIPE_ASYNC;
 		} else {
 			mpipe->pipe_state &= ~PIPE_ASYNC;
 		}
 		break;
 
 	case FIONREAD:
 		if (mpipe->pipe_state & PIPE_DIRECTW)
 			*(int *)data = mpipe->pipe_map.cnt;
 		else
 			*(int *)data = mpipe->pipe_buffer.cnt;
 		break;
 
 	case FIOSETOWN:
 		PIPE_UNLOCK(mpipe);
 		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
 		goto out_unlocked;
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&mpipe->pipe_sigio);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		PIPE_UNLOCK(mpipe);
 		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
 		goto out_unlocked;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(&mpipe->pipe_sigio);
 		break;
 
 	default:
 		error = ENOTTY;
 		break;
 	}
 	PIPE_UNLOCK(mpipe);
 out_unlocked:
 	return (error);
 }
 
 static int
 pipe_poll(fp, events, active_cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct pipe *rpipe = fp->f_data;
 	struct pipe *wpipe;
 	int revents = 0;
 #ifdef MAC
 	int error;
 #endif
 
 	wpipe = rpipe->pipe_peer;
 	PIPE_LOCK(rpipe);
 #ifdef MAC
 	error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair);
 	if (error)
 		goto locked_error;
 #endif
 	if (events & (POLLIN | POLLRDNORM))
 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
 		    (rpipe->pipe_buffer.cnt > 0) ||
 		    (rpipe->pipe_state & PIPE_EOF))
 			revents |= events & (POLLIN | POLLRDNORM);
 
 	if (events & (POLLOUT | POLLWRNORM))
 		if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) ||
 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
 			revents |= events & (POLLOUT | POLLWRNORM);
 
 	if ((rpipe->pipe_state & PIPE_EOF) ||
 	    (!wpipe->pipe_present) ||
 	    (wpipe->pipe_state & PIPE_EOF))
 		revents |= POLLHUP;
 
 	if (revents == 0) {
 		if (events & (POLLIN | POLLRDNORM)) {
 			selrecord(td, &rpipe->pipe_sel);
 			rpipe->pipe_state |= PIPE_SEL;
 		}
 
 		if (events & (POLLOUT | POLLWRNORM)) {
 			selrecord(td, &wpipe->pipe_sel);
 			wpipe->pipe_state |= PIPE_SEL;
 		}
 	}
 #ifdef MAC
 locked_error:
 #endif
 	PIPE_UNLOCK(rpipe);
 
 	return (revents);
 }
 
 /*
  * We shouldn't need locks here as we're doing a read and this should
  * be a natural race.
  */
 static int
 pipe_stat(fp, ub, active_cred, td)
 	struct file *fp;
 	struct stat *ub;
 	struct ucred *active_cred;
 	struct thread *td;
 {
 	struct pipe *pipe = fp->f_data;
 #ifdef MAC
 	int error;
 
 	PIPE_LOCK(pipe);
 	error = mac_check_pipe_stat(active_cred, pipe->pipe_pair);
 	PIPE_UNLOCK(pipe);
 	if (error)
 		return (error);
 #endif
 	bzero(ub, sizeof(*ub));
 	ub->st_mode = S_IFIFO;
 	ub->st_blksize = PAGE_SIZE;
 	if (pipe->pipe_state & PIPE_DIRECTW)
 		ub->st_size = pipe->pipe_map.cnt;
 	else
 		ub->st_size = pipe->pipe_buffer.cnt;
 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
 	ub->st_atimespec = pipe->pipe_atime;
 	ub->st_mtimespec = pipe->pipe_mtime;
 	ub->st_ctimespec = pipe->pipe_ctime;
 	ub->st_uid = fp->f_cred->cr_uid;
 	ub->st_gid = fp->f_cred->cr_gid;
 	/*
 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
 	 * XXX (st_dev, st_ino) should be unique.
 	 */
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pipe_close(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 	struct pipe *cpipe = fp->f_data;
 
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	funsetown(&cpipe->pipe_sigio);
 	pipeclose(cpipe);
 	return (0);
 }
 
 static void
 pipe_free_kmem(cpipe)
 	struct pipe *cpipe;
 {
 
 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
 	    ("pipe_free_kmem: pipe mutex locked"));
 
 	if (cpipe->pipe_buffer.buffer != NULL) {
 		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
 		vm_map_remove(pipe_map,
 		    (vm_offset_t)cpipe->pipe_buffer.buffer,
 		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
 		cpipe->pipe_buffer.buffer = NULL;
 	}
 #ifndef PIPE_NODIRECT
 	{
 		cpipe->pipe_map.cnt = 0;
 		cpipe->pipe_map.pos = 0;
 		cpipe->pipe_map.npages = 0;
 	}
 #endif
 }
 
 /*
  * shutdown the pipe
  */
 static void
 pipeclose(cpipe)
 	struct pipe *cpipe;
 {
 	struct pipepair *pp;
 	struct pipe *ppipe;
 
 	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
 
 	PIPE_LOCK(cpipe);
 	pipelock(cpipe, 0);
 	pp = cpipe->pipe_pair;
 
 	pipeselwakeup(cpipe);
 
 	/*
 	 * If the other side is blocked, wake it up saying that
 	 * we want to close it down.
 	 */
 	cpipe->pipe_state |= PIPE_EOF;
 	while (cpipe->pipe_busy) {
 		wakeup(cpipe);
 		cpipe->pipe_state |= PIPE_WANT;
 		pipeunlock(cpipe);
 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
 		pipelock(cpipe, 0);
 	}
 
 
 	/*
 	 * Disconnect from peer, if any.
 	 */
 	ppipe = cpipe->pipe_peer;
 	if (ppipe->pipe_present != 0) {
 		pipeselwakeup(ppipe);
 
 		ppipe->pipe_state |= PIPE_EOF;
 		wakeup(ppipe);
 		KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
 	}
 
 	/*
 	 * Mark this endpoint as free.  Release kmem resources.  We
 	 * don't mark this endpoint as unused until we've finished
 	 * doing that, or the pipe might disappear out from under
 	 * us.
 	 */
 	PIPE_UNLOCK(cpipe);
 	pipe_free_kmem(cpipe);
 	PIPE_LOCK(cpipe);
 	cpipe->pipe_present = 0;
 	pipeunlock(cpipe);
 	knlist_clear(&cpipe->pipe_sel.si_note, 1);
 	knlist_destroy(&cpipe->pipe_sel.si_note);
 
 	/*
 	 * If both endpoints are now closed, release the memory for the
 	 * pipe pair.  If not, unlock.
 	 */
 	if (ppipe->pipe_present == 0) {
 		PIPE_UNLOCK(cpipe);
 #ifdef MAC
 		mac_destroy_pipe(pp);
 #endif
 		uma_zfree(pipe_zone, cpipe->pipe_pair);
 	} else
 		PIPE_UNLOCK(cpipe);
 }
 
 /*ARGSUSED*/
 static int
 pipe_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct pipe *cpipe;
 
 	cpipe = kn->kn_fp->f_data;
 	PIPE_LOCK(cpipe);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &pipe_rfiltops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &pipe_wfiltops;
 		if (!cpipe->pipe_peer->pipe_present) {
 			/* other end of pipe has been closed */
 			PIPE_UNLOCK(cpipe);
 			return (EPIPE);
 		}
 		cpipe = cpipe->pipe_peer;
 		break;
 	default:
 		PIPE_UNLOCK(cpipe);
 		return (EINVAL);
 	}
 
 	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
 	PIPE_UNLOCK(cpipe);
 	return (0);
 }
 
 static void
 filt_pipedetach(struct knote *kn)
 {
 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
 
 	PIPE_LOCK(cpipe);
 	if (kn->kn_filter == EVFILT_WRITE) {
 		if (!cpipe->pipe_peer->pipe_present) {
 			PIPE_UNLOCK(cpipe);
 			return;
 		}
 		cpipe = cpipe->pipe_peer;
 	}
 	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
 	PIPE_UNLOCK(cpipe);
 }
 
 /*ARGSUSED*/
 static int
 filt_piperead(struct knote *kn, long hint)
 {
 	struct pipe *rpipe = kn->kn_fp->f_data;
 	struct pipe *wpipe = rpipe->pipe_peer;
 	int ret;
 
 	PIPE_LOCK(rpipe);
 	kn->kn_data = rpipe->pipe_buffer.cnt;
 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
 		kn->kn_data = rpipe->pipe_map.cnt;
 
 	if ((rpipe->pipe_state & PIPE_EOF) ||
 	    (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_flags |= EV_EOF;
 		PIPE_UNLOCK(rpipe);
 		return (1);
 	}
 	ret = kn->kn_data > 0;
 	PIPE_UNLOCK(rpipe);
 	return ret;
 }
 
 /*ARGSUSED*/
 static int
 filt_pipewrite(struct knote *kn, long hint)
 {
 	struct pipe *rpipe = kn->kn_fp->f_data;
 	struct pipe *wpipe = rpipe->pipe_peer;
 
 	PIPE_LOCK(rpipe);
 	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_data = 0;
 		kn->kn_flags |= EV_EOF;
 		PIPE_UNLOCK(rpipe);
 		return (1);
 	}
 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 	if (wpipe->pipe_state & PIPE_DIRECTW)
 		kn->kn_data = 0;
 
 	PIPE_UNLOCK(rpipe);
 	return (kn->kn_data >= PIPE_BUF);
 }
Index: head/sys/kern/sysv_msg.c
===================================================================
--- head/sys/kern/sysv_msg.c	(revision 167231)
+++ head/sys/kern/sysv_msg.c	(revision 167232)
@@ -1,1298 +1,1294 @@
 /*-
  * Implementation of SVID messages
  *
  * Author:  Daniel Boulet
  *
  * Copyright 1993 Daniel Boulet and RTMX Inc.
  *
  * This system call was implemented by Daniel Boulet under contract from RTMX.
  *
  * Redistribution and use in source forms, with and without modification,
  * are permitted provided that this entire comment appears intact.
  *
  * Redistribution in binary form may occur without any restrictions.
  * Obviously, it would be nice if you gave credit where credit is due
  * but requiring it would be too onerous.
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sysvipc.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/msg.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
 
 static void msginit(void);
 static int msgunload(void);
 static int sysvmsg_modload(struct module *, int, void *);
 
 #ifdef MSG_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)
 #endif
 
 static void msg_freehdr(struct msg *msghdr);
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *msgcalls[] = {
 	(sy_call_t *)msgctl, (sy_call_t *)msgget,
 	(sy_call_t *)msgsnd, (sy_call_t *)msgrcv
 };
 
 #ifndef MSGSSZ
 #define MSGSSZ	8		/* Each segment must be 2^N long */
 #endif
 #ifndef MSGSEG
 #define MSGSEG	2048		/* must be less than 32767 */
 #endif
 #define MSGMAX	(MSGSSZ*MSGSEG)
 #ifndef MSGMNB
 #define MSGMNB	2048		/* max # of bytes in a queue */
 #endif
 #ifndef MSGMNI
 #define MSGMNI	40
 #endif
 #ifndef MSGTQL
 #define MSGTQL	40
 #endif
 
 /*
  * Based on the configuration parameters described in an SVR2 (yes, two)
  * config(1m) man page.
  *
  * Each message is broken up and stored in segments that are msgssz bytes
  * long.  For efficiency reasons, this should be a power of two.  Also,
  * it doesn't make sense if it is less than 8 or greater than about 256.
  * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
  * two between 8 and 1024 inclusive (and panic's if it isn't).
  */
 struct msginfo msginfo = {
                 MSGMAX,         /* max chars in a message */
                 MSGMNI,         /* # of message queue identifiers */
                 MSGMNB,         /* max chars in a queue */
                 MSGTQL,         /* max messages in system */
                 MSGSSZ,         /* size of a message segment */
                 		/* (must be small power of 2 greater than 4) */
                 MSGSEG          /* number of message segments */
 };
 
 /*
  * macros to convert between msqid_ds's and msqid's.
  * (specific to this implementation)
  */
 #define MSQID(ix,ds)	((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
 #define MSQID_IX(id)	((id) & 0xffff)
 #define MSQID_SEQ(id)	(((id) >> 16) & 0xffff)
 
 /*
  * The rest of this file is specific to this particular implementation.
  */
 
 struct msgmap {
 	short	next;		/* next segment in buffer */
     				/* -1 -> available */
     				/* 0..(MSGSEG-1) -> index of next segment */
 };
 
 #define MSG_LOCKED	01000	/* Is this msqid_ds locked? */
 
 static int nfree_msgmaps;	/* # of free map entries */
 static short free_msgmaps;	/* head of linked list of free map entries */
 static struct msg *free_msghdrs;/* list of free msg headers */
 static char *msgpool;		/* MSGMAX byte long msg buffer pool */
 static struct msgmap *msgmaps;	/* MSGSEG msgmap structures */
 static struct msg *msghdrs;	/* MSGTQL msg headers */
 static struct msqid_kernel *msqids;	/* MSGMNI msqid_kernel struct's */
 static struct mtx msq_mtx;	/* global mutex for message queues. */
 
 static void
 msginit()
 {
 	register int i;
 
 	TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg);
 	TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz);
 	msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
 	TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni);
 	TUNABLE_INT_FETCH("kern.ipc.msgmnb", &msginfo.msgmnb);
 	TUNABLE_INT_FETCH("kern.ipc.msgtql", &msginfo.msgtql);
 
 	msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
 	if (msgpool == NULL)
 		panic("msgpool is NULL");
 	msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
 	if (msgmaps == NULL)
 		panic("msgmaps is NULL");
 	msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
 	if (msghdrs == NULL)
 		panic("msghdrs is NULL");
 	msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG,
 	    M_WAITOK);
 	if (msqids == NULL)
 		panic("msqids is NULL");
 
 	/*
 	 * msginfo.msgssz should be a power of two for efficiency reasons.
 	 * It is also pretty silly if msginfo.msgssz is less than 8
 	 * or greater than about 256 so ...
 	 */
 
 	i = 8;
 	while (i < 1024 && i != msginfo.msgssz)
 		i <<= 1;
     	if (i != msginfo.msgssz) {
 		DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
 		    msginfo.msgssz));
 		panic("msginfo.msgssz not a small power of 2");
 	}
 
 	if (msginfo.msgseg > 32767) {
 		DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg));
 		panic("msginfo.msgseg > 32767");
 	}
 
 	if (msgmaps == NULL)
 		panic("msgmaps is NULL");
 
 	for (i = 0; i < msginfo.msgseg; i++) {
 		if (i > 0)
 			msgmaps[i-1].next = i;
 		msgmaps[i].next = -1;	/* implies entry is available */
 	}
 	free_msgmaps = 0;
 	nfree_msgmaps = msginfo.msgseg;
 
 	if (msghdrs == NULL)
 		panic("msghdrs is NULL");
 
 	for (i = 0; i < msginfo.msgtql; i++) {
 		msghdrs[i].msg_type = 0;
 		if (i > 0)
 			msghdrs[i-1].msg_next = &msghdrs[i];
 		msghdrs[i].msg_next = NULL;
 #ifdef MAC
 		mac_init_sysv_msgmsg(&msghdrs[i]);
 #endif
     	}
 	free_msghdrs = &msghdrs[0];
 
 	if (msqids == NULL)
 		panic("msqids is NULL");
 
 	for (i = 0; i < msginfo.msgmni; i++) {
 		msqids[i].u.msg_qbytes = 0;	/* implies entry is available */
 		msqids[i].u.msg_perm.seq = 0;	/* reset to a known value */
 		msqids[i].u.msg_perm.mode = 0;
 #ifdef MAC
 		mac_init_sysv_msgqueue(&msqids[i]);
 #endif
 	}
 	mtx_init(&msq_mtx, "msq", NULL, MTX_DEF);
 }
 
 static int
 msgunload()
 {
 	struct msqid_kernel *msqkptr;
 	int msqid;
 #ifdef MAC
 	int i;
 #endif
 
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 		/*
 		 * Look for an unallocated and unlocked msqid_ds.
 		 * msqid_ds's can be locked by msgsnd or msgrcv while
 		 * they are copying the message in/out.  We can't
 		 * re-use the entry until they release it.
 		 */
 		msqkptr = &msqids[msqid];
 		if (msqkptr->u.msg_qbytes != 0 ||
 		    (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 			break;
 	}
 	if (msqid != msginfo.msgmni)
 		return (EBUSY);
 
 #ifdef MAC
 	for (i = 0; i < msginfo.msgtql; i++)
 		mac_destroy_sysv_msgmsg(&msghdrs[i]);
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++)
 		mac_destroy_sysv_msgqueue(&msqids[msqid]);
 #endif
 	free(msgpool, M_MSG);
 	free(msgmaps, M_MSG);
 	free(msghdrs, M_MSG);
 	free(msqids, M_MSG);
 	mtx_destroy(&msq_mtx);
 	return (0);
 }
 
 
 static int
 sysvmsg_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		msginit();
 		break;
 	case MOD_UNLOAD:
 		error = msgunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvmsg_mod = {
 	"sysvmsg",
 	&sysvmsg_modload,
 	NULL
 };
 
 SYSCALL_MODULE_HELPER(msgsys);
 SYSCALL_MODULE_HELPER(msgctl);
 SYSCALL_MODULE_HELPER(msgget);
 SYSCALL_MODULE_HELPER(msgsnd);
 SYSCALL_MODULE_HELPER(msgrcv);
 
 DECLARE_MODULE(sysvmsg, sysvmsg_mod,
 	SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
 MODULE_VERSION(sysvmsg, 1);
 
 /*
  * Entry point for all MSG calls.
  */
 int
 msgsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct msgsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 		int	a5;
 		int	a6;
 	} */ *uap;
 {
 	int error;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	if (uap->which < 0 ||
 	    uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
 		return (EINVAL);
 	error = (*msgcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 static void
 msg_freehdr(msghdr)
 	struct msg *msghdr;
 {
 	while (msghdr->msg_ts > 0) {
 		short next;
 		if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
 			panic("msghdr->msg_spot out of range");
 		next = msgmaps[msghdr->msg_spot].next;
 		msgmaps[msghdr->msg_spot].next = free_msgmaps;
 		free_msgmaps = msghdr->msg_spot;
 		nfree_msgmaps++;
 		msghdr->msg_spot = next;
 		if (msghdr->msg_ts >= msginfo.msgssz)
 			msghdr->msg_ts -= msginfo.msgssz;
 		else
 			msghdr->msg_ts = 0;
 	}
 	if (msghdr->msg_spot != -1)
 		panic("msghdr->msg_spot != -1");
 	msghdr->msg_next = free_msghdrs;
 	free_msghdrs = msghdr;
 #ifdef MAC
 	mac_cleanup_sysv_msgmsg(msghdr);
 #endif
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgctl_args {
 	int	msqid;
 	int	cmd;
 	struct	msqid_ds *buf;
 };
 #endif
-
 int
 msgctl(td, uap)
 	struct thread *td;
 	register struct msgctl_args *uap;
 {
 	int msqid = uap->msqid;
 	int cmd = uap->cmd;
 	struct msqid_ds msqbuf;
 	int error;
 
 	DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
 	if (cmd == IPC_SET &&
 	    (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
 		return (error);
 	error = kern_msgctl(td, msqid, cmd, &msqbuf);
 	if (cmd == IPC_STAT && error == 0)
 		error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds));
 	return (error);
 }
 
 int
 kern_msgctl(td, msqid, cmd, msqbuf)
 	struct thread *td;
 	int msqid;
 	int cmd;
 	struct msqid_ds *msqbuf;
 {
 	int rval, error, msqix;
 	register struct msqid_kernel *msqkptr;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such msqid\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_check_sysv_msqctl(td->td_ucred, msqkptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	rval = 0;
 
 	switch (cmd) {
 
 	case IPC_RMID:
 	{
 		struct msg *msghdr;
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 
 #ifdef MAC
 		/*
 		 * Check that the thread has MAC access permissions to
 		 * individual msghdrs.  Note: We need to do this in a
 		 * separate loop because the actual loop alters the
 		 * msq/msghdr info as it progresses, and there is no going
 		 * back if half the way through we discover that the
 		 * thread cannot free a certain msghdr.  The msq will get
 		 * into an inconsistent state.
 		 */
 		for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
 		    msghdr = msghdr->msg_next) {
 			error = mac_check_sysv_msgrmid(td->td_ucred, msghdr);
 			if (error != 0)
 				goto done2;
 		}
 #endif
 
 		/* Free the message headers */
 		msghdr = msqkptr->u.msg_first;
 		while (msghdr != NULL) {
 			struct msg *msghdr_tmp;
 
 			/* Free the segments of each message */
 			msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 			msqkptr->u.msg_qnum--;
 			msghdr_tmp = msghdr;
 			msghdr = msghdr->msg_next;
 			msg_freehdr(msghdr_tmp);
 		}
 
 		if (msqkptr->u.msg_cbytes != 0)
 			panic("msg_cbytes is screwed up");
 		if (msqkptr->u.msg_qnum != 0)
 			panic("msg_qnum is screwed up");
 
 		msqkptr->u.msg_qbytes = 0;	/* Mark it as free */
 
 #ifdef MAC
 		mac_cleanup_sysv_msgqueue(msqkptr);
 #endif
 
 		wakeup(msqkptr);
 	}
 
 		break;
 
 	case IPC_SET:
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 		if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
 			error = priv_check_cred(td->td_ucred,
 			    PRIV_IPC_MSGSIZE, SUSER_ALLOWJAIL);
 			if (error)
 				goto done2;
 		}
 		if (msqbuf->msg_qbytes > msginfo.msgmnb) {
 			DPRINTF(("can't increase msg_qbytes beyond %d"
 			    "(truncating)\n", msginfo.msgmnb));
 			msqbuf->msg_qbytes = msginfo.msgmnb;	/* silently restrict qbytes to system limit */
 		}
 		if (msqbuf->msg_qbytes == 0) {
 			DPRINTF(("can't reduce msg_qbytes to 0\n"));
 			error = EINVAL;		/* non-standard errno! */
 			goto done2;
 		}
 		msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid;	/* change the owner */
 		msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid;	/* change the owner */
 		msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) |
 		    (msqbuf->msg_perm.mode & 0777);
 		msqkptr->u.msg_qbytes = msqbuf->msg_qbytes;
 		msqkptr->u.msg_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 			DPRINTF(("requester doesn't have read access\n"));
 			goto done2;
 		}
 		*msqbuf = msqkptr->u;
 		break;
 
 	default:
 		DPRINTF(("invalid command %d\n", cmd));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgget_args {
 	key_t	key;
 	int	msgflg;
 };
 #endif
-
 int
 msgget(td, uap)
 	struct thread *td;
 	register struct msgget_args *uap;
 {
 	int msqid, error = 0;
 	int key = uap->key;
 	int msgflg = uap->msgflg;
 	struct ucred *cred = td->td_ucred;
 	register struct msqid_kernel *msqkptr = NULL;
 
 	DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	if (key != IPC_PRIVATE) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes != 0 &&
 			    msqkptr->u.msg_perm.key == key)
 				break;
 		}
 		if (msqid < msginfo.msgmni) {
 			DPRINTF(("found public key\n"));
 			if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 			if ((error = ipcperm(td, &msqkptr->u.msg_perm,
 			    msgflg & 0700))) {
 				DPRINTF(("requester doesn't have 0%o access\n",
 				    msgflg & 0700));
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_check_sysv_msqget(cred, msqkptr);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the msqid_ds\n"));
 	if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			/*
 			 * Look for an unallocated and unlocked msqid_ds.
 			 * msqid_ds's can be locked by msgsnd or msgrcv while
 			 * they are copying the message in/out.  We can't
 			 * re-use the entry until they release it.
 			 */
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes == 0 &&
 			    (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0)
 				break;
 		}
 		if (msqid == msginfo.msgmni) {
 			DPRINTF(("no more msqid_ds's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 		DPRINTF(("msqid %d is available\n", msqid));
 		msqkptr->u.msg_perm.key = key;
 		msqkptr->u.msg_perm.cuid = cred->cr_uid;
 		msqkptr->u.msg_perm.uid = cred->cr_uid;
 		msqkptr->u.msg_perm.cgid = cred->cr_gid;
 		msqkptr->u.msg_perm.gid = cred->cr_gid;
 		msqkptr->u.msg_perm.mode = (msgflg & 0777);
 		/* Make sure that the returned msqid is unique */
 		msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff;
 		msqkptr->u.msg_first = NULL;
 		msqkptr->u.msg_last = NULL;
 		msqkptr->u.msg_cbytes = 0;
 		msqkptr->u.msg_qnum = 0;
 		msqkptr->u.msg_qbytes = msginfo.msgmnb;
 		msqkptr->u.msg_lspid = 0;
 		msqkptr->u.msg_lrpid = 0;
 		msqkptr->u.msg_stime = 0;
 		msqkptr->u.msg_rtime = 0;
 		msqkptr->u.msg_ctime = time_second;
 #ifdef MAC
 		mac_create_sysv_msgqueue(cred, msqkptr);
 #endif
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	/* Construct the unique msqid */
 	td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm);
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgsnd_args {
 	int	msqid;
 	const void	*msgp;
 	size_t	msgsz;
 	int	msgflg;
 };
 #endif
-
 int
 kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
 	struct thread *td;
 	int msqid;
 	const void *msgp;	/* XXX msgp is actually mtext. */
 	size_t msgsz;
 	int msgflg;
 	long mtype;
 {
 	int msqix, segs_needed, error = 0;
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
 	short next;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		error = EINVAL;
 		goto done2;
 	}
 
 	msqkptr = &msqids[msqix];
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) {
 		DPRINTF(("requester doesn't have write access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_check_sysv_msqsnd(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
 	DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
 	    msginfo.msgssz, segs_needed));
 	for (;;) {
 		int need_more_resources = 0;
 
 		/*
 		 * check msgsz
 		 * (inside this loop in case msg_qbytes changes while we sleep)
 		 */
 
 		if (msgsz > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n"));
 			error = EINVAL;
 			goto done2;
 		}
 
 		if (msqkptr->u.msg_perm.mode & MSG_LOCKED) {
 			DPRINTF(("msqid is locked\n"));
 			need_more_resources = 1;
 		}
 		if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
 			need_more_resources = 1;
 		}
 		if (segs_needed > nfree_msgmaps) {
 			DPRINTF(("segs_needed > nfree_msgmaps\n"));
 			need_more_resources = 1;
 		}
 		if (free_msghdrs == NULL) {
 			DPRINTF(("no more msghdrs\n"));
 			need_more_resources = 1;
 		}
 
 		if (need_more_resources) {
 			int we_own_it;
 
 			if ((msgflg & IPC_NOWAIT) != 0) {
 				DPRINTF(("need more resources but caller "
 				    "doesn't want to wait\n"));
 				error = EAGAIN;
 				goto done2;
 			}
 
 			if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) {
 				DPRINTF(("we don't own the msqid_ds\n"));
 				we_own_it = 0;
 			} else {
 				/* Force later arrivals to wait for our
 				   request */
 				DPRINTF(("we own the msqid_ds\n"));
 				msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 				we_own_it = 1;
 			}
 			DPRINTF(("msgsnd:  goodnight\n"));
 			error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 			    "msgsnd", hz);
 			DPRINTF(("msgsnd:  good morning, error=%d\n", error));
 			if (we_own_it)
 				msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			if (error == EWOULDBLOCK) {
 				DPRINTF(("msgsnd:  timed out\n"));
 				continue;
 			}
 			if (error != 0) {
 				DPRINTF(("msgsnd:  interrupted system call\n"));
 				error = EINTR;
 				goto done2;
 			}
 
 			/*
 			 * Make sure that the msq queue still exists
 			 */
 
 			if (msqkptr->u.msg_qbytes == 0) {
 				DPRINTF(("msqid deleted\n"));
 				error = EIDRM;
 				goto done2;
 			}
 
 		} else {
 			DPRINTF(("got all the resources that we need\n"));
 			break;
 		}
 	}
 
 	/*
 	 * We have the resources that we need.
 	 * Make sure!
 	 */
 
 	if (msqkptr->u.msg_perm.mode & MSG_LOCKED)
 		panic("msg_perm.mode & MSG_LOCKED");
 	if (segs_needed > nfree_msgmaps)
 		panic("segs_needed > nfree_msgmaps");
 	if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes)
 		panic("msgsz + msg_cbytes > msg_qbytes");
 	if (free_msghdrs == NULL)
 		panic("no more msghdrs");
 
 	/*
 	 * Re-lock the msqid_ds in case we page-fault when copying in the
 	 * message
 	 */
 
 	if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 		panic("msqid_ds is already locked");
 	msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 
 	/*
 	 * Allocate a message header
 	 */
 
 	msghdr = free_msghdrs;
 	free_msghdrs = msghdr->msg_next;
 	msghdr->msg_spot = -1;
 	msghdr->msg_ts = msgsz;
 	msghdr->msg_type = mtype;
 #ifdef MAC
 	/*
 	 * XXXMAC: Should the mac_check_sysv_msgmsq check follow here
 	 * immediately?  Or, should it be checked just before the msg is
 	 * enqueued in the msgq (as it is done now)?
 	 */
 	mac_create_sysv_msgmsg(td->td_ucred, msqkptr, msghdr);
 #endif
 
 	/*
 	 * Allocate space for the message
 	 */
 
 	while (segs_needed > 0) {
 		if (nfree_msgmaps <= 0)
 			panic("not enough msgmaps");
 		if (free_msgmaps == -1)
 			panic("nil free_msgmaps");
 		next = free_msgmaps;
 		if (next <= -1)
 			panic("next too low #1");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #1");
 		DPRINTF(("allocating segment %d to message\n", next));
 		free_msgmaps = msgmaps[next].next;
 		nfree_msgmaps--;
 		msgmaps[next].next = msghdr->msg_spot;
 		msghdr->msg_spot = next;
 		segs_needed--;
 	}
 
 	/*
 	 * Validate the message type
 	 */
 
 	if (msghdr->msg_type < 1) {
 		msg_freehdr(msghdr);
 		msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 		wakeup(msqkptr);
 		DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
 		error = EINVAL;
 		goto done2;
 	}
 
 	/*
 	 * Copy in the message body
 	 */
 
 	next = msghdr->msg_spot;
 	while (msgsz > 0) {
 		size_t tlen;
 		if (msgsz > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz;
 		if (next <= -1)
 			panic("next too low #2");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #2");
 		mtx_unlock(&msq_mtx);
 		if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
 		    tlen)) != 0) {
 			mtx_lock(&msq_mtx);
 			DPRINTF(("error %d copying in message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			wakeup(msqkptr);
 			goto done2;
 		}
 		mtx_lock(&msq_mtx);
 		msgsz -= tlen;
 		msgp = (const char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 	if (next != -1)
 		panic("didn't use all the msg segments");
 
 	/*
 	 * We've got the message.  Unlock the msqid_ds.
 	 */
 
 	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 
 	/*
 	 * Make sure that the msqid_ds is still allocated.
 	 */
 
 	if (msqkptr->u.msg_qbytes == 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		error = EIDRM;
 		goto done2;
 	}
 
 #ifdef MAC
 	/*
 	 * Note: Since the task/thread allocates the msghdr and usually
 	 * primes it with its own MAC label, for a majority of policies, it
 	 * won't be necessary to check whether the msghdr has access
 	 * permissions to the msgq.  The mac_check_sysv_msqsnd check would
 	 * suffice in that case.  However, this hook may be required where
 	 * individual policies derive a non-identical label for the msghdr
 	 * from the current thread label and may want to check the msghdr
 	 * enqueue permissions, along with read/write permissions to the
 	 * msgq.
 	 */
 	error = mac_check_sysv_msgmsq(td->td_ucred, msghdr, msqkptr);
 	if (error != 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		goto done2;
 	}
 #endif
 
 	/*
 	 * Put the message into the queue
 	 */
 	if (msqkptr->u.msg_first == NULL) {
 		msqkptr->u.msg_first = msghdr;
 		msqkptr->u.msg_last = msghdr;
 	} else {
 		msqkptr->u.msg_last->msg_next = msghdr;
 		msqkptr->u.msg_last = msghdr;
 	}
 	msqkptr->u.msg_last->msg_next = NULL;
 
 	msqkptr->u.msg_cbytes += msghdr->msg_ts;
 	msqkptr->u.msg_qnum++;
 	msqkptr->u.msg_lspid = td->td_proc->p_pid;
 	msqkptr->u.msg_stime = time_second;
 
 	wakeup(msqkptr);
 	td->td_retval[0] = 0;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 int
 msgsnd(td, uap)
 	struct thread *td;
 	register struct msgsnd_args *uap;
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
 	    uap->msgsz, uap->msgflg));
 
 	if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
 		DPRINTF(("error %d copying the message type\n", error));
 		return (error);
 	}
 	return (kern_msgsnd(td, uap->msqid,
 	    (const char *)uap->msgp + sizeof(mtype),
 	    uap->msgsz, uap->msgflg, mtype));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgrcv_args {
 	int	msqid;
 	void	*msgp;
 	size_t	msgsz;
 	long	msgtyp;
 	int	msgflg;
 };
 #endif
-
 int
 kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype)
 	struct thread *td;
 	int msqid;
 	void *msgp;	/* XXX msgp is actually mtext. */
 	size_t msgsz;
 	long msgtyp;
 	int msgflg;
 	long *mtype;
 {
 	size_t len;
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
 	int msqix, error = 0;
 	short next;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 		DPRINTF(("requester doesn't have read access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_check_sysv_msqrcv(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 	msghdr = NULL;
 	while (msghdr == NULL) {
 		if (msgtyp == 0) {
 			msghdr = msqkptr->u.msg_first;
 			if (msghdr != NULL) {
 				if (msgsz < msghdr->msg_ts &&
 				    (msgflg & MSG_NOERROR) == 0) {
 					DPRINTF(("first message on the queue "
 					    "is too big (want %zu, got %d)\n",
 					    msgsz, msghdr->msg_ts));
 					error = E2BIG;
 					goto done2;
 				}
 #ifdef MAC
 				error = mac_check_sysv_msgrcv(td->td_ucred,
 				    msghdr);
 				if (error != 0)
 					goto done2;
 #endif
 				if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
 					msqkptr->u.msg_first = NULL;
 					msqkptr->u.msg_last = NULL;
 				} else {
 					msqkptr->u.msg_first = msghdr->msg_next;
 					if (msqkptr->u.msg_first == NULL)
 						panic("msg_first/last screwed up #1");
 				}
 			}
 		} else {
 			struct msg *previous;
 			struct msg **prev;
 
 			previous = NULL;
 			prev = &(msqkptr->u.msg_first);
 			while ((msghdr = *prev) != NULL) {
 				/*
 				 * Is this message's type an exact match or is
 				 * this message's type less than or equal to
 				 * the absolute value of a negative msgtyp?
 				 * Note that the second half of this test can
 				 * NEVER be true if msgtyp is positive since
 				 * msg_type is always positive!
 				 */
 
 				if (msgtyp == msghdr->msg_type ||
 				    msghdr->msg_type <= -msgtyp) {
 					DPRINTF(("found message type %ld, "
 					    "requested %ld\n",
 					    msghdr->msg_type, msgtyp));
 					if (msgsz < msghdr->msg_ts &&
 					    (msgflg & MSG_NOERROR) == 0) {
 						DPRINTF(("requested message "
 						    "on the queue is too big "
 						    "(want %zu, got %hu)\n",
 						    msgsz, msghdr->msg_ts));
 						error = E2BIG;
 						goto done2;
 					}
 #ifdef MAC
 					error = mac_check_sysv_msgrcv(
 					    td->td_ucred, msghdr);
 					if (error != 0)
 						goto done2;
 #endif
 					*prev = msghdr->msg_next;
 					if (msghdr == msqkptr->u.msg_last) {
 						if (previous == NULL) {
 							if (prev !=
 							    &msqkptr->u.msg_first)
 								panic("msg_first/last screwed up #2");
 							msqkptr->u.msg_first =
 							    NULL;
 							msqkptr->u.msg_last =
 							    NULL;
 						} else {
 							if (prev ==
 							    &msqkptr->u.msg_first)
 								panic("msg_first/last screwed up #3");
 							msqkptr->u.msg_last =
 							    previous;
 						}
 					}
 					break;
 				}
 				previous = msghdr;
 				prev = &(msghdr->msg_next);
 			}
 		}
 
 		/*
 		 * We've either extracted the msghdr for the appropriate
 		 * message or there isn't one.
 		 * If there is one then bail out of this loop.
 		 */
 
 		if (msghdr != NULL)
 			break;
 
 		/*
 		 * Hmph!  No message found.  Does the user want to wait?
 		 */
 
 		if ((msgflg & IPC_NOWAIT) != 0) {
 			DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
 			    msgtyp));
 			/* The SVID says to return ENOMSG. */
 			error = ENOMSG;
 			goto done2;
 		}
 
 		/*
 		 * Wait for something to happen
 		 */
 
 		DPRINTF(("msgrcv:  goodnight\n"));
 		error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 		    "msgrcv", 0);
 		DPRINTF(("msgrcv:  good morning (error=%d)\n", error));
 
 		if (error != 0) {
 			DPRINTF(("msgrcv:  interrupted system call\n"));
 			error = EINTR;
 			goto done2;
 		}
 
 		/*
 		 * Make sure that the msq queue still exists
 		 */
 
 		if (msqkptr->u.msg_qbytes == 0 ||
 		    msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 			DPRINTF(("msqid deleted\n"));
 			error = EIDRM;
 			goto done2;
 		}
 	}
 
 	/*
 	 * Return the message to the user.
 	 *
 	 * First, do the bookkeeping (before we risk being interrupted).
 	 */
 
 	msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 	msqkptr->u.msg_qnum--;
 	msqkptr->u.msg_lrpid = td->td_proc->p_pid;
 	msqkptr->u.msg_rtime = time_second;
 
 	/*
 	 * Make msgsz the actual amount that we'll be returning.
 	 * Note that this effectively truncates the message if it is too long
 	 * (since msgsz is never increased).
 	 */
 
 	DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
 	    msghdr->msg_ts));
 	if (msgsz > msghdr->msg_ts)
 		msgsz = msghdr->msg_ts;
 	*mtype = msghdr->msg_type;
 
 	/*
 	 * Return the segments to the user
 	 */
 
 	next = msghdr->msg_spot;
 	for (len = 0; len < msgsz; len += msginfo.msgssz) {
 		size_t tlen;
 
 		if (msgsz - len > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz - len;
 		if (next <= -1)
 			panic("next too low #3");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #3");
 		mtx_unlock(&msq_mtx);
 		error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
 		mtx_lock(&msq_mtx);
 		if (error != 0) {
 			DPRINTF(("error (%d) copying out message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			wakeup(msqkptr);
 			goto done2;
 		}
 		msgp = (char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 
 	/*
 	 * Done, return the actual number of bytes copied out.
 	 */
 
 	msg_freehdr(msghdr);
 	wakeup(msqkptr);
 	td->td_retval[0] = msgsz;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 int
 msgrcv(td, uap)
 	struct thread *td;
 	register struct msgrcv_args *uap;
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
 	    uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
 
 	if ((error = kern_msgrcv(td, uap->msqid,
 	    (char *)uap->msgp + sizeof(mtype), uap->msgsz,
 	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
 		return (error);
 	if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
 		DPRINTF(("error %d copying the message type\n", error));
 	return (error);
 }
 
 static int
 sysctl_msqids(SYSCTL_HANDLER_ARGS)
 {
 
 	return (SYSCTL_OUT(req, msqids,
 	    sizeof(struct msqid_kernel) * msginfo.msgmni));
 }
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
     "Maximum message size");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
     "Number of message queue identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0,
     "Maximum number of bytes in a queue");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0,
     "Maximum number of messages in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0,
     "Size of a message segment");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0,
     "Number of message segments");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLFLAG_RD,
     NULL, 0, sysctl_msqids, "", "Message queue IDs");
Index: head/sys/kern/sysv_sem.c
===================================================================
--- head/sys/kern/sysv_sem.c	(revision 167231)
+++ head/sys/kern/sysv_sem.c	(revision 167232)
@@ -1,1364 +1,1361 @@
 /*-
  * Implementation of SVID semaphores
  *
  * Author:  Daniel Boulet
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sysvipc.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/sem.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
 
 #ifdef SEM_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)
 #endif
 
 static void seminit(void);
 static int sysvsem_modload(struct module *, int, void *);
 static int semunload(void);
 static void semexit_myhook(void *arg, struct proc *p);
 static int sysctl_sema(SYSCTL_HANDLER_ARGS);
 static int semvalid(int semid, struct semid_kernel *semakptr);
 
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args;
 int __semctl(struct thread *td, struct __semctl_args *uap);
 struct semget_args;
 int semget(struct thread *td, struct semget_args *uap);
 struct semop_args;
 int semop(struct thread *td, struct semop_args *uap);
 #endif
 
 static struct sem_undo *semu_alloc(struct thread *td);
 static int semundo_adjust(struct thread *td, struct sem_undo **supptr,
 		int semid, int semnum, int adjval);
 static void semundo_clear(int semid, int semnum);
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *semcalls[] = {
 	(sy_call_t *)__semctl, (sy_call_t *)semget,
 	(sy_call_t *)semop
 };
 
 static struct mtx	sem_mtx;	/* semaphore global lock */
 static int	semtot = 0;
 static struct semid_kernel *sema;	/* semaphore id pool */
 static struct mtx *sema_mtx;	/* semaphore id pool mutexes*/
 static struct sem *sem;		/* semaphore pool */
 SLIST_HEAD(, sem_undo) semu_list;	/* list of active undo structures */
 static int	*semu;		/* undo structure pool */
 static eventhandler_tag semexit_tag;
 
 #define SEMUNDO_MTX		sem_mtx
 #define SEMUNDO_LOCK()		mtx_lock(&SEMUNDO_MTX);
 #define SEMUNDO_UNLOCK()	mtx_unlock(&SEMUNDO_MTX);
 #define SEMUNDO_LOCKASSERT(how)	mtx_assert(&SEMUNDO_MTX, (how));
 
 struct sem {
 	u_short	semval;		/* semaphore value */
 	pid_t	sempid;		/* pid of last operation */
 	u_short	semncnt;	/* # awaiting semval > cval */
 	u_short	semzcnt;	/* # awaiting semval = 0 */
 };
 
 /*
  * Undo structure (one per process)
  */
 struct sem_undo {
 	SLIST_ENTRY(sem_undo) un_next;	/* ptr to next active undo structure */
 	struct	proc *un_proc;		/* owner of this structure */
 	short	un_cnt;			/* # of active entries */
 	struct undo {
 		short	un_adjval;	/* adjust on exit values */
 		short	un_num;		/* semaphore # */
 		int	un_id;		/* semid */
 	} un_ent[1];			/* undo entries */
 };
 
 /*
  * Configuration parameters
  */
 #ifndef SEMMNI
 #define SEMMNI	10		/* # of semaphore identifiers */
 #endif
 #ifndef SEMMNS
 #define SEMMNS	60		/* # of semaphores in system */
 #endif
 #ifndef SEMUME
 #define SEMUME	10		/* max # of undo entries per process */
 #endif
 #ifndef SEMMNU
 #define SEMMNU	30		/* # of undo structures in system */
 #endif
 
 /* shouldn't need tuning */
 #ifndef SEMMAP
 #define SEMMAP	30		/* # of entries in semaphore map */
 #endif
 #ifndef SEMMSL
 #define SEMMSL	SEMMNS		/* max # of semaphores per id */
 #endif
 #ifndef SEMOPM
 #define SEMOPM	100		/* max # of operations per semop call */
 #endif
 
 #define SEMVMX	32767		/* semaphore maximum value */
 #define SEMAEM	16384		/* adjust on exit max value */
 
 /*
  * Due to the way semaphore memory is allocated, we have to ensure that
  * SEMUSZ is properly aligned.
  */
 
 #define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
 
 /* actual size of an undo structure */
 #define SEMUSZ	SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))
 
 /*
  * Macro to find a particular sem_undo vector
  */
 #define SEMU(ix) \
 	((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz))
 
 /*
  * semaphore info struct
  */
 struct seminfo seminfo = {
                 SEMMAP,         /* # of entries in semaphore map */
                 SEMMNI,         /* # of semaphore identifiers */
                 SEMMNS,         /* # of semaphores in system */
                 SEMMNU,         /* # of undo structures in system */
                 SEMMSL,         /* max # of semaphores per id */
                 SEMOPM,         /* max # of operations per semop call */
                 SEMUME,         /* max # of undo entries per process */
                 SEMUSZ,         /* size in bytes of undo structure */
                 SEMVMX,         /* semaphore maximum value */
                 SEMAEM          /* adjust on exit max value */
 };
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0,
     "Number of entries in the semaphore map");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
     "Number of semaphore identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
     "Maximum number of semaphores in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0,
     "Maximum number of undo structures in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0,
     "Max semaphores per id");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0,
     "Max operations per semop call");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0,
     "Max undo entries per process");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0,
     "Size in bytes of undo structure");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0,
     "Semaphore maximum value");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0,
     "Adjust on exit max value");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLFLAG_RD,
     NULL, 0, sysctl_sema, "", "");
 
 static void
 seminit(void)
 {
 	int i;
 
 	TUNABLE_INT_FETCH("kern.ipc.semmap", &seminfo.semmap);
 	TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
 	TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
 	TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
 	TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl);
 	TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm);
 	TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume);
 	TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz);
 	TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx);
 	TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem);
 
 	sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
 	sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM,
 	    M_WAITOK);
 	sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM,
 	    M_WAITOK | M_ZERO);
 	semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
 
 	for (i = 0; i < seminfo.semmni; i++) {
 		sema[i].u.sem_base = 0;
 		sema[i].u.sem_perm.mode = 0;
 		sema[i].u.sem_perm.seq = 0;
 #ifdef MAC
 		mac_init_sysv_sem(&sema[i]);
 #endif
 	}
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF);
 	for (i = 0; i < seminfo.semmnu; i++) {
 		struct sem_undo *suptr = SEMU(i);
 		suptr->un_proc = NULL;
 	}
 	SLIST_INIT(&semu_list);
 	mtx_init(&sem_mtx, "sem", NULL, MTX_DEF);
 	semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 static int
 semunload(void)
 {
 	int i;
 
 	if (semtot != 0)
 		return (EBUSY);
 
 	EVENTHANDLER_DEREGISTER(process_exit, semexit_tag);
 #ifdef MAC
 	for (i = 0; i < seminfo.semmni; i++)
 		mac_destroy_sysv_sem(&sema[i]);
 #endif
 	free(sem, M_SEM);
 	free(sema, M_SEM);
 	free(semu, M_SEM);
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_destroy(&sema_mtx[i]);
 	mtx_destroy(&sem_mtx);
 	return (0);
 }
 
 static int
 sysvsem_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		seminit();
 		break;
 	case MOD_UNLOAD:
 		error = semunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvsem_mod = {
 	"sysvsem",
 	&sysvsem_modload,
 	NULL
 };
 
 SYSCALL_MODULE_HELPER(semsys);
 SYSCALL_MODULE_HELPER(__semctl);
 SYSCALL_MODULE_HELPER(semget);
 SYSCALL_MODULE_HELPER(semop);
 
 DECLARE_MODULE(sysvsem, sysvsem_mod,
 	SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvsem, 1);
 
 /*
  * Entry point for all SEM calls.
  */
 int
 semsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct semsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 		int	a5;
 	} */ *uap;
 {
 	int error;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	if (uap->which < 0 ||
 	    uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
 		return (EINVAL);
 	error = (*semcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 /*
  * Allocate a new sem_undo structure for a process
  * (returns ptr to structure or NULL if no more room)
  */
 
 static struct sem_undo *
 semu_alloc(td)
 	struct thread *td;
 {
 	int i;
 	struct sem_undo *suptr;
 	struct sem_undo **supptr;
 	int attempt;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	/*
 	 * Try twice to allocate something.
 	 * (we'll purge an empty structure after the first pass so
 	 * two passes are always enough)
 	 */
 
 	for (attempt = 0; attempt < 2; attempt++) {
 		/*
 		 * Look for a free structure.
 		 * Fill it in and return it if we find one.
 		 */
 
 		for (i = 0; i < seminfo.semmnu; i++) {
 			suptr = SEMU(i);
 			if (suptr->un_proc == NULL) {
 				SLIST_INSERT_HEAD(&semu_list, suptr, un_next);
 				suptr->un_cnt = 0;
 				suptr->un_proc = td->td_proc;
 				return(suptr);
 			}
 		}
 
 		/*
 		 * We didn't find a free one, if this is the first attempt
 		 * then try to free a structure.
 		 */
 
 		if (attempt == 0) {
 			/* All the structures are in use - try to free one */
 			int did_something = 0;
 
 			SLIST_FOREACH_PREVPTR(suptr, supptr, &semu_list,
 			    un_next) {
 				if (suptr->un_cnt == 0) {
 					suptr->un_proc = NULL;
 					did_something = 1;
 					*supptr = SLIST_NEXT(suptr, un_next);
 					break;
 				}
 			}
 
 			/* If we didn't free anything then just give-up */
 			if (!did_something)
 				return(NULL);
 		} else {
 			/*
 			 * The second pass failed even though we freed
 			 * something after the first pass!
 			 * This is IMPOSSIBLE!
 			 */
 			panic("semu_alloc - second attempt failed");
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Adjust a particular entry for a particular proc
  */
 
 static int
 semundo_adjust(td, supptr, semid, semnum, adjval)
 	struct thread *td;
 	struct sem_undo **supptr;
 	int semid, semnum;
 	int adjval;
 {
 	struct proc *p = td->td_proc;
 	struct sem_undo *suptr;
 	struct undo *sunptr;
 	int i;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	/* Look for and remember the sem_undo if the caller doesn't provide
 	   it */
 
 	suptr = *supptr;
 	if (suptr == NULL) {
 		SLIST_FOREACH(suptr, &semu_list, un_next) {
 			if (suptr->un_proc == p) {
 				*supptr = suptr;
 				break;
 			}
 		}
 		if (suptr == NULL) {
 			if (adjval == 0)
 				return(0);
 			suptr = semu_alloc(td);
 			if (suptr == NULL)
 				return(ENOSPC);
 			*supptr = suptr;
 		}
 	}
 
 	/*
 	 * Look for the requested entry and adjust it (delete if adjval becomes
 	 * 0).
 	 */
 	sunptr = &suptr->un_ent[0];
 	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
 		if (sunptr->un_id != semid || sunptr->un_num != semnum)
 			continue;
 		if (adjval != 0) {
 			adjval += sunptr->un_adjval;
 			if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 				return (ERANGE);
 		}
 		sunptr->un_adjval = adjval;
 		if (sunptr->un_adjval == 0) {
 			suptr->un_cnt--;
 			if (i < suptr->un_cnt)
 				suptr->un_ent[i] =
 				    suptr->un_ent[suptr->un_cnt];
 		}
 		return(0);
 	}
 
 	/* Didn't find the right entry - create it */
 	if (adjval == 0)
 		return(0);
 	if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 		return (ERANGE);
 	if (suptr->un_cnt != seminfo.semume) {
 		sunptr = &suptr->un_ent[suptr->un_cnt];
 		suptr->un_cnt++;
 		sunptr->un_adjval = adjval;
 		sunptr->un_id = semid; sunptr->un_num = semnum;
 	} else
 		return(EINVAL);
 	return(0);
 }
 
 static void
 semundo_clear(semid, semnum)
 	int semid, semnum;
 {
 	struct sem_undo *suptr;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	SLIST_FOREACH(suptr, &semu_list, un_next) {
 		struct undo *sunptr = &suptr->un_ent[0];
 		int i = 0;
 
 		while (i < suptr->un_cnt) {
 			if (sunptr->un_id == semid) {
 				if (semnum == -1 || sunptr->un_num == semnum) {
 					suptr->un_cnt--;
 					if (i < suptr->un_cnt) {
 						suptr->un_ent[i] =
 						  suptr->un_ent[suptr->un_cnt];
 						continue;
 					}
 				}
 				if (semnum != -1)
 					break;
 			}
 			i++, sunptr++;
 		}
 	}
 }
 
 static int
 semvalid(semid, semakptr)
 	int semid;
 	struct semid_kernel *semakptr;
 {
 
 	return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 	    semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0);
 }
 
 /*
  * Note that the user-mode half of this passes a union, not a pointer.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args {
 	int	semid;
 	int	semnum;
 	int	cmd;
 	union	semun *arg;
 };
 #endif
-
 int
 __semctl(td, uap)
 	struct thread *td;
 	struct __semctl_args *uap;
 {
 	struct semid_ds dsbuf;
 	union semun arg, semun;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
 		if (error)
 			return (error);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = arg.array;
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 int
 kern_semctl(struct thread *td, int semid, int semnum, int cmd,
     union semun *arg, register_t *rval)
 {
 	u_short *array;
 	struct ucred *cred = td->td_ucred;
 	int i, error;
 	struct semid_ds *sbuf;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	u_short usval, count;
 	int semidx;
 
 	DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
 	    semid, semnum, cmd, arg));
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	array = NULL;
 
 	switch(cmd) {
 	case SEM_STAT:
 		/*
 		 * For this command we assume semid is an array index
 		 * rather than an IPC id.
 		 */
 		if (semid < 0 || semid >= seminfo.semmni)
 			return (EINVAL);
 		semakptr = &sema[semid];
 		sema_mtxp = &sema_mtx[semid];
 		mtx_lock(sema_mtxp);
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 			error = EINVAL;
 			goto done2;
 		}
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 #ifdef MAC
 		error = mac_check_sysv_semctl(cred, semakptr, cmd);
 		if (error != 0)
 			goto done2;
 #endif
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		*rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
 		mtx_unlock(sema_mtxp);
 		return (0);
 	}
 
 	semidx = IPCID_TO_IX(semid);
 	if (semidx < 0 || semidx >= seminfo.semmni)
 		return (EINVAL);
 
 	semakptr = &sema[semidx];
 	sema_mtxp = &sema_mtx[semidx];
 	mtx_lock(sema_mtxp);
 #ifdef MAC
 	error = mac_check_sysv_semctl(cred, semakptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	*rval = 0;
 
 	switch (cmd) {
 	case IPC_RMID:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		semakptr->u.sem_perm.cuid = cred->cr_uid;
 		semakptr->u.sem_perm.uid = cred->cr_uid;
 		semtot -= semakptr->u.sem_nsems;
 		for (i = semakptr->u.sem_base - sem; i < semtot; i++)
 			sem[i] = sem[i + semakptr->u.sem_nsems];
 		for (i = 0; i < seminfo.semmni; i++) {
 			if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
 			    sema[i].u.sem_base > semakptr->u.sem_base)
 				sema[i].u.sem_base -= semakptr->u.sem_nsems;
 		}
 		semakptr->u.sem_perm.mode = 0;
 #ifdef MAC
 		mac_cleanup_sysv_sem(semakptr);
 #endif
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	case IPC_SET:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		sbuf = arg->buf;
 		semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
 		semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
 		semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
 		    ~0777) | (sbuf->sem_perm.mode & 0777);
 		semakptr->u.sem_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		break;
 
 	case GETNCNT:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semncnt;
 		break;
 
 	case GETPID:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].sempid;
 		break;
 
 	case GETVAL:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semval;
 		break;
 
 	case GETALL:
 		/*
 		 * Unfortunately, callers of this function don't know
 		 * in advance how many semaphores are in this set.
 		 * While we could just allocate the maximum size array
 		 * and pass the actual size back to the caller, that
 		 * won't work for SETALL since we can't copyin() more
 		 * data than the user specified as we may return a
 		 * spurious EFAULT.
 		 * 
 		 * Note that the number of semaphores in a set is
 		 * fixed for the life of that set.  The only way that
 		 * the 'count' could change while are blocked in
 		 * malloc() is if this semaphore set were destroyed
 		 * and a new one created with the same index.
 		 * However, semvalid() will catch that due to the
 		 * sequence number unless exactly 0x8000 (or a
 		 * multiple thereof) semaphore sets for the same index
 		 * are created and destroyed while we are in malloc!
 		 *
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		mtx_lock(sema_mtxp);
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++)
 			array[i] = semakptr->u.sem_base[i].semval;
 		mtx_unlock(sema_mtxp);
 		error = copyout(array, arg->array, count * sizeof(*array));
 		mtx_lock(sema_mtxp);
 		break;
 
 	case GETZCNT:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semzcnt;
 		break;
 
 	case SETVAL:
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		if (arg->val < 0 || arg->val > seminfo.semvmx) {
 			error = ERANGE;
 			goto done2;
 		}
 		semakptr->u.sem_base[semnum].semval = arg->val;
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, semnum);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	case SETALL:
 		/*
 		 * See comment on GETALL for why 'count' shouldn't change
 		 * and why we require a userland buffer.
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		error = copyin(arg->array, array, count * sizeof(*array));
 		if (error)
 			break;
 		mtx_lock(sema_mtxp);
 		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++) {
 			usval = array[i];
 			if (usval > seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			}
 			semakptr->u.sem_base[i].semval = usval;
 		}
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 done2:
 	mtx_unlock(sema_mtxp);
 	if (array != NULL)
 		free(array, M_TEMP);
 	return(error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semget_args {
 	key_t	key;
 	int	nsems;
 	int	semflg;
 };
 #endif
-
 int
 semget(td, uap)
 	struct thread *td;
 	struct semget_args *uap;
 {
 	int semid, error = 0;
 	int key = uap->key;
 	int nsems = uap->nsems;
 	int semflg = uap->semflg;
 	struct ucred *cred = td->td_ucred;
 
 	DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	mtx_lock(&Giant);
 	if (key != IPC_PRIVATE) {
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) &&
 			    sema[semid].u.sem_perm.key == key)
 				break;
 		}
 		if (semid < seminfo.semmni) {
 			DPRINTF(("found public key\n"));
 			if ((error = ipcperm(td, &sema[semid].u.sem_perm,
 			    semflg & 0700))) {
 				goto done2;
 			}
 			if (nsems > 0 && sema[semid].u.sem_nsems < nsems) {
 				DPRINTF(("too small\n"));
 				error = EINVAL;
 				goto done2;
 			}
 			if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_check_sysv_semget(cred, &sema[semid]);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the semid_kernel\n"));
 	if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
 		if (nsems <= 0 || nsems > seminfo.semmsl) {
 			DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
 			    seminfo.semmsl));
 			error = EINVAL;
 			goto done2;
 		}
 		if (nsems > seminfo.semmns - semtot) {
 			DPRINTF((
 			    "not enough semaphores left (need %d, got %d)\n",
 			    nsems, seminfo.semmns - semtot));
 			error = ENOSPC;
 			goto done2;
 		}
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0)
 				break;
 		}
 		if (semid == seminfo.semmni) {
 			DPRINTF(("no more semid_kernel's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 		DPRINTF(("semid %d is available\n", semid));
 		sema[semid].u.sem_perm.key = key;
 		sema[semid].u.sem_perm.cuid = cred->cr_uid;
 		sema[semid].u.sem_perm.uid = cred->cr_uid;
 		sema[semid].u.sem_perm.cgid = cred->cr_gid;
 		sema[semid].u.sem_perm.gid = cred->cr_gid;
 		sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
 		sema[semid].u.sem_perm.seq =
 		    (sema[semid].u.sem_perm.seq + 1) & 0x7fff;
 		sema[semid].u.sem_nsems = nsems;
 		sema[semid].u.sem_otime = 0;
 		sema[semid].u.sem_ctime = time_second;
 		sema[semid].u.sem_base = &sem[semtot];
 		semtot += nsems;
 		bzero(sema[semid].u.sem_base,
 		    sizeof(sema[semid].u.sem_base[0])*nsems);
 #ifdef MAC
 		mac_create_sysv_sem(cred, &sema[semid]);
 #endif
 		DPRINTF(("sembase = %p, next = %p\n",
 		    sema[semid].u.sem_base, &sem[semtot]));
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semop_args {
 	int	semid;
 	struct	sembuf *sops;
 	size_t	nsops;
 };
 #endif
-
 int
 semop(td, uap)
 	struct thread *td;
 	struct semop_args *uap;
 {
 #define SMALL_SOPS	8
 	struct sembuf small_sops[SMALL_SOPS];
 	int semid = uap->semid;
 	size_t nsops = uap->nsops;
 	struct sembuf *sops;
 	struct semid_kernel *semakptr;
 	struct sembuf *sopptr = 0;
 	struct sem *semptr = 0;
 	struct sem_undo *suptr;
 	struct mtx *sema_mtxp;
 	size_t i, j, k;
 	int error;
 	int do_wakeup, do_undos;
 
 #ifdef SEM_DEBUG
 	sops = NULL;
 #endif
 	DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	semid = IPCID_TO_IX(semid);	/* Convert back to zero origin */
 
 	if (semid < 0 || semid >= seminfo.semmni)
 		return (EINVAL);
 
 	/* Allocate memory for sem_ops */
 	if (nsops <= SMALL_SOPS)
 		sops = small_sops;
 	else if (nsops <= seminfo.semopm)
 		sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
 	else {
 		DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
 		    nsops));
 		return (E2BIG);
 	}
 	if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
 		DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
 		    uap->sops, sops, nsops * sizeof(sops[0])));
 		if (sops != small_sops)
 			free(sops, M_SEM);
 		return (error);
 	}
 
 	semakptr = &sema[semid];
 	sema_mtxp = &sema_mtx[semid];
 	mtx_lock(sema_mtxp);
 	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 		error = EINVAL;
 		goto done2;
 	}
 	if (semakptr->u.sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
 		error = EINVAL;
 		goto done2;
 	}
 	/*
 	 * Initial pass thru sops to see what permissions are needed.
 	 * Also perform any checks that don't need repeating on each
 	 * attempt to satisfy the request vector.
 	 */
 	j = 0;		/* permission needed */
 	do_undos = 0;
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		if (sopptr->sem_num >= semakptr->u.sem_nsems) {
 			error = EFBIG;
 			goto done2;
 		}
 		if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
 			do_undos = 1;
 		j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
 	}
 
 	if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) {
 		DPRINTF(("error = %d from ipaccess\n", error));
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_check_sysv_semop(td->td_ucred, semakptr, j);
 	if (error != 0)
 		goto done2;
 #endif
 
 	/*
 	 * Loop trying to satisfy the vector of requests.
 	 * If we reach a point where we must wait, any requests already
 	 * performed are rolled back and we go to sleep until some other
 	 * process wakes us up.  At this point, we start all over again.
 	 *
 	 * This ensures that from the perspective of other tasks, a set
 	 * of requests is atomic (never partially satisfied).
 	 */
 	for (;;) {
 		do_wakeup = 0;
 		error = 0;	/* error return if necessary */
 
 		for (i = 0; i < nsops; i++) {
 			sopptr = &sops[i];
 			semptr = &semakptr->u.sem_base[sopptr->sem_num];
 
 			DPRINTF((
 			    "semop:  semakptr=%p, sem_base=%p, "
 			    "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
 			    semakptr, semakptr->u.sem_base, semptr,
 			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
 			    (sopptr->sem_flg & IPC_NOWAIT) ?
 			    "nowait" : "wait"));
 
 			if (sopptr->sem_op < 0) {
 				if (semptr->semval + sopptr->sem_op < 0) {
 					DPRINTF(("semop:  can't do it now\n"));
 					break;
 				} else {
 					semptr->semval += sopptr->sem_op;
 					if (semptr->semval == 0 &&
 					    semptr->semzcnt > 0)
 						do_wakeup = 1;
 				}
 			} else if (sopptr->sem_op == 0) {
 				if (semptr->semval != 0) {
 					DPRINTF(("semop:  not zero now\n"));
 					break;
 				}
 			} else if (semptr->semval + sopptr->sem_op >
 			    seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			} else {
 				if (semptr->semncnt > 0)
 					do_wakeup = 1;
 				semptr->semval += sopptr->sem_op;
 			}
 		}
 
 		/*
 		 * Did we get through the entire vector?
 		 */
 		if (i >= nsops)
 			goto done;
 
 		/*
 		 * No ... rollback anything that we've already done
 		 */
 		DPRINTF(("semop:  rollback 0 through %d\n", i-1));
 		for (j = 0; j < i; j++)
 			semakptr->u.sem_base[sops[j].sem_num].semval -=
 			    sops[j].sem_op;
 
 		/* If we detected an error, return it */
 		if (error != 0)
 			goto done2;
 
 		/*
 		 * If the request that we couldn't satisfy has the
 		 * NOWAIT flag set then return with EAGAIN.
 		 */
 		if (sopptr->sem_flg & IPC_NOWAIT) {
 			error = EAGAIN;
 			goto done2;
 		}
 
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt++;
 		else
 			semptr->semncnt++;
 
 		DPRINTF(("semop:  good night!\n"));
 		error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH,
 		    "semwait", 0);
 		DPRINTF(("semop:  good morning (error=%d)!\n", error));
 		/* return code is checked below, after sem[nz]cnt-- */
 
 		/*
 		 * Make sure that the semaphore still exists
 		 */
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 		    semakptr->u.sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
 			error = EIDRM;
 			goto done2;
 		}
 
 		/*
 		 * The semaphore is still alive.  Readjust the count of
 		 * waiting processes.
 		 */
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt--;
 		else
 			semptr->semncnt--;
 
 		/*
 		 * Is it really morning, or was our sleep interrupted?
 		 * (Delayed check of msleep() return code because we
 		 * need to decrement sem[nz]cnt either way.)
 		 */
 		if (error != 0) {
 			error = EINTR;
 			goto done2;
 		}
 		DPRINTF(("semop:  good morning!\n"));
 	}
 
 done:
 	/*
 	 * Process any SEM_UNDO requests.
 	 */
 	if (do_undos) {
 		SEMUNDO_LOCK();
 		suptr = NULL;
 		for (i = 0; i < nsops; i++) {
 			/*
 			 * We only need to deal with SEM_UNDO's for non-zero
 			 * op's.
 			 */
 			int adjval;
 
 			if ((sops[i].sem_flg & SEM_UNDO) == 0)
 				continue;
 			adjval = sops[i].sem_op;
 			if (adjval == 0)
 				continue;
 			error = semundo_adjust(td, &suptr, semid,
 			    sops[i].sem_num, -adjval);
 			if (error == 0)
 				continue;
 
 			/*
 			 * Oh-Oh!  We ran out of either sem_undo's or undo's.
 			 * Rollback the adjustments to this point and then
 			 * rollback the semaphore ups and down so we can return
 			 * with an error with all structures restored.  We
 			 * rollback the undo's in the exact reverse order that
 			 * we applied them.  This guarantees that we won't run
 			 * out of space as we roll things back out.
 			 */
 			for (j = 0; j < i; j++) {
 				k = i - j - 1;
 				if ((sops[k].sem_flg & SEM_UNDO) == 0)
 					continue;
 				adjval = sops[k].sem_op;
 				if (adjval == 0)
 					continue;
 				if (semundo_adjust(td, &suptr, semid,
 				    sops[k].sem_num, adjval) != 0)
 					panic("semop - can't undo undos");
 			}
 
 			for (j = 0; j < nsops; j++)
 				semakptr->u.sem_base[sops[j].sem_num].semval -=
 				    sops[j].sem_op;
 
 			DPRINTF(("error = %d from semundo_adjust\n", error));
 			SEMUNDO_UNLOCK();
 			goto done2;
 		} /* loop through the sops */
 		SEMUNDO_UNLOCK();
 	} /* if (do_undos) */
 
 	/* We're definitely done - set the sempid's and time */
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		semptr = &semakptr->u.sem_base[sopptr->sem_num];
 		semptr->sempid = td->td_proc->p_pid;
 	}
 	semakptr->u.sem_otime = time_second;
 
 	/*
 	 * Do a wakeup if any semaphore was up'd whilst something was
 	 * sleeping on it.
 	 */
 	if (do_wakeup) {
 		DPRINTF(("semop:  doing wakeup\n"));
 		wakeup(semakptr);
 		DPRINTF(("semop:  back from wakeup\n"));
 	}
 	DPRINTF(("semop:  done\n"));
 	td->td_retval[0] = 0;
 done2:
 	mtx_unlock(sema_mtxp);
 	if (sops != small_sops)
 		free(sops, M_SEM);
 	return (error);
 }
 
 /*
  * Go through the undo structures for this process and apply the adjustments to
  * semaphores.
  */
 static void
 semexit_myhook(arg, p)
 	void *arg;
 	struct proc *p;
 {
 	struct sem_undo *suptr;
 	struct sem_undo **supptr;
 
 	/*
 	 * Go through the chain of undo vectors looking for one
 	 * associated with this process.
 	 */
 	SEMUNDO_LOCK();
 	SLIST_FOREACH_PREVPTR(suptr, supptr, &semu_list, un_next) {
 		if (suptr->un_proc == p)
 			break;
 	}
 	SEMUNDO_UNLOCK();
 
 	if (suptr == NULL)
 		return;
 
 	DPRINTF(("proc @%p has undo structure with %d entries\n", p,
 	    suptr->un_cnt));
 
 	/*
 	 * If there are any active undo elements then process them.
 	 */
 	if (suptr->un_cnt > 0) {
 		int ix;
 
 		for (ix = 0; ix < suptr->un_cnt; ix++) {
 			int semid = suptr->un_ent[ix].un_id;
 			int semnum = suptr->un_ent[ix].un_num;
 			int adjval = suptr->un_ent[ix].un_adjval;
 			struct semid_kernel *semakptr;
 			struct mtx *sema_mtxp;
 
 			semakptr = &sema[semid];
 			sema_mtxp = &sema_mtx[semid];
 			mtx_lock(sema_mtxp);
 			SEMUNDO_LOCK();
 			if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0)
 				panic("semexit - semid not allocated");
 			if (semnum >= semakptr->u.sem_nsems)
 				panic("semexit - semnum out of range");
 
 			DPRINTF((
 			    "semexit:  %p id=%d num=%d(adj=%d) ; sem=%d\n",
 			    suptr->un_proc, suptr->un_ent[ix].un_id,
 			    suptr->un_ent[ix].un_num,
 			    suptr->un_ent[ix].un_adjval,
 			    semakptr->u.sem_base[semnum].semval));
 
 			if (adjval < 0) {
 				if (semakptr->u.sem_base[semnum].semval <
 				    -adjval)
 					semakptr->u.sem_base[semnum].semval = 0;
 				else
 					semakptr->u.sem_base[semnum].semval +=
 					    adjval;
 			} else
 				semakptr->u.sem_base[semnum].semval += adjval;
 
 			wakeup(semakptr);
 			DPRINTF(("semexit:  back from wakeup\n"));
 			mtx_unlock(sema_mtxp);
 			SEMUNDO_UNLOCK();
 		}
 	}
 
 	/*
 	 * Deallocate the undo vector.
 	 */
 	DPRINTF(("removing vector\n"));
 	suptr->un_proc = NULL;
 	*supptr = SLIST_NEXT(suptr, un_next);
 }
 
 static int
 sysctl_sema(SYSCTL_HANDLER_ARGS)
 {
 
 	return (SYSCTL_OUT(req, sema,
 	    sizeof(struct semid_kernel) * seminfo.semmni));
 }
Index: head/sys/kern/sysv_shm.c
===================================================================
--- head/sys/kern/sysv_shm.c	(revision 167231)
+++ head/sys/kern/sysv_shm.c	(revision 167232)
@@ -1,1023 +1,1018 @@
 /*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
 /*-
  * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Adam Glass and Charles
  *	Hannum.
  * 4. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/shm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
 
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 struct oshmctl_args;
 static int oshmctl(struct thread *td, struct oshmctl_args *uap);
 #endif
 
 static int shmget_allocate_segment(struct thread *td,
     struct shmget_args *uap, int mode);
 static int shmget_existing(struct thread *td, struct shmget_args *uap,
     int mode, int segnum);
 
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *shmcalls[] = {
 	(sy_call_t *)shmat, (sy_call_t *)oshmctl,
 	(sy_call_t *)shmdt, (sy_call_t *)shmget,
 	(sy_call_t *)shmctl
 };
 #endif
 
 #define	SHMSEG_FREE     	0x0200
 #define	SHMSEG_REMOVED  	0x0400
 #define	SHMSEG_ALLOCATED	0x0800
 #define	SHMSEG_WANTED		0x1000
 
 static int shm_last_free, shm_nused, shm_committed, shmalloced;
 static struct shmid_kernel	*shmsegs;
 
 struct shmmap_state {
 	vm_offset_t va;
 	int shmid;
 };
 
 static void shm_deallocate_segment(struct shmid_kernel *);
 static int shm_find_segment_by_key(key_t);
 static struct shmid_kernel *shm_find_segment_by_shmid(int);
 static struct shmid_kernel *shm_find_segment_by_shmidx(int);
 static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *);
 static void shmrealloc(void);
 static void shminit(void);
 static int sysvshm_modload(struct module *, int, void *);
 static int shmunload(void);
 static void shmexit_myhook(struct vmspace *vm);
 static void shmfork_myhook(struct proc *p1, struct proc *p2);
 static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
 
 /*
  * Tuneable values.
  */
 #ifndef SHMMAXPGS
 #define	SHMMAXPGS	8192	/* Note: sysv shared memory is swap backed. */
 #endif
 #ifndef SHMMAX
 #define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
 #endif
 #ifndef SHMMIN
 #define	SHMMIN	1
 #endif
 #ifndef SHMMNI
 #define	SHMMNI	192
 #endif
 #ifndef SHMSEG
 #define	SHMSEG	128
 #endif
 #ifndef SHMALL
 #define	SHMALL	(SHMMAXPGS)
 #endif
 
 struct	shminfo shminfo = {
 	SHMMAX,
 	SHMMIN,
 	SHMMNI,
 	SHMSEG,
 	SHMALL
 };
 
 static int shm_use_phys;
 static int shm_allow_removed;
 
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
     "Maximum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
     "Minimum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
     "Number of shared memory identifiers");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
     "Number of segments per process");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
     "Maximum number of pages available for shared memory");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
     &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RW,
     &shm_allow_removed, 0,
     "Enable/Disable attachment to attached segments marked for removal");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLFLAG_RD,
     NULL, 0, sysctl_shmsegs, "",
     "Current number of shared memory segments allocated");
 
 static int
 shm_find_segment_by_key(key)
 	key_t key;
 {
 	int i;
 
 	for (i = 0; i < shmalloced; i++)
 		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
 		    shmsegs[i].u.shm_perm.key == key)
 			return (i);
 	return (-1);
 }
 
 static struct shmid_kernel *
 shm_find_segment_by_shmid(int shmid)
 {
 	int segnum;
 	struct shmid_kernel *shmseg;
 
 	segnum = IPCID_TO_IX(shmid);
 	if (segnum < 0 || segnum >= shmalloced)
 		return (NULL);
 	shmseg = &shmsegs[segnum];
 	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
 	    (!shm_allow_removed &&
 	     (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) ||
 	    shmseg->u.shm_perm.seq != IPCID_TO_SEQ(shmid))
 		return (NULL);
 	return (shmseg);
 }
 
 static struct shmid_kernel *
 shm_find_segment_by_shmidx(int segnum)
 {
 	struct shmid_kernel *shmseg;
 
 	if (segnum < 0 || segnum >= shmalloced)
 		return (NULL);
 	shmseg = &shmsegs[segnum];
 	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
 	    (!shm_allow_removed &&
 	     (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0))
 		return (NULL);
 	return (shmseg);
 }
 
 static void
 shm_deallocate_segment(shmseg)
 	struct shmid_kernel *shmseg;
 {
 	size_t size;
 
 	GIANT_REQUIRED;
 
 	vm_object_deallocate(shmseg->u.shm_internal);
 	shmseg->u.shm_internal = NULL;
 	size = round_page(shmseg->u.shm_segsz);
 	shm_committed -= btoc(size);
 	shm_nused--;
 	shmseg->u.shm_perm.mode = SHMSEG_FREE;
 #ifdef MAC
 	mac_cleanup_sysv_shm(shmseg);
 #endif
 }
 
 static int
 shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
 {
 	struct shmid_kernel *shmseg;
 	int segnum, result;
 	size_t size;
 
 	GIANT_REQUIRED;
 
 	segnum = IPCID_TO_IX(shmmap_s->shmid);
 	shmseg = &shmsegs[segnum];
 	size = round_page(shmseg->u.shm_segsz);
 	result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
 	if (result != KERN_SUCCESS)
 		return (EINVAL);
 	shmmap_s->shmid = -1;
 	shmseg->u.shm_dtime = time_second;
 	if ((--shmseg->u.shm_nattch <= 0) &&
 	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
 		shm_deallocate_segment(shmseg);
 		shm_last_free = segnum;
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmdt_args {
 	const void *shmaddr;
 };
 #endif
-
 int
 shmdt(td, uap)
 	struct thread *td;
 	struct shmdt_args *uap;
 {
 	struct proc *p = td->td_proc;
 	struct shmmap_state *shmmap_s;
 #ifdef MAC
 	struct shmid_kernel *shmsegptr;
 #endif
 	int i;
 	int error = 0;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	mtx_lock(&Giant);
 	shmmap_s = p->p_vmspace->vm_shm;
  	if (shmmap_s == NULL) {
 		error = EINVAL;
 		goto done2;
 	}
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
 		if (shmmap_s->shmid != -1 &&
 		    shmmap_s->va == (vm_offset_t)uap->shmaddr) {
 			break;
 		}
 	}
 	if (i == shminfo.shmseg) {
 		error = EINVAL;
 		goto done2;
 	}
 #ifdef MAC
 	shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
 	error = mac_check_sysv_shmdt(td->td_ucred, shmsegptr);
 	if (error != 0)
 		goto done2;
 #endif
 	error = shm_delete_mapping(p->p_vmspace, shmmap_s);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmat_args {
 	int shmid;
 	const void *shmaddr;
 	int shmflg;
 };
 #endif
-
 int
 kern_shmat(td, shmid, shmaddr, shmflg)
 	struct thread *td;
 	int shmid;
 	const void *shmaddr;
 	int shmflg;
 {
 	struct proc *p = td->td_proc;
 	int i, flags;
 	struct shmid_kernel *shmseg;
 	struct shmmap_state *shmmap_s = NULL;
 	vm_offset_t attach_va;
 	vm_prot_t prot;
 	vm_size_t size;
 	int rv;
 	int error = 0;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	mtx_lock(&Giant);
 	shmmap_s = p->p_vmspace->vm_shm;
 	if (shmmap_s == NULL) {
 		size = shminfo.shmseg * sizeof(struct shmmap_state);
 		shmmap_s = malloc(size, M_SHM, M_WAITOK);
 		for (i = 0; i < shminfo.shmseg; i++)
 			shmmap_s[i].shmid = -1;
 		p->p_vmspace->vm_shm = shmmap_s;
 	}
 	shmseg = shm_find_segment_by_shmid(shmid);
 	if (shmseg == NULL) {
 		error = EINVAL;
 		goto done2;
 	}
 	error = ipcperm(td, &shmseg->u.shm_perm,
 	    (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
 	if (error)
 		goto done2;
 #ifdef MAC
 	error = mac_check_sysv_shmat(td->td_ucred, shmseg, shmflg);
 	if (error != 0)
 		goto done2;
 #endif
 	for (i = 0; i < shminfo.shmseg; i++) {
 		if (shmmap_s->shmid == -1)
 			break;
 		shmmap_s++;
 	}
 	if (i >= shminfo.shmseg) {
 		error = EMFILE;
 		goto done2;
 	}
 	size = round_page(shmseg->u.shm_segsz);
 #ifdef VM_PROT_READ_IS_EXEC
 	prot = VM_PROT_READ | VM_PROT_EXECUTE;
 #else
 	prot = VM_PROT_READ;
 #endif
 	if ((shmflg & SHM_RDONLY) == 0)
 		prot |= VM_PROT_WRITE;
 	flags = MAP_ANON | MAP_SHARED;
 	if (shmaddr) {
 		flags |= MAP_FIXED;
 		if (shmflg & SHM_RND) {
 			attach_va = (vm_offset_t)shmaddr & ~(SHMLBA-1);
 		} else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0) {
 			attach_va = (vm_offset_t)shmaddr;
 		} else {
 			error = EINVAL;
 			goto done2;
 		}
 	} else {
 		/*
 		 * This is just a hint to vm_map_find() about where to
 		 * put it.
 		 */
 		PROC_LOCK(p);
 		attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
 		    lim_max(p, RLIMIT_DATA));
 		PROC_UNLOCK(p);
 	}
 
 	vm_object_reference(shmseg->u.shm_internal);
 	rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->u.shm_internal,
 		0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0);
 	if (rv != KERN_SUCCESS) {
 		vm_object_deallocate(shmseg->u.shm_internal);
 		error = ENOMEM;
 		goto done2;
 	}
 	vm_map_inherit(&p->p_vmspace->vm_map,
 		attach_va, attach_va + size, VM_INHERIT_SHARE);
 
 	shmmap_s->va = attach_va;
 	shmmap_s->shmid = shmid;
 	shmseg->u.shm_lpid = p->p_pid;
 	shmseg->u.shm_atime = time_second;
 	shmseg->u.shm_nattch++;
 	td->td_retval[0] = attach_va;
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 int
 shmat(td, uap)
 	struct thread *td;
 	struct shmat_args *uap;
 {
 	return kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg);
 }
 
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 struct oshmid_ds {
 	struct	ipc_perm shm_perm;	/* operation perms */
 	int	shm_segsz;		/* size of segment (bytes) */
 	u_short	shm_cpid;		/* pid, creator */
 	u_short	shm_lpid;		/* pid, last operation */
 	short	shm_nattch;		/* no. of current attaches */
 	time_t	shm_atime;		/* last attach time */
 	time_t	shm_dtime;		/* last detach time */
 	time_t	shm_ctime;		/* last change time */
 	void	*shm_handle;		/* internal handle for shm segment */
 };
 
 struct oshmctl_args {
 	int shmid;
 	int cmd;
 	struct oshmid_ds *ubuf;
 };
-
 static int
 oshmctl(td, uap)
 	struct thread *td;
 	struct oshmctl_args *uap;
 {
 #ifdef COMPAT_43
 	int error = 0;
 	struct shmid_kernel *shmseg;
 	struct oshmid_ds outbuf;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	mtx_lock(&Giant);
 	shmseg = shm_find_segment_by_shmid(uap->shmid);
 	if (shmseg == NULL) {
 		error = EINVAL;
 		goto done2;
 	}
 	switch (uap->cmd) {
 	case IPC_STAT:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 		if (error)
 			goto done2;
 #ifdef MAC
 		error = mac_check_sysv_shmctl(td->td_ucred, shmseg, uap->cmd);
 		if (error != 0)
 			goto done2;
 #endif
 		outbuf.shm_perm = shmseg->u.shm_perm;
 		outbuf.shm_segsz = shmseg->u.shm_segsz;
 		outbuf.shm_cpid = shmseg->u.shm_cpid;
 		outbuf.shm_lpid = shmseg->u.shm_lpid;
 		outbuf.shm_nattch = shmseg->u.shm_nattch;
 		outbuf.shm_atime = shmseg->u.shm_atime;
 		outbuf.shm_dtime = shmseg->u.shm_dtime;
 		outbuf.shm_ctime = shmseg->u.shm_ctime;
 		outbuf.shm_handle = shmseg->u.shm_internal;
 		error = copyout(&outbuf, uap->ubuf, sizeof(outbuf));
 		if (error)
 			goto done2;
 		break;
 	default:
 		error = shmctl(td, (struct shmctl_args *)uap);
 		break;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 #else
 	return (EINVAL);
 #endif
 }
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmctl_args {
 	int shmid;
 	int cmd;
 	struct shmid_ds *buf;
 };
 #endif
-
 int
 kern_shmctl(td, shmid, cmd, buf, bufsz)
 	struct thread *td;
 	int shmid;
 	int cmd;
 	void *buf;
 	size_t *bufsz;
 {
 	int error = 0;
 	struct shmid_kernel *shmseg;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	mtx_lock(&Giant);
 	switch (cmd) {
 	case IPC_INFO:
 		memcpy(buf, &shminfo, sizeof(shminfo));
 		if (bufsz)
 			*bufsz = sizeof(shminfo);
 		td->td_retval[0] = shmalloced;
 		goto done2;
 	case SHM_INFO: {
 		struct shm_info shm_info;
 		shm_info.used_ids = shm_nused;
 		shm_info.shm_rss = 0;	/*XXX where to get from ? */
 		shm_info.shm_tot = 0;	/*XXX where to get from ? */
 		shm_info.shm_swp = 0;	/*XXX where to get from ? */
 		shm_info.swap_attempts = 0;	/*XXX where to get from ? */
 		shm_info.swap_successes = 0;	/*XXX where to get from ? */
 		memcpy(buf, &shm_info, sizeof(shm_info));
 		if (bufsz)
 			*bufsz = sizeof(shm_info);
 		td->td_retval[0] = shmalloced;
 		goto done2;
 	}
 	}
 	if (cmd == SHM_STAT)
 		shmseg = shm_find_segment_by_shmidx(shmid);
 	else
 		shmseg = shm_find_segment_by_shmid(shmid);
 	if (shmseg == NULL) {
 		error = EINVAL;
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_check_sysv_shmctl(td->td_ucred, shmseg, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 	switch (cmd) {
 	case SHM_STAT:
 	case IPC_STAT:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 		if (error)
 			goto done2;
 		memcpy(buf, &shmseg->u, sizeof(struct shmid_ds));
 		if (bufsz)
 			*bufsz = sizeof(struct shmid_ds);
 		if (cmd == SHM_STAT)
 			td->td_retval[0] = IXSEQ_TO_IPCID(shmid, shmseg->u.shm_perm);
 		break;
 	case IPC_SET: {
 		struct shmid_ds *shmid;
 
 		shmid = (struct shmid_ds *)buf;
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error)
 			goto done2;
 		shmseg->u.shm_perm.uid = shmid->shm_perm.uid;
 		shmseg->u.shm_perm.gid = shmid->shm_perm.gid;
 		shmseg->u.shm_perm.mode =
 		    (shmseg->u.shm_perm.mode & ~ACCESSPERMS) |
 		    (shmid->shm_perm.mode & ACCESSPERMS);
 		shmseg->u.shm_ctime = time_second;
 		break;
 	}
 	case IPC_RMID:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error)
 			goto done2;
 		shmseg->u.shm_perm.key = IPC_PRIVATE;
 		shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
 		if (shmseg->u.shm_nattch <= 0) {
 			shm_deallocate_segment(shmseg);
 			shm_last_free = IPCID_TO_IX(shmid);
 		}
 		break;
 #if 0
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 #endif
 	default:
 		error = EINVAL;
 		break;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 int
 shmctl(td, uap)
 	struct thread *td;
 	struct shmctl_args *uap;
 {
 	int error = 0;
 	struct shmid_ds buf;
 	size_t bufsz;
 	
 	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
 			goto done;
 	}
 	
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
 	if (error)
 		goto done;
 	
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_INFO:
 	case SHM_INFO:
 	case SHM_STAT:
 	case IPC_STAT:
 		error = copyout(&buf, uap->buf, bufsz);
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmget_args {
 	key_t key;
 	size_t size;
 	int shmflg;
 };
 #endif
-
 static int
 shmget_existing(td, uap, mode, segnum)
 	struct thread *td;
 	struct shmget_args *uap;
 	int mode;
 	int segnum;
 {
 	struct shmid_kernel *shmseg;
 	int error;
 
 	shmseg = &shmsegs[segnum];
 	if (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) {
 		/*
 		 * This segment is in the process of being allocated.  Wait
 		 * until it's done, and look the key up again (in case the
 		 * allocation failed or it was freed).
 		 */
 		shmseg->u.shm_perm.mode |= SHMSEG_WANTED;
 		error = tsleep(shmseg, PLOCK | PCATCH, "shmget", 0);
 		if (error)
 			return (error);
 		return (EAGAIN);
 	}
 	if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
 		return (EEXIST);
 #ifdef MAC
 	error = mac_check_sysv_shmget(td->td_ucred, shmseg, uap->shmflg);
 	if (error != 0)
 		return (error);
 #endif
 	if (uap->size && uap->size > shmseg->u.shm_segsz)
 		return (EINVAL);
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 	return (0);
 }
 
 static int
 shmget_allocate_segment(td, uap, mode)
 	struct thread *td;
 	struct shmget_args *uap;
 	int mode;
 {
 	int i, segnum, shmid, size;
 	struct ucred *cred = td->td_ucred;
 	struct shmid_kernel *shmseg;
 	vm_object_t shm_object;
 
 	GIANT_REQUIRED;
 
 	if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
 		return (EINVAL);
 	if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
 		return (ENOSPC);
 	size = round_page(uap->size);
 	if (shm_committed + btoc(size) > shminfo.shmall)
 		return (ENOMEM);
 	if (shm_last_free < 0) {
 		shmrealloc();	/* Maybe expand the shmsegs[] array. */
 		for (i = 0; i < shmalloced; i++)
 			if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
 				break;
 		if (i == shmalloced)
 			return (ENOSPC);
 		segnum = i;
 	} else  {
 		segnum = shm_last_free;
 		shm_last_free = -1;
 	}
 	shmseg = &shmsegs[segnum];
 	/*
 	 * In case we sleep in malloc(), mark the segment present but deleted
 	 * so that noone else tries to create the same key.
 	 */
 	shmseg->u.shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
 	shmseg->u.shm_perm.key = uap->key;
 	shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
 	shmid = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 	
 	/*
 	 * We make sure that we have allocated a pager before we need
 	 * to.
 	 */
 	if (shm_use_phys) {
 		shm_object =
 		    vm_pager_allocate(OBJT_PHYS, 0, size, VM_PROT_DEFAULT, 0);
 	} else {
 		shm_object =
 		    vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0);
 	}
 	VM_OBJECT_LOCK(shm_object);
 	vm_object_clear_flag(shm_object, OBJ_ONEMAPPING);
 	vm_object_set_flag(shm_object, OBJ_NOSPLIT);
 	VM_OBJECT_UNLOCK(shm_object);
 
 	shmseg->u.shm_internal = shm_object;
 	shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
 	shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
 	shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) |
 	    (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
 	shmseg->u.shm_segsz = uap->size;
 	shmseg->u.shm_cpid = td->td_proc->p_pid;
 	shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
 	shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
 #ifdef MAC
 	mac_create_sysv_shm(cred, shmseg);
 #endif
 	shmseg->u.shm_ctime = time_second;
 	shm_committed += btoc(size);
 	shm_nused++;
 	if (shmseg->u.shm_perm.mode & SHMSEG_WANTED) {
 		/*
 		 * Somebody else wanted this key while we were asleep.  Wake
 		 * them up now.
 		 */
 		shmseg->u.shm_perm.mode &= ~SHMSEG_WANTED;
 		wakeup(shmseg);
 	}
 	td->td_retval[0] = shmid;
 	return (0);
 }
 
 int
 shmget(td, uap)
 	struct thread *td;
 	struct shmget_args *uap;
 {
 	int segnum, mode;
 	int error;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	mtx_lock(&Giant);
 	mode = uap->shmflg & ACCESSPERMS;
 	if (uap->key != IPC_PRIVATE) {
 	again:
 		segnum = shm_find_segment_by_key(uap->key);
 		if (segnum >= 0) {
 			error = shmget_existing(td, uap, mode, segnum);
 			if (error == EAGAIN)
 				goto again;
 			goto done2;
 		}
 		if ((uap->shmflg & IPC_CREAT) == 0) {
 			error = ENOENT;
 			goto done2;
 		}
 	}
 	error = shmget_allocate_segment(td, uap, mode);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 int
 shmsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct shmsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 	} */ *uap;
 {
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 	int error;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 	if (uap->which < 0 ||
 	    uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
 		return (EINVAL);
 	mtx_lock(&Giant);
 	error = (*shmcalls[uap->which])(td, &uap->a2);
 	mtx_unlock(&Giant);
 	return (error);
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 static void
 shmfork_myhook(p1, p2)
 	struct proc *p1, *p2;
 {
 	struct shmmap_state *shmmap_s;
 	size_t size;
 	int i;
 
 	mtx_lock(&Giant);
 	size = shminfo.shmseg * sizeof(struct shmmap_state);
 	shmmap_s = malloc(size, M_SHM, M_WAITOK);
 	bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
 	p2->p_vmspace->vm_shm = shmmap_s;
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
 		if (shmmap_s->shmid != -1)
 			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
 	mtx_unlock(&Giant);
 }
 
 static void
 shmexit_myhook(struct vmspace *vm)
 {
 	struct shmmap_state *base, *shm;
 	int i;
 
 	if ((base = vm->vm_shm) != NULL) {
 		vm->vm_shm = NULL;
 		mtx_lock(&Giant);
 		for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
 			if (shm->shmid != -1)
 				shm_delete_mapping(vm, shm);
 		}
 		mtx_unlock(&Giant);
 		free(base, M_SHM);
 	}
 }
 
 static void
 shmrealloc(void)
 {
 	int i;
 	struct shmid_kernel *newsegs;
 
 	if (shmalloced >= shminfo.shmmni)
 		return;
 
 	newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
 	if (newsegs == NULL)
 		return;
 	for (i = 0; i < shmalloced; i++)
 		bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
 	for (; i < shminfo.shmmni; i++) {
 		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		shmsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_init_sysv_shm(&shmsegs[i]);
 #endif
 	}
 	free(shmsegs, M_SHM);
 	shmsegs = newsegs;
 	shmalloced = shminfo.shmmni;
 }
 
 static void
 shminit()
 {
 	int i;
 
 	TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall);
 	for (i = PAGE_SIZE; i > 0; i--) {
 		shminfo.shmmax = shminfo.shmall * i;
 		if (shminfo.shmmax >= shminfo.shmall)
 			break;
 	}
 	TUNABLE_ULONG_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
 	TUNABLE_ULONG_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
 	TUNABLE_ULONG_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
 	TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);
 
 	shmalloced = shminfo.shmmni;
 	shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
 	if (shmsegs == NULL)
 		panic("cannot allocate initial memory for sysvshm");
 	for (i = 0; i < shmalloced; i++) {
 		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		shmsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_init_sysv_shm(&shmsegs[i]);
 #endif
 	}
 	shm_last_free = 0;
 	shm_nused = 0;
 	shm_committed = 0;
 	shmexit_hook = &shmexit_myhook;
 	shmfork_hook = &shmfork_myhook;
 }
 
 static int
 shmunload()
 {
 #ifdef MAC
 	int i;	
 #endif
 
 	if (shm_nused > 0)
 		return (EBUSY);
 
 #ifdef MAC
 	for (i = 0; i < shmalloced; i++)
 		mac_destroy_sysv_shm(&shmsegs[i]);
 #endif
 	free(shmsegs, M_SHM);
 	shmexit_hook = NULL;
 	shmfork_hook = NULL;
 	return (0);
 }
 
 static int
 sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
 {
 
 	return (SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0])));
 }
 
 static int
 sysvshm_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		shminit();
 		break;
 	case MOD_UNLOAD:
 		error = shmunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvshm_mod = {
 	"sysvshm",
 	&sysvshm_modload,
 	NULL
 };
 
 SYSCALL_MODULE_HELPER(shmsys);
 SYSCALL_MODULE_HELPER(shmat);
 SYSCALL_MODULE_HELPER(shmctl);
 SYSCALL_MODULE_HELPER(shmdt);
 SYSCALL_MODULE_HELPER(shmget);
 
 DECLARE_MODULE(sysvshm, sysvshm_mod,
 	SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvshm, 1);
Index: head/sys/kern/uipc_mqueue.c
===================================================================
--- head/sys/kern/uipc_mqueue.c	(revision 167231)
+++ head/sys/kern/uipc_mqueue.c	(revision 167232)
@@ -1,2496 +1,2484 @@
 /*-
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * POSIX message queue implementation.
  *
  * 1) A mqueue filesystem can be mounted, each message queue appears
  *    in mounted directory, user can change queue's permission and
  *    ownership, or remove a queue. Manually creating a file in the
  *    directory causes a message queue to be created in the kernel with
  *    default message queue attributes applied and same name used, this
  *    method is not advocated since mq_open syscall allows user to specify
  *    different attributes. Also the file system can be mounted multiple
  *    times at different mount points but shows same contents.
  *
  * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
  *    but directly operate on internal data structure, this allows user to
  *    use the IPC facility without having to mount mqueue file system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/buf.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/posix4.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sysproto.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 
 /*
  * Limits and constants
  */
 #define	MQFS_NAMELEN		NAME_MAX
 #define MQFS_DELEN		(8 + MQFS_NAMELEN)
 
 /* node types */
 typedef enum {
 	mqfstype_none = 0,
 	mqfstype_root,
 	mqfstype_dir,
 	mqfstype_this,
 	mqfstype_parent,
 	mqfstype_file,
 	mqfstype_symlink,
 } mqfs_type_t;
 
 struct mqfs_node;
 
 /*
  * mqfs_info: describes a mqfs instance
  */
 struct mqfs_info {
 	struct sx		mi_lock;
 	struct mqfs_node	*mi_root;
 	struct unrhdr		*mi_unrhdr;
 };
 
 struct mqfs_vdata {
 	LIST_ENTRY(mqfs_vdata)	mv_link;
 	struct mqfs_node	*mv_node;
 	struct vnode		*mv_vnode;
 	struct task		mv_task;
 };
 
 /*
  * mqfs_node: describes a node (file or directory) within a mqfs
  */
 struct mqfs_node {
 	char			mn_name[MQFS_NAMELEN+1];
 	struct mqfs_info	*mn_info;
 	struct mqfs_node	*mn_parent;
 	LIST_HEAD(,mqfs_node)	mn_children;
 	LIST_ENTRY(mqfs_node)	mn_sibling;
 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
 	int			mn_refcount;
 	mqfs_type_t		mn_type;
 	int			mn_deleted;
 	u_int32_t		mn_fileno;
 	void			*mn_data;
 	struct timespec		mn_birth;
 	struct timespec		mn_ctime;
 	struct timespec		mn_atime;
 	struct timespec		mn_mtime;
 	uid_t			mn_uid;
 	gid_t			mn_gid;
 	int			mn_mode;
 };
 
 #define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
 #define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
 #define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
 #define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
 				(fp)->f_data)->mn_data))
 
 TAILQ_HEAD(msgq, mqueue_msg);
 
 struct mqueue;
 
 struct mqueue_notifier {
 	LIST_ENTRY(mqueue_notifier)	nt_link;
 	struct sigevent			nt_sigev;
 	ksiginfo_t			nt_ksi;
 	struct proc			*nt_proc;
 };
 
 struct mqueue {
 	struct mtx	mq_mutex;
 	int		mq_flags;
 	long		mq_maxmsg;
 	long		mq_msgsize;
 	long		mq_curmsgs;
 	long		mq_totalbytes;
 	struct msgq	mq_msgq;
 	int		mq_receivers;
 	int		mq_senders;
 	struct selinfo	mq_rsel;
 	struct selinfo	mq_wsel;
 	struct mqueue_notifier	*mq_notifier;
 };
 
 #define	MQ_RSEL		0x01
 #define	MQ_WSEL		0x02
 
 struct mqueue_msg {
 	TAILQ_ENTRY(mqueue_msg)	msg_link;
 	unsigned int	msg_prio;
 	unsigned int	msg_size;
 	/* following real data... */
 };
 
 SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
 	"POSIX real time message queue");
 
 static int	default_maxmsg  = 10;
 static int	default_msgsize = 1024;
 
 static int	maxmsg = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
     &maxmsg, 0, "Default maximum messages in queue");
 static int	maxmsgsize = 16384;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
     &maxmsgsize, 0, "Default maximum message size");
 static int	maxmq = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
     &maxmq, 0, "maximum message queues");
 static int	curmq = 0;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
     &curmq, 0, "current message queue number");
 static int	unloadable = 0;
 static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
 
 static eventhandler_tag exit_tag;
 
 /* Only one instance per-system */
 static struct mqfs_info		mqfs_data;
 static uma_zone_t		mqnode_zone;
 static uma_zone_t		mqueue_zone;
 static uma_zone_t		mvdata_zone;
 static uma_zone_t		mqnoti_zone;
 static struct vop_vector	mqfs_vnodeops;
 static struct fileops		mqueueops;
 
 /*
  * Directory structure construction and manipulation
  */
 #ifdef notyet
 static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 #endif
 
 static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static int	mqfs_destroy(struct mqfs_node *mn);
 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
 
 /*
  * Message queue construction and maniplation
  */
 static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
 static void	mqueue_free(struct mqueue *mq);
 static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
 			size_t msg_len, unsigned msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
 			size_t msg_len, unsigned *msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
 			int timo);
 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
 			int timo);
 static void	mqueue_send_notification(struct mqueue *mq);
 static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
 static void	mq_proc_exit(void *arg, struct proc *p);
 
 /*
  * kqueue filters
  */
 static void	filt_mqdetach(struct knote *kn);
 static int	filt_mqread(struct knote *kn, long hint);
 static int	filt_mqwrite(struct knote *kn, long hint);
 
 struct filterops mq_rfiltops =
 	{ 1, NULL, filt_mqdetach, filt_mqread };
 struct filterops mq_wfiltops =
 	{ 1, NULL, filt_mqdetach, filt_mqwrite };
 
 /*
  * Initialize fileno bitmap
  */
 static void
 mqfs_fileno_init(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = new_unrhdr(1, INT_MAX, NULL);
 	mi->mi_unrhdr = up;
 }
 
 /*
  * Tear down fileno bitmap
  */
 static void
 mqfs_fileno_uninit(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = mi->mi_unrhdr;
 	mi->mi_unrhdr = NULL;
 	delete_unrhdr(up);
 }
 
 /*
  * Allocate a file number
  */
 static void
 mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	/* make sure our parent has a file number */
 	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
 		mqfs_fileno_alloc(mi, mn->mn_parent);
 
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
 		break;
 	case mqfstype_this:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_this node has no parent"));
 		mn->mn_fileno = mn->mn_parent->mn_fileno;
 		break;
 	case mqfstype_parent:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_parent node has no parent"));
 		if (mn->mn_parent == mi->mi_root) {
 			mn->mn_fileno = mn->mn_parent->mn_fileno;
 			break;
 		}
 		KASSERT(mn->mn_parent->mn_parent != NULL,
 		    ("mqfstype_parent node has no grandparent"));
 		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_alloc() called for unknown type node: %d",
 			mn->mn_type));
 		break;
 	}
 }
 
 /*
  * Release a file number
  */
 static void
 mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		free_unr(mi->mi_unrhdr, mn->mn_fileno);
 		break;
 	case mqfstype_this:
 	case mqfstype_parent:
 		/* ignore these, as they don't "own" their file number */
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_free() called for unknown type node: %d", 
 			mn->mn_type));
 		break;
 	}
 }
 
 static __inline struct mqfs_node *
 mqnode_alloc(void)
 {
 	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
 }
 
 static __inline void
 mqnode_free(struct mqfs_node *node)
 {
 	uma_zfree(mqnode_zone, node);
 }
 
 static __inline void
 mqnode_addref(struct mqfs_node *node)
 {
 	atomic_fetchadd_int(&node->mn_refcount, 1);
 }
 
 static __inline void
 mqnode_release(struct mqfs_node *node)
 {
 	int old, exp;
 
 	old = atomic_fetchadd_int(&node->mn_refcount, -1);
 	if (node->mn_type == mqfstype_dir ||
 	    node->mn_type == mqfstype_root)
 		exp = 3; /* include . and .. */
 	else
 		exp = 1;
 	if (old == exp)
 		mqfs_destroy(node);
 }
 
 /*
  * Add a node to a directory
  */
 static int
 mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
 {
 	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
 	KASSERT(parent->mn_info != NULL,
 	    ("%s(): parent has no mn_info", __func__));
 	KASSERT(parent->mn_type == mqfstype_dir ||
 	    parent->mn_type == mqfstype_root,
 	    ("%s(): parent is not a directory", __func__));
 
 	node->mn_info = parent->mn_info;
 	node->mn_parent = parent;
 	LIST_INIT(&node->mn_children);
 	LIST_INIT(&node->mn_vnodes);
 	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
 	mqnode_addref(parent);
 	return (0);
 }
 
 static struct mqfs_node *
 mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
 	int nodetype)
 {
 	struct mqfs_node *node;
 
 	node = mqnode_alloc();
 	strncpy(node->mn_name, name, namelen);
 	node->mn_type = nodetype;
 	node->mn_refcount = 1;
 	getnanotime(&node->mn_birth);
 	node->mn_ctime = node->mn_atime = node->mn_mtime
 		= node->mn_birth;
 	node->mn_uid = cred->cr_uid;
 	node->mn_gid = cred->cr_gid;
 	node->mn_mode = mode;
 	return (node);
 }
 
 /*
  * Create a file
  */
 static struct mqfs_node *
 mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Add . and .. to a directory
  */
 static int
 mqfs_fixup_dir(struct mqfs_node *parent)
 {
 	struct mqfs_node *dir;
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = '.';
 	dir->mn_type = mqfstype_this;
 	dir->mn_refcount = 1;
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = dir->mn_name[1] = '.';
 	dir->mn_type = mqfstype_parent;
 	dir->mn_refcount = 1;
 
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	return (0);
 }
 
 #ifdef notyet
 
 /*
  * Create a directory
  */
 static struct mqfs_node *
 mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 
 	if (mqfs_fixup_dir(node) != 0) {
 		mqfs_destroy(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Create a symlink
  */
 static struct mqfs_node *
 mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 #endif
 
 /*
  * Destroy a node or a tree of nodes
  */
 static int
 mqfs_destroy(struct mqfs_node *node)
 {
 	struct mqfs_node *parent;
 
 	KASSERT(node != NULL,
 	    ("%s(): node is NULL", __func__));
 	KASSERT(node->mn_info != NULL,
 	    ("%s(): node has no mn_info", __func__));
 
 	/* destroy children */
 	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
 		while (! LIST_EMPTY(&node->mn_children))
 			mqfs_destroy(LIST_FIRST(&node->mn_children));
 
 	/* unlink from parent */
 	if ((parent = node->mn_parent) != NULL) {
 		KASSERT(parent->mn_info == node->mn_info,
 		    ("%s(): parent has different mn_info", __func__));
 		LIST_REMOVE(node, mn_sibling);
 	}
 
 	if (node->mn_fileno != 0)
 		mqfs_fileno_free(node->mn_info, node);
 	if (node->mn_data != NULL)
 		mqueue_free(node->mn_data);
 	mqnode_free(node);
 	return (0);
 }
 
 /*
  * Mount a mqfs instance
  */
 static int
 mqfs_mount(struct mount *mp, struct thread *td)
 {
 	struct statfs *sbp;
 
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	mp->mnt_data = &mqfs_data;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	mp->mnt_kern_flag |= MNTK_MPSAFE;
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 
 	sbp = &mp->mnt_stat;
 	vfs_mountedfrom(mp, "mqueue");
 	sbp->f_bsize = PAGE_SIZE;
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_blocks = 1;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;
 	sbp->f_ffree = 0;
 	return (0);
 }
 
 /*
  * Unmount a mqfs instance
  */
 static int
 mqfs_unmount(struct mount *mp, int mntflags, struct thread *td)
 {
 	int error;
 
 	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0, td);
 	return (error);
 }
 
 /*
  * Return a root vnode
  */
 static int
 mqfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td)
 {
 	struct mqfs_info *mqfs;
 	int ret;
 
 	mqfs = VFSTOMQFS(mp);
 	sx_xlock(&mqfs->mi_lock);
 	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
 	sx_xunlock(&mqfs->mi_lock);
 	return (ret);
 }
 
 /*
  * Return filesystem stats
  */
 static int
 mqfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
 {
 	/* XXX update statistics */
 	return (0);
 }
 
 /*
  * Initialize a mqfs instance
  */
 static int
 mqfs_init(struct vfsconf *vfc)
 {
 	struct mqfs_node *root;
 	struct mqfs_info *mi;
 
 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mvdata_zone = uma_zcreate("mvdata",
 		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
 		NULL, UMA_ALIGN_PTR, 0);
 	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mi = &mqfs_data;
 	sx_init(&mi->mi_lock, "mqfs lock");
 	/* set up the root diretory */
 	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
 		mqfstype_root);
 	root->mn_info = mi;
 	LIST_INIT(&root->mn_children);
 	LIST_INIT(&root->mn_vnodes);
 	mi->mi_root = root;
 	mqfs_fileno_init(mi);
 	mqfs_fileno_alloc(mi, root);
 	mqfs_fixup_dir(root);
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
 	return (0);
 }
 
 /*
  * Destroy a mqfs instance
  */
 static int
 mqfs_uninit(struct vfsconf *vfc)
 {
 	struct mqfs_info *mi;
 
 	if (!unloadable)
 		return (EOPNOTSUPP);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	mi = &mqfs_data;
 	mqfs_destroy(mi->mi_root);
 	mi->mi_root = NULL;
 	mqfs_fileno_uninit(mi);
 	sx_destroy(&mi->mi_lock);
 	uma_zdestroy(mqnode_zone);
 	uma_zdestroy(mqueue_zone);
 	uma_zdestroy(mvdata_zone);
 	uma_zdestroy(mqnoti_zone);
 	return (0);
 }
 
 /*
  * task routine
  */
 static void
 do_recycle(void *context, int pending __unused)
 {
 	struct vnode *vp = (struct vnode *)context;
 
 	vrecycle(vp, curthread);
 	vdrop(vp);
 }
 
 /*
  * Allocate a vnode
  */
 static int
 mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
 {
 	struct mqfs_vdata *vd;
 	int error;
 
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp)
 			break;
 	}
 
 	if (vd != NULL) {
 		if (vget(vd->mv_vnode, 0, curthread) == 0) {
 			*vpp = vd->mv_vnode;
 			vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE,
 			    curthread);
 			return (0);
 		}
 		/* XXX if this can happen, we're in trouble */
 	}
 
 	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, vpp);
 	if (error)
 		return (error);
 	vd = uma_zalloc(mvdata_zone, M_WAITOK);
 	(*vpp)->v_data = vd;
 	vd->mv_vnode = *vpp;
 	vd->mv_node = pn;
 	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
 	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
 	mqnode_addref(pn);
 	switch (pn->mn_type) {
 	case mqfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 		/* fall through */
 	case mqfstype_dir:
 	case mqfstype_this:
 	case mqfstype_parent:
 		(*vpp)->v_type = VDIR;
 		break;
 	case mqfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case mqfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case mqfstype_none:
 		KASSERT(0, ("mqfs_allocf called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
 	}
 	vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread);
 	return (0);
 }
 
 /* 
  * Search a directory entry
  */
 static struct mqfs_node *
 mqfs_search(struct mqfs_node *pd, const char *name, int len)
 {
 	struct mqfs_node *pn;
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		if (strncmp(pn->mn_name, name, len) == 0)
 			return (pn);
 	}
 	return (NULL);
 }
 
 /*
- * Look up a file or directory
+ * Look up a file or directory.
  */
 static int
 mqfs_lookupx(struct vop_cachedlookup_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	int nameiop, flags, error, namelen;
 	char *pname;
 	struct thread *td;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	namelen = cnp->cn_namelen;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	pd = VTON(dvp);
 	pn = NULL;
 	*vpp = NULLVP;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error)
 		return (error);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= MQFS_NAMELEN)
 		return (ENOENT);
 
 	/* self */
 	if (namelen == 1 && pname[0] == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		pn = pd;
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (dvp->v_vflag & VV_ROOT)
 			return (EIO);
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		VOP_UNLOCK(dvp, 0, cnp->cn_thread);
 		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
 		pn = pd->mn_parent;
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
 		return (error);
 	}
 
 	/* named node */
 	pn = mqfs_search(pd, pname, namelen);
 	
 	/* found */
 	if (pn != NULL) {
 		/* DELETE */
 		if (nameiop == DELETE && (flags & ISLASTCN)) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error)
 				return (error);
 			if (*vpp == dvp) {
 				VREF(dvp);
 				*vpp = dvp;
 				return (0);
 			}
 		}
 
 		/* allocate vnode */
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		if (error == 0 && cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, cnp);
 		return (error);
 	}
 	
 	/* not found */
 
 	/* will create a new entry in the directory ? */
 	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
 	    && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	return (ENOENT);
 }
 
 #if 0
 struct vop_lookup_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode lookup operation
  */
 static int
 mqfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	int rc;
 
 	sx_xlock(&mqfs->mi_lock);
 	rc = mqfs_lookupx(ap);
 	sx_xunlock(&mqfs->mi_lock);
 	return (rc);
 }
 
 #if 0
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * vnode creation operation
  */
 static int
 mqfs_create(struct vop_create_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int error;
 
 	pd = VTON(ap->a_dvp);
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	mq = mqueue_alloc(NULL);
 	if (mq == NULL)
 		return (EAGAIN);
 	sx_xlock(&mqfs->mi_lock);
 #if 0
 	/* named node */
 	pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen);
 	if (pn != NULL) {
 		mqueue_free(mq);
 		sx_xunlock(&mqfs->mi_lock);
 		return (EEXIST);
 	}
 #else
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 #endif
 	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		cnp->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL)
 		error = ENOSPC;
 	else {
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		if (error)
 			mqfs_destroy(pn);
 		else
 			pn->mn_data = mq;
 	}
 	sx_xunlock(&mqfs->mi_lock);
 	if (error)
 		mqueue_free(mq);
 	return (error);
 }
 
 /*
  * Remove an entry
  */
 static
 int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
 {
 	struct mqfs_node *parent;
 	struct mqfs_vdata *vd;
 	int error = 0;
 
 	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
 
 	/*
 	 * XXXRW: Other instances of the message queue primitive are
 	 * allowed in jail?
 	 */
 	if (ucred->cr_uid != pn->mn_uid &&
 	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
 		error = EACCES;
 	else if (!pn->mn_deleted) {
 		parent = pn->mn_parent;
 		pn->mn_parent = NULL;
 		pn->mn_deleted = 1;
 		LIST_REMOVE(pn, mn_sibling);
 		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 			cache_purge(vd->mv_vnode);
 			vhold(vd->mv_vnode);
 			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
 		}
 		mqnode_release(pn);
 		mqnode_release(parent);
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 #if 0
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode removal operation
  */
 static int
 mqfs_remove(struct vop_remove_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn;
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
                 return (EPERM);
 	pn = VTON(ap->a_vp);
 	sx_xlock(&mqfs->mi_lock);
 	error = do_unlink(pn, ap->a_cnp->cn_cred);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_inactive(struct vop_inactive_args *ap)
 {
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
 		vrecycle(ap->a_vp, ap->a_td);
 	return (0);
 }
 
 #if 0
 struct vop_reclaim_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn;
 	struct mqfs_vdata *vd;
 
 	vd = vp->v_data;
 	pn = vd->mv_node;
 	sx_xlock(&mqfs->mi_lock);
 	vp->v_data = NULL;
 	LIST_REMOVE(vd, mv_link);
 	uma_zfree(mvdata_zone, vd);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 #if 0
 struct vop_open_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	int a_fdidx;
 };
 #endif
 
 static int
 mqfs_open(struct vop_open_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_close_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_access_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Verify permissions
  */
 static int
 mqfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr vattr;
 	int error;
 
 	error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
 	if (error)
 		return (error);
 	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
 	    vattr.va_gid, ap->a_mode, ap->a_cred, NULL);
 	return (error);
 }
 
 #if 0
 struct vop_getattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Get file attributes
  */
 static int
 mqfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn = VTON(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 
 	VATTR_NULL(vap);
 	vap->va_type = vp->v_type;
 	vap->va_mode = pn->mn_mode;
 	vap->va_nlink = 1;
 	vap->va_uid = pn->mn_uid;
 	vap->va_gid = pn->mn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = pn->mn_fileno;
 	vap->va_size = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_atime = pn->mn_atime;
 	vap->va_mtime = pn->mn_mtime;
 	vap->va_ctime = pn->mn_ctime;
 	vap->va_birthtime = pn->mn_birth;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = 0;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 	vap->va_vaflags = 0;
 	return (error);
 }
 
 #if 0
 struct vop_setattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 /*
  * Set attributes
  */
 static int
 mqfs_setattr(struct vop_setattr_args *ap)
 {
 	struct mqfs_node *pn;
 	struct vattr *vap;
 	struct vnode *vp;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	pn = VTON(vp);
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = pn->mn_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = pn->mn_gid;
 	else
 		gid = vap->va_gid;
 
 	if (uid != pn->mn_uid || gid != pn->mn_gid) {
 		/*
 		 * To modify the ownership of a file, must possess VADMIN
 		 * for that file.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)))
 			return (error);
 
 		/*
 		 * XXXRW: Why is there a privilege check here: shouldn't the
 		 * check in VOP_ACCESS() be enough?  Also, are the group bits
 		 * below definitely right?
 		 */
 		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
 		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
 		    (error = priv_check_cred(ap->a_td->td_ucred,
 		    PRIV_MQ_ADMIN, SUSER_ALLOWJAIL)) != 0)
 			return (error);
 		pn->mn_uid = uid;
 		pn->mn_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
 		    (error = priv_check_cred(ap->a_td->td_ucred,
 		    PRIV_MQ_ADMIN, SUSER_ALLOWJAIL)))
 			return (error);
 		pn->mn_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		/* See the comment in ufs_vnops::ufs_setattr(). */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			pn->mn_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			pn->mn_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 	if (c) {
 		vfs_timestamp(&pn->mn_ctime);
 	}
 	return (0);
 }
 
 #if 0
 struct vop_read_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Read from a file
  */
 static int
 mqfs_read(struct vop_read_args *ap)
 {
 	char buf[80];
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int len, error;
 
 	if (vp->v_type != VREG)
 		return (EINVAL);
 
 	pn = VTON(vp);
 	mq = VTOMQ(vp);
 	snprintf(buf, sizeof(buf),
 		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
 		mq->mq_totalbytes,
 		mq->mq_maxmsg,
 		mq->mq_curmsgs,
 		mq->mq_msgsize);
 	buf[sizeof(buf)-1] = '\0';
 	len = strlen(buf);
 	error = uiomove_frombuf(buf, len, uio);
 	return (error);
 }
 
 #if 0
 struct vop_readdir_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	u_long **a_cookies;
 };
 #endif
 
 /*
  * Return directory entries.
  */
 static int
 mqfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp;
 	struct mqfs_info *mi;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct dirent entry;
 	struct uio *uio;
 	int *tmp_ncookies = NULL;
 	off_t offset;
 	int error, i;
 
 	vp = ap->a_vp;
 	mi = VFSTOMQFS(vp->v_mount);
 	pd = VTON(vp);
 	uio = ap->a_uio;
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
         }
 
 	error = 0;
 	offset = 0;
 
 	sx_xlock(&mi->mi_lock);
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		entry.d_reclen = sizeof(entry);
 		if (!pn->mn_fileno)
 			mqfs_fileno_alloc(mi, pn);
 		entry.d_fileno = pn->mn_fileno;
 		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
 			entry.d_name[i] = pn->mn_name[i];
 		entry.d_name[i] = 0;
 		entry.d_namlen = i;
 		switch (pn->mn_type) {
 		case mqfstype_root:
 		case mqfstype_dir:
 		case mqfstype_this:
 		case mqfstype_parent:
 			entry.d_type = DT_DIR;
 			break;
 		case mqfstype_file:
 			entry.d_type = DT_REG;
 			break;
 		case mqfstype_symlink:
 			entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->mn_name,
 				pn->mn_type);
 		}
 		if (entry.d_reclen > uio->uio_resid)
                         break;
 		if (offset >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, &entry, offset);
                         if (error)
                                 break;
                 }
                 offset += entry.d_reclen;
 	}
 	sx_xunlock(&mi->mi_lock);
 
 	uio->uio_offset = offset;
 
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 #ifdef notyet
 
 #if 0
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struvt vnode **a_vpp;
 	struvt componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * Create a directory.
  */
 static int
 mqfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd = VTON(ap->a_dvp);
 	struct mqfs_node *pn;
 	int error;
 
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	sx_xlock(&mqfs->mi_lock);
 #if 0
 	/* named node */
 	pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen);
 	if (pn != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (EEXIST);
 	}
 #else
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 #endif
 	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		ap->a_vap->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL)
 		error = ENOSPC;
 	else
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * Remove a directory.
  */
 static int
 mqfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn = VTON(ap->a_vp);
 	struct mqfs_node *pt;
 
 	if (pn->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 
 	sx_xlock(&mqfs->mi_lock);
 	if (pn->mn_deleted) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOENT);
 	}
 
 	pt = LIST_FIRST(&pn->mn_children);
 	pt = LIST_NEXT(pt, mn_sibling);
 	pt = LIST_NEXT(pt, mn_sibling);
 	if (pt != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOTEMPTY);
 	}
 	pt = pn->mn_parent;
 	pn->mn_parent = NULL;
 	pn->mn_deleted = 1;
 	LIST_REMOVE(pn, mn_sibling);
 	mqnode_release(pn);
 	mqnode_release(pt);
 	sx_xunlock(&mqfs->mi_lock);
 	cache_purge(ap->a_vp);
 	return (0);
 }
 
 #endif /* notyet */
 
 /*
  * Allocate a message queue
  */
 static struct mqueue *
 mqueue_alloc(const struct mq_attr *attr)
 {
 	struct mqueue *mq;
 
 	if (curmq >= maxmq)
 		return (NULL);
 	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mq->mq_msgq);
 	if (attr != NULL) {
 		mq->mq_maxmsg = attr->mq_maxmsg;
 		mq->mq_msgsize = attr->mq_msgsize;
 	} else {
 		mq->mq_maxmsg = default_maxmsg;
 		mq->mq_msgsize = default_msgsize;
 	}
 	mtx_init(&mq->mq_mutex, "mqueue", NULL, MTX_DEF);
 	knlist_init(&mq->mq_rsel.si_note, &mq->mq_mutex, NULL, NULL, NULL);
 	knlist_init(&mq->mq_wsel.si_note, &mq->mq_mutex, NULL, NULL, NULL);
 	atomic_add_int(&curmq, 1);
 	return (mq);
 }
 
 /*
  * Destroy a message queue
  */
 static void
 mqueue_free(struct mqueue *mq)
 {
 	struct mqueue_msg *msg;
 
 	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
 		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
 		FREE(msg, M_MQUEUEDATA);
 	}
 
 	mtx_destroy(&mq->mq_mutex);
 	knlist_destroy(&mq->mq_rsel.si_note);
 	knlist_destroy(&mq->mq_wsel.si_note);
 	uma_zfree(mqueue_zone, mq);
 	atomic_add_int(&curmq, -1);
 }
 
 /*
  * Load a message from user space
  */
 static struct mqueue_msg *
 mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
 {
 	struct mqueue_msg *msg;
 	size_t len;
 	int error;
 
 	len = sizeof(struct mqueue_msg) + msg_size;
 	MALLOC(msg, struct mqueue_msg *, len, M_MQUEUEDATA, M_WAITOK);
 	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
 	    msg_size);
 	if (error) {
 		FREE(msg, M_MQUEUEDATA);
 		msg = NULL;
 	} else {
 		msg->msg_size = msg_size;
 		msg->msg_prio = msg_prio;
 	}
 	return (msg);
 }
 
 /*
  * Save a message to user space
  */
 static int
 mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
 {
 	int error;
 
 	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
 		msg->msg_size);
 	if (error == 0 && msg_prio != NULL)
 		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
 	return (error);
 }
 
 /*
  * Free a message's memory
  */
 static __inline void
 mqueue_freemsg(struct mqueue_msg *msg)
 {
 	FREE(msg, M_MQUEUEDATA);
 }
 
 /*
  * Send a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_send(struct mqueue *mq, const char *msg_ptr,
 	size_t msg_len, unsigned msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ets, ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_prio >= MQ_PRIO_MAX)
 		return (EINVAL);
 	if (msg_len > mq->mq_msgsize)
 		return (EMSGSIZE);
 	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
 	if (msg == NULL)
 		return (EFAULT);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_send(mq, msg, -1);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* we allow a null timeout (wait forever) */
 	if (abs_timeout == NULL) {
 		error = _mqueue_send(mq, msg, 0);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* send it before checking time */
 	error = _mqueue_send(mq, msg, -1);
 	if (error == 0)
 		return (0);
 
 	if (error != EAGAIN)
 		goto bad;
 
 	error = copyin(abs_timeout, &ets, sizeof(ets));
 	if (error != 0)
 		goto bad;
 	if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) {
 		error = EINVAL;
 		goto bad;
 	}
 	for (;;) {
 		ts2 = ets;
 		getnanotime(&ts);
 		timespecsub(&ts2, &ts);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			break;
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_send(mq, msg, tvtohz(&tv));
 		if (error != ETIMEDOUT)
 			break;
 	}
 	if (error == 0)
 		return (0);
 bad:
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to send a message
  */
 static int
 _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
 {	
 	struct mqueue_msg *msg2;
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_senders++;
 		error = msleep(&mq->mq_senders, &mq->mq_mutex,
 			    PCATCH, "mqsend", timo);
 		mq->mq_senders--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
 		mtx_unlock(&mq->mq_mutex);
 		return (error);
 	}
 	error = 0;
 	if (TAILQ_EMPTY(&mq->mq_msgq)) {
 		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
 	} else {
 		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
 			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
 		} else {
 			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
 				if (msg2->msg_prio < msg->msg_prio)
 					break;
 			}
 			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
 		}
 	}
 	mq->mq_curmsgs++;
 	mq->mq_totalbytes += msg->msg_size;
 	if (mq->mq_receivers)
 		wakeup_one(&mq->mq_receivers);
 	else if (mq->mq_notifier != NULL)
 		mqueue_send_notification(mq);
 	if (mq->mq_flags & MQ_RSEL) {
 		mq->mq_flags &= ~MQ_RSEL;
 		selwakeup(&mq->mq_rsel);
 	}
 	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
 	mtx_unlock(&mq->mq_mutex);
 	return (0);
 }
 
 /*
  * Send realtime a signal to process which registered itself
  * successfully by mq_notify.
  */
 static void
 mqueue_send_notification(struct mqueue *mq)
 {
 	struct mqueue_notifier *nt;
 	struct proc *p;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	nt = mq->mq_notifier;
 	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
 		p = nt->nt_proc;
 		PROC_LOCK(p);
 		if (!KSI_ONQ(&nt->nt_ksi))
 			psignal_event(p, &nt->nt_sigev, &nt->nt_ksi);
 		PROC_UNLOCK(p);
 	}
 	mq->mq_notifier = NULL;
 }
 
 /*
  * Get a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_receive(struct mqueue *mq, char *msg_ptr,
 	size_t msg_len, unsigned *msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ets, ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_len < mq->mq_msgsize)
 		return (EMSGSIZE);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_recv(mq, &msg, -1);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* we allow a null timeout (wait forever). */
 	if (abs_timeout == NULL) {
 		error = _mqueue_recv(mq, &msg, 0);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* try to get a message before checking time */
 	error = _mqueue_recv(mq, &msg, -1);
 	if (error == 0)
 		goto received;
 
 	if (error != EAGAIN)
 		return (error);
 
 	error = copyin(abs_timeout, &ets, sizeof(ets));
 	if (error != 0)
 		return (error);
 	if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) {
 		error = EINVAL;
 		return (error);
 	}
 
 	for (;;) {
 		ts2 = ets;
 		getnanotime(&ts);
 		timespecsub(&ts2, &ts);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			return (error);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
 		if (error == 0)
 			break;
 		if (error != ETIMEDOUT)
 			return (error);
 	}
 
 received:
 	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
 	if (error == 0) {
 		curthread->td_retval[0] = msg->msg_size;
 		curthread->td_retval[1] = 0;
 	}
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to receive a message
  */
 static int
 _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
 {	
 	int error = 0;
 	
 	mtx_lock(&mq->mq_mutex);
 	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_receivers++;
 		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
 			    PCATCH, "mqrecv", timo);
 		mq->mq_receivers--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (*msg != NULL) {
 		error = 0;
 		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
 		mq->mq_curmsgs--;
 		mq->mq_totalbytes -= (*msg)->msg_size;
 		if (mq->mq_senders)
 			wakeup_one(&mq->mq_senders);
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
 	}
 	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
 	    !TAILQ_EMPTY(&mq->mq_msgq)) {
 		mqueue_send_notification(mq);
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (error);
 }
 
 static __inline struct mqueue_notifier *
 notifier_alloc(void)
 {
 	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
 }
 
 static __inline void
 notifier_free(struct mqueue_notifier *p)
 {
 	uma_zfree(mqnoti_zone, p);
 }
 
 static struct mqueue_notifier *
 notifier_search(struct proc *p, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
 		if (nt->nt_ksi.ksi_mqd == fd)
 			break;
 	}
 	return (nt);
 }
 
 static __inline void
 notifier_insert(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
 }
 
 static __inline void
 notifier_delete(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_REMOVE(nt, nt_link);
 	notifier_free(nt);
 }
 
 static void
 notifier_remove(struct proc *p, struct mqueue *mq, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	PROC_LOCK(p);
 	nt = notifier_search(p, fd);
 	if (nt != NULL) {
 		if (mq->mq_notifier == nt)
 			mq->mq_notifier = NULL;
 		sigqueue_take(&nt->nt_ksi);
 		notifier_delete(p, nt);
 	}
 	PROC_UNLOCK(p);
 }
 
 /*
- * Syscall to open a message queue
+ * Syscall to open a message queue.
  */
 int
 kmq_open(struct thread *td, struct kmq_open_args *uap)
 {
 	char path[MQFS_NAMELEN + 1];
 	struct mq_attr attr, *pattr;
 	struct mqfs_node *pn;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int fd, error, len, flags, cmode;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	flags = FFLAGS(uap->flags);
 	cmode = (((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
 	mq = NULL;
 	if ((flags & O_CREAT) && (uap->attr != NULL)) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 		if (attr.mq_maxmsg <= 0 || attr.mq_maxmsg > maxmsg)
 			return (EINVAL);
 		if (attr.mq_msgsize <= 0 || attr.mq_msgsize > maxmsgsize)
 			return (EINVAL);
 		pattr = &attr;
 	} else
 		pattr = NULL;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	/*
 	 * The first character of name must be a slash  (/) character
 	 * and the remaining characters of name cannot include any slash
 	 * characters. 
 	 */
 	len = strlen(path);
 	if (len < 2  || path[0] != '/' || index(path + 1, '/') != NULL)
 		return (EINVAL);
 
 	error = falloc(td, &fp, &fd);
 	if (error)
 		return (error);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
 	if (pn == NULL) {
 		if (!(flags & O_CREAT)) {
 			error = ENOENT;
 		} else {
 			mq = mqueue_alloc(pattr);
 			if (mq == NULL) {
 				error = ENFILE;
 			} else {
 				pn = mqfs_create_file(mqfs_data.mi_root,
 				         path + 1, len - 1, td->td_ucred,
 					 cmode);
 				if (pn == NULL) {
 					error = ENOSPC;
 					mqueue_free(mq);
 				}
 			}
 		}
 
 		if (error == 0) {
 			pn->mn_data = mq;
 		}
 	} else {
 		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
 			error = EEXIST;
 		} else {
 			int acc_mode = 0;
 
 			if (flags & FREAD)
 				acc_mode |= VREAD;
 			if (flags & FWRITE)
 				acc_mode |= VWRITE;
 			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
 				    pn->mn_gid, acc_mode, td->td_ucred, NULL);
 		}
 	}
 
 	if (error) {
 		sx_xunlock(&mqfs_data.mi_lock);
 		fdclose(fdp, fp, fd, td);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	mqnode_addref(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 
 	FILE_LOCK(fp);
 	fp->f_flag = (flags & (FREAD | FWRITE | O_NONBLOCK));
 	fp->f_type = DTYPE_MQUEUE;
 	fp->f_ops = &mqueueops;
 	fp->f_data = pn;
 	FILE_UNLOCK(fp);
 
 	FILEDESC_LOCK_FAST(fdp);
 	if (fdp->fd_ofiles[fd] == fp)
 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
 	FILEDESC_UNLOCK_FAST(fdp);
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 	return (0);
 }
 
 /*
- * Syscall to unlink a message queue
+ * Syscall to unlink a message queue.
  */
 int
 kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
 {
 	char path[MQFS_NAMELEN+1];
 	struct mqfs_node *pn;
 	int error, len;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	len = strlen(path);
 	if (len < 2  || path[0] != '/' || index(path + 1, '/') != NULL)
 		return (EINVAL);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
 	if (pn != NULL)
 		error = do_unlink(pn, td->td_ucred);
 	else
 		error = ENOENT;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 typedef int (*_fgetf)(struct thread *, int, struct file **);
 
 /*
  * Get message queue by giving file slot
  */
 static int
 _getmq(struct thread *td, int fd, _fgetf func,
        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = func(td, fd, fpp);
 	if (error)
 		return (error);
 	if (&mqueueops != (*fpp)->f_ops) {
 		fdrop(*fpp, td);
 		return (EBADF);
 	}
 	pn = (*fpp)->f_data;
 	if (ppn)
 		*ppn = pn;
 	if (pmq)
 		*pmq = pn->mn_data;
 	return (0);
 }
 
 static __inline int
 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
 	struct mqueue **pmq)
 {
 	return _getmq(td, fd, fget, fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_read(struct thread *td, int fd, struct file **fpp,
 	 struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	return _getmq(td, fd, fget_read, fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_write(struct thread *td, int fd, struct file **fpp,
 	struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	return _getmq(td, fd, fget_write, fpp, ppn, pmq);
 }
 
-/*
- * Syscall
- */
 int
 kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct mq_attr attr, oattr;
 	int error;
 
 	if (uap->attr) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 		if (attr.mq_flags & ~O_NONBLOCK)
 			return (EINVAL);
 	}
 	error = getmq(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	oattr.mq_maxmsg  = mq->mq_maxmsg;
 	oattr.mq_msgsize = mq->mq_msgsize;
 	oattr.mq_curmsgs = mq->mq_curmsgs;
 	FILE_LOCK(fp);
 	oattr.mq_flags = (O_NONBLOCK & fp->f_flag);
 	if (uap->attr) {
 		fp->f_flag &= ~O_NONBLOCK;
 		fp->f_flag |= (attr.mq_flags & O_NONBLOCK);
 	}
 	FILE_UNLOCK(fp);
 	fdrop(fp, td);
 	if (uap->oattr)
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
 	return (error);
 }
 
-/*
- * Syscall
- */
 int
 kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	int error;
 	int waitok;
 
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, uap->abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
-/*
- * Syscall
- */
 int
 kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	int error, waitok;
 
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, uap->abs_timeout);
 	fdrop(fp, td);
 	return (error);
 }
 
-/*
- * Syscall
- */
 int
 kmq_notify(struct thread *td, struct kmq_notify_args *uap)
 {
 	struct sigevent ev;
 	struct filedesc *fdp;
 	struct proc *p;
 	struct mqueue *mq;
 	struct file *fp;
 	struct mqueue_notifier *nt, *newnt = NULL;
 	int error;
 
 	p = td->td_proc;
 	fdp = td->td_proc->p_fd;
 	if (uap->sigev) {
 		error = copyin(uap->sigev, &ev, sizeof(ev));
 		if (error)
 			return (error);
 		if (ev.sigev_notify != SIGEV_SIGNAL &&
 		    ev.sigev_notify != SIGEV_THREAD_ID &&
 		    ev.sigev_notify != SIGEV_NONE)
 			return (EINVAL);
 		if ((ev.sigev_notify == SIGEV_SIGNAL ||
 		     ev.sigev_notify == SIGEV_THREAD_ID) &&
 			!_SIG_VALID(ev.sigev_signo))
 			return (EINVAL);
 	}
 	error = getmq(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 again:
 	FILEDESC_LOCK_FAST(fdp);
 	if (fget_locked(fdp, uap->mqd) != fp) {
 		FILEDESC_UNLOCK_FAST(fdp);
 		error = EBADF;
 		goto out;
 	}
 	mtx_lock(&mq->mq_mutex);
 	FILEDESC_UNLOCK_FAST(fdp);
 	if (uap->sigev != NULL) {
 		if (mq->mq_notifier != NULL) {
 			error = EBUSY;
 		} else {
 			PROC_LOCK(p);
 			nt = notifier_search(p, uap->mqd);
 			if (nt == NULL) {
 				if (newnt == NULL) {
 					PROC_UNLOCK(p);
 					mtx_unlock(&mq->mq_mutex);
 					newnt = notifier_alloc();
 					goto again;
 				}
 			}
 
 			if (nt != NULL) {
 				sigqueue_take(&nt->nt_ksi);
 				if (newnt != NULL) {
 					notifier_free(newnt);
 					newnt = NULL;
 				}
 			} else {
 				nt = newnt;
 				newnt = NULL;
 				ksiginfo_init(&nt->nt_ksi);
 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 				nt->nt_ksi.ksi_code = SI_MESGQ;
 				nt->nt_proc = p;
 				nt->nt_ksi.ksi_mqd = uap->mqd;
 				notifier_insert(p, nt);
 			}
 			nt->nt_sigev = ev;
 			mq->mq_notifier = nt;
 			PROC_UNLOCK(p);
 			/*
 			 * if there is no receivers and message queue
 			 * is not empty, we should send notification
 			 * as soon as possible.
 			 */
 			if (mq->mq_receivers == 0 &&
 			    !TAILQ_EMPTY(&mq->mq_msgq))
 				mqueue_send_notification(mq);
 		}
 	} else {
 		notifier_remove(p, mq, uap->mqd);
 	}
 	mtx_unlock(&mq->mq_mutex);
 
 out:
 	fdrop(fp, td);
 	if (newnt != NULL)
 		notifier_free(newnt);
 	return (error);
 }
 
 static void
 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
 {
 	struct filedesc *fdp;
 	struct mqueue *mq;
  
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 	if (fp->f_ops == &mqueueops) {
 		mq = FPTOMQ(fp);
 		mtx_lock(&mq->mq_mutex);
 		notifier_remove(td->td_proc, mq, fd);
 
 		/* have to wakeup thread in same process */
 		if (mq->mq_flags & MQ_RSEL) {
 			mq->mq_flags &= ~MQ_RSEL;
 			selwakeup(&mq->mq_rsel);
 		}
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		mtx_unlock(&mq->mq_mutex);
 	}
 }
 
 static void
 mq_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int i;
 
 	fdp = p->p_fd;
 	FILEDESC_LOCK_FAST(fdp);
 	for (i = 0; i < fdp->fd_nfiles; ++i) {
 		fp = fget_locked(fdp, i);
 		if (fp != NULL && fp->f_ops == &mqueueops) {
 			mq = FPTOMQ(fp);
 			mtx_lock(&mq->mq_mutex);
 			notifier_remove(p, FPTOMQ(fp), i);
 			mtx_unlock(&mq->mq_mutex);
 		}
 	}
 	FILEDESC_UNLOCK_FAST(fdp);
 	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
 }
 
 static int
 mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
 	int flags, struct thread *td)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
 	int flags, struct thread *td)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 mqf_ioctl(struct file *fp, u_long cmd, void *data,
 	struct ucred *active_cred, struct thread *td)
 {
 	return (ENOTTY);
 }
 
 static int
 mqf_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int revents = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (mq->mq_curmsgs) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			mq->mq_flags |= MQ_RSEL;
 			selrecord(td, &mq->mq_rsel);
  		}
 	}
 	if (events & POLLOUT) {
 		if (mq->mq_curmsgs < mq->mq_maxmsg)
 			revents |= POLLOUT;
 		else {
 			mq->mq_flags |= MQ_WSEL;
 			selrecord(td, &mq->mq_wsel);
 		}
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (revents);
 }
 
 static int
 mqf_close(struct file *fp, struct thread *td)
 {
 	struct mqfs_node *pn;
 
 	FILE_LOCK(fp);
 	fp->f_ops = &badfileops;
 	FILE_UNLOCK(fp);
 	pn = fp->f_data;
 	fp->f_data = NULL;
 	sx_xlock(&mqfs_data.mi_lock);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqfs_node *pn = fp->f_data;
 
 	bzero(st, sizeof *st);
 	st->st_atimespec = pn->mn_atime;
 	st->st_mtimespec = pn->mn_mtime;
 	st->st_ctimespec = pn->mn_ctime;
 	st->st_birthtimespec = pn->mn_birth;
 	st->st_uid = pn->mn_uid;
 	st->st_gid = pn->mn_gid;
 	st->st_mode = S_IFIFO | pn->mn_mode;
 	return (0);
 }
 
 static int
 mqf_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int error = 0;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &mq_rfiltops;
 		knlist_add(&mq->mq_rsel.si_note, kn, 0);
 	} else if (kn->kn_filter == EVFILT_WRITE) {
 		kn->kn_fop = &mq_wfiltops;
 		knlist_add(&mq->mq_wsel.si_note, kn, 0);
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_mqdetach(struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	if (kn->kn_filter == EVFILT_READ)
 		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
 	else if (kn->kn_filter == EVFILT_WRITE)
 		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
 	else
 		panic("filt_mqdetach");
 }
 
 static int
 filt_mqread(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs != 0);
 }
 
 static int
 filt_mqwrite(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs < mq->mq_maxmsg);
 }
 
 static struct fileops mqueueops = {
 	.fo_read		= mqf_read,
 	.fo_write		= mqf_write,
 	.fo_ioctl		= mqf_ioctl,
 	.fo_poll		= mqf_poll,
 	.fo_kqfilter		= mqf_kqfilter,
 	.fo_stat		= mqf_stat,
 	.fo_close		= mqf_close
 };
 
 static struct vop_vector mqfs_vnodeops = {
 	.vop_default 		= &default_vnodeops,
 	.vop_access		= mqfs_access,
 	.vop_cachedlookup	= mqfs_lookup,
 	.vop_lookup		= vfs_cache_lookup,
 	.vop_reclaim		= mqfs_reclaim,
 	.vop_create		= mqfs_create,
 	.vop_remove		= mqfs_remove,
 	.vop_inactive		= mqfs_inactive,
 	.vop_open		= mqfs_open,
 	.vop_close		= mqfs_close,
 	.vop_getattr		= mqfs_getattr,
 	.vop_setattr		= mqfs_setattr,
 	.vop_read		= mqfs_read,
 	.vop_write		= VOP_EOPNOTSUPP,
 	.vop_readdir		= mqfs_readdir,
 	.vop_mkdir		= VOP_EOPNOTSUPP,
 	.vop_rmdir		= VOP_EOPNOTSUPP
 };
 
 static struct vfsops mqfs_vfsops = {
 	.vfs_init 		= mqfs_init,
 	.vfs_uninit		= mqfs_uninit,
 	.vfs_mount		= mqfs_mount,
 	.vfs_unmount		= mqfs_unmount,
 	.vfs_root		= mqfs_root,
 	.vfs_statfs		= mqfs_statfs,
 };
 
 SYSCALL_MODULE_HELPER(kmq_open);
 SYSCALL_MODULE_HELPER(kmq_setattr);
 SYSCALL_MODULE_HELPER(kmq_timedsend);
 SYSCALL_MODULE_HELPER(kmq_timedreceive);
 SYSCALL_MODULE_HELPER(kmq_notify);
 SYSCALL_MODULE_HELPER(kmq_unlink);
 
 VFS_SET(mqfs_vfsops, mqueuefs, VFCF_SYNTHETIC);
 MODULE_VERSION(mqueuefs, 1);
Index: head/sys/kern/uipc_sem.c
===================================================================
--- head/sys/kern/uipc_sem.c	(revision 167231)
+++ head/sys/kern/uipc_sem.c	(revision 167232)
@@ -1,1011 +1,1008 @@
 /*-
  * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
  * Copyright (c) 2003-2005 SPARTA, Inc.
  * Copyright (c) 2005 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 #include "opt_posix.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/ksem.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/condvar.h>
 #include <sys/sem.h>
 #include <sys/uio.h>
 #include <sys/semaphore.h>
 #include <sys/syscall.h>
 #include <sys/stat.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/_semaphore.h>
 
 #include <security/mac/mac_framework.h>
 
 static int sem_count_proc(struct proc *p);
 static struct ksem *sem_lookup_byname(const char *name);
 static int sem_create(struct thread *td, const char *name,
     struct ksem **ksret, mode_t mode, unsigned int value);
 static void sem_free(struct ksem *ksnew);
 static int sem_perm(struct thread *td, struct ksem *ks);
 static void sem_enter(struct proc *p, struct ksem *ks);
 static int sem_leave(struct proc *p, struct ksem *ks);
 static void sem_exechook(void *arg, struct proc *p, struct image_params *imgp);
 static void sem_exithook(void *arg, struct proc *p);
 static void sem_forkhook(void *arg, struct proc *p1, struct proc *p2,
     int flags);
 static int sem_hasopen(struct thread *td, struct ksem *ks);
 
 static int kern_sem_close(struct thread *td, semid_t id);
 static int kern_sem_post(struct thread *td, semid_t id);
 static int kern_sem_wait(struct thread *td, semid_t id, int tryflag,
     struct timespec *abstime);
 static int kern_sem_init(struct thread *td, int dir, unsigned int value,
     semid_t *idp);
 static int kern_sem_open(struct thread *td, int dir, const char *name,
     int oflag, mode_t mode, unsigned int value, semid_t *idp);
 static int kern_sem_unlink(struct thread *td, const char *name);
 
 #ifndef SEM_MAX
 #define SEM_MAX	30
 #endif
 
 #define SEM_MAX_NAMELEN	14
 
 #define SEM_TO_ID(x)	((intptr_t)(x))
 #define ID_TO_SEM(x)	id_to_sem(x)
 
 /*
  * available semaphores go here, this includes sem_init and any semaphores
  * created via sem_open that have not yet been unlinked.
  */
 LIST_HEAD(, ksem) ksem_head = LIST_HEAD_INITIALIZER(&ksem_head);
 /*
  * semaphores still in use but have been sem_unlink()'d go here.
  */
 LIST_HEAD(, ksem) ksem_deadhead = LIST_HEAD_INITIALIZER(&ksem_deadhead);
 
 static struct mtx sem_lock;
 static MALLOC_DEFINE(M_SEM, "sems", "semaphore data");
 
 static int nsems = 0;
 SYSCTL_DECL(_p1003_1b);
 SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0, "");
 
 static eventhandler_tag sem_exit_tag, sem_exec_tag, sem_fork_tag;
 
 #ifdef SEM_DEBUG
 #define DP(x)	printf x
 #else
 #define DP(x)
 #endif
 
 static __inline
 void
 sem_ref(struct ksem *ks)
 {
 
 	mtx_assert(&sem_lock, MA_OWNED);
 	ks->ks_ref++;
 	DP(("sem_ref: ks = %p, ref = %d\n", ks, ks->ks_ref));
 }
 
 static __inline
 void
 sem_rel(struct ksem *ks)
 {
 
 	mtx_assert(&sem_lock, MA_OWNED);
 	DP(("sem_rel: ks = %p, ref = %d\n", ks, ks->ks_ref - 1));
 	if (--ks->ks_ref == 0)
 		sem_free(ks);
 }
 
 static __inline struct ksem *id_to_sem(semid_t id);
 
 static __inline
 struct ksem *
 id_to_sem(semid_t id)
 {
 	struct ksem *ks;
 
 	mtx_assert(&sem_lock, MA_OWNED);
 	DP(("id_to_sem: id = %0x,%p\n", id, (struct ksem *)id));
 	LIST_FOREACH(ks, &ksem_head, ks_entry) {
 		DP(("id_to_sem: ks = %p\n", ks));
 		if (ks == (struct ksem *)id)
 			return (ks);
 	}
 	return (NULL);
 }
 
 static struct ksem *
 sem_lookup_byname(const char *name)
 {
 	struct ksem *ks;
 
 	mtx_assert(&sem_lock, MA_OWNED);
 	LIST_FOREACH(ks, &ksem_head, ks_entry)
 		if (ks->ks_name != NULL && strcmp(ks->ks_name, name) == 0)
 			return (ks);
 	return (NULL);
 }
 
 static int
 sem_create(struct thread *td, const char *name, struct ksem **ksret,
     mode_t mode, unsigned int value)
 {
 	struct ksem *ret;
 	struct proc *p;
 	struct ucred *uc;
 	size_t len;
 	int error;
 
 	DP(("sem_create\n"));
 	p = td->td_proc;
 	uc = td->td_ucred;
 	if (value > SEM_VALUE_MAX)
 		return (EINVAL);
 	ret = malloc(sizeof(*ret), M_SEM, M_WAITOK | M_ZERO);
 	if (name != NULL) {
 		len = strlen(name);
 		if (len > SEM_MAX_NAMELEN) {
 			free(ret, M_SEM);
 			return (ENAMETOOLONG);
 		}
 		/* name must start with a '/' but not contain one. */
 		if (*name != '/' || len < 2 || index(name + 1, '/') != NULL) {
 			free(ret, M_SEM);
 			return (EINVAL);
 		}
 		ret->ks_name = malloc(len + 1, M_SEM, M_WAITOK);
 		strcpy(ret->ks_name, name);
 	} else {
 		ret->ks_name = NULL;
 	}
 	ret->ks_mode = mode;
 	ret->ks_value = value;
 	ret->ks_ref = 1;
 	ret->ks_waiters = 0;
 	ret->ks_uid = uc->cr_uid;
 	ret->ks_gid = uc->cr_gid;
 	ret->ks_onlist = 0;
 	cv_init(&ret->ks_cv, "sem");
 	LIST_INIT(&ret->ks_users);
 #ifdef MAC
 	mac_init_posix_sem(ret);
 	mac_create_posix_sem(uc, ret);
 #endif
 	if (name != NULL)
 		sem_enter(td->td_proc, ret);
 	*ksret = ret;
 	mtx_lock(&sem_lock);
 	if (nsems >= p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX)) {
 		sem_leave(td->td_proc, ret);
 		sem_free(ret);
 		error = ENFILE;
 	} else {
 		nsems++;
 		error = 0;
 	}
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_init_args {
 	unsigned int value;
 	semid_t *idp;
 };
 int ksem_init(struct thread *td, struct ksem_init_args *uap);
 #endif
 int
 ksem_init(struct thread *td, struct ksem_init_args *uap)
 {
 	int error;
 
 	error = kern_sem_init(td, UIO_USERSPACE, uap->value, uap->idp);
 	return (error);
 }
 
 static int
 kern_sem_init(struct thread *td, int dir, unsigned int value, semid_t *idp)
 {
 	struct ksem *ks;
 	semid_t id;
 	int error;
 
 	error = sem_create(td, NULL, &ks, S_IRWXU | S_IRWXG, value);
 	if (error)
 		return (error);
 	id = SEM_TO_ID(ks);
 	if (dir == UIO_USERSPACE) {
 		error = copyout(&id, idp, sizeof(id));
 		if (error) {
 			mtx_lock(&sem_lock);
 			sem_rel(ks);
 			mtx_unlock(&sem_lock);
 			return (error);
 		}
 	} else {
 		*idp = id;
 	}
 	mtx_lock(&sem_lock);
 	LIST_INSERT_HEAD(&ksem_head, ks, ks_entry);
 	ks->ks_onlist = 1;
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_open_args {
 	char *name;
 	int oflag;
 	mode_t mode;
 	unsigned int value;
 	semid_t *idp;	
 };
 int ksem_open(struct thread *td, struct ksem_open_args *uap);
 #endif
 int
 ksem_open(struct thread *td, struct ksem_open_args *uap)
 {
 	char name[SEM_MAX_NAMELEN + 1];
 	size_t done;
 	int error;
 
 	error = copyinstr(uap->name, name, SEM_MAX_NAMELEN + 1, &done);
 	if (error)
 		return (error);
 	DP((">>> sem_open start\n"));
 	error = kern_sem_open(td, UIO_USERSPACE,
 	    name, uap->oflag, uap->mode, uap->value, uap->idp);
 	DP(("<<< sem_open end\n"));
 	return (error);
 }
 
 static int
 kern_sem_open(struct thread *td, int dir, const char *name, int oflag,
     mode_t mode, unsigned int value, semid_t *idp)
 {
 	struct ksem *ksnew, *ks;
 	int error;
 	semid_t id;
 
 	ksnew = NULL;
 	mtx_lock(&sem_lock);
 	ks = sem_lookup_byname(name);
 	/*
 	 * If we found it but O_EXCL is set, error.
 	 */
 	if (ks != NULL && (oflag & O_EXCL) != 0) {
 		mtx_unlock(&sem_lock);
 		return (EEXIST);
 	}
 	/*
 	 * If we didn't find it...
 	 */
 	if (ks == NULL) {
 		/*
 		 * didn't ask for creation? error.
 		 */
 		if ((oflag & O_CREAT) == 0) {
 			mtx_unlock(&sem_lock);
 			return (ENOENT);
 		}
 		/*
 		 * We may block during creation, so drop the lock.
 		 */
 		mtx_unlock(&sem_lock);
 		error = sem_create(td, name, &ksnew, mode, value);
 		if (error != 0)
 			return (error);
 		id = SEM_TO_ID(ksnew);
 		if (dir == UIO_USERSPACE) {
 			DP(("about to copyout! %d to %p\n", id, idp));
 			error = copyout(&id, idp, sizeof(id));
 			if (error) {
 				mtx_lock(&sem_lock);
 				sem_leave(td->td_proc, ksnew);
 				sem_rel(ksnew);
 				mtx_unlock(&sem_lock);
 				return (error);
 			}
 		} else {
 			DP(("about to set! %d to %p\n", id, idp));
 			*idp = id;
 		}
 		/*
 		 * We need to make sure we haven't lost a race while
 		 * allocating during creation.
 		 */
 		mtx_lock(&sem_lock);
 		ks = sem_lookup_byname(name);
 		if (ks != NULL) {
 			/* we lost... */
 			sem_leave(td->td_proc, ksnew);
 			sem_rel(ksnew);
 			/* we lost and we can't loose... */
 			if ((oflag & O_EXCL) != 0) {
 				mtx_unlock(&sem_lock);
 				return (EEXIST);
 			}
 		} else {
 			DP(("sem_create: about to add to list...\n"));
 			LIST_INSERT_HEAD(&ksem_head, ksnew, ks_entry); 
 			DP(("sem_create: setting list bit...\n"));
 			ksnew->ks_onlist = 1;
 			DP(("sem_create: done, about to unlock...\n"));
 		}
 	} else {
 #ifdef MAC
 		error = mac_check_posix_sem_open(td->td_ucred, ks);
 		if (error)
 			goto err_open;
 #endif
 		/*
 		 * if we aren't the creator, then enforce permissions.
 		 */
 		error = sem_perm(td, ks);
 		if (error)
 			goto err_open;
 		sem_ref(ks);
 		mtx_unlock(&sem_lock);
 		id = SEM_TO_ID(ks);
 		if (dir == UIO_USERSPACE) {
 			error = copyout(&id, idp, sizeof(id));
 			if (error) {
 				mtx_lock(&sem_lock);
 				sem_rel(ks);
 				mtx_unlock(&sem_lock);
 				return (error);
 			}
 		} else {
 			*idp = id;
 		}
 		sem_enter(td->td_proc, ks);
 		mtx_lock(&sem_lock);
 		sem_rel(ks);
 	}
 err_open:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 sem_perm(struct thread *td, struct ksem *ks)
 {
 	struct ucred *uc;
 
 	/*
 	 * XXXRW: This permission routine appears to be incorrect.  If the
 	 * user matches, we shouldn't go on to the group if the user
 	 * permissions don't allow the action?  Not changed for now.  To fix,
 	 * change from a series of if (); if (); to if () else if () else...
 	 */
 	uc = td->td_ucred;
 	DP(("sem_perm: uc(%d,%d) ks(%d,%d,%o)\n",
 	    uc->cr_uid, uc->cr_gid,
 	     ks->ks_uid, ks->ks_gid, ks->ks_mode));
 	if ((uc->cr_uid == ks->ks_uid) && (ks->ks_mode & S_IWUSR) != 0)
 		return (0);
 	if ((uc->cr_gid == ks->ks_gid) && (ks->ks_mode & S_IWGRP) != 0)
 		return (0);
 	if ((ks->ks_mode & S_IWOTH) != 0)
 		return (0);
 	return (priv_check(td, PRIV_SEM_WRITE));
 }
 
 static void
 sem_free(struct ksem *ks)
 {
 
 	nsems--;
 	if (ks->ks_onlist)
 		LIST_REMOVE(ks, ks_entry);
 	if (ks->ks_name != NULL)
 		free(ks->ks_name, M_SEM);
 	cv_destroy(&ks->ks_cv);
 	free(ks, M_SEM);
 }
 
 static __inline struct kuser *sem_getuser(struct proc *p, struct ksem *ks);
 
 static __inline struct kuser *
 sem_getuser(struct proc *p, struct ksem *ks)
 {
 	struct kuser *k;
 
 	LIST_FOREACH(k, &ks->ks_users, ku_next)
 		if (k->ku_pid == p->p_pid)
 			return (k);
 	return (NULL);
 }
 
 static int
 sem_hasopen(struct thread *td, struct ksem *ks)
 {
 	
 	return ((ks->ks_name == NULL && sem_perm(td, ks) == 0)
 	    || sem_getuser(td->td_proc, ks) != NULL);
 }
 
 static int
 sem_leave(struct proc *p, struct ksem *ks)
 {
 	struct kuser *k;
 
 	DP(("sem_leave: ks = %p\n", ks));
 	k = sem_getuser(p, ks);
 	DP(("sem_leave: ks = %p, k = %p\n", ks, k));
 	if (k != NULL) {
 		LIST_REMOVE(k, ku_next);
 		sem_rel(ks);
 		DP(("sem_leave: about to free k\n"));
 		free(k, M_SEM);
 		DP(("sem_leave: returning\n"));
 		return (0);
 	}
 	return (EINVAL);
 }
 
 static void
 sem_enter(p, ks)
 	struct proc *p;
 	struct ksem *ks;
 {
 	struct kuser *ku, *k;
 
 	ku = malloc(sizeof(*ku), M_SEM, M_WAITOK);
 	ku->ku_pid = p->p_pid;
 	mtx_lock(&sem_lock);
 	k = sem_getuser(p, ks);
 	if (k != NULL) {
 		mtx_unlock(&sem_lock);
 		free(ku, M_TEMP);
 		return;
 	}
 	LIST_INSERT_HEAD(&ks->ks_users, ku, ku_next);
 	sem_ref(ks);
 	mtx_unlock(&sem_lock);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_unlink_args {
 	char *name;
 };
 int ksem_unlink(struct thread *td, struct ksem_unlink_args *uap);
 #endif
-	
 int
 ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
 {
 	char name[SEM_MAX_NAMELEN + 1];
 	size_t done;
 	int error;
 
 	error = copyinstr(uap->name, name, SEM_MAX_NAMELEN + 1, &done);
 	return (error ? error :
 	    kern_sem_unlink(td, name));
 }
 
 static int
 kern_sem_unlink(struct thread *td, const char *name)
 {
 	struct ksem *ks;
 	int error;
 
 	mtx_lock(&sem_lock);
 	ks = sem_lookup_byname(name);
 	if (ks != NULL) {
 #ifdef MAC
 		error = mac_check_posix_sem_unlink(td->td_ucred, ks);
 		if (error) {
 			mtx_unlock(&sem_lock);
 			return (error);
 		}
 #endif
 		error = sem_perm(td, ks);
 	} else
 		error = ENOENT;
 	DP(("sem_unlink: '%s' ks = %p, error = %d\n", name, ks, error));
 	if (error == 0) {
 		LIST_REMOVE(ks, ks_entry);
 		LIST_INSERT_HEAD(&ksem_deadhead, ks, ks_entry); 
 		sem_rel(ks);
 	}
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_close_args {
 	semid_t id;
 };
 int ksem_close(struct thread *td, struct ksem_close_args *uap);
 #endif
-
 int
 ksem_close(struct thread *td, struct ksem_close_args *uap)
 {
 
 	return (kern_sem_close(td, uap->id));
 }
 
 static int
 kern_sem_close(struct thread *td, semid_t id)
 {
 	struct ksem *ks;
 	int error;
 
 	error = EINVAL;
 	mtx_lock(&sem_lock);
 	ks = ID_TO_SEM(id);
 	/* this is not a valid operation for unnamed sems */
 	if (ks != NULL && ks->ks_name != NULL)
 		error = sem_leave(td->td_proc, ks);
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_post_args {
 	semid_t id;
 };
 int ksem_post(struct thread *td, struct ksem_post_args *uap);
 #endif
 int
 ksem_post(struct thread *td, struct ksem_post_args *uap)
 {
 
 	return (kern_sem_post(td, uap->id));
 }
 
 static int
 kern_sem_post(struct thread *td, semid_t id)
 {
 	struct ksem *ks;
 	int error;
 
 	mtx_lock(&sem_lock);
 	ks = ID_TO_SEM(id);
 	if (ks == NULL || !sem_hasopen(td, ks)) {
 		error = EINVAL;
 		goto err;
 	}
 #ifdef MAC
 	error = mac_check_posix_sem_post(td->td_ucred, ks);
 	if (error)
 		goto err;
 #endif
 	if (ks->ks_value == SEM_VALUE_MAX) {
 		error = EOVERFLOW;
 		goto err;
 	}
 	++ks->ks_value;
 	if (ks->ks_waiters > 0)
 		cv_signal(&ks->ks_cv);
 	error = 0;
 err:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_wait_args {
 	semid_t id;
 };
 int ksem_wait(struct thread *td, struct ksem_wait_args *uap);
 #endif
-
 int
 ksem_wait(struct thread *td, struct ksem_wait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 0, NULL));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_timedwait_args {
 	semid_t id;
 	const struct timespec *abstime;
 };
 int ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap);
 #endif
 int
 ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
 {
 	struct timespec abstime;
 	struct timespec *ts;
 	int error;
 
 	/* We allow a null timespec (wait forever). */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime, sizeof(abstime));
 		if (error != 0)
 			return (error);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_trywait_args {
 	semid_t id;
 };
 int ksem_trywait(struct thread *td, struct ksem_trywait_args *uap);
 #endif
 int
 ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 1, NULL));
 }
 
 static int
 kern_sem_wait(struct thread *td, semid_t id, int tryflag,
     struct timespec *abstime)
 {
 	struct timespec ts1, ts2;
 	struct timeval tv;
 	struct ksem *ks;
 	int error;
 
 	DP((">>> kern_sem_wait entered!\n"));
 	mtx_lock(&sem_lock);
 	ks = ID_TO_SEM(id);
 	if (ks == NULL) {
 		DP(("kern_sem_wait ks == NULL\n"));
 		error = EINVAL;
 		goto err;
 	}
 	sem_ref(ks);
 	if (!sem_hasopen(td, ks)) {
 		DP(("kern_sem_wait hasopen failed\n"));
 		error = EINVAL;
 		goto err;
 	}
 #ifdef MAC
 	error = mac_check_posix_sem_wait(td->td_ucred, ks);
 	if (error) {
 		DP(("kern_sem_wait mac failed\n"));
 		goto err;
 	}
 #endif
 	DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
 	if (ks->ks_value == 0) {
 		ks->ks_waiters++;
 		if (tryflag != 0)
 			error = EAGAIN;
 		else if (abstime == NULL)
 			error = cv_wait_sig(&ks->ks_cv, &sem_lock);
 		else {
 			for (;;) {
 				ts1 = *abstime;
 				getnanotime(&ts2);
 				timespecsub(&ts1, &ts2);
 				TIMESPEC_TO_TIMEVAL(&tv, &ts1);
 				if (tv.tv_sec < 0) {
 					error = ETIMEDOUT;
 					break;
 				}
 				error = cv_timedwait_sig(&ks->ks_cv,
 				    &sem_lock, tvtohz(&tv));
 				if (error != EWOULDBLOCK)
 					break;
 			}
 		}
 		ks->ks_waiters--;
 		if (error)
 			goto err;
 	}
 	ks->ks_value--;
 	error = 0;
 err:
 	if (ks != NULL)
 		sem_rel(ks);
 	mtx_unlock(&sem_lock);
 	DP(("<<< kern_sem_wait leaving, error = %d\n", error));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_getvalue_args {
 	semid_t id;
 	int *val;
 };
 int ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap);
 #endif
 int
 ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
 {
 	struct ksem *ks;
 	int error, val;
 
 	mtx_lock(&sem_lock);
 	ks = ID_TO_SEM(uap->id);
 	if (ks == NULL || !sem_hasopen(td, ks)) {
 		mtx_unlock(&sem_lock);
 		return (EINVAL);
 	}
 #ifdef MAC
 	error = mac_check_posix_sem_getvalue(td->td_ucred, ks);
 	if (error) {
 		mtx_unlock(&sem_lock);
 		return (error);
 	}
 #endif
 	val = ks->ks_value;
 	mtx_unlock(&sem_lock);
 	error = copyout(&val, uap->val, sizeof(val));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_destroy_args {
 	semid_t id;
 };
 int ksem_destroy(struct thread *td, struct ksem_destroy_args *uap);
 #endif
 int
 ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
 {
 	struct ksem *ks;
 	int error;
 
 	mtx_lock(&sem_lock);
 	ks = ID_TO_SEM(uap->id);
 	if (ks == NULL || !sem_hasopen(td, ks) ||
 	    ks->ks_name != NULL) {
 		error = EINVAL;
 		goto err;
 	}
 #ifdef MAC
 	error = mac_check_posix_sem_destroy(td->td_ucred, ks);
 	if (error)
 		goto err;
 #endif
 	if (ks->ks_waiters != 0) {
 		error = EBUSY;
 		goto err;
 	}
 	sem_rel(ks);
 	error = 0;
 err:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 /*
  * Count the number of kusers associated with a proc, so as to guess at how
  * many to allocate when forking.
  */
 static int
 sem_count_proc(struct proc *p)
 {
 	struct ksem *ks;
 	struct kuser *ku;
 	int count;
 
 	mtx_assert(&sem_lock, MA_OWNED);
 
 	count = 0;
 	LIST_FOREACH(ks, &ksem_head, ks_entry) {
 		LIST_FOREACH(ku, &ks->ks_users, ku_next) {
 			if (ku->ku_pid == p->p_pid)
 				count++;
 		}
 	}
 	LIST_FOREACH(ks, &ksem_deadhead, ks_entry) {
 		LIST_FOREACH(ku, &ks->ks_users, ku_next) {
 			if (ku->ku_pid == p->p_pid)
 				count++;
 		}
 	}
 	return (count);
 }
 
 /*
  * When a process forks, the child process must gain a reference to each open
  * semaphore in the parent process, whether it is unlinked or not.  This
  * requires allocating a kuser structure for each semaphore reference in the
  * new process.  Because the set of semaphores in the parent can change while
  * the fork is in progress, we have to handle races -- first we attempt to
  * allocate enough storage to acquire references to each of the semaphores,
  * then we enter the semaphores and release the temporary references.
  */
 static void
 sem_forkhook(void *arg, struct proc *p1, struct proc *p2, int flags)
 {
 	struct ksem *ks, **sem_array;
 	int count, i, new_count;
 	struct kuser *ku;
 
 	mtx_lock(&sem_lock);
 	count = sem_count_proc(p1);
 	if (count == 0) {
 		mtx_unlock(&sem_lock);
 		return;
 	}
 race_lost:
 	mtx_assert(&sem_lock, MA_OWNED);
 	mtx_unlock(&sem_lock);
 	sem_array = malloc(sizeof(struct ksem *) * count, M_TEMP, M_WAITOK);
 	mtx_lock(&sem_lock);
 	new_count = sem_count_proc(p1);
 	if (count < new_count) {
 		/* Lost race, repeat and allocate more storage. */
 		free(sem_array, M_TEMP);
 		count = new_count;
 		goto race_lost;
 	}
 	/*
 	 * Given an array capable of storing an adequate number of semaphore
 	 * references, now walk the list of semaphores and acquire a new
 	 * reference for any semaphore opened by p1.
 	 */
 	count = new_count;
 	i = 0;
 	LIST_FOREACH(ks, &ksem_head, ks_entry) {
 		LIST_FOREACH(ku, &ks->ks_users, ku_next) {
 			if (ku->ku_pid == p1->p_pid) {
 				sem_ref(ks);
 				sem_array[i] = ks;
 				i++;
 				break;
 			}
 		}
 	}
 	LIST_FOREACH(ks, &ksem_deadhead, ks_entry) {
 		LIST_FOREACH(ku, &ks->ks_users, ku_next) {
 			if (ku->ku_pid == p1->p_pid) {
 				sem_ref(ks);
 				sem_array[i] = ks;
 				i++;
 				break;
 			}
 		}
 	}
 	mtx_unlock(&sem_lock);
 	KASSERT(i == count, ("sem_forkhook: i != count (%d, %d)", i, count));
 	/*
 	 * Now cause p2 to enter each of the referenced semaphores, then
 	 * release our temporary reference.  This is pretty inefficient.
 	 * Finally, free our temporary array.
 	 */
 	for (i = 0; i < count; i++) {
 		sem_enter(p2, sem_array[i]);
 		mtx_lock(&sem_lock);
 		sem_rel(sem_array[i]);
 		mtx_unlock(&sem_lock);
 	}
 	free(sem_array, M_TEMP);
 }
 
 static void
 sem_exechook(void *arg, struct proc *p, struct image_params *imgp __unused)
 {
    	sem_exithook(arg, p);   	
 }
 
 static void
 sem_exithook(void *arg, struct proc *p)
 {
 	struct ksem *ks, *ksnext;
 
 	mtx_lock(&sem_lock);
 	ks = LIST_FIRST(&ksem_head);
 	while (ks != NULL) {
 		ksnext = LIST_NEXT(ks, ks_entry);
 		sem_leave(p, ks);
 		ks = ksnext;
 	}
 	ks = LIST_FIRST(&ksem_deadhead);
 	while (ks != NULL) {
 		ksnext = LIST_NEXT(ks, ks_entry);
 		sem_leave(p, ks);
 		ks = ksnext;
 	}
 	mtx_unlock(&sem_lock);
 }
 
 static int
 sem_modload(struct module *module, int cmd, void *arg)
 {
         int error = 0;
 
         switch (cmd) {
         case MOD_LOAD:
 		mtx_init(&sem_lock, "sem", "semaphore", MTX_DEF);
 		p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
 		p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
 		sem_exit_tag = EVENTHANDLER_REGISTER(process_exit, sem_exithook,
 		    NULL, EVENTHANDLER_PRI_ANY);
 		sem_exec_tag = EVENTHANDLER_REGISTER(process_exec, sem_exechook,
 		    NULL, EVENTHANDLER_PRI_ANY);
 		sem_fork_tag = EVENTHANDLER_REGISTER(process_fork, sem_forkhook, NULL, EVENTHANDLER_PRI_ANY);
                 break;
         case MOD_UNLOAD:
 		if (nsems != 0) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		EVENTHANDLER_DEREGISTER(process_exit, sem_exit_tag);
 		EVENTHANDLER_DEREGISTER(process_exec, sem_exec_tag);
 		EVENTHANDLER_DEREGISTER(process_fork, sem_fork_tag);
 		mtx_destroy(&sem_lock);
                 break;
         case MOD_SHUTDOWN:
                 break;
         default:
                 error = EINVAL;
                 break;
         }
         return (error);
 }
 
 static moduledata_t sem_mod = {
         "sem",
         &sem_modload,
         NULL
 };
 
 SYSCALL_MODULE_HELPER(ksem_init);
 SYSCALL_MODULE_HELPER(ksem_open);
 SYSCALL_MODULE_HELPER(ksem_unlink);
 SYSCALL_MODULE_HELPER(ksem_close);
 SYSCALL_MODULE_HELPER(ksem_post);
 SYSCALL_MODULE_HELPER(ksem_wait);
 SYSCALL_MODULE_HELPER(ksem_timedwait);
 SYSCALL_MODULE_HELPER(ksem_trywait);
 SYSCALL_MODULE_HELPER(ksem_getvalue);
 SYSCALL_MODULE_HELPER(ksem_destroy);
 
 DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sem, 1);
Index: head/sys/kern/uipc_syscalls.c
===================================================================
--- head/sys/kern/uipc_syscalls.c	(revision 167231)
+++ head/sys/kern/uipc_syscalls.c	(revision 167232)
@@ -1,2689 +1,2687 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * sendfile(2) and related extensions:
  * Copyright (c) 1998, David Greenman. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sctp.h"
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <sys/event.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/sf_buf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_peeloff.h>
 #endif /* SCTP */
 
 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 
 static int accept1(struct thread *td, struct accept_args *uap, int compat);
 static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
 static int getsockname1(struct thread *td, struct getsockname_args *uap,
 			int compat);
 static int getpeername1(struct thread *td, struct getpeername_args *uap,
 			int compat);
 
 /*
  * NSFBUFS-related variables and associated sysctls
  */
 int nsfbufs;
 int nsfbufspeak;
 int nsfbufsused;
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
     "Maximum number of sendfile(2) sf_bufs available");
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
     "Number of sendfile(2) sf_bufs at peak usage");
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
     "Number of sendfile(2) sf_bufs in use");
 
 /*
  * Convert a user file descriptor to a kernel file entry.  A reference on the
  * file entry is held upon returning.  This is lighter weight than
  * fgetsock(), which bumps the socket reference drops the file reference
  * count instead, as this approach avoids several additional mutex operations
  * associated with the additional reference count.  If requested, return the
  * open file flags.
  */
 static int
 getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	fp = NULL;
 	if (fdp == NULL)
 		error = EBADF;
 	else {
 		FILEDESC_LOCK_FAST(fdp);
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			error = EBADF;
 		else if (fp->f_type != DTYPE_SOCKET) {
 			fp = NULL;
 			error = ENOTSOCK;
 		} else {
 			fhold(fp);
 			if (fflagp != NULL)
 				*fflagp = fp->f_flag;
 			error = 0;
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 	}
 	*fpp = fp;
 	return (error);
 }
 
 /*
  * System call interface to the socket abstraction.
  */
 #if defined(COMPAT_43)
 #define COMPAT_OLDSOCK
 #endif
 
 int
 socket(td, uap)
 	struct thread *td;
 	register struct socket_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
 	} */ *uap;
 {
 	struct filedesc *fdp;
 	struct socket *so;
 	struct file *fp;
 	int fd, error;
 
 #ifdef MAC
 	error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type,
 	    uap->protocol);
 	if (error)
 		return (error);
 #endif
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd);
 	if (error)
 		return (error);
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	NET_LOCK_GIANT();
 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	NET_UNLOCK_GIANT();
 	if (error) {
 		fdclose(fdp, fp, fd, td);
 	} else {
 		FILEDESC_LOCK_FAST(fdp);
 		fp->f_data = so;	/* already has ref count */
 		fp->f_flag = FREAD|FWRITE;
 		fp->f_ops = &socketops;
 		fp->f_type = DTYPE_SOCKET;
 		FILEDESC_UNLOCK_FAST(fdp);
 		td->td_retval[0] = fd;
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
 /* ARGSUSED */
 int
 bind(td, uap)
 	struct thread *td;
 	register struct bind_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct sockaddr *sa;
 	int error;
 
 	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
 		return (error);
 
 	error = kern_bind(td, uap->s, sa);
 	free(sa, M_SONAME);
 	return (error);
 }
 
 int
 kern_bind(td, fd, sa)
 	struct thread *td;
 	int fd;
 	struct sockaddr *sa;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		goto done2;
 	so = fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_bind(td->td_ucred, so, sa);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto done1;
 #endif
 	error = sobind(so, sa, td);
 #ifdef MAC
 done1:
 #endif
 	fdrop(fp, td);
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 /* ARGSUSED */
 int
 listen(td, uap)
 	struct thread *td;
 	register struct listen_args /* {
 		int	s;
 		int	backlog;
 	} */ *uap;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 #ifdef MAC
 		SOCK_LOCK(so);
 		error = mac_check_socket_listen(td->td_ucred, so);
 		SOCK_UNLOCK(so);
 		if (error)
 			goto done;
 #endif
 		error = solisten(so, uap->backlog, td);
 #ifdef MAC
 done:
 #endif
 		fdrop(fp, td);
 	}
 	NET_UNLOCK_GIANT();
 	return(error);
 }
 
 /*
  * accept1()
  */
 static int
 accept1(td, uap, compat)
 	struct thread *td;
 	register struct accept_args /* {
 		int	s;
 		struct sockaddr	* __restrict name;
 		socklen_t	* __restrict anamelen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *name;
 	socklen_t namelen;
 	struct file *fp;
 	int error;
 
 	if (uap->name == NULL)
 		return (kern_accept(td, uap->s, NULL, NULL, NULL));
 
 	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
 	if (error)
 		return (error);
 
 	error = kern_accept(td, uap->s, &name, &namelen, &fp);
 
 	/*
 	 * return a namelen of zero for older code which might
 	 * ignore the return value from accept.
 	 */
 	if (error) {
 		(void) copyout(&namelen,
 		    uap->anamelen, sizeof(*uap->anamelen));
 		return (error);
 	}
 
 	if (error == 0 && name != NULL) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)name)->sa_family =
 			    name->sa_family;
 #endif
 		error = copyout(name, uap->name, namelen);
 	}
 	if (error == 0)
 		error = copyout(&namelen, uap->anamelen,
 		    sizeof(namelen));
 	if (error)
 		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
 	fdrop(fp, td);
 	free(name, M_SONAME);
 	return (error);
 }
 
 int
 kern_accept(struct thread *td, int s, struct sockaddr **name,
     socklen_t *namelen, struct file **fp)
 {
 	struct filedesc *fdp;
 	struct file *headfp, *nfp = NULL;
 	struct sockaddr *sa = NULL;
 	int error;
 	struct socket *head, *so;
 	int fd;
 	u_int fflag;
 	pid_t pgid;
 	int tmp;
 
 	if (name) {
 		*name = NULL;
 		if (*namelen < 0)
 			return (EINVAL);
 	}
 
 	fdp = td->td_proc->p_fd;
 	NET_LOCK_GIANT();
 	error = getsock(fdp, s, &headfp, &fflag);
 	if (error)
 		goto done2;
 	head = headfp->f_data;
 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
 		error = EINVAL;
 		goto done;
 	}
 #ifdef MAC
 	SOCK_LOCK(head);
 	error = mac_check_socket_accept(td->td_ucred, head);
 	SOCK_UNLOCK(head);
 	if (error != 0)
 		goto done;
 #endif
 	error = falloc(td, &nfp, &fd);
 	if (error)
 		goto done;
 	ACCEPT_LOCK();
 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
 		ACCEPT_UNLOCK();
 		error = EWOULDBLOCK;
 		goto noconnection;
 	}
 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
 		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			head->so_error = ECONNABORTED;
 			break;
 		}
 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
 		    "accept", 0);
 		if (error) {
 			ACCEPT_UNLOCK();
 			goto noconnection;
 		}
 	}
 	if (head->so_error) {
 		error = head->so_error;
 		head->so_error = 0;
 		ACCEPT_UNLOCK();
 		goto noconnection;
 	}
 	so = TAILQ_FIRST(&head->so_comp);
 	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
 	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
 
 	/*
 	 * Before changing the flags on the socket, we have to bump the
 	 * reference count.  Otherwise, if the protocol calls sofree(),
 	 * the socket will be released due to a zero refcount.
 	 */
 	SOCK_LOCK(so);			/* soref() and so_state update */
 	soref(so);			/* file descriptor reference */
 
 	TAILQ_REMOVE(&head->so_comp, so, so_list);
 	head->so_qlen--;
 	so->so_state |= (head->so_state & SS_NBIO);
 	so->so_qstate &= ~SQ_COMP;
 	so->so_head = NULL;
 
 	SOCK_UNLOCK(so);
 	ACCEPT_UNLOCK();
 
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	td->td_retval[0] = fd;
 
 	/* connection has been removed from the listen queue */
 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
 
 	pgid = fgetown(&head->so_sigio);
 	if (pgid != 0)
 		fsetown(pgid, &so->so_sigio);
 
 	FILE_LOCK(nfp);
 	nfp->f_data = so;	/* nfp has ref count from falloc */
 	nfp->f_flag = fflag;
 	nfp->f_ops = &socketops;
 	nfp->f_type = DTYPE_SOCKET;
 	FILE_UNLOCK(nfp);
 	/* Sync socket nonblocking/async state with file flags */
 	tmp = fflag & FNONBLOCK;
 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
 	tmp = fflag & FASYNC;
 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
 	sa = 0;
 	error = soaccept(so, &sa);
 	if (error) {
 		/*
 		 * return a namelen of zero for older code which might
 		 * ignore the return value from accept.
 		 */
 		if (name)
 			*namelen = 0;
 		goto noconnection;
 	}
 	if (sa == NULL) {
 		if (name)
 			*namelen = 0;
 		goto done;
 	}
 	if (name) {
 		/* check sa_len before it is destroyed */
 		if (*namelen > sa->sa_len)
 			*namelen = sa->sa_len;
 		*name = sa;
 		sa = NULL;
 	}
 noconnection:
 	if (sa)
 		FREE(sa, M_SONAME);
 
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error)
 		fdclose(fdp, nfp, fd, td);
 
 	/*
 	 * Release explicitly held references before returning.  We return
 	 * a reference on nfp to the caller on success if they request it.
 	 */
 done:
 	if (fp != NULL) {
 		if (error == 0) {
 			*fp = nfp;
 			nfp = NULL;
 		} else
 			*fp = NULL;
 	}
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fdrop(headfp, td);
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 int
 accept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 oaccept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 /* ARGSUSED */
 int
 connect(td, uap)
 	struct thread *td;
 	register struct connect_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error)
 		return (error);
 
 	error = kern_connect(td, uap->s, sa);
 	free(sa, M_SONAME);
 	return (error);
 }
 
 
 int
 kern_connect(td, fd, sa)
 	struct thread *td;
 	int fd;
 	struct sockaddr *sa;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 	int interrupted = 0;
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		goto done2;
 	so = fp->f_data;
 	if (so->so_state & SS_ISCONNECTING) {
 		error = EALREADY;
 		goto done1;
 	}
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_connect(td->td_ucred, so, sa);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto bad;
 #endif
 	error = soconnect(so, sa, td);
 	if (error)
 		goto bad;
 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 		error = EINPROGRESS;
 		goto done1;
 	}
 	SOCK_LOCK(so);
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
 		    "connec", 0);
 		if (error) {
 			if (error == EINTR || error == ERESTART)
 				interrupted = 1;
 			break;
 		}
 	}
 	if (error == 0) {
 		error = so->so_error;
 		so->so_error = 0;
 	}
 	SOCK_UNLOCK(so);
 bad:
 	if (!interrupted)
 		so->so_state &= ~SS_ISCONNECTING;
 	if (error == ERESTART)
 		error = EINTR;
 done1:
 	fdrop(fp, td);
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 int
 socketpair(td, uap)
 	struct thread *td;
 	register struct socketpair_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
 		int	*rsv;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp1, *fp2;
 	struct socket *so1, *so2;
 	int fd, error, sv[2];
 
 #ifdef MAC
 	/* We might want to have a separate check for socket pairs. */
 	error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type,
 	    uap->protocol);
 	if (error)
 		return (error);
 #endif
 
 	NET_LOCK_GIANT();
 	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error)
 		goto done2;
 	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error)
 		goto free1;
 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
 	error = falloc(td, &fp1, &fd);
 	if (error)
 		goto free2;
 	sv[0] = fd;
 	fp1->f_data = so1;	/* so1 already has ref count */
 	error = falloc(td, &fp2, &fd);
 	if (error)
 		goto free3;
 	fp2->f_data = so2;	/* so2 already has ref count */
 	sv[1] = fd;
 	error = soconnect2(so1, so2);
 	if (error)
 		goto free4;
 	if (uap->type == SOCK_DGRAM) {
 		/*
 		 * Datagram socket connection is asymmetric.
 		 */
 		 error = soconnect2(so2, so1);
 		 if (error)
 			goto free4;
 	}
 	FILE_LOCK(fp1);
 	fp1->f_flag = FREAD|FWRITE;
 	fp1->f_ops = &socketops;
 	fp1->f_type = DTYPE_SOCKET;
 	FILE_UNLOCK(fp1);
 	FILE_LOCK(fp2);
 	fp2->f_flag = FREAD|FWRITE;
 	fp2->f_ops = &socketops;
 	fp2->f_type = DTYPE_SOCKET;
 	FILE_UNLOCK(fp2);
 	error = copyout(sv, uap->rsv, 2 * sizeof (int));
 	fdrop(fp1, td);
 	fdrop(fp2, td);
 	goto done2;
 free4:
 	fdclose(fdp, fp2, sv[1], td);
 	fdrop(fp2, td);
 free3:
 	fdclose(fdp, fp1, sv[0], td);
 	fdrop(fp1, td);
 free2:
 	(void)soclose(so2);
 free1:
 	(void)soclose(so1);
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 static int
 sendit(td, s, mp, flags)
 	register struct thread *td;
 	int s;
 	register struct msghdr *mp;
 	int flags;
 {
 	struct mbuf *control;
 	struct sockaddr *to;
 	int error;
 
 	if (mp->msg_name != NULL) {
 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 		if (error) {
 			to = NULL;
 			goto bad;
 		}
 		mp->msg_name = to;
 	} else {
 		to = NULL;
 	}
 
 	if (mp->msg_control) {
 		if (mp->msg_controllen < sizeof(struct cmsghdr)
 #ifdef COMPAT_OLDSOCK
 		    && mp->msg_flags != MSG_COMPAT
 #endif
 		) {
 			error = EINVAL;
 			goto bad;
 		}
 		error = sockargs(&control, mp->msg_control,
 		    mp->msg_controllen, MT_CONTROL);
 		if (error)
 			goto bad;
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags == MSG_COMPAT) {
 			register struct cmsghdr *cm;
 
 			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
 			if (control == 0) {
 				error = ENOBUFS;
 				goto bad;
 			} else {
 				cm = mtod(control, struct cmsghdr *);
 				cm->cmsg_len = control->m_len;
 				cm->cmsg_level = SOL_SOCKET;
 				cm->cmsg_type = SCM_RIGHTS;
 			}
 		}
 #endif
 	} else {
 		control = NULL;
 	}
 
 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
 
 bad:
 	if (to)
 		FREE(to, M_SONAME);
 	return (error);
 }
 
 int
 kern_sendit(td, s, mp, flags, control, segflg)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	int flags;
 	struct mbuf *control;
 	enum uio_seg segflg;
 {
 	struct file *fp;
 	struct uio auio;
 	struct iovec *iov;
 	struct socket *so;
 	int i;
 	int len, error;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error)
 		goto bad2;
 	so = (struct socket *)fp->f_data;
 
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto bad;
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			error = EINVAL;
 			goto bad;
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(s, UIO_WRITE, ktruio, error);
 	}
 #endif
 bad:
 	fdrop(fp, td);
 bad2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 int
 sendto(td, uap)
 	struct thread *td;
 	register struct sendto_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
 		int	flags;
 		caddr_t	to;
 		int	tolen;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = uap->to;
 	msg.msg_namelen = uap->tolen;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = 0;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = 0;
 #endif
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	return (error);
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 osend(td, uap)
 	struct thread *td;
 	register struct osend_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = 0;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	return (error);
 }
 
 int
 osendmsg(td, uap)
 	struct thread *td;
 	struct osendmsg_args /* {
 		int	s;
 		caddr_t	msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_iov = iov;
 	msg.msg_flags = MSG_COMPAT;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
 int
 sendmsg(td, uap)
 	struct thread *td;
 	struct sendmsg_args /* {
 		int	s;
 		caddr_t	msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_iov = iov;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = 0;
 #endif
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 
 int
 kern_recvit(td, s, mp, fromseg, controlp)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	enum uio_seg fromseg;
 	struct mbuf **controlp;
 {
 	struct uio auio;
 	struct iovec *iov;
 	int i;
 	socklen_t len;
 	int error;
 	struct mbuf *m, *control = 0;
 	caddr_t ctlbuf;
 	struct file *fp;
 	struct socket *so;
 	struct sockaddr *fromsa = 0;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 
 	if(controlp != NULL)
 		*controlp = 0;
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error) {
 		NET_UNLOCK_GIANT();
 		return (error);
 	}
 	so = fp->f_data;
 
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_receive(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error) {
 		fdrop(fp, td);
 		NET_UNLOCK_GIANT();
 		return (error);
 	}
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			fdrop(fp, td);
 			NET_UNLOCK_GIANT();
 			return (EINVAL);
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
 	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
 	    (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
 	    &mp->msg_flags);
 	if (error) {
 		if (auio.uio_resid != (int)len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = (int)len - auio.uio_resid;
 		ktrgenio(s, UIO_READ, ktruio, error);
 	}
 #endif
 	if (error)
 		goto out;
 	td->td_retval[0] = (int)len - auio.uio_resid;
 	if (mp->msg_name) {
 		len = mp->msg_namelen;
 		if (len <= 0 || fromsa == 0)
 			len = 0;
 		else {
 			/* save sa_len before it is destroyed by MSG_COMPAT */
 			len = MIN(len, fromsa->sa_len);
 #ifdef COMPAT_OLDSOCK
 			if (mp->msg_flags & MSG_COMPAT)
 				((struct osockaddr *)fromsa)->sa_family =
 				    fromsa->sa_family;
 #endif
 			if (fromseg == UIO_USERSPACE) {
 				error = copyout(fromsa, mp->msg_name,
 				    (unsigned)len);
 				if (error)
 					goto out;
 			} else
 				bcopy(fromsa, mp->msg_name, len);
 		}
 		mp->msg_namelen = len;
 	}
 	if (mp->msg_control && controlp == NULL) {
 #ifdef COMPAT_OLDSOCK
 		/*
 		 * We assume that old recvmsg calls won't receive access
 		 * rights and other control info, esp. as control info
 		 * is always optional and those options didn't exist in 4.3.
 		 * If we receive rights, trim the cmsghdr; anything else
 		 * is tossed.
 		 */
 		if (control && mp->msg_flags & MSG_COMPAT) {
 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
 			    SOL_SOCKET ||
 			    mtod(control, struct cmsghdr *)->cmsg_type !=
 			    SCM_RIGHTS) {
 				mp->msg_controllen = 0;
 				goto out;
 			}
 			control->m_len -= sizeof (struct cmsghdr);
 			control->m_data += sizeof (struct cmsghdr);
 		}
 #endif
 		len = mp->msg_controllen;
 		m = control;
 		mp->msg_controllen = 0;
 		ctlbuf = mp->msg_control;
 
 		while (m && len > 0) {
 			unsigned int tocopy;
 
 			if (len >= m->m_len)
 				tocopy = m->m_len;
 			else {
 				mp->msg_flags |= MSG_CTRUNC;
 				tocopy = len;
 			}
 
 			if ((error = copyout(mtod(m, caddr_t),
 					ctlbuf, tocopy)) != 0)
 				goto out;
 
 			ctlbuf += tocopy;
 			len -= tocopy;
 			m = m->m_next;
 		}
 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
 	}
 out:
 	fdrop(fp, td);
 	NET_UNLOCK_GIANT();
 	if (fromsa)
 		FREE(fromsa, M_SONAME);
 
 	if (error == 0 && controlp != NULL)  
 		*controlp = control;
 	else  if (control)
 		m_freem(control);
 
 	return (error);
 }
 
 static int
 recvit(td, s, mp, namelenp)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	void *namelenp;
 {
 	int error;
 
 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
 	if (error)
 		return (error);
 	if (namelenp) {
 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags & MSG_COMPAT)
 			error = 0;	/* old recvfrom didn't check */
 #endif
 	}
 	return (error);
 }
 
 int
 recvfrom(td, uap)
 	struct thread *td;
 	register struct recvfrom_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
 		int	flags;
 		struct sockaddr * __restrict	from;
 		socklen_t * __restrict fromlenaddr;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	if (uap->fromlenaddr) {
 		error = copyin(uap->fromlenaddr,
 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
 		if (error)
 			goto done2;
 	} else {
 		msg.msg_namelen = 0;
 	}
 	msg.msg_name = uap->from;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
 done2:
 	return(error);
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 orecvfrom(td, uap)
 	struct thread *td;
 	struct recvfrom_args *uap;
 {
 
 	uap->flags |= MSG_COMPAT;
 	return (recvfrom(td, uap));
 }
 #endif
 
-
 #ifdef COMPAT_OLDSOCK
 int
 orecv(td, uap)
 	struct thread *td;
 	register struct orecv_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	error = recvit(td, uap->s, &msg, NULL);
 	return (error);
 }
 
 /*
  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
  * overlays the new one, missing only the flags, and with the (old) access
  * rights where the control fields are now.
  */
 int
 orecvmsg(td, uap)
 	struct thread *td;
 	struct orecvmsg_args /* {
 		int	s;
 		struct	omsghdr *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_flags = uap->flags | MSG_COMPAT;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
 	if (msg.msg_controllen && error == 0)
 		error = copyout(&msg.msg_controllen,
 		    &uap->msg->msg_accrightslen, sizeof (int));
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
 int
 recvmsg(td, uap)
 	struct thread *td;
 	struct recvmsg_args /* {
 		int	s;
 		struct	msghdr *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *uiov, *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error)
 		return (error);
 	msg.msg_flags = uap->flags;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags &= ~MSG_COMPAT;
 #endif
 	uiov = msg.msg_iov;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, NULL);
 	if (error == 0) {
 		msg.msg_iov = uiov;
 		error = copyout(&msg, uap->msg, sizeof(msg));
 	}
 	free(iov, M_IOV);
 	return (error);
 }
 
 /* ARGSUSED */
 int
 shutdown(td, uap)
 	struct thread *td;
 	register struct shutdown_args /* {
 		int	s;
 		int	how;
 	} */ *uap;
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = soshutdown(so, uap->how);
 		fdrop(fp, td);
 	}
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 /* ARGSUSED */
 int
 setsockopt(td, uap)
 	struct thread *td;
 	register struct setsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
 		caddr_t	val;
 		int	valsize;
 	} */ *uap;
 {
 
 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, uap->valsize));
 }
 
 int
 kern_setsockopt(td, s, level, name, val, valseg, valsize)
 	struct thread *td;
 	int s;
 	int level;
 	int name;
 	void *val;
 	enum uio_seg valseg;
 	socklen_t valsize;
 {
 	int error;
 	struct socket *so;
 	struct file *fp;
 	struct sockopt sopt;
 
 	if (val == NULL && valsize != 0)
 		return (EFAULT);
 	if ((int)valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = val;
 	sopt.sopt_valsize = valsize;
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_setsockopt called with bad valseg");
 	}
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sosetopt(so, &sopt);
 		fdrop(fp, td);
 	}
 	NET_UNLOCK_GIANT();
 	return(error);
 }
 
 /* ARGSUSED */
 int
 getsockopt(td, uap)
 	struct thread *td;
 	register struct getsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
 		void * __restrict	val;
 		socklen_t * __restrict avalsize;
 	} */ *uap;
 {
 	socklen_t valsize;
 	int	error;
 
 	if (uap->val) {
 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
 		if (error)
 			return (error);
 	}
 
 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, &valsize);
 
 	if (error == 0)
 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
 	return (error);
 }
 
 /*
  * Kernel version of getsockopt.
  * optval can be a userland or userspace. optlen is always a kernel pointer.
  */
 int
 kern_getsockopt(td, s, level, name, val, valseg, valsize)
 	struct thread *td;
 	int s;
 	int level;
 	int name;
 	void *val;
 	enum uio_seg valseg;
 	socklen_t *valsize;
 {
 	int error;
 	struct  socket *so;
 	struct file *fp;
 	struct	sockopt sopt;
 
 	if (val == NULL)
 		*valsize = 0;
 	if ((int)*valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_GET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = val;
 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_getsockopt called with bad valseg");
 	}
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sogetopt(so, &sopt);
 		*valsize = sopt.sopt_valsize;
 		fdrop(fp, td);
 	}
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 /*
  * getsockname1() - Get socket name.
  */
 /* ARGSUSED */
 static int
 getsockname1(td, uap, compat)
 	struct thread *td;
 	register struct getsockname_args /* {
 		int	fdes;
 		struct sockaddr * __restrict asa;
 		socklen_t * __restrict alen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 
 	error = copyin(uap->alen, &len, sizeof(len));
 	if (error)
 		return (error);
 
 	error = kern_getsockname(td, uap->fdes, &sa, &len);
 	if (error)
 		return (error);
 
 	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 		error = copyout(sa, uap->asa, (u_int)len);
 	}
 	free(sa, M_SONAME);
 	if (error == 0)
 		error = copyout(&len, uap->alen, sizeof(len));
 	return (error);
 }
 
 int
 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
     socklen_t *alen)
 {
 	struct socket *so;
 	struct file *fp;
 	socklen_t len;
 	int error;
 
 	if (*alen < 0)
 		return (EINVAL);
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		goto done;
 	so = fp->f_data;
 	*sa = NULL;
 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
 	if (error)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
 	else
 		len = MIN(*alen, (*sa)->sa_len);
 	*alen = len;
 bad:
 	fdrop(fp, td);
 	if (error && *sa) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
 done:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 int
 getsockname(td, uap)
 	struct thread *td;
 	struct getsockname_args *uap;
 {
 
 	return (getsockname1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 ogetsockname(td, uap)
 	struct thread *td;
 	struct getsockname_args *uap;
 {
 
 	return (getsockname1(td, uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 /*
  * getpeername1() - Get name of peer for connected socket.
  */
 /* ARGSUSED */
 static int
 getpeername1(td, uap, compat)
 	struct thread *td;
 	register struct getpeername_args /* {
 		int	fdes;
 		struct sockaddr * __restrict	asa;
 		socklen_t * __restrict	alen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 
 	error = copyin(uap->alen, &len, sizeof (len));
 	if (error)
 		return (error);
 
 	error = kern_getpeername(td, uap->fdes, &sa, &len);
 	if (error)
 		return (error);
 
 	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 		error = copyout(sa, uap->asa, (u_int)len);
 	}
 	free(sa, M_SONAME);
 	if (error == 0)
 		error = copyout(&len, uap->alen, sizeof(len));
 	return (error);
 }
 
 int
 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
     socklen_t *alen)
 {
 	struct socket *so;
 	struct file *fp;
 	socklen_t len;
 	int error;
 
 	if (*alen < 0)
 		return (EINVAL);
 
 	NET_LOCK_GIANT();
 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
 		goto done2;
 	so = fp->f_data;
 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
 		error = ENOTCONN;
 		goto done1;
 	}
 	*sa = NULL;
 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
 	if (error)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
 	else
 		len = MIN(*alen, (*sa)->sa_len);
 	*alen = len;
 bad:
 	if (error && *sa) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
 done1:
 	fdrop(fp, td);
 done2:
 	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 int
 getpeername(td, uap)
 	struct thread *td;
 	struct getpeername_args *uap;
 {
 
 	return (getpeername1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 ogetpeername(td, uap)
 	struct thread *td;
 	struct ogetpeername_args *uap;
 {
 
 	/* XXX uap should have type `getpeername_args *' to begin with. */
 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 int
 sockargs(mp, buf, buflen, type)
 	struct mbuf **mp;
 	caddr_t buf;
 	int buflen, type;
 {
 	register struct sockaddr *sa;
 	register struct mbuf *m;
 	int error;
 
 	if ((u_int)buflen > MLEN) {
 #ifdef COMPAT_OLDSOCK
 		if (type == MT_SONAME && (u_int)buflen <= 112)
 			buflen = MLEN;		/* unix domain compat. hack */
 		else
 #endif
 			if ((u_int)buflen > MCLBYTES)
 				return (EINVAL);
 	}
 	m = m_get(M_TRYWAIT, type);
 	if (m == NULL)
 		return (ENOBUFS);
 	if ((u_int)buflen > MLEN) {
 		MCLGET(m, M_TRYWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			return (ENOBUFS);
 		}
 	}
 	m->m_len = buflen;
 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
 	if (error)
 		(void) m_free(m);
 	else {
 		*mp = m;
 		if (type == MT_SONAME) {
 			sa = mtod(m, struct sockaddr *);
 
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 				sa->sa_family = sa->sa_len;
 #endif
 			sa->sa_len = buflen;
 		}
 	}
 	return (error);
 }
 
 int
 getsockaddr(namp, uaddr, len)
 	struct sockaddr **namp;
 	caddr_t uaddr;
 	size_t len;
 {
 	struct sockaddr *sa;
 	int error;
 
 	if (len > SOCK_MAXADDRLEN)
 		return (ENAMETOOLONG);
 	if (len < offsetof(struct sockaddr, sa_data[0]))
 		return (EINVAL);
 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
 	error = copyin(uaddr, sa, len);
 	if (error) {
 		FREE(sa, M_SONAME);
 	} else {
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 			sa->sa_family = sa->sa_len;
 #endif
 		sa->sa_len = len;
 		*namp = sa;
 	}
 	return (error);
 }
 
 /*
  * Detach mapped page and release resources back to the system.
  */
 void
 sf_buf_mext(void *addr, void *args)
 {
 	vm_page_t m;
 
 	m = sf_buf_page(args);
 	sf_buf_free(args);
 	vm_page_lock_queues();
 	vm_page_unwire(m, 0);
 	/*
 	 * Check for the object going away on us. This can
 	 * happen since we don't hold a reference to it.
 	 * If so, we're responsible for freeing the page.
 	 */
 	if (m->wire_count == 0 && m->object == NULL)
 		vm_page_free(m);
 	vm_page_unlock_queues();
 }
 
 /*
  * sendfile(2)
  *
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  *
  * Send a file specified by 'fd' and starting at 'offset' to a socket
  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
- * 0. Optionally add a header and/or trailer to the socket output.  If
+ * 0.  Optionally add a header and/or trailer to the socket output.  If
  * specified, write the total number of bytes sent into *sbytes.
- *
  */
 int
 sendfile(struct thread *td, struct sendfile_args *uap)
 {
 
 	return (do_sendfile(td, uap, 0));
 }
 
 static int
 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	int error;
 
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 		if (error)
 			goto out;
 		if (hdtr.headers != NULL) {
 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
 			if (error)
 				goto out;
 		}
 		if (hdtr.trailers != NULL) {
 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
 			if (error)
 				goto out;
 
 		}
 	}
 
 	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
 out:
 	if (hdr_uio)
 		free(hdr_uio, M_IOV);
 	if (trl_uio)
 		free(trl_uio, M_IOV);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 {
 	struct sendfile_args args;
 
 	args.fd = uap->fd;
 	args.s = uap->s;
 	args.offset = uap->offset;
 	args.nbytes = uap->nbytes;
 	args.hdtr = uap->hdtr;
 	args.sbytes = uap->sbytes;
 	args.flags = uap->flags;
 
 	return (do_sendfile(td, &args, 1));
 }
 #endif /* COMPAT_FREEBSD4 */
 
 int
 kern_sendfile(struct thread *td, struct sendfile_args *uap,
     struct uio *hdr_uio, struct uio *trl_uio, int compat)
 {
 	struct file *sock_fp;
 	struct vnode *vp;
 	struct vm_object *obj = NULL;
 	struct socket *so = NULL;
 	struct mbuf *m = NULL;
 	struct sf_buf *sf;
 	struct vm_page *pg;
 	off_t off, xfsize, sbytes = 0, rem = 0;
 	int error, mnw = 0;
 	int vfslocked;
 
 	NET_LOCK_GIANT();
 
 	/*
 	 * The file descriptor must be a regular file and have a
 	 * backing VM object.
 	 * File offset must be positive.  If it goes beyond EOF
 	 * we send only the header/trailer and no payload data.
 	 */
 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
 		goto out;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	obj = vp->v_object;
 	if (obj != NULL) {
 		/*
 		 * Temporarily increase the backing VM object's reference
 		 * count so that a forced reclamation of its vnode does not
 		 * immediately destroy it.
 		 */
 		VM_OBJECT_LOCK(obj);
 		if ((obj->flags & OBJ_DEAD) == 0) {
 			vm_object_reference_locked(obj);
 			VM_OBJECT_UNLOCK(obj);
 		} else {
 			VM_OBJECT_UNLOCK(obj);
 			obj = NULL;
 		}
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (obj == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 	if (uap->offset < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * The socket must be a stream socket and connected.
 	 * Remember if it a blocking or non-blocking socket.
 	 */
 	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp,
 	    NULL)) != 0)
 		goto out;
 	so = sock_fp->f_data;
 	if (so->so_type != SOCK_STREAM) {
 		error = EINVAL;
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		error = ENOTCONN;
 		goto out;
 	}
 	/*
 	 * Do not wait on memory allocations but return ENOMEM for
 	 * caller to retry later.
 	 * XXX: Experimental.
 	 */
 	if (uap->flags & SF_MNOWAIT)
 		mnw = 1;
 
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto out;
 #endif
 
 	/* If headers are specified copy them into mbufs. */
 	if (hdr_uio != NULL) {
 		hdr_uio->uio_td = td;
 		hdr_uio->uio_rw = UIO_WRITE;
 		if (hdr_uio->uio_resid > 0) {
 			/*
 			 * In FBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
 			    0, 0, 0);
 			if (m == NULL) {
 				error = mnw ? EAGAIN : ENOBUFS;
 				goto out;
 			}
 		}
 	}
 
 	/* Protect against multiple writers to the socket. */
 	SOCKBUF_LOCK(&so->so_snd);
 	(void) sblock(&so->so_snd, M_WAITOK);
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	/*
 	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
 	 * This is done in two loops.  The inner loop turns as many pages
 	 * as it can, up to available socket buffer space, without blocking
 	 * into mbufs to have it bulk delivered into the socket send buffer.
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
 	for (off = uap->offset; ; ) {
 		int loopbytes = 0;
 		int space = 0;
 		int done = 0;
 
 		/*
 		 * Check the socket state for ongoing connection,
 		 * no errors and space in socket buffer.
 		 * If space is low allow for the remainder of the
 		 * file to be processed if it fits the socket buffer.
 		 * Otherwise block in waiting for sufficient space
 		 * to proceed, or if the socket is nonblocking, return
 		 * to userland with EAGAIN while reporting how far
 		 * we've come.
 		 * We wait until the socket buffer has significant free
 		 * space to do bulk sends.  This makes good use of file
 		 * system read ahead and allows packet segmentation
 		 * offloading hardware to take over lots of work.  If
 		 * we were not careful here we would send off only one
 		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
 retry_space:
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		} else if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
 		space = sbspace(&so->so_snd);
 		if (space < rem &&
 		    (space <= 0 ||
 		     space < so->so_snd.sb_lowat)) {
 			if (so->so_state & SS_NBIO) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EAGAIN;
 				goto done;
 			}
 			/*
 			 * sbwait drops the lock while sleeping.
 			 * When we loop back to retry_space the
 			 * state may have changed and we retest
 			 * for it.
 			 */
 			error = sbwait(&so->so_snd);
 			/*
 			 * An error from sbwait usually indicates that we've
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
 			if (error) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			goto retry_space;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
 		while(space > loopbytes) {
 			vm_pindex_t pindex;
 			vm_offset_t pgoff;
 			struct mbuf *m0;
 
 			VM_OBJECT_LOCK(obj);
 			/*
 			 * Calculate the amount to transfer.
 			 * Not to exceed a page, the EOF,
 			 * or the passed in nbytes.
 			 */
 			pgoff = (vm_offset_t)(off & PAGE_MASK);
 			xfsize = omin(PAGE_SIZE - pgoff,
 			    obj->un_pager.vnp.vnp_size - off -
 			    sbytes - loopbytes);
 			if (uap->nbytes)
 				rem = (uap->nbytes - sbytes - loopbytes);
 			else
 				rem = obj->un_pager.vnp.vnp_size - off -
 				    sbytes - loopbytes;
 			xfsize = omin(rem, xfsize);
 			if (xfsize <= 0) {
 				VM_OBJECT_UNLOCK(obj);
 				done = 1;		/* all data sent */
 				break;
 			}
 			/*
 			 * Don't overflow the send buffer.
 			 * Stop here and send out what we've
 			 * already got.
 			 */
 			if (space < loopbytes + xfsize) {
 				VM_OBJECT_UNLOCK(obj);
 				break;
 			}
 retry_lookup:
 			/*
 			 * Attempt to look up the page.
 			 * Allocate if not found or
 			 * wait and loop if busy.
 			 */
 			pindex = OFF_TO_IDX(off);
 			pg = vm_page_lookup(obj, pindex);
 			if (pg == NULL) {
 				pg = vm_page_alloc(obj, pindex,
 				    VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL |
 				    VM_ALLOC_WIRED);
 				if (pg == NULL) {
 					VM_OBJECT_UNLOCK(obj);
 					VM_WAIT;
 					VM_OBJECT_LOCK(obj);
 					goto retry_lookup;
 				}
 			} else if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
 				goto retry_lookup;
 			else {
 				/*
 				 * Wire the page so it does not get
 				 * ripped out from under us.
 				 */
 				vm_page_lock_queues();
 				vm_page_wire(pg);
 				vm_page_unlock_queues();
 			}
 
 			/*
 			 * Check if page is valid for what we need,
 			 * otherwise initiate I/O.
 			 * If we already turned some pages into mbufs,
 			 * send them off before we come here again and
 			 * block.
 			 */
 			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
 				VM_OBJECT_UNLOCK(obj);
 			else if (m != NULL)
 				error = EAGAIN;	/* send what we already got */
 			else if (uap->flags & SF_NODISKIO)
 				error = EBUSY;
 			else {
 				int bsize, resid;
 
 				/*
 				 * Ensure that our page is still around
 				 * when the I/O completes.
 				 */
 				vm_page_io_start(pg);
 				VM_OBJECT_UNLOCK(obj);
 
 				/*
 				 * Get the page from backing store.
 				 */
 				bsize = vp->v_mount->mnt_stat.f_iosize;
 				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 				vn_lock(vp, LK_SHARED | LK_RETRY, td);
 
 				/*
 				 * XXXMAC: Because we don't have fp->f_cred
 				 * here, we pass in NOCRED.  This is probably
 				 * wrong, but is consistent with our original
 				 * implementation.
 				 */
 				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
 				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
 				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
 				    td->td_ucred, NOCRED, &resid, td);
 				VOP_UNLOCK(vp, 0, td);
 				VFS_UNLOCK_GIANT(vfslocked);
 				VM_OBJECT_LOCK(obj);
 				vm_page_io_finish(pg);
 				if (!error)
 					VM_OBJECT_UNLOCK(obj);
 				mbstat.sf_iocnt++;
 			}
 			if (error) {
 				vm_page_lock_queues();
 				vm_page_unwire(pg, 0);
 				/*
 				 * See if anyone else might know about
 				 * this page.  If not and it is not valid,
 				 * then free it.
 				 */
 				if (pg->wire_count == 0 && pg->valid == 0 &&
 				    pg->busy == 0 && !(pg->oflags & VPO_BUSY) &&
 				    pg->hold_count == 0) {
 					vm_page_free(pg);
 				}
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(obj);
 				if (error == EAGAIN)
 					error = 0;	/* not a real error */
 				break;
 			}
 
 			/*
 			 * Get a sendfile buf.  We usually wait as long
 			 * as necessary, but this wait can be interrupted.
 			 */
 			if ((sf = sf_buf_alloc(pg,
 			    (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) {
 				mbstat.sf_allocfail++;
 				vm_page_lock_queues();
 				vm_page_unwire(pg, 0);
 				/*
 				 * XXX: Not same check as above!?
 				 */
 				if (pg->wire_count == 0 && pg->object == NULL)
 					vm_page_free(pg);
 				vm_page_unlock_queues();
 				error = (mnw ? EAGAIN : EINTR);
 				break;
 			}
 
 			/*
 			 * Get an mbuf and set it up as having
 			 * external storage.
 			 */
 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
 			if (m0 == NULL) {
 				error = (mnw ? EAGAIN : ENOBUFS);
 				sf_buf_mext((void *)sf_buf_kva(sf), sf);
 				break;
 			}
 			MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
 			    sf, M_RDONLY, EXT_SFBUF);
 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
 			m0->m_len = xfsize;
 
 			/* Append to mbuf chain. */
 			if (m != NULL)
 				m_cat(m, m0);
 			else
 				m = m0;
 
 			/* Keep track of bits processed. */
 			loopbytes += xfsize;
 			off += xfsize;
 		}
 
 		/* Add the buffer chain to the socket buffer. */
 		if (m != NULL) {
 			int mlen;
 
 			mlen = m_length(m, NULL);
 			SOCKBUF_LOCK(&so->so_snd);
 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 				error = EPIPE;
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = (*so->so_proto->pr_usrreqs->pru_send)
 				    (so, 0, m, NULL, NULL, td);
 			if (!error)
 				sbytes += mlen;
 			m = NULL;	/* pru_send always consumes */
 		}
 
 		/* Quit outer loop on error or when we're done. */
 		if (error || done)
 			goto done;
 	}
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
 	if (trl_uio != NULL) {
 		error = kern_writev(td, uap->s, trl_uio);
 		if (error)
 			goto done;
 		sbytes += td->td_retval[0];
 	}
 
 done:
 	SOCKBUF_LOCK(&so->so_snd);
 	sbunlock(&so->so_snd);
 	SOCKBUF_UNLOCK(&so->so_snd);
 out:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
 	if (uap->sbytes != NULL) {
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 	}
 	if (obj != NULL)
 		vm_object_deallocate(obj);
 	if (vp != NULL) {
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (so)
 		fdrop(sock_fp, td);
 	if (m)
 		m_freem(m);
 
 	NET_UNLOCK_GIANT();
 
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 /*
  * SCTP syscalls.
  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
  * otherwise all return EOPNOTSUPP.
  * XXX: We should make this loadable one day.
  */
 int
 sctp_peeloff(td, uap)
 	struct thread *td;
 	struct sctp_peeloff_args /* {
 		int	sd;
 		caddr_t	name;
 	} */ *uap;
 {
 #ifdef SCTP
 	struct filedesc *fdp;
 	struct file *nfp = NULL;
 	int error;
 	struct socket *head, *so;
 	int fd;
 	u_int fflag;
 
 	fdp = td->td_proc->p_fd;
 	error = fgetsock(td, uap->sd, &head, &fflag);
 	if (error)
 		goto done2;
 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
 	if (error)
 		goto done2;
 	/*
 	 * At this point we know we do have a assoc to pull
 	 * we proceed to get the fd setup. This may block
 	 * but that is ok.
 	 */
 
 	error = falloc(td, &nfp, &fd);
 	if (error)
 		goto done;
 	td->td_retval[0] = fd;
 
 	so = sonewconn(head, SS_ISCONNECTED);
 	if (so == NULL) 
 		goto noconnection;
 	/*
 	 * Before changing the flags on the socket, we have to bump the
 	 * reference count.  Otherwise, if the protocol calls sofree(),
 	 * the socket will be released due to a zero refcount.
 	 */
         SOCK_LOCK(so);
         soref(so);                      /* file descriptor reference */
         SOCK_UNLOCK(so);
 
 	ACCEPT_LOCK();
 
 	TAILQ_REMOVE(&head->so_comp, so, so_list);
 	head->so_qlen--;
 	so->so_state |= (head->so_state & SS_NBIO);
 	so->so_state &= ~SS_NOFDREF;
 	so->so_qstate &= ~SQ_COMP;
 	so->so_head = NULL;
 
 	ACCEPT_UNLOCK();
 
 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
 	if (error)
 		goto noconnection;
 	if (head->so_sigio != NULL)
 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
 
 	FILE_LOCK(nfp);
 	nfp->f_data = so;
 	nfp->f_flag = fflag;
 	nfp->f_ops = &socketops;
 	nfp->f_type = DTYPE_SOCKET;
 	FILE_UNLOCK(nfp);
 
 noconnection:
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error)
 		fdclose(fdp, nfp, fd, td);
 
 	/*
 	 * Release explicitly held references before returning.
 	 */
 done:
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fputsock(head);
 done2:
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sctp_generic_sendmsg (td, uap)
 	struct thread *td;
 	struct sctp_generic_sendmsg_args /* {
 		int sd, 
 		caddr_t msg, 
 		int mlen, 
 		caddr_t to, 
 		__socklen_t tolen, 
 		struct sctp_sndrcvinfo *sinfo, 
 		int flags
 	} */ *uap;
 {
 #ifdef SCTP
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp;
 	int use_rcvinfo = 1;
 	int error = 0, len;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec iov[1];
 
 	if (uap->sinfo) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 	if (uap->tolen) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 	}
 
 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
 	if (error)
 		goto sctp_bad;
 
 	iov[0].iov_base = uap->msg;
 	iov[0].iov_len = uap->mlen;
 
 	so = (struct socket *)fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov =  iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	len = auio.uio_resid = uap->mlen;
 	error = sctp_lower_sosend(so, to, &auio,
 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
 		    uap->flags, use_rcvinfo, u_sinfo, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket. */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	fdrop(fp, td);
 sctp_bad2:
 	if (to)
 		free(to, M_SONAME);
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sctp_generic_sendmsg_iov(td, uap)
 	struct thread *td;
 	struct sctp_generic_sendmsg_iov_args /* {
 		int sd, 
 		struct iovec *iov, 
 		int iovlen, 
 		caddr_t to, 
 		__socklen_t tolen, 
 		struct sctp_sndrcvinfo *sinfo, 
 		int flags
 	} */ *uap;
 {
 #ifdef SCTP
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp;
 	int use_rcvinfo = 1;
 	int error=0, len, i;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec *iov, *tiov;
 
 	if (uap->sinfo) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 	if (uap->tolen) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 	}
 
 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
 	if (error)
 		goto sctp_bad1;
 
 	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error)
 		goto sctp_bad1;
 
 	so = (struct socket *)fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov =  iov;
 	auio.uio_iovcnt = uap->iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto sctp_bad;
 		}
 	}
 	len = auio.uio_resid;
 	error = sctp_lower_sosend(so, to, &auio,
 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
 		    uap->flags, use_rcvinfo, u_sinfo, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	free(iov, M_IOV);
 sctp_bad1:
 	fdrop(fp, td);
 sctp_bad2:
 	if (to)
 		free(to, M_SONAME);
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sctp_generic_recvmsg(td, uap)
 	struct thread *td;
 	struct sctp_generic_recvmsg_args /* {
 		int sd, 
 		struct iovec *iov, 
 		int iovlen,
 		struct sockaddr *from, 
 		__socklen_t *fromlenaddr,
 		struct sctp_sndrcvinfo *sinfo, 
 		int *msg_flags
 	} */ *uap;
 {
 #ifdef SCTP
 	u_int8_t sockbufstore[256];
 	struct uio auio;
 	struct iovec *iov, *tiov;
 	struct sctp_sndrcvinfo sinfo;
 	struct socket *so;
 	struct file *fp;
 	struct sockaddr *fromsa;
 	int fromlen;
 	int len, i, msg_flags;
 	int error = 0;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
 	if (error) {
 		return (error);
 	}
 	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error) {
 		goto out1;
 	}
 
 	so = fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_receive(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error) {
 		goto out;
 		return (error);
 	}
 #endif /* MAC */
 
 	if (uap->fromlenaddr) {
 		error = copyin(uap->fromlenaddr,
 		    &fromlen, sizeof (fromlen));
 		if (error) {
 			goto out;
 		}
 	} else {
 		fromlen = 0;
 	}
 	if(uap->msg_flags) {
 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
 		if (error) {
 			goto out;
 		}
 	} else {
 		msg_flags = 0;
 	}
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovlen;
   	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto out;
 		}
 	}
 	len = auio.uio_resid;
 	fromsa = (struct sockaddr *)sockbufstore;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif /* KTRACE */
 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
 		    fromsa, fromlen, &msg_flags,
 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
 	if (error) {
 		if (auio.uio_resid != (int)len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	} else {
 		if (uap->sinfo)
 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
 	}
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = (int)len - auio.uio_resid;
 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
 	}
 #endif /* KTRACE */
 	if (error)
 		goto out;
 	td->td_retval[0] = (int)len - auio.uio_resid;
 
 	if (fromlen && uap->from) {
 		len = fromlen;
 		if (len <= 0 || fromsa == 0)
 			len = 0;
 		else {
 			len = MIN(len, fromsa->sa_len);
 			error = copyout(fromsa, uap->from, (unsigned)len);
 			if (error)
 				goto out;
 		}
 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
 		if (error) {
 			goto out;
 		}
 	}
 	if (uap->msg_flags) {
 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
 		if (error) {
 			goto out;
 		}
 	}
 out:
 	free(iov, M_IOV);
 out1:
 	fdrop(fp, td);
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
Index: head/sys/kern/vfs_aio.c
===================================================================
--- head/sys/kern/vfs_aio.c	(revision 167231)
+++ head/sys/kern/vfs_aio.c	(revision 167232)
@@ -1,2322 +1,2322 @@
 /*-
  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. John S. Dyson's name may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * DISCLAIMER:  This code isn't warranted to do anything useful.  Anything
  * bad that happens because of using this software isn't the responsibility
  * of the author.  This software is distributed AS-IS.
  */
 
 /*
  * This file contains support for the POSIX 1003.1B AIO/LIO facility.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/eventhandler.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/kthread.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
 #include <sys/posix4.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/protosw.h>
 #include <sys/sema.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/uma.h>
 #include <sys/aio.h>
 
 #include "opt_vfs_aio.h"
 
 /*
  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
  * overflow. (XXX will be removed soon.)
  */
 static u_long jobrefid;
 
 /*
  * Counter for aio_fsync.
  */
 static uint64_t jobseqno;
 
 #define JOBST_NULL		0
 #define JOBST_JOBQSOCK		1
 #define JOBST_JOBQGLOBAL	2
 #define JOBST_JOBRUNNING	3
 #define JOBST_JOBFINISHED	4
 #define JOBST_JOBQBUF		5
 #define JOBST_JOBQSYNC		6
 
 #ifndef MAX_AIO_PER_PROC
 #define MAX_AIO_PER_PROC	32
 #endif
 
 #ifndef MAX_AIO_QUEUE_PER_PROC
 #define MAX_AIO_QUEUE_PER_PROC	256 /* Bigger than AIO_LISTIO_MAX */
 #endif
 
 #ifndef MAX_AIO_PROCS
 #define MAX_AIO_PROCS		32
 #endif
 
 #ifndef MAX_AIO_QUEUE
 #define	MAX_AIO_QUEUE		1024 /* Bigger than AIO_LISTIO_MAX */
 #endif
 
 #ifndef TARGET_AIO_PROCS
 #define TARGET_AIO_PROCS	4
 #endif
 
 #ifndef MAX_BUF_AIO
 #define MAX_BUF_AIO		16
 #endif
 
 #ifndef AIOD_TIMEOUT_DEFAULT
 #define	AIOD_TIMEOUT_DEFAULT	(10 * hz)
 #endif
 
 #ifndef AIOD_LIFETIME_DEFAULT
 #define AIOD_LIFETIME_DEFAULT	(30 * hz)
 #endif
 
 static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
 
 static int max_aio_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
 	CTLFLAG_RW, &max_aio_procs, 0,
 	"Maximum number of kernel threads to use for handling async IO ");
 
 static int num_aio_procs = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
 	CTLFLAG_RD, &num_aio_procs, 0,
 	"Number of presently active kernel threads for async IO");
 
 /*
  * The code will adjust the actual number of AIO processes towards this
  * number when it gets a chance.
  */
 static int target_aio_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
 	0, "Preferred number of ready kernel threads for async IO");
 
 static int max_queue_count = MAX_AIO_QUEUE;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
     "Maximum number of aio requests to queue, globally");
 
 static int num_queue_count = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
     "Number of queued aio requests");
 
 static int num_buf_aio = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
     "Number of aio requests presently handled by the buf subsystem");
 
 /* Number of async I/O thread in the process of being started */
 /* XXX This should be local to aio_aqueue() */
 static int num_aio_resv_start = 0;
 
 static int aiod_timeout;
 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
     "Timeout value for synchronous aio operations");
 
 static int aiod_lifetime;
 SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static int unloadable = 0;
 SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
     "Allow unload of aio (not recommended)");
 
 
 static int max_aio_per_proc = MAX_AIO_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
     0, "Maximum active aio requests per process (stored in the process)");
 
 static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
     &max_aio_queue_per_proc, 0,
     "Maximum queued aio requests per process (stored in the process)");
 
 static int max_buf_aio = MAX_BUF_AIO;
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
     "Maximum buf aio requests per process (stored in the process)");
 
 typedef struct oaiocb {
 	int	aio_fildes;		/* File descriptor */
 	off_t	aio_offset;		/* File offset for I/O */
 	volatile void *aio_buf;         /* I/O buffer in process space */
 	size_t	aio_nbytes;		/* Number of bytes for I/O */
 	struct	osigevent aio_sigevent;	/* Signal to deliver */
 	int	aio_lio_opcode;		/* LIO opcode */
 	int	aio_reqprio;		/* Request priority -- ignored */
 	struct	__aiocb_private	_aiocb_private;
 } oaiocb_t;
 
 /*
  * Below is a key of locks used to protect each member of struct aiocblist
  * aioliojob and kaioinfo and any backends.
  *
  * * - need not protected
  * a - locked by kaioinfo lock
  * b - locked by backend lock, the backend lock can be null in some cases,
  *     for example, BIO belongs to this type, in this case, proc lock is
  *     reused.
  * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
  */
 
 /*
  * Current, there is only two backends: BIO and generic file I/O.
  * socket I/O is served by generic file I/O, this is not a good idea, since
  * disk file I/O and any other types without O_NONBLOCK flag can block daemon
  * threads, if there is no thread to serve socket I/O, the socket I/O will be
  * delayed too long or starved, we should create some threads dedicated to
  * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
  * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
  * structure is not safe because there is race between userland and aio
  * daemons.
  */
 
 struct aiocblist {
 	TAILQ_ENTRY(aiocblist) list;	/* (b) internal list of for backend */
 	TAILQ_ENTRY(aiocblist) plist;	/* (a) list of jobs for each backend */
 	TAILQ_ENTRY(aiocblist) allist;  /* (a) list of all jobs in proc */
 	int	jobflags;		/* (a) job flags */
 	int	jobstate;		/* (b) job state */
 	int	inputcharge;		/* (*) input blockes */
 	int	outputcharge;		/* (*) output blockes */
 	struct	buf *bp;		/* (*) private to BIO backend,
 				  	 * buffer pointer
 					 */
 	struct	proc *userproc;		/* (*) user process */
 	struct  ucred *cred;		/* (*) active credential when created */
 	struct	file *fd_file;		/* (*) pointer to file structure */
 	struct	aioliojob *lio;		/* (*) optional lio job */
 	struct	aiocb *uuaiocb;		/* (*) pointer in userspace of aiocb */
 	struct	knlist klist;		/* (a) list of knotes */
 	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
 	ksiginfo_t ksi;			/* (a) realtime signal info */
 	struct	task biotask;		/* (*) private to BIO backend */
 	uint64_t seqno;			/* (*) job number */
 	int	pending;		/* (a) number of pending I/O, aio_fsync only */
 };
 
 /* jobflags */
 #define AIOCBLIST_DONE		0x01
 #define AIOCBLIST_BUFDONE	0x02
 #define AIOCBLIST_RUNDOWN	0x04
 #define AIOCBLIST_CHECKSYNC	0x08
 
 /*
  * AIO process info
  */
 #define AIOP_FREE	0x1			/* proc on free queue */
 
 struct aiothreadlist {
 	int aiothreadflags;			/* (c) AIO proc flags */
 	TAILQ_ENTRY(aiothreadlist) list;	/* (c) list of processes */
 	struct thread *aiothread;		/* (*) the AIO thread */
 };
 
 /*
  * data-structure for lio signal management
  */
 struct aioliojob {
 	int	lioj_flags;			/* (a) listio flags */
 	int	lioj_count;			/* (a) listio flags */
 	int	lioj_finished_count;		/* (a) listio flags */
 	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
 	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
 	struct  knlist klist;			/* (a) list of knotes */
 	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
 };
 
 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
 #define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
 
 /*
  * per process aio data structure
  */
 struct kaioinfo {
 	struct mtx	kaio_mtx;	/* the lock to protect this struct */
 	int	kaio_flags;		/* (a) per process kaio flags */
 	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
 	int	kaio_active_count;	/* (c) number of currently used AIOs */
 	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
 	int	kaio_count;		/* (a) size of AIO queue */
 	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
 	int	kaio_buffer_count;	/* (a) number of physio buffers */
 	TAILQ_HEAD(,aiocblist) kaio_all;	/* (a) all AIOs in the process */
 	TAILQ_HEAD(,aiocblist) kaio_done;	/* (a) done queue for process */
 	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
 	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* (a) job queue for process */
 	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* (a) buffer job queue for process */
 	TAILQ_HEAD(,aiocblist) kaio_sockqueue;  /* (a) queue for aios waiting on sockets,
 						 *  NOT USED YET.
 						 */
 	TAILQ_HEAD(,aiocblist) kaio_syncqueue;	/* (a) queue for aio_fsync */
 	struct	task	kaio_task;	/* (*) task to kick aio threads */
 };
 
 #define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
 #define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
 #define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
 #define AIO_MTX(ki)		(&(ki)->kaio_mtx)
 
 #define KAIO_RUNDOWN	0x1	/* process is being run down */
 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
 
 static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* (c) Idle daemons */
 static struct sema aio_newproc_sem;
 static struct mtx aio_job_mtx;
 static struct mtx aio_sock_mtx;
 static TAILQ_HEAD(,aiocblist) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
 void		aio_init_aioinfo(struct proc *p);
 static void	aio_onceonly(void);
 static int	aio_free_entry(struct aiocblist *aiocbe);
 static void	aio_process(struct aiocblist *aiocbe);
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *job,
 			struct aioliojob *lio, int type, int osigev);
 static void	aio_physwakeup(struct buf *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
 static void	biohelper(void *, int);
 static void	aio_daemon(void *param);
 static void	aio_swake_cb(struct socket *, struct sockbuf *);
 static int	aio_unload(void);
 static void	aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
 #define DONE_BUF	1
 #define DONE_QUEUE	2
 static int	do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev);
 static int	aio_kick(struct proc *userp);
 static void	aio_kick_nowait(struct proc *userp);
 static void	aio_kick_helper(void *context, int pending);
 static int	filt_aioattach(struct knote *kn);
 static void	filt_aiodetach(struct knote *kn);
 static int	filt_aio(struct knote *kn, long hint);
 static int	filt_lioattach(struct knote *kn);
 static void	filt_liodetach(struct knote *kn);
 static int	filt_lio(struct knote *kn, long hint);
 
 /*
  * Zones for:
  * 	kaio	Per process async io info
  *	aiop	async io thread data
  *	aiocb	async io jobs
  *	aiol	list io job pointer - internal to aio_suspend XXX
  *	aiolio	list io jobs
  */
 static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
 
 /* kqueue filters for aio */
 static struct filterops aio_filtops =
 	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
 static struct filterops lio_filtops =
 	{ 0, filt_lioattach, filt_liodetach, filt_lio };
 
 static eventhandler_tag exit_tag, exec_tag;
 
 TASKQUEUE_DEFINE_THREAD(aiod_bio);
 
 /*
  * Main operations function for use as a kernel module.
  */
 static int
 aio_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		aio_onceonly();
 		break;
 	case MOD_UNLOAD:
 		error = aio_unload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t aio_mod = {
 	"aio",
 	&aio_modload,
 	NULL
 };
 
 SYSCALL_MODULE_HELPER(aio_cancel);
 SYSCALL_MODULE_HELPER(aio_error);
 SYSCALL_MODULE_HELPER(aio_fsync);
 SYSCALL_MODULE_HELPER(aio_read);
 SYSCALL_MODULE_HELPER(aio_return);
 SYSCALL_MODULE_HELPER(aio_suspend);
 SYSCALL_MODULE_HELPER(aio_waitcomplete);
 SYSCALL_MODULE_HELPER(aio_write);
 SYSCALL_MODULE_HELPER(lio_listio);
 SYSCALL_MODULE_HELPER(oaio_read);
 SYSCALL_MODULE_HELPER(oaio_write);
 SYSCALL_MODULE_HELPER(olio_listio);
 
 DECLARE_MODULE(aio, aio_mod,
 	SI_SUB_VFS, SI_ORDER_ANY);
 MODULE_VERSION(aio, 1);
 
 /*
  * Startup initialization
  */
 static void
 aio_onceonly(void)
 {
 
 	/* XXX: should probably just use so->callback */
 	aio_swake = &aio_swake_cb;
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
 	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
 	TAILQ_INIT(&aio_freeproc);
 	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
 	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
 	mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
 	TAILQ_INIT(&aio_jobs);
 	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
 	jobrefid = 1;
 	async_io_version = _POSIX_VERSION;
 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
 }
 
 /*
  * Callback for unload of AIO when used as a module.
  */
 static int
 aio_unload(void)
 {
 	int error;
 
 	/*
 	 * XXX: no unloads by default, it's too dangerous.
 	 * perhaps we could do it if locked out callers and then
 	 * did an aio_proc_rundown() on each process.
 	 *
 	 * jhb: aio_proc_rundown() needs to run on curproc though,
 	 * so I don't think that would fly.
 	 */
 	if (!unloadable)
 		return (EOPNOTSUPP);
 
 	error = kqueue_del_filteropts(EVFILT_AIO);
 	if (error)
 		return error;
 	error = kqueue_del_filteropts(EVFILT_LIO);
 	if (error)
 		return error;
 	async_io_version = 0;
 	aio_swake = NULL;
 	taskqueue_free(taskqueue_aiod_bio);
 	delete_unrhdr(aiod_unr);
 	uma_zdestroy(kaio_zone);
 	uma_zdestroy(aiop_zone);
 	uma_zdestroy(aiocb_zone);
 	uma_zdestroy(aiol_zone);
 	uma_zdestroy(aiolio_zone);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
 	mtx_destroy(&aio_job_mtx);
 	mtx_destroy(&aio_sock_mtx);
 	sema_destroy(&aio_newproc_sem);
 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
 	return (0);
 }
 
 /*
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
 void
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
 	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
 	ki->kaio_flags = 0;
 	ki->kaio_maxactive_count = max_aio_per_proc;
 	ki->kaio_active_count = 0;
 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
 	ki->kaio_count = 0;
 	ki->kaio_ballowed_count = max_buf_aio;
 	ki->kaio_buffer_count = 0;
 	TAILQ_INIT(&ki->kaio_all);
 	TAILQ_INIT(&ki->kaio_done);
 	TAILQ_INIT(&ki->kaio_jobqueue);
 	TAILQ_INIT(&ki->kaio_bufqueue);
 	TAILQ_INIT(&ki->kaio_liojoblist);
 	TAILQ_INIT(&ki->kaio_sockqueue);
 	TAILQ_INIT(&ki->kaio_syncqueue);
 	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
 	PROC_LOCK(p);
 	if (p->p_aioinfo == NULL) {
 		p->p_aioinfo = ki;
 		PROC_UNLOCK(p);
 	} else {
 		PROC_UNLOCK(p);
 		mtx_destroy(&ki->kaio_mtx);
 		uma_zfree(kaio_zone, ki);
 	}
 
 	while (num_aio_procs < target_aio_procs)
 		aio_newproc(NULL);
 }
 
 static int
 aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
 {
 	int ret = 0;
 
 	PROC_LOCK(p);
 	if (!KSI_ONQ(ksi)) {
 		ksi->ksi_code = SI_ASYNCIO;
 		ksi->ksi_flags |= KSI_EXT | KSI_INS;
 		ret = psignal_event(p, sigev, ksi);
 	}
 	PROC_UNLOCK(p);
 	return (ret);
 }
 
 /*
  * Free a job entry.  Wait for completion if it is currently active, but don't
  * delay forever.  If we delay, we return a flag that says that we have to
  * restart the queue scan.
  */
 static int
 aio_free_entry(struct aiocblist *aiocbe)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct proc *p;
 
 	p = aiocbe->userproc;
 	MPASS(curproc == p);
 	ki = p->p_aioinfo;
 	MPASS(ki != NULL);
 
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
 
 	atomic_subtract_int(&num_queue_count, 1);
 
 	ki->kaio_count--;
 	MPASS(ki->kaio_count >= 0);
 
 	TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
 	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
 
 	lj = aiocbe->lio;
 	if (lj) {
 		lj->lioj_count--;
 		lj->lioj_finished_count--;
 
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			/* lio is going away, we need to destroy any knotes */
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		}
 	}
 
 	/* aiocbe is going away, we need to destroy any knotes */
 	knlist_delete(&aiocbe->klist, curthread, 1);
 	PROC_LOCK(p);
 	sigqueue_take(&aiocbe->ksi);
 	PROC_UNLOCK(p);
 
 	MPASS(aiocbe->bp == NULL);
 	aiocbe->jobstate = JOBST_NULL;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * The thread argument here is used to find the owning process
 	 * and is also passed to fo_close() which may pass it to various
 	 * places such as devsw close() routines.  Because of that, we
 	 * need a thread pointer from the process owning the job that is
 	 * persistent and won't disappear out from under us or move to
 	 * another process.
 	 *
 	 * Currently, all the callers of this function call it to remove
 	 * an aiocblist from the current process' job list either via a
 	 * syscall or due to the current process calling exit() or
 	 * execve().  Thus, we know that p == curproc.  We also know that
 	 * curthread can't exit since we are curthread.
 	 *
 	 * Therefore, we use curthread as the thread to pass to
 	 * knlist_delete().  This does mean that it is possible for the
 	 * thread pointer at close time to differ from the thread pointer
 	 * at open time, but this is already true of file descriptors in
 	 * a multithreaded process.
 	 */
 	fdrop(aiocbe->fd_file, curthread);
 	crfree(aiocbe->cred);
 	uma_zfree(aiocb_zone, aiocbe);
 	AIO_LOCK(ki);
 
 	return (0);
 }
 
 static void
 aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
 {
    	aio_proc_rundown(arg, p);
 }
 
 /*
  * Rundown the jobs for a given process.
  */
 static void
 aio_proc_rundown(void *arg, struct proc *p)
 {
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct aiocblist *cbe, *cbn;
 	struct file *fp;
 	struct socket *so;
 	int remove;
 
 	KASSERT(curthread->td_proc == p,
 	    ("%s: called on non-curproc", __func__));
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return;
 
 	AIO_LOCK(ki);
 	ki->kaio_flags |= KAIO_RUNDOWN;
 
 restart:
 
 	/*
 	 * Try to cancel all pending requests. This code simulates
 	 * aio_cancel on all pending I/O requests.
 	 */
 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
 		remove = 0;
 		mtx_lock(&aio_job_mtx);
 		if (cbe->jobstate == JOBST_JOBQGLOBAL) {
 			TAILQ_REMOVE(&aio_jobs, cbe, list);
 			remove = 1;
 		} else if (cbe->jobstate == JOBST_JOBQSOCK) {
 			fp = cbe->fd_file;
 			MPASS(fp->f_type == DTYPE_SOCKET);
 			so = fp->f_data;
 			TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
 			remove = 1;
 		} else if (cbe->jobstate == JOBST_JOBQSYNC) {
 			TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
 			remove = 1;
 		}
 		mtx_unlock(&aio_job_mtx);
 
 		if (remove) {
 			cbe->jobstate = JOBST_JOBFINISHED;
 			cbe->uaiocb._aiocb_private.status = -1;
 			cbe->uaiocb._aiocb_private.error = ECANCELED;
 			TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
 			aio_bio_done_notify(p, cbe, DONE_QUEUE);
 		}
 	}
 
 	/* Wait for all running I/O to be finished */
 	if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
 	    TAILQ_FIRST(&ki->kaio_jobqueue)) {
 		ki->kaio_flags |= KAIO_WAKEUP;
 		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
 		goto restart;
 	}
 
 	/* Free all completed I/O requests. */
 	while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
 		aio_free_entry(cbe);
 
 	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
 		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 			knlist_delete(&lj->klist, curthread, 1);
 			PROC_LOCK(p);
 			sigqueue_take(&lj->lioj_ksi);
 			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		} else {
 			panic("LIO job not cleaned up: C:%d, FC:%d\n",
 			    lj->lioj_count, lj->lioj_finished_count);
 		}
 	}
 	AIO_UNLOCK(ki);
 	taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
 	uma_zfree(kaio_zone, ki);
 	p->p_aioinfo = NULL;
 }
 
 /*
  * Select a job to run (called by an AIO daemon).
  */
 static struct aiocblist *
 aio_selectjob(struct aiothreadlist *aiop)
 {
 	struct aiocblist *aiocbe;
 	struct kaioinfo *ki;
 	struct proc *userp;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 	TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
 		userp = aiocbe->userproc;
 		ki = userp->p_aioinfo;
 
 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
 			/* Account for currently active jobs. */
 			ki->kaio_active_count++;
 			aiocbe->jobstate = JOBST_JOBRUNNING;
 			break;
 		}
 	}
 	return (aiocbe);
 }
 
 /*
  *  Move all data to a permanent storage device, this code
  *  simulates fsync syscall.
  */
 static int
 aio_fsync_vnode(struct thread *td, struct vnode *vp)
 {
 	struct mount *mp;
 	int vfslocked;
 	int error;
 
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_LOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_UNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 drop:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * The AIO processing activity.  This is the code that does the I/O request for
  * the non-physio version of the operations.  The normal vn operations are used,
  * and this code should work in all instances for every type of file, including
  * pipes, sockets, fifos, and regular files.
  *
  * XXX I don't think it works well for socket, pipe, and fifo.
  */
 static void
 aio_process(struct aiocblist *aiocbe)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct proc *mycp;
 	struct aiocb *cb;
 	struct file *fp;
 	struct socket *so;
 	struct uio auio;
 	struct iovec aiov;
 	int cnt;
 	int error;
 	int oublock_st, oublock_end;
 	int inblock_st, inblock_end;
 
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = aiocbe->cred;
 	mycp = td->td_proc;
 	cb = &aiocbe->uaiocb;
 	fp = aiocbe->fd_file;
 
 	if (cb->aio_lio_opcode == LIO_SYNC) {
 		error = 0;
 		cnt = 0;
 		if (fp->f_vnode != NULL)
 			error = aio_fsync_vnode(td, fp->f_vnode);
 		cb->_aiocb_private.error = error;
 		cb->_aiocb_private.status = 0;
 		td->td_ucred = td_savedcred;
 		return;
 	}
 
 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
 	aiov.iov_len = cb->aio_nbytes;
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = cb->aio_offset;
 	auio.uio_resid = cb->aio_nbytes;
 	cnt = cb->aio_nbytes;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 
 	inblock_st = mycp->p_stats->p_ru.ru_inblock;
 	oublock_st = mycp->p_stats->p_ru.ru_oublock;
 	/*
 	 * aio_aqueue() acquires a reference to the file that is
 	 * released in aio_free_entry().
 	 */
 	if (cb->aio_lio_opcode == LIO_READ) {
 		auio.uio_rw = UIO_READ;
 		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	} else {
 		if (fp->f_type == DTYPE_VNODE)
 			bwillwrite();
 		auio.uio_rw = UIO_WRITE;
 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	}
 	inblock_end = mycp->p_stats->p_ru.ru_inblock;
 	oublock_end = mycp->p_stats->p_ru.ru_oublock;
 
 	aiocbe->inputcharge = inblock_end - inblock_st;
 	aiocbe->outputcharge = oublock_end - oublock_st;
 
 	if ((error) && (auio.uio_resid != cnt)) {
 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 			error = 0;
 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
 			int sigpipe = 1;
 			if (fp->f_type == DTYPE_SOCKET) {
 				so = fp->f_data;
 				if (so->so_options & SO_NOSIGPIPE)
 					sigpipe = 0;
 			}
 			if (sigpipe) {
 				PROC_LOCK(aiocbe->userproc);
 				psignal(aiocbe->userproc, SIGPIPE);
 				PROC_UNLOCK(aiocbe->userproc);
 			}
 		}
 	}
 
 	cnt -= auio.uio_resid;
 	cb->_aiocb_private.error = error;
 	cb->_aiocb_private.status = cnt;
 	td->td_ucred = td_savedcred;
 }
 
 static void
 aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
 {
 	struct aioliojob *lj;
 	struct kaioinfo *ki;
 	struct aiocblist *scb, *scbn;
 	int lj_done;
 
 	ki = userp->p_aioinfo;
 	AIO_LOCK_ASSERT(ki, MA_OWNED);
 	lj = aiocbe->lio;
 	lj_done = 0;
 	if (lj) {
 		lj->lioj_finished_count++;
 		if (lj->lioj_count == lj->lioj_finished_count)
 			lj_done = 1;
 	}
 	if (type == DONE_QUEUE) {
 		aiocbe->jobflags |= AIOCBLIST_DONE;
 	} else {
 		aiocbe->jobflags |= AIOCBLIST_BUFDONE;
 	}
 	TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
 	aiocbe->jobstate = JOBST_JOBFINISHED;
 
 	if (ki->kaio_flags & KAIO_RUNDOWN)
 		goto notification_done;
 
 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	    aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
 		aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
 
 	KNOTE_LOCKED(&aiocbe->klist, 1);
 
 	if (lj_done) {
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 			KNOTE_LOCKED(&lj->klist, 1);
 		}
 		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 		    == LIOJ_SIGNAL
 		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
 			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 		}
 	}
 
 notification_done:
 	if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
 		TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
 			if (aiocbe->fd_file == scb->fd_file &&
 			    aiocbe->seqno < scb->seqno) {
 				if (--scb->pending == 0) {
 					mtx_lock(&aio_job_mtx);
 					scb->jobstate = JOBST_JOBQGLOBAL;
 					TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
 					TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
 					aio_kick_nowait(userp);
 					mtx_unlock(&aio_job_mtx);
 				}
 			}
 		}
 	}
 	if (ki->kaio_flags & KAIO_WAKEUP) {
 		ki->kaio_flags &= ~KAIO_WAKEUP;
 		wakeup(&userp->p_aioinfo);
 	}
 }
 
 /*
  * The AIO daemon, most of the actual work is done in aio_process,
  * but the setup (and address space mgmt) is done in this routine.
  */
 static void
 aio_daemon(void *_id)
 {
 	struct aiocblist *aiocbe;
 	struct aiothreadlist *aiop;
 	struct kaioinfo *ki;
 	struct proc *curcp, *mycp, *userp;
 	struct vmspace *myvm, *tmpvm;
 	struct thread *td = curthread;
 	int id = (intptr_t)_id;
 
 	/*
 	 * Local copies of curproc (cp) and vmspace (myvm)
 	 */
 	mycp = td->td_proc;
 	myvm = mycp->p_vmspace;
 
 	KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));
 
 	/*
 	 * Allocate and ready the aio control info.  There is one aiop structure
 	 * per daemon.
 	 */
 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
 	aiop->aiothread = td;
 	aiop->aiothreadflags = 0;
 
 	/* The daemon resides in its own pgrp. */
 	setsid(td, NULL);
 
 	/*
 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
 	 * and creating too many daemons.)
 	 */
 	sema_post(&aio_newproc_sem);
 
 	mtx_lock(&aio_job_mtx);
 	for (;;) {
 		/*
 		 * curcp is the current daemon process context.
 		 * userp is the current user process context.
 		 */
 		curcp = mycp;
 
 		/*
 		 * Take daemon off of free queue
 		 */
 		if (aiop->aiothreadflags & AIOP_FREE) {
 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
 			aiop->aiothreadflags &= ~AIOP_FREE;
 		}
 
 		/*
 		 * Check for jobs.
 		 */
 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
 			mtx_unlock(&aio_job_mtx);
 			userp = aiocbe->userproc;
 
 			/*
 			 * Connect to process address space for user program.
 			 */
 			if (userp != curcp) {
 				/*
 				 * Save the current address space that we are
 				 * connected to.
 				 */
 				tmpvm = mycp->p_vmspace;
 
 				/*
 				 * Point to the new user address space, and
 				 * refer to it.
 				 */
 				mycp->p_vmspace = userp->p_vmspace;
 				atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);
 
 				/* Activate the new mapping. */
 				pmap_activate(FIRST_THREAD_IN_PROC(mycp));
 
 				/*
 				 * If the old address space wasn't the daemons
 				 * own address space, then we need to remove the
 				 * daemon's reference from the other process
 				 * that it was acting on behalf of.
 				 */
 				if (tmpvm != myvm) {
 					vmspace_free(tmpvm);
 				}
 				curcp = userp;
 			}
 
 			ki = userp->p_aioinfo;
 
 			/* Do the I/O function. */
 			aio_process(aiocbe);
 
 			mtx_lock(&aio_job_mtx);
 			/* Decrement the active job count. */
 			ki->kaio_active_count--;
 			mtx_unlock(&aio_job_mtx);
 
 			AIO_LOCK(ki);
 			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
 			aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
 			AIO_UNLOCK(ki);
 
 			mtx_lock(&aio_job_mtx);
 		}
 
 		/*
 		 * Disconnect from user address space.
 		 */
 		if (curcp != mycp) {
 
 			mtx_unlock(&aio_job_mtx);
 
 			/* Get the user address space to disconnect from. */
 			tmpvm = mycp->p_vmspace;
 
 			/* Get original address space for daemon. */
 			mycp->p_vmspace = myvm;
 
 			/* Activate the daemon's address space. */
 			pmap_activate(FIRST_THREAD_IN_PROC(mycp));
 #ifdef DIAGNOSTIC
 			if (tmpvm == myvm) {
 				printf("AIOD: vmspace problem -- %d\n",
 				    mycp->p_pid);
 			}
 #endif
 			/* Remove our vmspace reference. */
 			vmspace_free(tmpvm);
 
 			curcp = mycp;
 
 			mtx_lock(&aio_job_mtx);
 			/*
 			 * We have to restart to avoid race, we only sleep if
 			 * no job can be selected, that should be
 			 * curcp == mycp.
 			 */
 			continue;
 		}
 
 		mtx_assert(&aio_job_mtx, MA_OWNED);
 
 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
 		aiop->aiothreadflags |= AIOP_FREE;
 
 		/*
 		 * If daemon is inactive for a long time, allow it to exit,
 		 * thereby freeing resources.
 		 */
 		if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
 		    aiod_lifetime)) {
 			if (TAILQ_EMPTY(&aio_jobs)) {
 				if ((aiop->aiothreadflags & AIOP_FREE) &&
 				    (num_aio_procs > target_aio_procs)) {
 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
 					num_aio_procs--;
 					mtx_unlock(&aio_job_mtx);
 					uma_zfree(aiop_zone, aiop);
 					free_unr(aiod_unr, id);
 #ifdef DIAGNOSTIC
 					if (mycp->p_vmspace->vm_refcnt <= 1) {
 						printf("AIOD: bad vm refcnt for"
 						    " exiting daemon: %d\n",
 						    mycp->p_vmspace->vm_refcnt);
 					}
 #endif
 					kthread_exit(0);
 				}
 			}
 		}
 	}
 	mtx_unlock(&aio_job_mtx);
 	panic("shouldn't be here\n");
 }
 
 /*
  * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
  * AIO daemon modifies its environment itself.
  */
 static int
 aio_newproc(int *start)
 {
 	int error;
 	struct proc *p;
 	int id;
 
 	id = alloc_unr(aiod_unr);
 	error = kthread_create(aio_daemon, (void *)(intptr_t)id, &p,
 		RFNOWAIT, 0, "aiod%d", id);
 	if (error == 0) {
 		/*
 		 * Wait until daemon is started.
 		 */
 		sema_wait(&aio_newproc_sem);
 		mtx_lock(&aio_job_mtx);
 		num_aio_procs++;
 		if (start != NULL)
 			(*start)--;
 		mtx_unlock(&aio_job_mtx);
 	} else {
 		free_unr(aiod_unr, id);
 	}
 	return (error);
 }
 
 /*
  * Try the high-performance, low-overhead physio method for eligible
  * VCHR devices.  This method doesn't use an aio helper thread, and
  * thus has very low overhead.
  *
  * Assumes that the caller, aio_aqueue(), has incremented the file
  * structure's reference count, preventing its deallocation for the
  * duration of this call.
  */
 static int
 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
 {
 	struct aiocb *cb;
 	struct file *fp;
 	struct buf *bp;
 	struct vnode *vp;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	int error;
 
 	cb = &aiocbe->uaiocb;
 	fp = aiocbe->fd_file;
 
 	if (fp->f_type != DTYPE_VNODE)
 		return (-1);
 
 	vp = fp->f_vnode;
 
 	/*
 	 * If its not a disk, we don't want to return a positive error.
 	 * It causes the aio code to not fall through to try the thread
 	 * way when you're talking to a regular file.
 	 */
 	if (!vn_isdisk(vp, &error)) {
 		if (error == ENOTBLK)
 			return (-1);
 		else
 			return (error);
 	}
 
 	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
 
  	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
 		return (-1);
 
 	if (cb->aio_nbytes > vp->v_rdev->si_iosize_max)
 		return (-1);
 
 	if (cb->aio_nbytes >
 	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
 		return (-1);
 
 	ki = p->p_aioinfo;
 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
 		return (-1);
 
 	/* Create and build a buffer header for a transfer. */
 	bp = (struct buf *)getpbuf(NULL);
 	BUF_KERNPROC(bp);
 
 	AIO_LOCK(ki);
 	ki->kaio_count++;
 	ki->kaio_buffer_count++;
 	lj = aiocbe->lio;
 	if (lj)
 		lj->lioj_count++;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Get a copy of the kva from the physical buffer.
 	 */
 	error = 0;
 
 	bp->b_bcount = cb->aio_nbytes;
 	bp->b_bufsize = cb->aio_nbytes;
 	bp->b_iodone = aio_physwakeup;
 	bp->b_saveaddr = bp->b_data;
 	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
 	bp->b_offset = cb->aio_offset;
 	bp->b_iooffset = cb->aio_offset;
 	bp->b_blkno = btodb(cb->aio_offset);
 	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
 
 	/*
 	 * Bring buffer into kernel space.
 	 */
 	if (vmapbuf(bp) < 0) {
 		error = EFAULT;
 		goto doerror;
 	}
 
 	AIO_LOCK(ki);
 	aiocbe->bp = bp;
 	bp->b_caller1 = (void *)aiocbe;
 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
 	aiocbe->jobstate = JOBST_JOBQBUF;
 	cb->_aiocb_private.status = cb->aio_nbytes;
 	AIO_UNLOCK(ki);
 
 	atomic_add_int(&num_queue_count, 1);
 	atomic_add_int(&num_buf_aio, 1);
 
 	bp->b_error = 0;
 
 	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
 
 	/* Perform transfer. */
 	dev_strategy(vp->v_rdev, bp);
 	return (0);
 
 doerror:
 	AIO_LOCK(ki);
 	ki->kaio_count--;
 	ki->kaio_buffer_count--;
 	if (lj)
 		lj->lioj_count--;
 	aiocbe->bp = NULL;
 	AIO_UNLOCK(ki);
 	relpbuf(bp, NULL);
 	return (error);
 }
 
 /*
  * Wake up aio requests that may be serviceable now.
  */
 static void
 aio_swake_cb(struct socket *so, struct sockbuf *sb)
 {
 	struct aiocblist *cb, *cbn;
 	int opcode;
 
 	if (sb == &so->so_snd)
 		opcode = LIO_WRITE;
 	else
 		opcode = LIO_READ;
 
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_AIO;
 	mtx_lock(&aio_job_mtx);
 	TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
 		if (opcode == cb->uaiocb.aio_lio_opcode) {
 			if (cb->jobstate != JOBST_JOBQSOCK)
 				panic("invalid queue value");
 			/* XXX
 			 * We don't have actual sockets backend yet,
 			 * so we simply move the requests to the generic
 			 * file I/O backend.
 			 */
 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
 			aio_kick_nowait(cb->userproc);
 		}
 	}
 	mtx_unlock(&aio_job_mtx);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
  * technique is done in this code.
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
 	int type, int oldsigev)
 {
 	struct proc *p = td->td_proc;
 	struct file *fp;
 	struct socket *so;
 	struct aiocblist *aiocbe, *cb;
 	struct kaioinfo *ki;
 	struct kevent kev;
 	struct sockbuf *sb;
 	int opcode;
 	int error;
 	int fd, kqfd;
 	int jid;
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	suword(&job->_aiocb_private.status, -1);
 	suword(&job->_aiocb_private.error, 0);
 	suword(&job->_aiocb_private.kernelinfo, -1);
 
 	if (num_queue_count >= max_queue_count ||
 	    ki->kaio_count >= ki->kaio_qallowed_count) {
 		suword(&job->_aiocb_private.error, EAGAIN);
 		return (EAGAIN);
 	}
 
 	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
 	aiocbe->inputcharge = 0;
 	aiocbe->outputcharge = 0;
 	knlist_init(&aiocbe->klist, AIO_MTX(ki), NULL, NULL, NULL);
 
 	if (oldsigev) {
 		bzero(&aiocbe->uaiocb, sizeof(struct aiocb));
 		error = copyin(job, &aiocbe->uaiocb, sizeof(struct oaiocb));
 		bcopy(&aiocbe->uaiocb.__spare__, &aiocbe->uaiocb.aio_sigevent,
 			sizeof(struct osigevent));
 	} else {
 		error = copyin(job, &aiocbe->uaiocb, sizeof(struct aiocb));
 	}
 	if (error) {
 		suword(&job->_aiocb_private.error, error);
 		uma_zfree(aiocb_zone, aiocbe);
 		return (error);
 	}
 
 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
 		suword(&job->_aiocb_private.error, EINVAL);
 		uma_zfree(aiocb_zone, aiocbe);
 		return (EINVAL);
 	}
 	
 	if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
 	     aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
 		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
 		uma_zfree(aiocb_zone, aiocbe);
 		return (EINVAL);
 	}
 
 	ksiginfo_init(&aiocbe->ksi);
 
 	/* Save userspace address of the job info. */
 	aiocbe->uuaiocb = job;
 
 	/* Get the opcode. */
 	if (type != LIO_NOP)
 		aiocbe->uaiocb.aio_lio_opcode = type;
 	opcode = aiocbe->uaiocb.aio_lio_opcode;
 
 	/* Fetch the file object for the specified file descriptor. */
 	fd = aiocbe->uaiocb.aio_fildes;
 	switch (opcode) {
 	case LIO_WRITE:
 		error = fget_write(td, fd, &fp);
 		break;
 	case LIO_READ:
 		error = fget_read(td, fd, &fp);
 		break;
 	default:
 		error = fget(td, fd, &fp);
 	}
 	if (error) {
 		uma_zfree(aiocb_zone, aiocbe);
 		suword(&job->_aiocb_private.error, error);
 		return (error);
 	}
 
 	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	aiocbe->fd_file = fp;
 
 	mtx_lock(&aio_job_mtx);
 	jid = jobrefid++;
 	aiocbe->seqno = jobseqno++;
 	mtx_unlock(&aio_job_mtx);
 	error = suword(&job->_aiocb_private.kernelinfo, jid);
 	if (error) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
 
 	if (opcode == LIO_NOP) {
 		fdrop(fp, td);
 		uma_zfree(aiocb_zone, aiocbe);
 		return (0);
 	}
 	if ((opcode != LIO_READ) && (opcode != LIO_WRITE) &&
 	    (opcode != LIO_SYNC)) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
 		goto no_kqueue;
 	kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
 	kev.ident = (uintptr_t)aiocbe->uuaiocb;
 	kev.filter = EVFILT_AIO;
 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 	kev.data = (intptr_t)aiocbe;
 	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
 	error = kqfd_register(kqfd, &kev, td, 1);
 aqueue_fail:
 	if (error) {
 		fdrop(fp, td);
 		uma_zfree(aiocb_zone, aiocbe);
 		suword(&job->_aiocb_private.error, error);
 		goto done;
 	}
 no_kqueue:
 
 	suword(&job->_aiocb_private.error, EINPROGRESS);
 	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
 	aiocbe->userproc = p;
 	aiocbe->cred = crhold(td->td_ucred);
 	aiocbe->jobflags = 0;
 	aiocbe->lio = lj;
 
 	if (opcode == LIO_SYNC)
 		goto queueit;
 
 	if (fp->f_type == DTYPE_SOCKET) {
 		/*
 		 * Alternate queueing for socket ops: Reach down into the
 		 * descriptor to get the socket data.  Then check to see if the
 		 * socket is ready to be read or written (based on the requested
 		 * operation).
 		 *
 		 * If it is not ready for io, then queue the aiocbe on the
 		 * socket, and set the flags so we get a call when sbnotify()
 		 * happens.
 		 *
 		 * Note if opcode is neither LIO_WRITE nor LIO_READ we lock
 		 * and unlock the snd sockbuf for no reason.
 		 */
 		so = fp->f_data;
 		sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
 		SOCKBUF_LOCK(sb);
 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
 		    LIO_WRITE) && (!sowriteable(so)))) {
 			sb->sb_flags |= SB_AIO;
 
 			mtx_lock(&aio_job_mtx);
 			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
 			mtx_unlock(&aio_job_mtx);
 
 			AIO_LOCK(ki);
 			TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
 			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
 			aiocbe->jobstate = JOBST_JOBQSOCK;
 			ki->kaio_count++;
 			if (lj)
 				lj->lioj_count++;
 			AIO_UNLOCK(ki);
 			SOCKBUF_UNLOCK(sb);
 			atomic_add_int(&num_queue_count, 1);
 			error = 0;
 			goto done;
 		}
 		SOCKBUF_UNLOCK(sb);
 	}
 
 	if ((error = aio_qphysio(p, aiocbe)) == 0)
 		goto done;
 #if 0
 	if (error > 0) {
 		aiocbe->uaiocb._aiocb_private.error = error;
 		suword(&job->_aiocb_private.error, error);
 		goto done;
 	}
 #endif
 queueit:
 	/* No buffer for daemon I/O. */
 	aiocbe->bp = NULL;
 	atomic_add_int(&num_queue_count, 1);
 
 	AIO_LOCK(ki);
 	ki->kaio_count++;
 	if (lj)
 		lj->lioj_count++;
 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
 	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
 	if (opcode == LIO_SYNC) {
 		TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
 			if (cb->fd_file == aiocbe->fd_file &&
 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
 			    cb->seqno < aiocbe->seqno) {
 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
 				aiocbe->pending++;
 			}
 		}
 		TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
 			if (cb->fd_file == aiocbe->fd_file &&
 			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
 			    cb->seqno < aiocbe->seqno) {
 				cb->jobflags |= AIOCBLIST_CHECKSYNC;
 				aiocbe->pending++;
 			}
 		}
 		if (aiocbe->pending != 0) {
 			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
 			aiocbe->jobstate = JOBST_JOBQSYNC;
 			AIO_UNLOCK(ki);
 			goto done;
 		}
 	}
 	mtx_lock(&aio_job_mtx);
 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
 	aio_kick_nowait(p);
 	mtx_unlock(&aio_job_mtx);
 	AIO_UNLOCK(ki);
 	error = 0;
 done:
 	return (error);
 }
 
 static void
 aio_kick_nowait(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aiothreadlist *aiop;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aiothreadflags &= ~AIOP_FREE;
 		wakeup(aiop->aiothread);
 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
 	    ((ki->kaio_active_count + num_aio_resv_start) <
 	    ki->kaio_maxactive_count)) {
 		taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
 	}
 }
 
 static int
 aio_kick(struct proc *userp)
 {
 	struct kaioinfo *ki = userp->p_aioinfo;
 	struct aiothreadlist *aiop;
 	int error, ret = 0;
 
 	mtx_assert(&aio_job_mtx, MA_OWNED);
 retryproc:
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
 		aiop->aiothreadflags &= ~AIOP_FREE;
 		wakeup(aiop->aiothread);
 	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
 	    ((ki->kaio_active_count + num_aio_resv_start) <
 	    ki->kaio_maxactive_count)) {
 		num_aio_resv_start++;
 		mtx_unlock(&aio_job_mtx);
 		error = aio_newproc(&num_aio_resv_start);
 		mtx_lock(&aio_job_mtx);
 		if (error) {
 			num_aio_resv_start--;
 			goto retryproc;
 		}
 	} else {
 		ret = -1;
 	}
 	return (ret);
 }
 
 static void
 aio_kick_helper(void *context, int pending)
 {
 	struct proc *userp = context;
 
 	mtx_lock(&aio_job_mtx);
 	while (--pending >= 0) {
 		if (aio_kick(userp))
 			break;
 	}
 	mtx_unlock(&aio_job_mtx);
 }
 
 /*
  * Support the aio_return system call, as a side-effect, kernel resources are
  * released.
  */
 int
 aio_return(struct thread *td, struct aio_return_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct aiocblist *cb;
 	struct aiocb *uaiocb;
 	struct kaioinfo *ki;
 	int status, error;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EINVAL);
 	uaiocb = uap->aiocbp;
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
 		if (cb->uuaiocb == uaiocb)
 			break;
 	}
 	if (cb != NULL) {
 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
 		status = cb->uaiocb._aiocb_private.status;
 		error = cb->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
 			p->p_stats->p_ru.ru_oublock +=
 			    cb->outputcharge;
 			cb->outputcharge = 0;
 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
 			p->p_stats->p_ru.ru_inblock += cb->inputcharge;
 			cb->inputcharge = 0;
 		}
 		aio_free_entry(cb);
 		AIO_UNLOCK(ki);
 		suword(&uaiocb->_aiocb_private.error, error);
 		suword(&uaiocb->_aiocb_private.status, status);
 	} else {
 		error = EINVAL;
 		AIO_UNLOCK(ki);
 	}
 	return (error);
 }
 
 /*
  * Allow a process to wakeup when any of the I/O requests are completed.
  */
 int
 aio_suspend(struct thread *td, struct aio_suspend_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct timespec ts;
 	struct aiocb *const *cbptr, *cbp;
 	struct kaioinfo *ki;
 	struct aiocblist *cb, *cbfirst;
 	struct aiocb **ujoblist;
 	int njoblist;
 	int error;
 	int timo;
 	int i;
 
 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	timo = 0;
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
 			return (error);
 
 		if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EAGAIN);
 
 	njoblist = 0;
 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
 	cbptr = uap->aiocbp;
 
 	for (i = 0; i < uap->nent; i++) {
 		cbp = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
 		if (cbp == 0)
 			continue;
 		ujoblist[njoblist] = cbp;
 		njoblist++;
 	}
 
 	if (njoblist == 0) {
 		uma_zfree(aiol_zone, ujoblist);
 		return (0);
 	}
 
 	AIO_LOCK(ki);
 	for (;;) {
 		cbfirst = NULL;
 		error = 0;
 		TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
 			for (i = 0; i < njoblist; i++) {
 				if (cb->uuaiocb == ujoblist[i]) {
 					if (cbfirst == NULL)
 						cbfirst = cb;
 					if (cb->jobstate == JOBST_JOBFINISHED)
 						goto RETURN;
 				}
 			}
 		}
 		/* All tasks were finished. */
 		if (cbfirst == NULL)
 			break;
 
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiospn", timo);
 		if (error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 RETURN:
 	AIO_UNLOCK(ki);
 	uma_zfree(aiol_zone, ujoblist);
 	return (error);
 }
 
 /*
  * aio_cancel cancels any non-physio aio operations not currently in
  * progress.
  */
 int
 aio_cancel(struct thread *td, struct aio_cancel_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 	struct aiocblist *cbe, *cbn;
 	struct file *fp;
 	struct socket *so;
 	int error;
 	int remove;
 	int cancelled = 0;
 	int notcancelled = 0;
 	struct vnode *vp;
 
 	/* Lookup file object. */
 	error = fget(td, uap->fd, &fp);
 	if (error)
 		return (error);
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		goto done;
 
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if (vn_isdisk(vp, &error)) {
 			fdrop(fp, td);
 			td->td_retval[0] = AIO_NOTCANCELED;
 			return (0);
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
 		    ((uap->aiocbp == NULL) ||
 		     (uap->aiocbp == cbe->uuaiocb))) {
 			remove = 0;
 
 			mtx_lock(&aio_job_mtx);
 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
 				TAILQ_REMOVE(&aio_jobs, cbe, list);
 				remove = 1;
 			} else if (cbe->jobstate == JOBST_JOBQSOCK) {
 				MPASS(fp->f_type == DTYPE_SOCKET);
 				so = fp->f_data;
 				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
 				remove = 1;
 			} else if (cbe->jobstate == JOBST_JOBQSYNC) {
 				TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
 				remove = 1;
 			}
 			mtx_unlock(&aio_job_mtx);
 
 			if (remove) {
 				TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
 				cbe->uaiocb._aiocb_private.status = -1;
 				cbe->uaiocb._aiocb_private.error = ECANCELED;
 				aio_bio_done_notify(p, cbe, DONE_QUEUE);
 				cancelled++;
 			} else {
 				notcancelled++;
 			}
 			if (uap->aiocbp != NULL)
 				break;
 		}
 	}
 	AIO_UNLOCK(ki);
 
 done:
 	fdrop(fp, td);
 
 	if (uap->aiocbp != NULL) {
 		if (cancelled) {
 			td->td_retval[0] = AIO_CANCELED;
 			return (0);
 		}
 	}
 
 	if (notcancelled) {
 		td->td_retval[0] = AIO_NOTCANCELED;
 		return (0);
 	}
 
 	if (cancelled) {
 		td->td_retval[0] = AIO_CANCELED;
 		return (0);
 	}
 
 	td->td_retval[0] = AIO_ALLDONE;
 
 	return (0);
 }
 
 /*
- * aio_error is implemented in the kernel level for compatibility purposes only.
- * For a user mode async implementation, it would be best to do it in a userland
- * subroutine.
+ * aio_error is implemented in the kernel level for compatibility purposes
+ * only.  For a user mode async implementation, it would be best to do it in
+ * a userland subroutine.
  */
 int
 aio_error(struct thread *td, struct aio_error_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct aiocblist *cb;
 	struct kaioinfo *ki;
 	int status;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL) {
 		td->td_retval[0] = EINVAL;
 		return (0);
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
 		if (cb->uuaiocb == uap->aiocbp) {
 			if (cb->jobstate == JOBST_JOBFINISHED)
 				td->td_retval[0] =
 					cb->uaiocb._aiocb_private.error;
 			else
 				td->td_retval[0] = EINPROGRESS;
 			AIO_UNLOCK(ki);
 			return (0);
 		}
 	}
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Hack for failure of aio_aqueue.
 	 */
 	status = fuword(&uap->aiocbp->_aiocb_private.status);
 	if (status == -1) {
 		td->td_retval[0] = fuword(&uap->aiocbp->_aiocb_private.error);
 		return (0);
 	}
 
 	td->td_retval[0] = EINVAL;
 	return (0);
 }
 
 /* syscall - asynchronous read from a file (REALTIME) */
 int
 oaio_read(struct thread *td, struct oaio_read_args *uap)
 {
 
 	return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 1);
 }
 
 int
 aio_read(struct thread *td, struct aio_read_args *uap)
 {
 
 	return aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, 0);
 }
 
 /* syscall - asynchronous write to a file (REALTIME) */
 int
 oaio_write(struct thread *td, struct oaio_write_args *uap)
 {
 
 	return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 1);
 }
 
 int
 aio_write(struct thread *td, struct aio_write_args *uap)
 {
 
 	return aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, 0);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 int
 olio_listio(struct thread *td, struct olio_listio_args *uap)
 {
 	return do_lio_listio(td, (struct lio_listio_args *)uap, 1);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 int
 lio_listio(struct thread *td, struct lio_listio_args *uap)
 {
 	return do_lio_listio(td, uap, 0);
 }
 
 static int
 do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev)
 {
 	struct proc *p = td->td_proc;
 	struct aiocb *iocb, * const *cbptr;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
 	struct kevent kev;
 	int nent;
 	int error;
 	int nerror;
 	int i;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
 
 	nent = uap->nent;
 	if (nent < 0 || nent > AIO_LISTIO_MAX)
 		return (EINVAL);
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
 	ki = p->p_aioinfo;
 
 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
 	lj->lioj_flags = 0;
 	lj->lioj_count = 0;
 	lj->lioj_finished_count = 0;
 	knlist_init(&lj->klist, AIO_MTX(ki), NULL, NULL, NULL);
 	ksiginfo_init(&lj->lioj_ksi);
 
 	/*
 	 * Setup signal.
 	 */
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
 		bzero(&lj->lioj_signal, sizeof(&lj->lioj_signal));
 		error = copyin(uap->sig, &lj->lioj_signal,
 				oldsigev ? sizeof(struct osigevent) :
 					   sizeof(struct sigevent));
 		if (error) {
 			uma_zfree(aiolio_zone, lj);
 			return (error);
 		}
 
 		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 			/* Assume only new style KEVENT */
 			kev.filter = EVFILT_LIO;
 			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 			kev.ident = (uintptr_t)uap->acb_list; /* something unique */
 			kev.data = (intptr_t)lj;
 			/* pass user defined sigval data */
 			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
 			error = kqfd_register(
 			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
 			if (error) {
 				uma_zfree(aiolio_zone, lj);
 				return (error);
 			}
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
 			;
 		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
 				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
 					uma_zfree(aiolio_zone, lj);
 					return EINVAL;
 				}
 				lj->lioj_flags |= LIOJ_SIGNAL;
 		} else {
 			uma_zfree(aiolio_zone, lj);
 			return EINVAL;
 		}
 	}
 
 	AIO_LOCK(ki);
 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 	/*
 	 * Add extra aiocb count to avoid the lio to be freed
 	 * by other threads doing aio_waitcomplete or aio_return,
 	 * and prevent event from being sent until we have queued
 	 * all tasks.
 	 */
 	lj->lioj_count = 1;
 	AIO_UNLOCK(ki);
 
 	/*
 	 * Get pointers to the list of I/O requests.
 	 */
 	nerror = 0;
 	cbptr = uap->acb_list;
 	for (i = 0; i < uap->nent; i++) {
 		iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
 		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
 			error = aio_aqueue(td, iocb, lj, LIO_NOP, oldsigev);
 			if (error != 0)
 				nerror++;
 		}
 	}
 
 	error = 0;
 	AIO_LOCK(ki);
 	if (uap->mode == LIO_WAIT) {
 		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
 			ki->kaio_flags |= KAIO_WAKEUP;
 			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
 			    PRIBIO | PCATCH, "aiospn", 0);
 			if (error == ERESTART)
 				error = EINTR;
 			if (error)
 				break;
 		}
 	} else {
 		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
 			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
 				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
 				KNOTE_LOCKED(&lj->klist, 1);
 			}
 			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
 			    == LIOJ_SIGNAL
 			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
 			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
 				aio_sendsig(p, &lj->lioj_signal,
 					    &lj->lioj_ksi);
 				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
 			}
 		}
 	}
 	lj->lioj_count--;
 	if (lj->lioj_count == 0) {
 		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
 		knlist_delete(&lj->klist, curthread, 1);
 		PROC_LOCK(p);
 		sigqueue_take(&lj->lioj_ksi);
 		PROC_UNLOCK(p);
 		AIO_UNLOCK(ki);
 		uma_zfree(aiolio_zone, lj);
 	} else
 		AIO_UNLOCK(ki);
 
 	if (nerror)
 		return (EIO);
 	return (error);
 }
 
 /*
  * Called from interrupt thread for physio, we should return as fast
  * as possible, so we schedule a biohelper task.
  */
 static void
 aio_physwakeup(struct buf *bp)
 {
 	struct aiocblist *aiocbe;
 
 	aiocbe = (struct aiocblist *)bp->b_caller1;
 	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
 }
 
 /*
  * Task routine to perform heavy tasks, process wakeup, and signals.
  */
 static void
 biohelper(void *context, int pending)
 {
 	struct aiocblist *aiocbe = context;
 	struct buf *bp;
 	struct proc *userp;
 	struct kaioinfo *ki;
 	int nblks;
 
 	bp = aiocbe->bp;
 	userp = aiocbe->userproc;
 	ki = userp->p_aioinfo;
 	AIO_LOCK(ki);
 	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
 	aiocbe->uaiocb._aiocb_private.error = 0;
 	if (bp->b_ioflags & BIO_ERROR)
 		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
 	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
 	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
 		aiocbe->outputcharge += nblks;
 	else
 		aiocbe->inputcharge += nblks;
 	aiocbe->bp = NULL;
 	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
 	ki->kaio_buffer_count--;
 	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
 	AIO_UNLOCK(ki);
 
 	/* Release mapping into kernel space. */
 	vunmapbuf(bp);
 	relpbuf(bp, NULL);
 	atomic_subtract_int(&num_buf_aio, 1);
 }
 
 /* syscall - wait for the next completion of an aio request */
 int
 aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct timeval atv;
 	struct timespec ts;
 	struct kaioinfo *ki;
 	struct aiocblist *cb;
 	struct aiocb *uuaiocb;
 	int error, status, timo;
 
 	suword(uap->aiocbp, (long)NULL);
 
 	timo = 0;
 	if (uap->timeout) {
 		/* Get timespec struct. */
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		if ((ts.tv_nsec < 0) || (ts.tv_nsec >= 1000000000))
 			return (EINVAL);
 
 		TIMESPEC_TO_TIMEVAL(&atv, &ts);
 		if (itimerfix(&atv))
 			return (EINVAL);
 		timo = tvtohz(&atv);
 	}
 
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 	ki = p->p_aioinfo;
 
 	error = 0;
 	cb = NULL;
 	AIO_LOCK(ki);
 	while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
 		ki->kaio_flags |= KAIO_WAKEUP;
 		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
 		    "aiowc", timo);
 		if (timo && error == ERESTART)
 			error = EINTR;
 		if (error)
 			break;
 	}
 
 	if (cb != NULL) {
 		MPASS(cb->jobstate == JOBST_JOBFINISHED);
 		uuaiocb = cb->uuaiocb;
 		status = cb->uaiocb._aiocb_private.status;
 		error = cb->uaiocb._aiocb_private.error;
 		td->td_retval[0] = status;
 		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
 			p->p_stats->p_ru.ru_oublock += cb->outputcharge;
 			cb->outputcharge = 0;
 		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
 			p->p_stats->p_ru.ru_inblock += cb->inputcharge;
 			cb->inputcharge = 0;
 		}
 		aio_free_entry(cb);
 		AIO_UNLOCK(ki);
 		suword(uap->aiocbp, (long)uuaiocb);
 		suword(&uuaiocb->_aiocb_private.error, error);
 		suword(&uuaiocb->_aiocb_private.status, status);
 	} else
 		AIO_UNLOCK(ki);
 
 	return (error);
 }
 
 int
 aio_fsync(struct thread *td, struct aio_fsync_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct kaioinfo *ki;
 
 	if (uap->op != O_SYNC) /* XXX lack of O_DSYNC */
 		return (EINVAL);
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		aio_init_aioinfo(p);
 	return aio_aqueue(td, uap->aiocbp, NULL, LIO_SYNC, 0);
 }
 
 /* kqueue attach function */
 static int
 filt_aioattach(struct knote *kn)
 {
 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
 
 	/*
 	 * The aiocbe pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&aiocbe->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_aiodetach(struct knote *kn)
 {
 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
 
 	if (!knlist_empty(&aiocbe->klist))
 		knlist_remove(&aiocbe->klist, kn, 0);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_aio(struct knote *kn, long hint)
 {
 	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
 
 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
 	if (aiocbe->jobstate != JOBST_JOBFINISHED)
 		return (0);
 	kn->kn_flags |= EV_EOF;
 	return (1);
 }
 
 /* kqueue attach function */
 static int
 filt_lioattach(struct knote *kn)
 {
 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
 
 	/*
 	 * The aioliojob pointer must be validated before using it, so
 	 * registration is restricted to the kernel; the user cannot
 	 * set EV_FLAG1.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&lj->klist, kn, 0);
 
 	return (0);
 }
 
 /* kqueue detach function */
 static void
 filt_liodetach(struct knote *kn)
 {
 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
 
 	if (!knlist_empty(&lj->klist))
 		knlist_remove(&lj->klist, kn, 0);
 }
 
 /* kqueue filter function */
 /*ARGSUSED*/
 static int
 filt_lio(struct knote *kn, long hint)
 {
 	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
 
 	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
 }
Index: head/sys/kern/vfs_cache.c
===================================================================
--- head/sys/kern/vfs_cache.c	(revision 167231)
+++ head/sys/kern/vfs_cache.c	(revision 167232)
@@ -1,871 +1,871 @@
 /*-
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Poul-Henning Kamp of the FreeBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 
 #include <vm/uma.h>
 
 /*
  * This structure describes the elements in the cache of recent
  * names looked up by namei.
  */
 
 struct	namecache {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	struct	vnode *nc_vp;		/* vnode the name refers to */
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	char	nc_name[0];		/* segment name */
 };
 
 /*
  * Name caching works as follows:
  *
  * Names found by directory scans are retained in a cache
  * for future reference.  It is managed LRU, so frequently
  * used names will hang around.  Cache is indexed by hash value
  * obtained from (vp, name) where vp refers to the directory
  * containing name.
  *
  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  * exist) the vnode pointer will be NULL.
  *
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
  */
 
 /*
  * Structures associated with name cacheing.
  */
 #define NCHHASH(hash) \
 	(&nchashtbl[(hash) & nchash])
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
 static u_long	nchash;			/* size of hash table */
 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
 static u_long	numneg;			/* number of cache entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
 static u_long	numcache;		/* number of cache entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
 static u_long	numcachehv;		/* number of cache entries with vnodes held */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
 #if 0
 static u_long	numcachepl;		/* number of cache purge for leaf entries */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
 #endif
 struct	nchstats nchstats;		/* cache effectiveness statistics */
 
 static struct mtx cache_lock;
 MTX_SYSINIT(vfscache, &cache_lock, "Name Cache", MTX_DEF);
 
 #define	CACHE_LOCK()	mtx_lock(&cache_lock)
 #define	CACHE_UNLOCK()	mtx_unlock(&cache_lock)
 
 /*
  * UMA zones for the VFS cache.
  *
  * The small cache is used for entries with short names, which are the
  * most common.  The large cache is used for entries which are too big to
  * fit in the small cache.
  */
 static uma_zone_t cache_zone_small;
 static uma_zone_t cache_zone_large;
 
 #define	CACHE_PATH_CUTOFF	32
 #define	CACHE_ZONE_SMALL	(sizeof(struct namecache) + CACHE_PATH_CUTOFF)
 #define	CACHE_ZONE_LARGE	(sizeof(struct namecache) + NAME_MAX)
 
 #define cache_alloc(len)	uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \
 	cache_zone_small : cache_zone_large, M_WAITOK)
 #define cache_free(ncp)		do { \
 	if (ncp != NULL) \
 		uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \
 		    cache_zone_small : cache_zone_large, (ncp)); \
 } while (0)
 
 static int	doingcache = 1;		/* 1 => enable the cache */
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
 
 /* Export size information to userland */
 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
 	sizeof(struct namecache), "");
 
 /*
  * The new name cache statistics
  */
 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 #define STATNODE(mode, name, var) \
 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
 STATNODE(CTLFLAG_RD, numneg, &numneg);
 STATNODE(CTLFLAG_RD, numcache, &numcache);
 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
 
 SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
 	sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
 
 
 
 static void cache_zap(struct namecache *ncp);
 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen);
 
 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 /*
  * Flags in namecache.nc_flag
  */
 #define NCF_WHITE	1
 
 /*
  * Grab an atomic snapshot of the name cache hash chain lengths
  */
 SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
 
 static int
 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count;
 
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		error = SYSCTL_OUT(req, &count, sizeof(count));
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
 
 static int
 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count, maxlength, used, pct;
 
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	used = 0;
 	maxlength = 0;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		if (count)
 			used++;
 		if (maxlength < count)
 			maxlength = count;
 	}
 	n_nchash = nchash + 1;
 	pct = (used * 100 * 100) / n_nchash;
 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &used, sizeof(used));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
 	if (error)
 		return (error);
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
 
 /*
  * cache_zap():
  *
  *   Removes a namecache entry from cache, whether it contains an actual
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
 cache_zap(ncp)
 	struct namecache *ncp;
 {
 	struct vnode *vp;
 
 	mtx_assert(&cache_lock, MA_OWNED);
 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
 	vp = NULL;
 	LIST_REMOVE(ncp, nc_hash);
 	LIST_REMOVE(ncp, nc_src);
 	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
 		vp = ncp->nc_dvp;
 		numcachehv--;
 	}
 	if (ncp->nc_vp) {
 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
 		ncp->nc_vp->v_dd = NULL;
 	} else {
 		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 		numneg--;
 	}
 	numcache--;
 	cache_free(ncp);
 	if (vp)
 		vdrop(vp);
 }
 
 /*
  * cache_leaf_test()
  *
  *      Test whether this (directory) vnode's namei cache entry contains
  *      subdirectories or not.  Used to determine whether the directory is
  *      a leaf in the namei cache or not.  Note: the directory may still
  *      contain files in the namei cache.
  *
  *      Returns 0 if the directory is a leaf, -1 if it isn't.
  */
 int
 cache_leaf_test(struct vnode *vp)
 {
 	struct namecache *ncpc;
 	int leaf;
 
 	leaf = 0;
 	CACHE_LOCK();
 	for (ncpc = LIST_FIRST(&vp->v_cache_src);
 	     ncpc != NULL;
 	     ncpc = LIST_NEXT(ncpc, nc_src)
 	 ) {
 		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR) {
 			leaf = -1;
 			break;
 		}
 	}
 	CACHE_UNLOCK();
 	return (leaf);
 }
 
 /*
  * Lookup an entry in the cache
  *
  * Lookup is called with dvp pointing to the directory to search,
  * cnp pointing to the name of the entry being sought. If the lookup
  * succeeds, the vnode is returned in *vpp, and a status of -1 is
  * returned. If the lookup determines that the name does not exist
  * (negative cacheing), a status of ENOENT is returned. If the lookup
  * fails, a status of zero is returned.
  *
  * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
  * unlocked.  If we're looking up . an extra ref is taken, but the lock is
  * not recursively acquired.
  */
 
 int
 cache_lookup(dvp, vpp, cnp)
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 {
 	struct namecache *ncp;
 	u_int32_t hash;
 	int error;
 
 	if (!doingcache) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 retry:
 	CACHE_LOCK();
 	numcalls++;
 
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1) {
 			*vpp = dvp;
 			CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
 			    dvp, cnp->cn_nameptr);
 			dothits++;
 			goto success;
 		}
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			dotdothits++;
 			if (dvp->v_dd == NULL ||
 			    (cnp->cn_flags & MAKEENTRY) == 0) {
 				CACHE_UNLOCK();
 				return (0);
 			}
 			*vpp = dvp->v_dd;
 			CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
 			    dvp, cnp->cn_nameptr, *vpp);
 			goto success;
 		}
 	}
 
 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		numchecks++;
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	/* We failed to find an entry */
 	if (ncp == 0) {
 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
 			nummisszap++;
 		} else {
 			nummiss++;
 		}
 		nchstats.ncs_miss++;
 		CACHE_UNLOCK();
 		return (0);
 	}
 
 	/* We don't want to have an entry, so dump it */
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		numposzaps++;
 		nchstats.ncs_badhits++;
 		cache_zap(ncp);
 		CACHE_UNLOCK();
 		return (0);
 	}
 
 	/* We found a "positive" match, return the vnode */
 	if (ncp->nc_vp) {
 		numposhits++;
 		nchstats.ncs_goodhits++;
 		*vpp = ncp->nc_vp;
 		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 		    dvp, cnp->cn_nameptr, *vpp, ncp);
 		goto success;
 	}
 
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
 		numnegzaps++;
 		nchstats.ncs_badhits++;
 		cache_zap(ncp);
 		CACHE_UNLOCK();
 		return (0);
 	}
 
 	numneghits++;
 	/*
 	 * We found a "negative" match, so we shift it to the end of
 	 * the "negative" cache entries queue to satisfy LRU.  Also,
 	 * check to see if the entry is a whiteout; indicate this to
 	 * the componentname, if so.
 	 */
 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	nchstats.ncs_neghits++;
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |= ISWHITEOUT;
 	CACHE_UNLOCK();
 	return (ENOENT);
 
 success:
 	/*
 	 * On success we return a locked and ref'd vnode as per the lookup
 	 * protocol.
 	 */
 	if (dvp == *vpp) {   /* lookup on "." */
 		VREF(*vpp);
 		CACHE_UNLOCK();
 		return (-1);
 	}
 	if (cnp->cn_flags & ISDOTDOT)
 		VOP_UNLOCK(dvp, 0, cnp->cn_thread);
 	VI_LOCK(*vpp);
 	CACHE_UNLOCK();
 	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
 	if (cnp->cn_flags & ISDOTDOT)
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, cnp->cn_thread);
 	if (error) {
 		*vpp = NULL;
 		goto retry;
 	}
 	return (-1);
 }
 
 /*
  * Add an entry to the cache.
  */
 void
 cache_enter(dvp, vp, cnp)
 	struct vnode *dvp;
 	struct vnode *vp;
 	struct componentname *cnp;
 {
 	struct namecache *ncp;
 	struct nchashhead *ncpp;
 	u_int32_t hash;
 	int hold;
 	int zap;
 	int len;
 
 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
 	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
 	    ("cahe_enter: Adding a doomed vnode"));
 
 	if (!doingcache)
 		return;
 
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1) {
 			return;
 		}
 		/*
 		 * For dotdot lookups only cache the v_dd pointer if the
 		 * directory has a link back to its parent via v_cache_dst.
 		 * Without this an unlinked directory would keep a soft
 		 * reference to its parent which could not be NULLd at
 		 * cache_purge() time.
 		 */
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			CACHE_LOCK();
 			if (!TAILQ_EMPTY(&dvp->v_cache_dst))
 				dvp->v_dd = vp;
 			CACHE_UNLOCK();
 			return;
 		}
 	}
 
 	hold = 0;
 	zap = 0;
 	ncp = cache_alloc(cnp->cn_namelen);
 	CACHE_LOCK();
 	numcache++;
 	if (!vp) {
 		numneg++;
 		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
 	} else if (vp->v_type == VDIR) {
 		vp->v_dd = dvp;
 	} else {
 		vp->v_dd = NULL;
 	}
 
 	/*
 	 * Set the rest of the namecache entry elements, calculate it's
 	 * hash key and insert it into the appropriate chain within
 	 * the cache entries table.
 	 */
 	ncp->nc_vp = vp;
 	ncp->nc_dvp = dvp;
 	len = ncp->nc_nlen = cnp->cn_namelen;
 	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
 	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
 	ncpp = NCHHASH(hash);
 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 	if (LIST_EMPTY(&dvp->v_cache_src)) {
 		hold = 1;
 		numcachehv++;
 	}
 	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 	/*
 	 * If the entry is "negative", we place it into the
 	 * "negative" cache queue, otherwise, we place it into the
 	 * destination vnode's cache entries queue.
 	 */
 	if (vp) {
 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 	} else {
 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	}
 	if (numneg * ncnegfactor > numcache) {
 		ncp = TAILQ_FIRST(&ncneg);
 		zap = 1;
 	}
 	if (hold)
 		vhold(dvp);
 	if (zap)
 		cache_zap(ncp);
 	CACHE_UNLOCK();
 }
 
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
 
 	TAILQ_INIT(&ncneg);
 
 	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 
 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
 
 
 /*
  * Invalidate all entries to a particular vnode.
  */
 void
 cache_purge(vp)
 	struct vnode *vp;
 {
 
 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
 	CACHE_LOCK();
 	while (!LIST_EMPTY(&vp->v_cache_src))
 		cache_zap(LIST_FIRST(&vp->v_cache_src));
 	while (!TAILQ_EMPTY(&vp->v_cache_dst))
 		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
 	vp->v_dd = NULL;
 	CACHE_UNLOCK();
 }
 
 /*
  * Flush all entries referencing a particular filesystem.
  *
  * Since we need to check it anyway, we will flush all the invalid
  * entries at the same time.
  */
 void
 cache_purgevfs(mp)
 	struct mount *mp;
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp, *nnp;
 	struct nchashhead mplist;
 
 	LIST_INIT(&mplist);
 	ncp = NULL;
 
 	/* Scan hash tables for applicable entries */
 	CACHE_LOCK();
 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
 		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
 			nnp = LIST_NEXT(ncp, nc_hash);
 			if (ncp->nc_dvp->v_mount == mp) {
 				LIST_REMOVE(ncp, nc_hash);
 				LIST_INSERT_HEAD(&mplist, ncp, nc_hash);
 			}
 		}
 	}
 	while (!LIST_EMPTY(&mplist))
 		cache_zap(LIST_FIRST(&mplist));
 	CACHE_UNLOCK();
 }
 
 /*
  * Perform canonical checks and cache lookup and pass on to filesystem
  * through the vop_cachedlookup only if needed.
  */
 
 int
 vfs_cache_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *dvp;
 	int error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	struct thread *td = cnp->cn_thread;
 
 	*vpp = NULL;
 	dvp = ap->a_dvp;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
 	if (error)
 		return (error);
 
 	error = cache_lookup(dvp, vpp, cnp);
 	if (error == 0)
 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 	if (error == ENOENT)
 		return (error);
 	return (0);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct  __getcwd_args {
 	u_char	*buf;
 	u_int	buflen;
 };
 #endif
 
 /*
  * XXX All of these sysctls would probably be more productive dead.
  */
 static int disablecwd;
 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
    "Disable the getcwd syscall");
 
-/* Implementation of the getcwd syscall */
+/* Implementation of the getcwd syscall. */
 int
 __getcwd(td, uap)
 	struct thread *td;
 	struct __getcwd_args *uap;
 {
 
 	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
 }
 
 int
 kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
 {
 	char *bp, *tmpbuf;
 	struct filedesc *fdp;
 	int error;
 
 	if (disablecwd)
 		return (ENODEV);
 	if (buflen < 2)
 		return (EINVAL);
 	if (buflen > MAXPATHLEN)
 		buflen = MAXPATHLEN;
 
 	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	mtx_lock(&Giant);
 	FILEDESC_LOCK(fdp);
 	error = vn_fullpath1(td, fdp->fd_cdir, fdp->fd_rdir, tmpbuf,
 	    &bp, buflen);
 	FILEDESC_UNLOCK(fdp);
 	mtx_unlock(&Giant);
 
 	if (!error) {
 		if (bufseg == UIO_SYSSPACE)
 			bcopy(bp, buf, strlen(bp) + 1);
 		else
 			error = copyout(bp, buf, strlen(bp) + 1);
 	}
 	free(tmpbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Thus begins the fullpath magic.
  */
 
 #undef STATNODE
 #define STATNODE(name)							\
 	static u_int name;						\
 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
 
 static int disablefullpath;
 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
 	"Disable the vn_fullpath function");
 
 /* These count for kern___getcwd(), too. */
 STATNODE(numfullpathcalls);
 STATNODE(numfullpathfail1);
 STATNODE(numfullpathfail2);
 STATNODE(numfullpathfail4);
 STATNODE(numfullpathfound);
 
 /*
  * Retrieve the full filesystem path that correspond to a vnode from the name
  * cache (if available)
  */
 int
 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
 {
 	char *buf;
 	struct filedesc *fdp;
 	int error;
 
 	if (disablefullpath)
 		return (ENODEV);
 	if (vn == NULL)
 		return (EINVAL);
 
 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK(fdp);
 	error = vn_fullpath1(td, vn, fdp->fd_rdir, buf, retbuf, MAXPATHLEN);
 	FILEDESC_UNLOCK(fdp);
 
 	if (!error)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 /*
  * The magic behind kern___getcwd() and vn_fullpath().
  */
 static int
 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen)
 {
 	char *bp;
 	int error, i, slash_prefixed;
 	struct namecache *ncp;
 
 	bp = buf + buflen - 1;
 	*bp = '\0';
 	error = 0;
 	slash_prefixed = 0;
 
 	CACHE_LOCK();
 	numfullpathcalls++;
 	if (vp->v_type != VDIR) {
 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
 		if (!ncp) {
 			numfullpathfail2++;
 			CACHE_UNLOCK();
 			return (ENOENT);
 		}
 		for (i = ncp->nc_nlen - 1; i >= 0 && bp > buf; i--)
 			*--bp = ncp->nc_name[i];
 		if (bp == buf) {
 			numfullpathfail4++;
 			CACHE_UNLOCK();
 			return (ENOMEM);
 		}
 		*--bp = '/';
 		slash_prefixed = 1;
 		vp = ncp->nc_dvp;
 	}
 	while (vp != rdir && vp != rootvnode) {
 		if (vp->v_vflag & VV_ROOT) {
 			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
 				error = EBADF;
 				break;
 			}
 			vp = vp->v_mount->mnt_vnodecovered;
 			continue;
 		}
 		if (vp->v_dd == NULL) {
 			numfullpathfail1++;
 			error = ENOTDIR;
 			break;
 		}
 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
 		if (!ncp) {
 			numfullpathfail2++;
 			error = ENOENT;
 			break;
 		}
 		MPASS(ncp->nc_dvp == vp->v_dd);
 		for (i = ncp->nc_nlen - 1; i >= 0 && bp != buf; i--)
 			*--bp = ncp->nc_name[i];
 		if (bp == buf) {
 			numfullpathfail4++;
 			error = ENOMEM;
 			break;
 		}
 		*--bp = '/';
 		slash_prefixed = 1;
 		vp = ncp->nc_dvp;
 	}
 	if (error) {
 		CACHE_UNLOCK();
 		return (error);
 	}
 	if (!slash_prefixed) {
 		if (bp == buf) {
 			numfullpathfail4++;
 			CACHE_UNLOCK();
 			return (ENOMEM);
 		} else {
 			*--bp = '/';
 		}
 	}
 	numfullpathfound++;
 	CACHE_UNLOCK();
 
 	*retbuf = bp;
 	return (0);
 }
Index: head/sys/kern/vfs_mount.c
===================================================================
--- head/sys/kern/vfs_mount.c	(revision 167231)
+++ head/sys/kern/vfs_mount.c	(revision 167232)
@@ -1,2192 +1,2189 @@
 /*-
  * Copyright (c) 1999-2004 Poul-Henning Kamp
  * Copyright (c) 1999 Michael Smith
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/clock.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include "opt_rootdevname.h"
 #include "opt_ddb.h"
 #include "opt_mac.h"
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #define	ROOTNAME		"root_device"
 #define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
 
 static int	vfs_domount(struct thread *td, const char *fstype,
 		    char *fspath, int fsflags, void *fsdata);
 static struct mount *vfs_mount_alloc(struct vnode *dvp, struct vfsconf *vfsp,
 		    const char *fspath, struct thread *td);
 static int	vfs_mountroot_ask(void);
 static int	vfs_mountroot_try(const char *mountfrom);
 static int	vfs_donmount(struct thread *td, int fsflags,
 		    struct uio *fsoptions);
 static void	free_mntarg(struct mntarg *ma);
 static void	vfs_mount_destroy(struct mount *);
 static int	vfs_getopt_pos(struct vfsoptlist *opts, const char *name);
 
 static int	usermount = 0;
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
     "Unprivileged users may mount and unmount file systems");
 
 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 static uma_zone_t mount_zone;
 
 /* List of mounted filesystems. */
 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
 
 /* For any iteration/modification of mountlist */
 struct mtx mountlist_mtx;
 MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
 
 TAILQ_HEAD(vfsoptlist, vfsopt);
 struct vfsopt {
 	TAILQ_ENTRY(vfsopt) link;
 	char	*name;
 	void	*value;
 	int	len;
 };
 
 /*
  * The vnode of the system's root (/ in the filesystem, without chroot
  * active.)
  */
 struct vnode	*rootvnode;
 
 /*
  * The root filesystem is detailed in the kernel environment variable
  * vfs.root.mountfrom, which is expected to be in the general format
  *
  * <vfsname>:[<path>]
  * vfsname   := the name of a VFS known to the kernel and capable
  *              of being mounted as root
  * path      := disk device name or other data used by the filesystem
  *              to locate its physical store
  */
 
 /*
  * Global opts, taken by all filesystems
  */
 static const char *global_opts[] = {
 	"errmsg",
 	"fstype",
 	"fspath",
 	"rdonly",
 	"ro",
 	"rw",
 	"suid",
 	"exec",
 	"update",
 	NULL
 };
 
 /*
  * The root specifiers we will try if RB_CDROM is specified.
  */
 static char *cdrom_rootdevnames[] = {
 	"cd9660:cd0",
 	"cd9660:acd0",
 	NULL
 };
 
 /* legacy find-root code */
 char		*rootdevnames[2] = {NULL, NULL};
 #ifndef ROOTDEVNAME
 #  define ROOTDEVNAME NULL
 #endif
 static const char	*ctrootdevname = ROOTDEVNAME;
 
 /*
  * ---------------------------------------------------------------------
  * Functions for building and sanitizing the mount options
  */
 
 /* Remove one mount option. */
 static void
 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
 {
 
 	TAILQ_REMOVE(opts, opt, link);
 	free(opt->name, M_MOUNT);
 	if (opt->value != NULL)
 		free(opt->value, M_MOUNT);
 #ifdef INVARIANTS
 	else if (opt->len != 0)
 		panic("%s: mount option with NULL value but length != 0",
 		    __func__);
 #endif
 	free(opt, M_MOUNT);
 }
 
 /* Release all resources related to the mount options. */
 static void
 vfs_freeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt;
 
 	while (!TAILQ_EMPTY(opts)) {
 		opt = TAILQ_FIRST(opts);
 		vfs_freeopt(opts, opt);
 	}
 	free(opts, M_MOUNT);
 }
 
 void
 vfs_deleteopt(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt, *temp;
   
 	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
 		if (strcmp(opt->name, name) == 0)
 			vfs_freeopt(opts, opt);
 	}
 }
 
 /*
  * Check if options are equal (with or without the "no" prefix).
  */
 static int
 vfs_equalopts(const char *opt1, const char *opt2)
 {
 
 	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
 	if (strcmp(opt1, opt2) == 0)
 		return (1);
 	/* "noopt" vs. "opt" */
 	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 		return (1);
 	/* "opt" vs. "noopt" */
 	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 		return (1);
 	return (0);
 }
 
 /*
  * If a mount option is specified several times,
  * (with or without the "no" prefix) only keep
  * the last occurence of it.
  */
 static void
 vfs_sanitizeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt, *opt2, *tmp;
 
 	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
 		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
 		while (opt2 != NULL) {
 			if (vfs_equalopts(opt->name, opt2->name)) {
 				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
 				vfs_freeopt(opts, opt2);
 				opt2 = tmp;
 			} else {
 				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
 			}
 		}
 	}
 }
 
 /*
  * Build a linked list of mount options from a struct uio.
  */
 static int
 vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
 {
 	struct vfsoptlist *opts;
 	struct vfsopt *opt;
 	size_t memused;
 	unsigned int i, iovcnt;
 	int error, namelen, optlen;
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 	TAILQ_INIT(opts);
 	memused = 0;
 	iovcnt = auio->uio_iovcnt;
 	for (i = 0; i < iovcnt; i += 2) {
 		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		namelen = auio->uio_iov[i].iov_len;
 		optlen = auio->uio_iov[i + 1].iov_len;
 		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
 		opt->value = NULL;
 		opt->len = 0;
 
 		/*
 		 * Do this early, so jumps to "bad" will free the current
 		 * option.
 		 */
 		TAILQ_INSERT_TAIL(opts, opt, link);
 		memused += sizeof(struct vfsopt) + optlen + namelen;
 
 		/*
 		 * Avoid consuming too much memory, and attempts to overflow
 		 * memused.
 		 */
 		if (memused > VFS_MOUNTARG_SIZE_MAX ||
 		    optlen > VFS_MOUNTARG_SIZE_MAX ||
 		    namelen > VFS_MOUNTARG_SIZE_MAX) {
 			error = EINVAL;
 			goto bad;
 		}
 
 		if (auio->uio_segflg == UIO_SYSSPACE) {
 			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
 		} else {
 			error = copyin(auio->uio_iov[i].iov_base, opt->name,
 			    namelen);
 			if (error)
 				goto bad;
 		}
 		/* Ensure names are null-terminated strings. */
 		if (opt->name[namelen - 1] != '\0') {
 			error = EINVAL;
 			goto bad;
 		}
 		if (optlen != 0) {
 			opt->len = optlen;
 			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
 			if (auio->uio_segflg == UIO_SYSSPACE) {
 				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
 				    optlen);
 			} else {
 				error = copyin(auio->uio_iov[i + 1].iov_base,
 				    opt->value, optlen);
 				if (error)
 					goto bad;
 			}
 		}
 	}
 	vfs_sanitizeopts(opts);
 	*options = opts;
 	return (0);
 bad:
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * Merge the old mount options with the new ones passed
  * in the MNT_UPDATE case.
  */
 static void
 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts)
 {
 	struct vfsopt *opt, *opt2, *new;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		/*
 		 * Check that this option hasn't been redefined
 		 * nor cancelled with a "no" mount option.
 		 */
 		opt2 = TAILQ_FIRST(toopts);
 		while (opt2 != NULL) {
 			if (strcmp(opt2->name, opt->name) == 0)
 				goto next;
 			if (strncmp(opt2->name, "no", 2) == 0 &&
 			    strcmp(opt2->name + 2, opt->name) == 0) {
 				vfs_freeopt(toopts, opt2);
 				goto next;
 			}
 			opt2 = TAILQ_NEXT(opt2, link);
 		}
 		/* We want this option, duplicate it. */
 		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK);
 		strcpy(new->name, opt->name);
 		if (opt->len != 0) {
 			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 			bcopy(opt->value, new->value, opt->len);
 		} else {
 			new->value = NULL;
 		}
 		new->len = opt->len;
 		TAILQ_INSERT_TAIL(toopts, new, link);
 next:
 		continue;
 	}
 }
 
 /*
- * ---------------------------------------------------------------------
- * Mount a filesystem
+ * Mount a filesystem.
  */
 int
 nmount(td, uap)
 	struct thread *td;
 	struct nmount_args /* {
 		struct iovec *iovp;
 		unsigned int iovcnt;
 		int flags;
 	} */ *uap;
 {
 	struct uio *auio;
 	struct iovec *iov;
 	unsigned int i;
 	int error;
 	u_int iovcnt;
 
 	AUDIT_ARG(fflags, uap->flags);
 
 	/* Kick out MNT_ROOTFS early as it is legal internally */
 	if (uap->flags & MNT_ROOTFS)
 		return (EINVAL);
 
 	iovcnt = uap->iovcnt;
 	/*
 	 * Check that we have an even number of iovec's
 	 * and that we have at least two options.
 	 */
 	if ((iovcnt & 1) || (iovcnt < 4))
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, iovcnt, &auio);
 	if (error)
 		return (error);
 	iov = auio->uio_iov;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > MMAXOPTIONLEN) {
 			free(auio, M_IOV);
 			return (EINVAL);
 		}
 		iov++;
 	}
 	error = vfs_donmount(td, uap->flags, auio);
 
 	free(auio, M_IOV);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Various utility functions
  */
 
 void
 vfs_ref(struct mount *mp)
 {
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 }
 
 void
 vfs_rel(struct mount *mp)
 {
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 static int
 mount_init(void *mem, int size, int flags)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
 	return (0);
 }
 
 static void
 mount_fini(void *mem, int size)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	lockdestroy(&mp->mnt_lock);
 	mtx_destroy(&mp->mnt_mtx);
 }
 
 /*
  * Allocate and initialize the mount point struct.
  */
 static struct mount *
 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp,
     const char *fspath, struct thread *td)
 {
 	struct mount *mp;
 
 	mp = uma_zalloc(mount_zone, M_WAITOK);
 	bzero(&mp->mnt_startzero,
 	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	mp->mnt_nvnodelistsize = 0;
 	mp->mnt_ref = 0;
 	(void) vfs_busy(mp, LK_NOWAIT, 0, td);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	vfsp->vfc_refcount++;	/* XXX Unlocked */
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 	MNT_IUNLOCK(mp);
 	mp->mnt_gen++;
 	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_vnodecovered = vp;
 	mp->mnt_cred = crdup(td->td_ucred);
 	mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
 	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 	mp->mnt_iosize_max = DFLTPHYS;
 #ifdef MAC
 	mac_init_mount(mp);
 	mac_create_mount(td->td_ucred, mp);
 #endif
 	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
 	return (mp);
 }
 
 /*
  * Destroy the mount struct previously allocated by vfs_mount_alloc().
  */
 static void
 vfs_mount_destroy(struct mount *mp)
 {
 	int i;
 
 	MNT_ILOCK(mp);
 	for (i = 0; mp->mnt_ref && i < 3; i++)
 		msleep(mp, MNT_MTX(mp), PVFS, "mntref", hz);
 	/*
 	 * This will always cause a 3 second delay in rebooting due to
 	 * refs on the root mountpoint that never go away.  Most of these
 	 * are held by init which never exits.
 	 */
 	if (i == 3 && (!rebooting || bootverbose))
 		printf("Mount point %s had %d dangling refs\n",
 		    mp->mnt_stat.f_mntonname, mp->mnt_ref);
 	if (mp->mnt_holdcnt != 0) {
 		printf("Waiting for mount point to be unheld\n");
 		while (mp->mnt_holdcnt != 0) {
 			mp->mnt_holdcntwaiters++;
 			msleep(&mp->mnt_holdcnt, MNT_MTX(mp),
 			       PZERO, "mntdestroy", 0);
 			mp->mnt_holdcntwaiters--;
 		}
 		printf("mount point unheld\n");
 	}
 	if (mp->mnt_writeopcount > 0) {
 		printf("Waiting for mount point write ops\n");
 		while (mp->mnt_writeopcount > 0) {
 			mp->mnt_kern_flag |= MNTK_SUSPEND;
 			msleep(&mp->mnt_writeopcount,
 			       MNT_MTX(mp),
 			       PZERO, "mntdestroy2", 0);
 		}
 		printf("mount point write ops completed\n");
 	}
 	if (mp->mnt_secondary_writes > 0) {
 		printf("Waiting for mount point secondary write ops\n");
 		while (mp->mnt_secondary_writes > 0) {
 			mp->mnt_kern_flag |= MNTK_SUSPEND;
 			msleep(&mp->mnt_secondary_writes,
 			       MNT_MTX(mp),
 			       PZERO, "mntdestroy3", 0);
 		}
 		printf("mount point secondary write ops completed\n");
 	}
 	MNT_IUNLOCK(mp);
 	mp->mnt_vfc->vfc_refcount--;
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
 		struct vnode *vp;
 
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
 			vprint("", vp);
 		panic("unmount: dangling vnode");
 	}
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_MWAIT)
 		wakeup(mp);
 	if (mp->mnt_writeopcount != 0)
 		panic("vfs_mount_destroy: nonzero writeopcount");
 	if (mp->mnt_secondary_writes != 0)
 		panic("vfs_mount_destroy: nonzero secondary_writes");
 	if (mp->mnt_nvnodelistsize != 0)
 		panic("vfs_mount_destroy: nonzero nvnodelistsize");
 	mp->mnt_writeopcount = -1000;
 	mp->mnt_nvnodelistsize = -1000;
 	mp->mnt_secondary_writes = -1000;
 	MNT_IUNLOCK(mp);
 #ifdef MAC
 	mac_destroy_mount(mp);
 #endif
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	crfree(mp->mnt_cred);
 	uma_zfree(mount_zone, mp);
 }
 
 static int
 vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
 {
 	struct vfsoptlist *optlist;
 	struct vfsopt *opt, *noro_opt;
 	char *fstype, *fspath, *errmsg;
 	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
 	int has_rw, has_noro;
 
 	errmsg = NULL;
 	errmsg_len = 0;
 	errmsg_pos = -1;
 	has_rw = 0;
 	has_noro = 0;
 
 	error = vfs_buildopts(fsoptions, &optlist);
 	if (error)
 		return (error);
 
 	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) 
 		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
 
 	/*
 	 * We need these two options before the others,
 	 * and they are mandatory for any filesystem.
 	 * Ensure they are NUL terminated as well.
 	 */
 	fstypelen = 0;
 	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
 	if (error || fstype[fstypelen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fstype", errmsg_len);
 		goto bail;
 	}
 	fspathlen = 0;
 	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
 	if (error || fspath[fspathlen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fspath", errmsg_len);
 		goto bail;
 	}
 
 	/*
 	 * We need to see if we have the "update" option
 	 * before we call vfs_domount(), since vfs_domount() has special
 	 * logic based on MNT_UPDATE.  This is very important
 	 * when we want to update the root filesystem.
 	 */ 
 	TAILQ_FOREACH(opt, optlist, link) {
 		if (strcmp(opt->name, "update") == 0)
 			fsflags |= MNT_UPDATE;
 		else if (strcmp(opt->name, "async") == 0)
 			fsflags |= MNT_ASYNC;
 		else if (strcmp(opt->name, "force") == 0)
 			fsflags |= MNT_FORCE;
 		else if (strcmp(opt->name, "multilabel") == 0)
 			fsflags |= MNT_MULTILABEL;
 		else if (strcmp(opt->name, "noasync") == 0)
 			fsflags &= ~MNT_ASYNC;
 		else if (strcmp(opt->name, "noatime") == 0)
 			fsflags |= MNT_NOATIME;
 		else if (strcmp(opt->name, "noclusterr") == 0)
 			fsflags |= MNT_NOCLUSTERR;
 		else if (strcmp(opt->name, "noclusterw") == 0)
 			fsflags |= MNT_NOCLUSTERW;
 		else if (strcmp(opt->name, "noexec") == 0)
 			fsflags |= MNT_NOEXEC;
 		else if (strcmp(opt->name, "nosuid") == 0)
 			fsflags |= MNT_NOSUID;
 		else if (strcmp(opt->name, "nosymfollow") == 0)
 			fsflags |= MNT_NOSYMFOLLOW;
 		else if (strcmp(opt->name, "noro") == 0) {
 			fsflags &= ~MNT_RDONLY;
 			has_noro = 1;
 		}
 		else if (strcmp(opt->name, "rw") == 0) {
 			fsflags &= ~MNT_RDONLY;
 			has_rw = 1;
 		}
 		else if (strcmp(opt->name, "ro") == 0 ||
 		    strcmp(opt->name, "rdonly") == 0)
 			fsflags |= MNT_RDONLY;
 		else if (strcmp(opt->name, "snapshot") == 0)
 			fsflags |= MNT_SNAPSHOT;
 		else if (strcmp(opt->name, "suiddir") == 0)
 			fsflags |= MNT_SUIDDIR;
 		else if (strcmp(opt->name, "sync") == 0)
 			fsflags |= MNT_SYNCHRONOUS;
 		else if (strcmp(opt->name, "union") == 0)
 			fsflags |= MNT_UNION;
 	}
 
 	/*
 	 * If "rw" was specified as a mount option, and we
 	 * are trying to update a mount-point from "ro" to "rw",
 	 * we need a mount option "noro", since in vfs_mergeopts(),
 	 * "noro" will cancel "ro", but "rw" will not do anything.
 	 */
 	if (has_rw && !has_noro) {
 		noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		noro_opt->name = strdup("noro", M_MOUNT);
 		noro_opt->value = NULL;
 		noro_opt->len = 0;
 		TAILQ_INSERT_TAIL(optlist, noro_opt, link);
 	}
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
 		error = ENAMETOOLONG;
 		goto bail;
 	}
 
 	mtx_lock(&Giant);
 	error = vfs_domount(td, fstype, fspath, fsflags, optlist);
 	mtx_unlock(&Giant);
 bail:
 	/* copyout the errmsg */
 	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
 	    && errmsg_len > 0 && errmsg != NULL) {
 		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
 			bcopy(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		} else {
 			copyout(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		} 
 	}
 
 	if (error != 0)
 		vfs_freeopts(optlist);
 	return (error);
 }
 
 /*
- * ---------------------------------------------------------------------
  * Old mount API.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mount_args {
 	char	*type;
 	char	*path;
 	int	flags;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 mount(td, uap)
 	struct thread *td;
 	struct mount_args /* {
 		char *type;
 		char *path;
 		int flags;
 		caddr_t data;
 	} */ *uap;
 {
 	char *fstype;
 	struct vfsconf *vfsp = NULL;
 	struct mntarg *ma = NULL;
 	int error;
 
 	AUDIT_ARG(fflags, uap->flags);
 
 	/* Kick out MNT_ROOTFS early as it is legal internally */
 	uap->flags &= ~MNT_ROOTFS;
 
 	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
 	if (error) {
 		free(fstype, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG(text, fstype);
 	mtx_lock(&Giant);
 	vfsp = vfs_byname_kld(fstype, td, &error);
 	free(fstype, M_TEMP);
 	if (vfsp == NULL) {
 		mtx_unlock(&Giant);
 		return (ENOENT);
 	}
 	if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
 		mtx_unlock(&Giant);
 		return (EOPNOTSUPP);
 	}
 
 	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
 	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
 	ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro");
 	ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid");
 	ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
 
 	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 
 /*
  * vfs_domount(): actually attempt a filesystem mount.
  */
 static int
 vfs_domount(
 	struct thread *td,	/* Calling thread. */
 	const char *fstype,	/* Filesystem type. */
 	char *fspath,		/* Mount path. */
 	int fsflags,		/* Flags common to all filesystems. */
 	void *fsdata		/* Options local to the filesystem. */
 	)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vfsconf *vfsp;
 	struct export_args export;
 	int error, flag = 0;
 	struct vattr va;
 	struct nameidata nd;
 
 	mtx_assert(&Giant, MA_OWNED);
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		return (ENAMETOOLONG);
 
 	if (jailed(td->td_ucred))
 		return (EPERM);
 	if (usermount == 0) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
 	 */
 	if (fsflags & MNT_EXPORTED) {
 		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
 		if (error)
 			return (error);
 	}
 	if (fsflags & MNT_SUIDDIR) {
 		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
 		if (error)
 			return (error);
 
 	}
 	/*
 	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
 	 */
 	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
 		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
 			fsflags |= MNT_NOSUID | MNT_USER;
 	}
 
 	/* Load KLDs before we lock the covered vnode to avoid reversals. */
 	vfsp = NULL;
 	if ((fsflags & MNT_UPDATE) == 0) {
 		/* Don't try to load KLDs if we're mounting the root. */
 		if (fsflags & MNT_ROOTFS)
 			vfsp = vfs_byname(fstype);
 		else
 			vfsp = vfs_byname_kld(fstype, td, &error);
 		if (vfsp == NULL)
 			return (ENODEV);
 	}
 	/*
 	 * Get vnode to be covered
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE,
 	    fspath, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (fsflags & MNT_UPDATE) {
 		if ((vp->v_vflag & VV_ROOT) == 0) {
 			vput(vp);
 			return (EINVAL);
 		}
 		mp = vp->v_mount;
 		MNT_ILOCK(mp);
 		flag = mp->mnt_flag;
 		/*
 		 * We only allow the filesystem to be reloaded if it
 		 * is currently mounted read-only.
 		 */
 		if ((fsflags & MNT_RELOAD) &&
 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 			MNT_IUNLOCK(mp);
 			vput(vp);
 			return (EOPNOTSUPP);	/* Needs translation */
 		}
 		MNT_IUNLOCK(mp);
 		/*
 		 * Only privileged root, or (if MNT_USER is set) the user that
 		 * did the original mount is permitted to update it.
 		 */
 		error = vfs_suser(mp, td);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
 			vput(vp);
 			return (EBUSY);
 		}
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) != 0 ||
 		    vp->v_mountedhere != NULL) {
 			VI_UNLOCK(vp);
 			vfs_unbusy(mp, td);
 			vput(vp);
 			return (EBUSY);
 		}
 		vp->v_iflag |= VI_MOUNT;
 		VI_UNLOCK(vp);
 		MNT_ILOCK(mp);
 		mp->mnt_flag |= fsflags &
 		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS);
 		MNT_IUNLOCK(mp);
 		VOP_UNLOCK(vp, 0, td);
 		mp->mnt_optnew = fsdata;
 		vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
 	} else {
 		/*
 		 * If the user is not root, ensure that they own the directory
 		 * onto which we are attempting to mount.
 		 */
 		error = VOP_GETATTR(vp, &va, td->td_ucred, td);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 		if (va.va_uid != td->td_ucred->cr_uid) {
 			error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN,
 			    SUSER_ALLOWJAIL);
 			if (error) {
 				vput(vp);
 				return (error);
 			}
 		}
 		error = vinvalbuf(vp, V_SAVE, td, 0, 0);
 		if (error != 0) {
 			vput(vp);
 			return (error);
 		}
 		if (vp->v_type != VDIR) {
 			vput(vp);
 			return (ENOTDIR);
 		}
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) != 0 ||
 		    vp->v_mountedhere != NULL) {
 			VI_UNLOCK(vp);
 			vput(vp);
 			return (EBUSY);
 		}
 		vp->v_iflag |= VI_MOUNT;
 		VI_UNLOCK(vp);
 
 		/*
 		 * Allocate and initialize the filesystem.
 		 */
 		mp = vfs_mount_alloc(vp, vfsp, fspath, td);
 		VOP_UNLOCK(vp, 0, td);
 
 		/* XXXMAC: pass to vfs_mount_alloc? */
 		mp->mnt_optnew = fsdata;
 	}
 
 	/*
 	 * Set the mount level flags.
 	 */
 	MNT_ILOCK(mp);
 	mp->mnt_flag = (mp->mnt_flag & ~MNT_UPDATEMASK) |
 		(fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS |
 			    MNT_RDONLY));
 	if ((mp->mnt_flag & MNT_ASYNC) == 0)
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
         error = VFS_MOUNT(mp, td);
 
 	/*
 	 * Process the export option only if we are
 	 * updating mount options.
 	 */
 	if (!error && (fsflags & MNT_UPDATE)) {
 		if (vfs_copyopt(mp->mnt_optnew, "export", &export,
 		    sizeof(export)) == 0)
 			error = vfs_export(mp, &export);
 	}
 
 	if (!error) {
 		if (mp->mnt_opt != NULL)
 			vfs_freeopts(mp->mnt_opt);
 		mp->mnt_opt = mp->mnt_optnew;
 		(void)VFS_STATFS(mp, &mp->mnt_stat, td);
 	}
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
 	*/
 	mp->mnt_optnew = NULL;
 	if (mp->mnt_flag & MNT_UPDATE) {
 		MNT_ILOCK(mp);
 		if (error)
 			mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) |
 				(flag & ~MNT_QUOTA);
 		else
 			mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD |
 					  MNT_FORCE | MNT_SNAPSHOT);
 		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 			mp->mnt_kern_flag |= MNTK_ASYNC;
 		else
 			mp->mnt_kern_flag &= ~MNTK_ASYNC;
 		MNT_IUNLOCK(mp);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			if (mp->mnt_syncer == NULL)
 				error = vfs_allocate_syncvnode(mp);
 		} else {
 			if (mp->mnt_syncer != NULL)
 				vrele(mp->mnt_syncer);
 			mp->mnt_syncer = NULL;
 		}
 		vfs_unbusy(mp, td);
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vrele(vp);
 		return (error);
 	}
 	MNT_ILOCK(mp);
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	else
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	/*
 	 * Put the new filesystem on the mount list after root.
 	 */
 	cache_purge(vp);
 	if (!error) {
 		struct vnode *newdp;
 
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vp->v_mountedhere = mp;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 		mtx_unlock(&mountlist_mtx);
 		vfs_event_signal(NULL, VQ_MOUNT, 0);
 		if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp, td))
 			panic("mount: lost mount");
 		mountcheckdirs(vp, newdp);
 		vput(newdp);
 		VOP_UNLOCK(vp, 0, td);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
 			error = vfs_allocate_syncvnode(mp);
 		vfs_unbusy(mp, td);
 		if (error)
 			vrele(vp);
 	} else {
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vfs_unbusy(mp, td);
 		vfs_mount_destroy(mp);
 		vput(vp);
 	}
 	return (error);
 }
 
 /*
- * ---------------------------------------------------------------------
  * Unmount a filesystem.
  *
- * Note: unmount takes a path to the vnode mounted on as argument,
- * not special file (as before).
+ * Note: unmount takes a path to the vnode mounted on as argument, not
+ * special file (as before).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unmount_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 unmount(td, uap)
 	struct thread *td;
 	register struct unmount_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 	struct mount *mp;
 	char *pathbuf;
 	int error, id0, id1;
 
 	if (jailed(td->td_ucred))
 		return (EPERM);
 	if (usermount == 0) {
 		error = priv_check(td, PRIV_VFS_UNMOUNT);
 		if (error)
 			return (error);
 	}
 
 	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
 	if (error) {
 		free(pathbuf, M_TEMP);
 		return (error);
 	}
 	AUDIT_ARG(upath, td, pathbuf, ARG_UPATH1);
 	mtx_lock(&Giant);
 	if (uap->flags & MNT_BYFSID) {
 		/* Decode the filesystem ID. */
 		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
 			mtx_unlock(&Giant);
 			free(pathbuf, M_TEMP);
 			return (EINVAL);
 		}
 
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
 			    mp->mnt_stat.f_fsid.val[1] == id1)
 				break;
 		}
 		mtx_unlock(&mountlist_mtx);
 	} else {
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
 				break;
 		}
 		mtx_unlock(&mountlist_mtx);
 	}
 	free(pathbuf, M_TEMP);
 	if (mp == NULL) {
 		/*
 		 * Previously we returned ENOENT for a nonexistent path and
 		 * EINVAL for a non-mountpoint.  We cannot tell these apart
 		 * now, so in the !MNT_BYFSID case return the more likely
 		 * EINVAL for compatibility.
 		 */
 		mtx_unlock(&Giant);
 		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
 	}
 
 	/*
 	 * Don't allow unmounting the root filesystem.
 	 */
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		mtx_unlock(&Giant);
 		return (EINVAL);
 	}
 	error = dounmount(mp, uap->flags, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Do the actual filesystem unmount.
  */
 int
 dounmount(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *coveredvp, *fsrootvp;
 	int error;
 	int async_flag;
 	int mnt_gen_r;
 
 	mtx_assert(&Giant, MA_OWNED);
 
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
 		mnt_gen_r = mp->mnt_gen;
 		VI_LOCK(coveredvp);
 		vholdl(coveredvp);
 		error = vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK, td);
 		vdrop(coveredvp);
 		/*
 		 * Check for mp being unmounted while waiting for the
 		 * covered vnode lock.
 		 */
 		if (error)
 			return (error);
 		if (coveredvp->v_mountedhere != mp ||
 		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
 			VOP_UNLOCK(coveredvp, 0, td);
 			return (EBUSY);
 		}
 	}
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that did the
 	 * original mount is permitted to unmount this filesystem.
 	 */
 	error = vfs_suser(mp, td);
 	if (error) {
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0, td);
 		return (error);
 	}
 
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		MNT_IUNLOCK(mp);
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0, td);
 		return (EBUSY);
 	}
 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
 	/* Allow filesystems to detect that a forced unmount is in progress. */
 	if (flags & MNT_FORCE)
 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
 	error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
 	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp), td);
 	if (error) {
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup(mp);
 		MNT_IUNLOCK(mp);
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0, td);
 		return (error);
 	}
 	vn_start_write(NULL, &mp, V_WAIT);
 
 	if (mp->mnt_flag & MNT_EXPUBLIC)
 		vfs_setpublicfs(NULL, NULL, NULL);
 
 	vfs_msync(mp, MNT_WAIT);
 	MNT_ILOCK(mp);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
 	if (mp->mnt_syncer != NULL)
 		vrele(mp->mnt_syncer);
 	/*
 	 * For forced unmounts, move process cdir/rdir refs on the fs root
 	 * vnode to the covered vnode.  For non-forced unmounts we want
 	 * such references to cause an EBUSY error.
 	 */
 	if ((flags & MNT_FORCE) &&
 	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
 		if (mp->mnt_vnodecovered != NULL)
 			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
 		if (fsrootvp == rootvnode) {
 			vrele(rootvnode);
 			rootvnode = NULL;
 		}
 		vput(fsrootvp);
 	}
 	if (((mp->mnt_flag & MNT_RDONLY) ||
 	     (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) ||
 	    (flags & MNT_FORCE)) {
 		error = VFS_UNMOUNT(mp, flags, td);
 	}
 	vn_finished_write(mp);
 	if (error) {
 		/* Undo cdir/rdir and rootvnode changes made above. */
 		if ((flags & MNT_FORCE) &&
 		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
 			if (mp->mnt_vnodecovered != NULL)
 				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
 			if (rootvnode == NULL) {
 				rootvnode = fsrootvp;
 				vref(rootvnode);
 			}
 			vput(fsrootvp);
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
 			(void) vfs_allocate_syncvnode(mp);
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 		mp->mnt_flag |= async_flag;
 		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 			mp->mnt_kern_flag |= MNTK_ASYNC;
 		lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup(mp);
 		MNT_IUNLOCK(mp);
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0, td);
 		return (error);
 	}
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	if (coveredvp != NULL) {
 		coveredvp->v_mountedhere = NULL;
 		vput(coveredvp);
 	}
 	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 	vfs_mount_destroy(mp);
 	return (0);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Mounting of root filesystem
  *
  */
 
 struct root_hold_token {
 	const char 			*who;
 	LIST_ENTRY(root_hold_token)	list;
 };
 
 static LIST_HEAD(, root_hold_token)	root_holds =
     LIST_HEAD_INITIALIZER(&root_holds);
 
 struct root_hold_token *
 root_mount_hold(const char *identifier)
 {
 	struct root_hold_token *h;
 
 	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
 	h->who = identifier;
 	mtx_lock(&mountlist_mtx);
 	LIST_INSERT_HEAD(&root_holds, h, list);
 	mtx_unlock(&mountlist_mtx);
 	return (h);
 }
 
 void
 root_mount_rel(struct root_hold_token *h)
 {
 
 	mtx_lock(&mountlist_mtx);
 	LIST_REMOVE(h, list);
 	wakeup(&root_holds);
 	mtx_unlock(&mountlist_mtx);
 	free(h, M_DEVBUF);
 }
 
 static void
 root_mount_wait(void)
 {
 	struct root_hold_token *h;
 
 	for (;;) {
 		DROP_GIANT();
 		g_waitidle();
 		PICKUP_GIANT();
 		mtx_lock(&mountlist_mtx);
 		if (LIST_EMPTY(&root_holds)) {
 			mtx_unlock(&mountlist_mtx);
 			break;
 		}
 		printf("Root mount waiting for:");
 		LIST_FOREACH(h, &root_holds, list)
 			printf(" %s", h->who);
 		printf("\n");
 		msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold",
 		    hz);
 	}
 }
 
 static void
 set_rootvnode(struct thread *td)
 {
 	struct proc *p;
 
 	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode, td))
 		panic("Cannot find root vnode");
 
 	p = td->td_proc;
 	FILEDESC_LOCK(p->p_fd);
 
 	if (p->p_fd->fd_cdir != NULL)
 		vrele(p->p_fd->fd_cdir);
 	p->p_fd->fd_cdir = rootvnode;
 	VREF(rootvnode);
 
 	if (p->p_fd->fd_rdir != NULL)
 		vrele(p->p_fd->fd_rdir);
 	p->p_fd->fd_rdir = rootvnode;
 	VREF(rootvnode);
 
 	FILEDESC_UNLOCK(p->p_fd);
 
 	VOP_UNLOCK(rootvnode, 0, td);
 }
 
 /*
  * Mount /devfs as our root filesystem, but do not put it on the mountlist
  * yet.  Create a /dev -> / symlink so that absolute pathnames will lookup.
  */
 
 static void
 devfs_first(void)
 {
 	struct thread *td = curthread;
 	struct vfsoptlist *opts;
 	struct vfsconf *vfsp;
 	struct mount *mp = NULL;
 	int error;
 
 	vfsp = vfs_byname("devfs");
 	KASSERT(vfsp != NULL, ("Could not find devfs by name"));
 	if (vfsp == NULL) 
 		return;
 
 	mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td);
 
 	error = VFS_MOUNT(mp, td);
 	KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
 	if (error)
 		return;
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 	TAILQ_INIT(opts);
 	mp->mnt_opt = opts;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 
 	set_rootvnode(td);
 
 	error = kern_symlink(td, "/", "dev", UIO_SYSSPACE);
 	if (error)
 		printf("kern_symlink /dev -> / returns %d\n", error);
 }
 
 /*
  * Surgically move our devfs to be mounted on /dev.
  */
 
 static void
 devfs_fixup(struct thread *td)
 {
 	struct nameidata nd;
 	int error;
 	struct vnode *vp, *dvp;
 	struct mount *mp;
 
 	/* Remove our devfs mount from the mountlist and purge the cache */
 	mtx_lock(&mountlist_mtx);
 	mp = TAILQ_FIRST(&mountlist);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	cache_purgevfs(mp);
 
 	VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td);
 	VI_LOCK(dvp);
 	dvp->v_iflag &= ~VI_MOUNT;
 	dvp->v_mountedhere = NULL;
 	VI_UNLOCK(dvp);
 
 	/* Set up the real rootvnode, and purge the cache */
 	TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL;
 	set_rootvnode(td);
 	cache_purgevfs(rootvnode->v_mount);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
 	error = namei(&nd);
 	if (error) {
 		printf("Lookup of /dev for devfs, error: %d\n", error);
 		return;
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		vput(vp);
 	}
 	error = vinvalbuf(vp, V_SAVE, td, 0, 0);
 	if (error) {
 		vput(vp);
 	}
 	cache_purge(vp);
 	mp->mnt_vnodecovered = vp;
 	vp->v_mountedhere = mp;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	VOP_UNLOCK(vp, 0, td);
 	vput(dvp);
 	vfs_unbusy(mp, td);
 
 	/* Unlink the no longer needed /dev/dev -> / symlink */
 	kern_unlink(td, "/dev/dev", UIO_SYSSPACE);
 }
 
 /*
  * Report errors during filesystem mounting. 
  */
 void
 vfs_mount_error(struct mount *mp, const char *fmt, ...)
 {
 	struct vfsoptlist *moptlist = mp->mnt_optnew;
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 /*
  * Find and mount the root filesystem
  */
 void
 vfs_mountroot(void)
 {
 	char *cp;
 	int error, i, asked = 0;
 
 	root_mount_wait();
 
 	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount),
 	    NULL, NULL, mount_init, mount_fini,
 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	devfs_first();
 
 	/*
 	 * We are booted with instructions to prompt for the root filesystem.
 	 */
 	if (boothowto & RB_ASKNAME) {
 		if (!vfs_mountroot_ask())
 			return;
 		asked = 1;
 	}
 
 	/*
 	 * The root filesystem information is compiled in, and we are
 	 * booted with instructions to use it.
 	 */
 	if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) {
 		if (!vfs_mountroot_try(ctrootdevname))
 			return;
 		ctrootdevname = NULL;
 	}
 
 	/*
 	 * We've been given the generic "use CDROM as root" flag.  This is
 	 * necessary because one media may be used in many different
 	 * devices, so we need to search for them.
 	 */
 	if (boothowto & RB_CDROM) {
 		for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
 			if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
 				return;
 		}
 	}
 
 	/*
 	 * Try to use the value read by the loader from /etc/fstab, or
 	 * supplied via some other means.  This is the preferred
 	 * mechanism.
 	 */
 	cp = getenv("vfs.root.mountfrom");
 	if (cp != NULL) {
 		error = vfs_mountroot_try(cp);
 		freeenv(cp);
 		if (!error)
 			return;
 	}
 
 	/*
 	 * Try values that may have been computed by code during boot
 	 */
 	if (!vfs_mountroot_try(rootdevnames[0]))
 		return;
 	if (!vfs_mountroot_try(rootdevnames[1]))
 		return;
 
 	/*
 	 * If we (still) have a compiled-in default, try it.
 	 */
 	if (ctrootdevname != NULL)
 		if (!vfs_mountroot_try(ctrootdevname))
 			return;
 	/*
 	 * Everything so far has failed, prompt on the console if we haven't
 	 * already tried that.
 	 */
 	if (!asked)
 		if (!vfs_mountroot_ask())
 			return;
 
 	panic("Root mount failed, startup aborted.");
 }
 
 /*
  * Mount (mountfrom) as the root filesystem.
  */
 static int
 vfs_mountroot_try(const char *mountfrom)
 {
 	struct mount	*mp;
 	char		*vfsname, *path;
 	time_t		timebase;
 	int		error;
 	char		patt[32];
 
 	vfsname = NULL;
 	path    = NULL;
 	mp      = NULL;
 	error   = EINVAL;
 
 	if (mountfrom == NULL)
 		return (error);		/* don't complain */
 	printf("Trying to mount root from %s\n", mountfrom);
 
 	/* parse vfs name and path */
 	vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK);
 	path = malloc(MNAMELEN, M_MOUNT, M_WAITOK);
 	vfsname[0] = path[0] = 0;
 	sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN);
 	if (sscanf(mountfrom, patt, vfsname, path) < 1)
 		goto out;
 
 	if (path[0] == '\0')
 		strcpy(path, ROOTNAME);
 
 	error = kernel_vmount(
 	    MNT_RDONLY | MNT_ROOTFS,
 	    "fstype", vfsname,
 	    "fspath", "/",
 	    "from", path,
 	    NULL);
 	if (error == 0) {
 		/*
 		 * We mount devfs prior to mounting the / FS, so the first
 		 * entry will typically be devfs.
 		 */
 		mp = TAILQ_FIRST(&mountlist);
 		KASSERT(mp != NULL, ("%s: mountlist is empty", __func__));
 
 		/*
 		 * Iterate over all currently mounted file systems and use
 		 * the time stamp found to check and/or initialize the RTC.
 		 * Typically devfs has no time stamp and the only other FS
 		 * is the actual / FS.
 		 * Call inittodr() only once and pass it the largest of the
 		 * timestamps we encounter.
 		 */
 		timebase = 0;
 		do {
 			if (mp->mnt_time > timebase)
 				timebase = mp->mnt_time;
 			mp = TAILQ_NEXT(mp, mnt_list);
 		} while (mp != NULL);
 		inittodr(timebase);
 
 		devfs_fixup(curthread);
 	}
 out:
 	free(path, M_MOUNT);
 	free(vfsname, M_MOUNT);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Interactive root filesystem selection code.
  */
 
 static int
 vfs_mountroot_ask(void)
 {
 	char name[128];
 
 	for(;;) {
 		printf("\nManual root filesystem specification:\n");
 		printf("  <fstype>:<device>  Mount <device> using filesystem <fstype>\n");
 #if defined(__amd64__) || defined(__i386__) || defined(__ia64__)
 		printf("                       eg. ufs:da0s1a\n");
 #else
 		printf("                       eg. ufs:/dev/da0a\n");
 #endif
 		printf("  ?                  List valid disk boot devices\n");
 		printf("  <empty line>       Abort manual input\n");
 		printf("\nmountroot> ");
 		gets(name, sizeof(name), 1);
 		if (name[0] == '\0')
 			return (1);
 		if (name[0] == '?') {
 			printf("\nList of GEOM managed disk devices:\n  ");
 			g_dev_print();
 			continue;
 		}
 		if (!vfs_mountroot_try(name))
 			return (0);
 	}
 }
 
 /*
  * ---------------------------------------------------------------------
  * Functions for querying mount options/arguments from filesystems.
  */
 
 /*
  * Check that no unknown options are given
  */
 int
 vfs_filteropt(struct vfsoptlist *opts, const char **legal)
 {
 	struct vfsopt *opt;
 	const char **t, *p;
 	
 
 	TAILQ_FOREACH(opt, opts, link) {
 		p = opt->name;
 		if (p[0] == 'n' && p[1] == 'o')
 			p += 2;
 		for(t = global_opts; *t != NULL; t++)
 			if (!strcmp(*t, p))
 				break;
 		if (*t != NULL)
 			continue;
 		for(t = legal; *t != NULL; t++)
 			if (!strcmp(*t, p))
 				break;
 		if (*t != NULL)
 			continue;
 		printf("mount option <%s> is unknown\n", p);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /*
  * Get a mount option by its name.
  *
  * Return 0 if the option was found, ENOENT otherwise.
  * If len is non-NULL it will be filled with the length
  * of the option. If buf is non-NULL, it will be filled
  * with the address of the option.
  */
 int
 vfs_getopt(opts, name, buf, len)
 	struct vfsoptlist *opts;
 	const char *name;
 	void **buf;
 	int *len;
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			if (len != NULL)
 				*len = opt->len;
 			if (buf != NULL)
 				*buf = opt->value;
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 static int
 vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt;
 	int i;
 
 	if (opts == NULL)
 		return (-1);
 
 	i = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0)
 			return (i);
 		++i;
 	}
 	return (-1);
 }
 
 char *
 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
 {
 	struct vfsopt *opt;
 
 	*error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		if (((char *)opt->value)[opt->len - 1] != '\0') {
 			*error = EINVAL;
 			return (NULL);
 		}
 		return (opt->value);
 	}
 	*error = ENOENT;
 	return (NULL);
 }
 
 int
 vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			if (w != NULL)
 				*w |= val;
 			return (1);
 		}
 	}
 	if (w != NULL)
 		*w &= ~val;
 	return (0);
 }
 
 int
 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct vfsopt *opt;
 	int ret;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		if (((char *)opt->value)[opt->len - 1] != '\0')
 			return (0);
 		va_start(ap, fmt);
 		ret = vsscanf(opt->value, fmt, ap);
 		va_end(ap);
 		return (ret);
 	}
 	return (0);
 }
 
 /*
  * Find and copy a mount option.
  *
  * The size of the buffer has to be specified
  * in len, if it is not the same length as the
  * mount option, EINVAL is returned.
  * Returns ENOENT if the option is not found.
  */
 int
 vfs_copyopt(opts, name, dest, len)
 	struct vfsoptlist *opts;
 	const char *name;
 	void *dest;
 	int len;
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			if (len != opt->len)
 				return (EINVAL);
 			bcopy(opt->value, dest, opt->len);
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 /*
  * This is a helper function for filesystems to traverse their
  * vnodes.  See MNT_VNODE_FOREACH() in sys/mount.h
  */
 
 struct vnode *
 __mnt_vnode_next(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
 	while (vp != NULL && vp->v_type == VMARKER) 
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
 	
 	/* Check if we are done */
 	if (vp == NULL) {
 		__mnt_vnode_markerfree(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_first(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 	while (vp != NULL && vp->v_type == VMARKER) 
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
 	
 	/* Check if we are done */
 	if (vp == NULL) {
 		*mvp = NULL;
 		return (NULL);
 	}
 	mp->mnt_holdcnt++;
 	MNT_IUNLOCK(mp);
 	*mvp = (struct vnode *) malloc(sizeof(struct vnode),
 				       M_VNODE_MARKER,
 				       M_WAITOK | M_ZERO);
 	MNT_ILOCK(mp);
 	(*mvp)->v_type = VMARKER;
 
 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 	while (vp != NULL && vp->v_type == VMARKER) 
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
 	
 	/* Check if we are done */
 	if (vp == NULL) {
 		MNT_IUNLOCK(mp);
 		free(*mvp, M_VNODE_MARKER);
 		MNT_ILOCK(mp);
 		*mvp = NULL;
 		mp->mnt_holdcnt--;
 		if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
 			wakeup(&mp->mnt_holdcnt);
 		return (NULL);
 	}
 	mp->mnt_markercnt++;
 	(*mvp)->v_mount = mp;
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	return (vp);
 }
 
 
 void
 __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL)
 		return;
 	
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	free(*mvp, M_VNODE_MARKER);
 	MNT_ILOCK(mp);
 	*mvp = NULL;
 
 	mp->mnt_markercnt--;
 	mp->mnt_holdcnt--;
 	if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
 		wakeup(&mp->mnt_holdcnt);
 }
 
 
 int
 __vfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
 {
 	int error;
 
 	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat, td);
 	if (sbp != &mp->mnt_stat)
 		*sbp = mp->mnt_stat;
 	return (error);
 }
 
 void
 vfs_mountedfrom(struct mount *mp, const char *from)
 {
 
 	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
 	strlcpy(mp->mnt_stat.f_mntfromname, from,
 	    sizeof mp->mnt_stat.f_mntfromname);
 }
 
 /*
  * ---------------------------------------------------------------------
  * This is the api for building mount args and mounting filesystems from
  * inside the kernel.
  *
  * The API works by accumulation of individual args.  First error is
  * latched.
  *
  * XXX: should be documented in new manpage kernel_mount(9)
  */
 
 /* A memory allocation which must be freed when we are done */
 struct mntaarg {
 	SLIST_ENTRY(mntaarg)	next;
 };
 
 /* The header for the mount arguments */
 struct mntarg {
 	struct iovec *v;
 	int len;
 	int error;
 	SLIST_HEAD(, mntaarg)	list;
 };
 
 /*
  * Add a boolean argument.
  *
  * flag is the boolean value.
  * name must start with "no".
  */
 struct mntarg *
 mount_argb(struct mntarg *ma, int flag, const char *name)
 {
 
 	KASSERT(name[0] == 'n' && name[1] == 'o',
 	    ("mount_argb(...,%s): name must start with 'no'", name));
 
 	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
 }
 
 /*
  * Add an argument printf style
  */
 struct mntarg *
 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct mntaarg *maa;
 	struct sbuf *sb;
 	int len;
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND);
 	va_start(ap, fmt);
 	sbuf_vprintf(sb, fmt, ap);
 	va_end(ap);
 	sbuf_finish(sb);
 	len = sbuf_len(sb) + 1;
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	bcopy(sbuf_data(sb), maa + 1, len);
 	sbuf_delete(sb);
 
 	ma->v[ma->len].iov_base = maa + 1;
 	ma->v[ma->len].iov_len = len;
 	ma->len++;
 
 	return (ma);
 }
 
 /*
  * Add an argument which is a userland string.
  */
 struct mntarg *
 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
 {
 	struct mntaarg *maa;
 	char *tbuf;
 
 	if (val == NULL)
 		return (ma);
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	tbuf = (void *)(maa + 1);
 	ma->error = copyinstr(val, tbuf, len, NULL);
 	return (mount_arg(ma, name, tbuf, -1));
 }
 
 /*
  * Plain argument.
  *
  * If length is -1, use printf.
  */
 struct mntarg *
 mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
 {
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
 	if (len < 0)
 		ma->v[ma->len].iov_len = strlen(val) + 1;
 	else
 		ma->v[ma->len].iov_len = len;
 	ma->len++;
 	return (ma);
 }
 
 /*
  * Free a mntarg structure
  */
 static void
 free_mntarg(struct mntarg *ma)
 {
 	struct mntaarg *maa;
 
 	while (!SLIST_EMPTY(&ma->list)) {
 		maa = SLIST_FIRST(&ma->list);
 		SLIST_REMOVE_HEAD(&ma->list, next);
 		free(maa, M_MOUNT);
 	}
 	free(ma->v, M_MOUNT);
 	free(ma, M_MOUNT);
 }
 
 /*
  * Mount a filesystem
  */
 int
 kernel_mount(struct mntarg *ma, int flags)
 {
 	struct uio auio;
 	int error;
 
 	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
 	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
 	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
 
 	auio.uio_iov = ma->v;
 	auio.uio_iovcnt = ma->len;
 	auio.uio_segflg = UIO_SYSSPACE;
 
 	error = ma->error;
 	if (!error)
 		error = vfs_donmount(curthread, flags, &auio);
 	free_mntarg(ma);
 	return (error);
 }
 
 /*
  * A printflike function to mount a filesystem.
  */
 int
 kernel_vmount(int flags, ...)
 {
 	struct mntarg *ma = NULL;
 	va_list ap;
 	const char *cp;
 	const void *vp;
 	int error;
 
 	va_start(ap, flags);
 	for (;;) {
 		cp = va_arg(ap, const char *);
 		if (cp == NULL)
 			break;
 		vp = va_arg(ap, const void *);
 		ma = mount_arg(ma, cp, vp, -1);
 	}
 	va_end(ap);
 
 	error = kernel_mount(ma, flags);
 	return (error);
 }
Index: head/sys/kern/vfs_syscalls.c
===================================================================
--- head/sys/kern/vfs_syscalls.c	(revision 167231)
+++ head/sys/kern/vfs_syscalls.c	(revision 167232)
@@ -1,4319 +1,4305 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysent.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/dirent.h>
 #include <sys/jail.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 static int chroot_refuse_vdir_fds(struct filedesc *fdp);
 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
 static int setfown(struct thread *td, struct vnode *, uid_t, gid_t);
 static int setfmode(struct thread *td, struct vnode *, int);
 static int setfflags(struct thread *td, struct vnode *, int);
 static int setutimes(struct thread *td, struct vnode *,
     const struct timespec *, int, int);
 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
     struct thread *td);
 
 /*
  * The module initialization routine for POSIX asynchronous I/O will
  * set this to the version of AIO that it implements.  (Zero means
  * that it is not implemented.)  This value is used here by pathconf()
  * and in kern_descrip.c by fpathconf().
  */
 int async_io_version;
 
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
 /*
  * Sync each mounted filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sync_args {
 	int     dummy;
 };
 #endif
-
-#ifdef DEBUG
-static int syncprt = 0;
-SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
-#endif
-
 /* ARGSUSED */
 int
 sync(td, uap)
 	struct thread *td;
 	struct sync_args *uap;
 {
 	struct mount *mp, *nmp;
 	int vfslocked;
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		vfslocked = VFS_LOCK_GIANT(mp);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 			MNT_ILOCK(mp);
 			mp->mnt_noasync++;
 			mp->mnt_kern_flag &= ~MNTK_ASYNC;
 			MNT_IUNLOCK(mp);
 			vfs_msync(mp, MNT_NOWAIT);
 			VFS_SYNC(mp, MNT_NOWAIT, td);
 			MNT_ILOCK(mp);
 			mp->mnt_noasync--;
 			if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 			    mp->mnt_noasync == 0)
 				mp->mnt_kern_flag |= MNTK_ASYNC;
 			MNT_IUNLOCK(mp);
 			vn_finished_write(mp);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
 	return (0);
 }
 
 /* XXX PRISON: could be per prison flag */
 static int prison_quotas;
 #if 0
 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
 #endif
 
 /*
  * Change filesystem quotas.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct quotactl_args {
 	char *path;
 	int cmd;
 	int uid;
 	caddr_t arg;
 };
 #endif
 int
 quotactl(td, uap)
 	struct thread *td;
 	register struct quotactl_args /* {
 		char *path;
 		int cmd;
 		int uid;
 		caddr_t arg;
 	} */ *uap;
 {
 	struct mount *mp, *vmp;
 	int vfslocked;
 	int error;
 	struct nameidata nd;
 
 	AUDIT_ARG(cmd, uap->cmd);
 	AUDIT_ARG(uid, uap->uid);
 	if (jailed(td->td_ucred) && !prison_quotas)
 		return (EPERM);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1,
 	   UIO_USERSPACE, uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = vn_start_write(nd.ni_vp, &vmp, V_WAIT | PCATCH);
 	mp = nd.ni_vp->v_mount;
 	vrele(nd.ni_vp);
 	if (error)
 		goto out;
 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, td);
 	vn_finished_write(vmp);
 out:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct statfs_args {
 	char *path;
 	struct statfs *buf;
 };
 #endif
 int
 statfs(td, uap)
 	struct thread *td;
 	register struct statfs_args /* {
 		char *path;
 		struct statfs *buf;
 	} */ *uap;
 {
 	struct statfs sf;
 	int error;
 
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 	if (error == 0)
 		error = copyout(&sf, uap->buf, sizeof(sf));
 	return (error);
 }
 
 int
 kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
     struct statfs *buf)
 {
 	struct mount *mp;
 	struct statfs *sp, sb;
 	int vfslocked;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	mp = nd.ni_vp->v_mount;
 	vfs_ref(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 #ifdef MAC
 	error = mac_check_mount_stat(td->td_ucred, mp);
 	if (error)
 		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sp = &mp->mnt_stat;
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp, td);
 	if (error)
 		goto out;
 	if (priv_check(td, PRIV_VFS_GENERATION)) {
 		bcopy(sp, &sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		prison_enforce_statfs(td->td_ucred, mp, &sb);
 		sp = &sb;
 	}
 	*buf = *sp;
 out:
 	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (mtx_owned(&Giant))
 		printf("statfs(%d): %s: %d\n", vfslocked, path, error);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstatfs_args {
 	int fd;
 	struct statfs *buf;
 };
 #endif
 int
 fstatfs(td, uap)
 	struct thread *td;
 	register struct fstatfs_args /* {
 		int fd;
 		struct statfs *buf;
 	} */ *uap;
 {
 	struct statfs sf;
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &sf);
 	if (error == 0)
 		error = copyout(&sf, uap->buf, sizeof(sf));
 	return (error);
 }
 
 int
 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
 {
 	struct file *fp;
 	struct mount *mp;
 	struct statfs *sp, sb;
 	int vfslocked;
 	struct vnode *vp;
 	int error;
 
 	AUDIT_ARG(fd, fd);
 	error = getvnode(td->td_proc->p_fd, fd, &fp);
 	if (error)
 		return (error);
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef AUDIT
 	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 #endif
 	mp = vp->v_mount;
 	if (mp)
 		vfs_ref(mp);
 	VOP_UNLOCK(vp, 0, td);
 	fdrop(fp, td);
 	if (vp->v_iflag & VI_DOOMED) {
 		error = EBADF;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_check_mount_stat(td->td_ucred, mp);
 	if (error)
 		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sp = &mp->mnt_stat;
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp, td);
 	if (error)
 		goto out;
 	if (priv_check(td, PRIV_VFS_GENERATION)) {
 		bcopy(sp, &sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		prison_enforce_statfs(td->td_ucred, mp, &sb);
 		sp = &sb;
 	}
 	*buf = *sp;
 out:
 	if (mp)
 		vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfsstat_args {
 	struct statfs *buf;
 	long bufsize;
 	int flags;
 };
 #endif
 int
 getfsstat(td, uap)
 	struct thread *td;
 	register struct getfsstat_args /* {
 		struct statfs *buf;
 		long bufsize;
 		int flags;
 	} */ *uap;
 {
 
 	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
 	    uap->flags));
 }
 
 /*
  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
  * 	The caller is responsible for freeing memory which will be allocated
  *	in '*buf'.
  */
 int
 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
     enum uio_seg bufseg, int flags)
 {
 	struct mount *mp, *nmp;
 	struct statfs *sfsp, *sp, sb;
 	size_t count, maxcount;
 	int vfslocked;
 	int error;
 
 	maxcount = bufsize / sizeof(struct statfs);
 	if (bufsize == 0)
 		sfsp = NULL;
 	else if (bufseg == UIO_USERSPACE)
 		sfsp = *buf;
 	else /* if (bufseg == UIO_SYSSPACE) */ {
 		count = 0;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			count++;
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (maxcount > count)
 			maxcount = count;
 		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
 		    M_WAITOK);
 	}
 	count = 0;
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (prison_canseemount(td->td_ucred, mp) != 0) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 #ifdef MAC
 		if (mac_check_mount_stat(td->td_ucred, mp) != 0) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 #endif
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		vfslocked = VFS_LOCK_GIANT(mp);
 		if (sfsp && count < maxcount) {
 			sp = &mp->mnt_stat;
 			/*
 			 * Set these in case the underlying filesystem
 			 * fails to do so.
 			 */
 			sp->f_version = STATFS_VERSION;
 			sp->f_namemax = NAME_MAX;
 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 			/*
 			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
 			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
 			 * overrides MNT_WAIT.
 			 */
 			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 			    (flags & MNT_WAIT)) &&
 			    (error = VFS_STATFS(mp, sp, td))) {
 				VFS_UNLOCK_GIANT(vfslocked);
 				mtx_lock(&mountlist_mtx);
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				vfs_unbusy(mp, td);
 				continue;
 			}
 			if (priv_check(td, PRIV_VFS_GENERATION)) {
 				bcopy(sp, &sb, sizeof(sb));
 				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 				prison_enforce_statfs(td->td_ucred, mp, &sb);
 				sp = &sb;
 			}
 			if (bufseg == UIO_SYSSPACE)
 				bcopy(sp, sfsp, sizeof(*sp));
 			else /* if (bufseg == UIO_USERSPACE) */ {
 				error = copyout(sp, sfsp, sizeof(*sp));
 				if (error) {
 					vfs_unbusy(mp, td);
 					VFS_UNLOCK_GIANT(vfslocked);
 					return (error);
 				}
 			}
 			sfsp++;
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 		count++;
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
 	if (sfsp && count > maxcount)
 		td->td_retval[0] = maxcount;
 	else
 		td->td_retval[0] = count;
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD4
 /*
  * Get old format filesystem statistics.
  */
 static void cvtstatfs(struct statfs *, struct ostatfs *);
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_statfs_args {
 	char *path;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_statfs(td, uap)
 	struct thread *td;
 	struct freebsd4_statfs_args /* {
 		char *path;
 		struct ostatfs *buf;
 	} */ *uap;
 {
 	struct ostatfs osb;
 	struct statfs sf;
 	int error;
 
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
 	if (error)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_fstatfs_args {
 	int fd;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_fstatfs(td, uap)
 	struct thread *td;
 	struct freebsd4_fstatfs_args /* {
 		int fd;
 		struct ostatfs *buf;
 	} */ *uap;
 {
 	struct ostatfs osb;
 	struct statfs sf;
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &sf);
 	if (error)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_getfsstat_args {
 	struct ostatfs *buf;
 	long bufsize;
 	int flags;
 };
 #endif
 int
 freebsd4_getfsstat(td, uap)
 	struct thread *td;
 	register struct freebsd4_getfsstat_args /* {
 		struct ostatfs *buf;
 		long bufsize;
 		int flags;
 	} */ *uap;
 {
 	struct statfs *buf, *sp;
 	struct ostatfs osb;
 	size_t count, size;
 	int error;
 
 	count = uap->bufsize / sizeof(struct ostatfs);
 	size = count * sizeof(struct statfs);
 	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
 	if (size > 0) {
 		count = td->td_retval[0];
 		sp = buf;
 		while (count > 0 && error == 0) {
 			cvtstatfs(sp, &osb);
 			error = copyout(&osb, uap->buf, sizeof(osb));
 			sp++;
 			uap->buf++;
 			count--;
 		}
 		free(buf, M_TEMP);
 	}
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_fhstatfs(td, uap)
 	struct thread *td;
 	struct freebsd4_fhstatfs_args /* {
 		struct fhandle *u_fhp;
 		struct ostatfs *buf;
 	} */ *uap;
 {
 	struct ostatfs osb;
 	struct statfs sf;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
 	error = kern_fhstatfs(td, fh, &sf);
 	if (error)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
 }
 
 /*
  * Convert a new format statfs structure to an old format statfs structure.
  */
 static void
 cvtstatfs(nsp, osp)
 	struct statfs *nsp;
 	struct ostatfs *osp;
 {
 
 	bzero(osp, sizeof(*osp));
 	osp->f_bsize = MIN(nsp->f_bsize, LONG_MAX);
 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
 	osp->f_blocks = MIN(nsp->f_blocks, LONG_MAX);
 	osp->f_bfree = MIN(nsp->f_bfree, LONG_MAX);
 	osp->f_bavail = MIN(nsp->f_bavail, LONG_MAX);
 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
 	osp->f_owner = nsp->f_owner;
 	osp->f_type = nsp->f_type;
 	osp->f_flags = nsp->f_flags;
 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
 	    MIN(MFSNAMELEN, OMFSNAMELEN));
 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
 	    MIN(MNAMELEN, OMNAMELEN));
 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
 	    MIN(MNAMELEN, OMNAMELEN));
 	osp->f_fsid = nsp->f_fsid;
 }
 #endif /* COMPAT_FREEBSD4 */
 
 /*
  * Change current working directory to a given file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchdir_args {
 	int	fd;
 };
 #endif
 int
 fchdir(td, uap)
 	struct thread *td;
 	struct fchdir_args /* {
 		int fd;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	struct vnode *vp, *tdp, *vpold;
 	struct mount *mp;
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	if ((error = getvnode(fdp, uap->fd, &fp)) != 0)
 		return (error);
 	vp = fp->f_vnode;
 	VREF(vp);
 	fdrop(fp, td);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 	error = change_dir(vp, td);
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
 		int tvfslocked;
 		if (vfs_busy(mp, 0, 0, td))
 			continue;
 		tvfslocked = VFS_LOCK_GIANT(mp);
 		error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdp, td);
 		vfs_unbusy(mp, td);
 		if (error) {
 			VFS_UNLOCK_GIANT(tvfslocked);
 			break;
 		}
 		vput(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		vp = tdp;
 		vfslocked = tvfslocked;
 	}
 	if (error) {
 		vput(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	FILEDESC_LOCK_FAST(fdp);
 	vpold = fdp->fd_cdir;
 	fdp->fd_cdir = vp;
 	FILEDESC_UNLOCK_FAST(fdp);
 	vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
 	vrele(vpold);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 }
 
 /*
  * Change current working directory (``.'').
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chdir_args {
 	char	*path;
 };
 #endif
 int
 chdir(td, uap)
 	struct thread *td;
 	struct chdir_args /* {
 		char *path;
 	} */ *uap;
 {
 
 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
 }
 
 int
 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 	struct nameidata nd;
 	struct vnode *vp;
 	int vfslocked;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | MPSAFE,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
 		vput(nd.ni_vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		return (error);
 	}
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	FILEDESC_LOCK_FAST(fdp);
 	vp = fdp->fd_cdir;
 	fdp->fd_cdir = nd.ni_vp;
 	FILEDESC_UNLOCK_FAST(fdp);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 }
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(fdp)
 	struct filedesc *fdp;
 {
 	struct vnode *vp;
 	struct file *fp;
 	int fd;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			continue;
 		if (fp->f_type == DTYPE_VNODE) {
 			vp = fp->f_vnode;
 			if (vp->v_type == VDIR)
 				return (EPERM);
 		}
 	}
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
      &chroot_allow_open_directories, 0, "");
 
 /*
  * Change notion of root (``/'') directory.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chroot_args {
 	char	*path;
 };
 #endif
 int
 chroot(td, uap)
 	struct thread *td;
 	struct chroot_args /* {
 		char *path;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT,
 	    SUSER_ALLOWJAIL);
 	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		goto error;
 	vfslocked = NDHASGIANT(&nd);
 	if ((error = change_dir(nd.ni_vp, td)) != 0)
 		goto e_vunlock;
 #ifdef MAC
 	if ((error = mac_check_vnode_chroot(td->td_ucred, nd.ni_vp)))
 		goto e_vunlock;
 #endif
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	error = change_root(nd.ni_vp, td);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 e_vunlock:
 	vput(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 error:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 }
 
 /*
  * Common routine for chroot and chdir.  Callers must provide a locked vnode
  * instance.
  */
 int
 change_dir(vp, td)
 	struct vnode *vp;
 	struct thread *td;
 {
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 #ifdef MAC
 	error = mac_check_vnode_chdir(td->td_ucred, vp);
 	if (error)
 		return (error);
 #endif
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	return (error);
 }
 
 /*
  * Common routine for kern_chroot() and jail_attach().  The caller is
  * responsible for invoking priv_check() and mac_check_chroot() to authorize
  * this operation.
  */
 int
 change_root(vp, td)
 	struct vnode *vp;
 	struct thread *td;
 {
 	struct filedesc *fdp;
 	struct vnode *oldvp;
 	int vfslocked;
 	int error;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK(fdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
 		if (error) {
 			FILEDESC_UNLOCK(fdp);
 			return (error);
 		}
 	}
 	oldvp = fdp->fd_rdir;
 	fdp->fd_rdir = vp;
 	VREF(fdp->fd_rdir);
 	if (!fdp->fd_jdir) {
 		fdp->fd_jdir = vp;
 		VREF(fdp->fd_jdir);
 	}
 	FILEDESC_UNLOCK(fdp);
 	vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
 	vrele(oldvp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 }
 
 /*
- * Check permissions, allocate an open file structure,
- * and call the device open routine if any.
- *
- * MP SAFE
+ * Check permissions, allocate an open file structure, and call the device
+ * open routine if any.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct open_args {
 	char	*path;
 	int	flags;
 	int	mode;
 };
 #endif
 int
 open(td, uap)
 	struct thread *td;
 	register struct open_args /* {
 		char *path;
 		int flags;
 		int mode;
 	} */ *uap;
 {
 
 	return kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode);
 }
 
 int
 kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
     int mode)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	struct file *fp;
 	struct vnode *vp;
 	struct vattr vat;
 	struct mount *mp;
 	int cmode;
 	struct file *nfp;
 	int type, indx, error;
 	struct flock lf;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(fflags, flags);
 	AUDIT_ARG(mode, mode);
 	if ((flags & O_ACCMODE) == O_ACCMODE)
 		return (EINVAL);
 	flags = FFLAGS(flags);
 	error = falloc(td, &nfp, &indx);
 	if (error)
 		return (error);
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	fp = nfp;
 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, td);
 	td->td_dupfd = -1;		/* XXX check for fdopen */
 	error = vn_open(&nd, &flags, cmode, indx);
 	if (error) {
 		/*
 		 * If the vn_open replaced the method vector, something
 		 * wonderous happened deep below and we just pass it up
 		 * pretending we know what we do.
 		 */
 		if (error == ENXIO && fp->f_ops != &badfileops) {
 			fdrop(fp, td);
 			td->td_retval[0] = indx;
 			return (0);
 		}
 
 		/*
 		 * release our own reference
 		 */
 		fdrop(fp, td);
 
 		/*
 		 * handle special fdopen() case.  bleh.  dupfdopen() is
 		 * responsible for dropping the old contents of ofiles[indx]
 		 * if it succeeds.
 		 */
 		if ((error == ENODEV || error == ENXIO) &&
 		    td->td_dupfd >= 0 &&		/* XXX from fdopen */
 		    (error =
 			dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) {
 			td->td_retval[0] = indx;
 			return (0);
 		}
 		/*
 		 * Clean up the descriptor, but only if another thread hadn't
 		 * replaced or closed it.
 		 */
 		fdclose(fdp, fp, indx, td);
 
 		if (error == ERESTART)
 			error = EINTR;
 		return (error);
 	}
 	td->td_dupfd = 0;
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/*
 	 * There should be 2 references on the file, one from the descriptor
 	 * table, and one for us.
 	 *
 	 * Handle the case where someone closed the file (via its file
 	 * descriptor) while we were blocked.  The end result should look
 	 * like opening the file succeeded but it was immediately closed.
 	 * We call vn_close() manually because we haven't yet hooked up
 	 * the various 'struct file' fields.
 	 */
 	FILEDESC_LOCK(fdp);
 	FILE_LOCK(fp);
 	if (fp->f_count == 1) {
 		mp = vp->v_mount;
 		KASSERT(fdp->fd_ofiles[indx] != fp,
 		    ("Open file descriptor lost all refs"));
 		FILE_UNLOCK(fp);
 		FILEDESC_UNLOCK(fdp);
 		VOP_UNLOCK(vp, 0, td);
 		vn_close(vp, flags & FMASK, fp->f_cred, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		td->td_retval[0] = indx;
 		return (0);
 	}
 	fp->f_vnode = vp;
 	if (fp->f_data == NULL)
 		fp->f_data = vp;
 	fp->f_flag = flags & FMASK;
 	if (fp->f_ops == &badfileops)
 		fp->f_ops = &vnops;
 	fp->f_seqcount = 1;
 	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
 	FILE_UNLOCK(fp);
 	FILEDESC_UNLOCK(fdp);
 
 	VOP_UNLOCK(vp, 0, td);
 	if (flags & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		if (flags & O_EXLOCK)
 			lf.l_type = F_WRLCK;
 		else
 			lf.l_type = F_RDLCK;
 		type = F_FLOCK;
 		if ((flags & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 			    type)) != 0)
 			goto bad;
 		fp->f_flag |= FHASLOCK;
 	}
 	if (flags & O_TRUNC) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 			goto bad;
 		VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 		VATTR_NULL(&vat);
 		vat.va_size = 0;
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 		error = mac_check_vnode_write(td->td_ucred, fp->f_cred, vp);
 		if (error == 0)
 #endif
 			error = VOP_SETATTR(vp, &vat, td->td_ucred, td);
 		VOP_UNLOCK(vp, 0, td);
 		vn_finished_write(mp);
 		if (error)
 			goto bad;
 	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	/*
 	 * Release our private reference, leaving the one associated with
 	 * the descriptor table intact.
 	 */
 	fdrop(fp, td);
 	td->td_retval[0] = indx;
 	return (0);
 bad:
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdclose(fdp, fp, indx, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Create a file.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ocreat_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 ocreat(td, uap)
 	struct thread *td;
 	register struct ocreat_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_open(td, uap->path, UIO_USERSPACE,
 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
 }
 #endif /* COMPAT_43 */
 
 /*
  * Create a special file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mknod_args {
 	char	*path;
 	int	mode;
 	int	dev;
 };
 #endif
 int
 mknod(td, uap)
 	struct thread *td;
 	register struct mknod_args /* {
 		char *path;
 		int mode;
 		int dev;
 	} */ *uap;
 {
 
 	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
 }
 
 int
 kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
     int dev)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 	int whiteout = 0;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(mode, mode);
 	AUDIT_ARG(dev, dev);
 	switch (mode & S_IFMT) {
 	case S_IFCHR:
 	case S_IFBLK:
 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
 		break;
 	case S_IFMT:
 		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
 		break;
 	case S_IFWHT:
 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (error)
 		return (error);
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	} else {
 		VATTR_NULL(&vattr);
 		FILEDESC_LOCK_FAST(td->td_proc->p_fd);
 		vattr.va_mode = (mode & ALLPERMS) &
 		    ~td->td_proc->p_fd->fd_cmask;
 		FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
 		vattr.va_rdev = dev;
 		whiteout = 0;
 
 		switch (mode & S_IFMT) {
 		case S_IFMT:	/* used by badsect to flag bad sectors */
 			vattr.va_type = VBAD;
 			break;
 		case S_IFCHR:
 			vattr.va_type = VCHR;
 			break;
 		case S_IFBLK:
 			vattr.va_type = VBLK;
 			break;
 		case S_IFWHT:
 			whiteout = 1;
 			break;
 		default:
 			panic("kern_mknod: invalid mode");
 		}
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 #ifdef MAC
 	if (error == 0 && !whiteout)
 		error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp,
 		    &nd.ni_cnd, &vattr);
 #endif
 	if (!error) {
 		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 		if (whiteout)
 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
 		else {
 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
 						&nd.ni_cnd, &vattr);
 			if (error == 0)
 				vput(nd.ni_vp);
 		}
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Create a named pipe.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkfifo_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 mkfifo(td, uap)
 	struct thread *td;
 	register struct mkfifo_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 int
 kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(mode, mode);
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	if (nd.ni_vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VFIFO;
 	FILEDESC_LOCK_FAST(td->td_proc->p_fd);
 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
 	FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
 #ifdef MAC
 	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error)
 		goto out;
 #endif
 	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	if (error == 0)
 		vput(nd.ni_vp);
 #ifdef MAC
 out:
 #endif
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 }
 
 /*
  * Make a hard file link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct link_args {
 	char	*path;
 	char	*link;
 };
 #endif
 int
 link(td, uap)
 	struct thread *td;
 	register struct link_args /* {
 		char *path;
 		char *link;
 	} */ *uap;
 {
 	int error;
 
 	error = kern_link(td, uap->path, uap->link, UIO_USERSPACE);
 	return (error);
 }
 
 static int hardlink_check_uid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
     &hardlink_check_uid, 0,
     "Unprivileged processes cannot create hard links to files owned by other "
     "users");
 static int hardlink_check_gid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
     &hardlink_check_gid, 0,
     "Unprivileged processes cannot create hard links to files owned by other "
     "groups");
 
 static int
 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
 {
 	struct vattr va;
 	int error;
 
 	if (!hardlink_check_uid && !hardlink_check_gid)
 		return (0);
 
 	error = VOP_GETATTR(vp, &va, cred, td);
 	if (error != 0)
 		return (error);
 
 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 
 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 int
 kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct nameidata nd;
 	int vfslocked;
 	int lvfslocked;
 	int error;
 
 	bwillwrite();
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, segflg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR) {
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EPERM);		/* POSIX */
 	}
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
 	    segflg, link, td);
 	if ((error = namei(&nd)) == 0) {
 		lvfslocked = NDHASGIANT(&nd);
 		if (nd.ni_vp != NULL) {
 			if (nd.ni_dvp == nd.ni_vp)
 				vrele(nd.ni_dvp);
 			else
 				vput(nd.ni_dvp);
 			vrele(nd.ni_vp);
 			error = EEXIST;
 		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td))
 		    == 0) {
 			VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 			VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 			error = can_hardlink(vp, td, td->td_ucred);
 			if (error == 0)
 #ifdef MAC
 				error = mac_check_vnode_link(td->td_ucred,
 				    nd.ni_dvp, vp, &nd.ni_cnd);
 			if (error == 0)
 #endif
 				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 			VOP_UNLOCK(vp, 0, td);
 			vput(nd.ni_dvp);
 		}
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		VFS_UNLOCK_GIANT(lvfslocked);
 	}
 	vrele(vp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Make a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct symlink_args {
 	char	*path;
 	char	*link;
 };
 #endif
 int
 symlink(td, uap)
 	struct thread *td;
 	register struct symlink_args /* {
 		char *path;
 		char *link;
 	} */ *uap;
 {
 
 	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
 }
 
 int
 kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	char *syspath;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	if (segflg == UIO_SYSSPACE) {
 		syspath = path;
 	} else {
 		syspath = uma_zalloc(namei_zone, M_WAITOK);
 		if ((error = copyinstr(path, syspath, MAXPATHLEN, NULL)) != 0)
 			goto out;
 	}
 	AUDIT_ARG(text, syspath);
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
 	    segflg, link, td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	vfslocked = NDHASGIANT(&nd);
 	if (nd.ni_vp) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		error = EEXIST;
 		goto out;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	FILEDESC_LOCK_FAST(td->td_proc->p_fd);
 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
 	FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
 #ifdef MAC
 	vattr.va_type = VLNK;
 	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error)
 		goto out2;
 #endif
 	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
 	if (error == 0)
 		vput(nd.ni_vp);
 #ifdef MAC
 out2:
 #endif
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 out:
 	if (segflg != UIO_SYSSPACE)
 		uma_zfree(namei_zone, syspath);
 	return (error);
 }
 
 /*
  * Delete a whiteout from the filesystem.
  */
 int
 undelete(td, uap)
 	struct thread *td;
 	register struct undelete_args /* {
 		char *path;
 	} */ *uap;
 {
 	int error;
 	struct mount *mp;
 	struct nameidata nd;
 	int vfslocked;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 
 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if (nd.ni_vp)
 			vrele(nd.ni_vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Delete a name from the filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unlink_args {
 	char	*path;
 };
 #endif
 int
 unlink(td, uap)
 	struct thread *td;
 	struct unlink_args /* {
 		char *path;
 	} */ *uap;
 {
 	int error;
 
 	error = kern_unlink(td, uap->path, UIO_USERSPACE);
 	return (error);
 }
 
 int
 kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error == EINVAL ? EPERM : error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR)
 		error = EPERM;		/* POSIX */
 	else {
 		/*
 		 * The root of a mounted filesystem cannot be deleted.
 		 *
 		 * XXX: can this only be a VDIR case?
 		 */
 		if (vp->v_vflag & VV_ROOT)
 			error = EBUSY;
 	}
 	if (error == 0) {
 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vput(nd.ni_dvp);
 			if (vp == nd.ni_dvp)
 				vrele(vp);
 			else
 				vput(vp);
 			VFS_UNLOCK_GIANT(vfslocked);
 			if ((error = vn_start_write(NULL, &mp,
 			    V_XSLEEP | PCATCH)) != 0)
 				return (error);
 			goto restart;
 		}
 #ifdef MAC
 		error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp,
 		    &nd.ni_cnd);
 		if (error)
 			goto out;
 #endif
 		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 #ifdef MAC
 out:
 #endif
 		vn_finished_write(mp);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lseek_args {
 	int	fd;
 	int	pad;
 	off_t	offset;
 	int	whence;
 };
 #endif
 int
 lseek(td, uap)
 	struct thread *td;
 	register struct lseek_args /* {
 		int fd;
 		int pad;
 		off_t offset;
 		int whence;
 	} */ *uap;
 {
 	struct ucred *cred = td->td_ucred;
 	struct file *fp;
 	struct vnode *vp;
 	struct vattr vattr;
 	off_t offset;
 	int error, noneg;
 	int vfslocked;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
 		fdrop(fp, td);
 		return (ESPIPE);
 	}
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	noneg = (vp->v_type != VCHR);
 	offset = uap->offset;
 	switch (uap->whence) {
 	case L_INCR:
 		if (noneg &&
 		    (fp->f_offset < 0 ||
 		    (offset > 0 && fp->f_offset > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += fp->f_offset;
 		break;
 	case L_XTND:
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		VOP_UNLOCK(vp, 0, td);
 		if (error)
 			break;
 		if (noneg &&
 		    (vattr.va_size > OFF_MAX ||
 		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += vattr.va_size;
 		break;
 	case L_SET:
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0 && noneg && offset < 0)
 		error = EINVAL;
 	if (error != 0)
 		goto drop;
 	fp->f_offset = offset;
 	*(off_t *)(td->td_retval) = fp->f_offset;
 drop:
 	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olseek_args {
 	int	fd;
 	long	offset;
 	int	whence;
 };
 #endif
 int
 olseek(td, uap)
 	struct thread *td;
 	register struct olseek_args /* {
 		int fd;
 		long offset;
 		int whence;
 	} */ *uap;
 {
 	struct lseek_args /* {
 		int fd;
 		int pad;
 		off_t offset;
 		int whence;
 	} */ nuap;
 	int error;
 
 	nuap.fd = uap->fd;
 	nuap.offset = uap->offset;
 	nuap.whence = uap->whence;
 	error = lseek(td, &nuap);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Check access permissions using passed credentials.
  */
 static int
 vn_access(vp, user_flags, cred, td)
 	struct vnode	*vp;
 	int		user_flags;
 	struct ucred	*cred;
 	struct thread	*td;
 {
 	int error, flags;
 
 	/* Flags == 0 means only check for existence. */
 	error = 0;
 	if (user_flags) {
 		flags = 0;
 		if (user_flags & R_OK)
 			flags |= VREAD;
 		if (user_flags & W_OK)
 			flags |= VWRITE;
 		if (user_flags & X_OK)
 			flags |= VEXEC;
 #ifdef MAC
 		error = mac_check_vnode_access(cred, vp, flags);
 		if (error)
 			return (error);
 #endif
 		if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
 			error = VOP_ACCESS(vp, flags, cred, td);
 	}
 	return (error);
 }
 
 /*
  * Check access permissions using "real" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct access_args {
 	char	*path;
 	int	flags;
 };
 #endif
 int
 access(td, uap)
 	struct thread *td;
 	register struct access_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 
 	return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags));
 }
 
 int
 kern_access(struct thread *td, char *path, enum uio_seg pathseg, int flags)
 {
 	struct ucred *cred, *tmpcred;
 	register struct vnode *vp;
 	struct nameidata nd;
 	int vfslocked;
 	int error;
 
 	/*
 	 * Create and modify a temporary credential instead of one that
 	 * is potentially shared.  This could also mess up socket
 	 * buffer accounting which can run in an interrupt context.
 	 */
 	cred = td->td_ucred;
 	tmpcred = crdup(cred);
 	tmpcred->cr_uid = cred->cr_ruid;
 	tmpcred->cr_groups[0] = cred->cr_rgid;
 	td->td_ucred = tmpcred;
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		goto out1;
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 
 	error = vn_access(vp, flags, tmpcred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 out1:
 	td->td_ucred = cred;
 	crfree(tmpcred);
 	return (error);
 }
 
 /*
  * Check access permissions using "effective" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct eaccess_args {
 	char	*path;
 	int	flags;
 };
 #endif
 int
 eaccess(td, uap)
 	struct thread *td;
 	register struct eaccess_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 
 	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags));
 }
 
 int
 kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int flags)
 {
 	struct nameidata nd;
 	struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	vfslocked = NDHASGIANT(&nd);
 	error = vn_access(vp, flags, td->td_ucred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ostat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 int
 ostat(td, uap)
 	struct thread *td;
 	register struct ostat_args /* {
 		char *path;
 		struct ostat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error)
 		return (error);
 	cvtstat(&sb, &osb);
 	error = copyout(&osb, uap->ub, sizeof (osb));
 	return (error);
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olstat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 int
 olstat(td, uap)
 	struct thread *td;
 	register struct olstat_args /* {
 		char *path;
 		struct ostat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error)
 		return (error);
 	cvtstat(&sb, &osb);
 	error = copyout(&osb, uap->ub, sizeof (osb));
 	return (error);
 }
 
 /*
  * Convert from an old to a new stat structure.
  */
 void
 cvtstat(st, ost)
 	struct stat *st;
 	struct ostat *ost;
 {
 
 	ost->st_dev = st->st_dev;
 	ost->st_ino = st->st_ino;
 	ost->st_mode = st->st_mode;
 	ost->st_nlink = st->st_nlink;
 	ost->st_uid = st->st_uid;
 	ost->st_gid = st->st_gid;
 	ost->st_rdev = st->st_rdev;
 	if (st->st_size < (quad_t)1 << 32)
 		ost->st_size = st->st_size;
 	else
 		ost->st_size = -2;
 	ost->st_atime = st->st_atime;
 	ost->st_mtime = st->st_mtime;
 	ost->st_ctime = st->st_ctime;
 	ost->st_blksize = st->st_blksize;
 	ost->st_blocks = st->st_blocks;
 	ost->st_flags = st->st_flags;
 	ost->st_gen = st->st_gen;
 }
 #endif /* COMPAT_43 */
 
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct stat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 int
 stat(td, uap)
 	struct thread *td;
 	register struct stat_args /* {
 		char *path;
 		struct stat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error == 0)
 		error = copyout(&sb, uap->ub, sizeof (sb));
 	return (error);
 }
 
 int
 kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
 {
 	struct nameidata nd;
 	struct stat sb;
 	int error, vfslocked;
 
 	NDINIT(&nd, LOOKUP,
 	    FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (mtx_owned(&Giant))
 		printf("stat(%d): %s\n", vfslocked, path);
 	if (error)
 		return (error);
 	*sbp = sb;
 	return (0);
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lstat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 int
 lstat(td, uap)
 	struct thread *td;
 	register struct lstat_args /* {
 		char *path;
 		struct stat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error == 0)
 		error = copyout(&sb, uap->ub, sizeof (sb));
 	return (error);
 }
 
 int
 kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
 {
 	struct vnode *vp;
 	struct stat sb;
 	struct nameidata nd;
 	int error, vfslocked;
 
 	NDINIT(&nd, LOOKUP,
 	    NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
 		return (error);
 	*sbp = sb;
 	return (0);
 }
 
 /*
  * Implementation of the NetBSD [l]stat() functions.
  */
 void
 cvtnstat(sb, nsb)
 	struct stat *sb;
 	struct nstat *nsb;
 {
 	bzero(nsb, sizeof *nsb);
 	nsb->st_dev = sb->st_dev;
 	nsb->st_ino = sb->st_ino;
 	nsb->st_mode = sb->st_mode;
 	nsb->st_nlink = sb->st_nlink;
 	nsb->st_uid = sb->st_uid;
 	nsb->st_gid = sb->st_gid;
 	nsb->st_rdev = sb->st_rdev;
 	nsb->st_atimespec = sb->st_atimespec;
 	nsb->st_mtimespec = sb->st_mtimespec;
 	nsb->st_ctimespec = sb->st_ctimespec;
 	nsb->st_size = sb->st_size;
 	nsb->st_blocks = sb->st_blocks;
 	nsb->st_blksize = sb->st_blksize;
 	nsb->st_flags = sb->st_flags;
 	nsb->st_gen = sb->st_gen;
 	nsb->st_birthtimespec = sb->st_birthtimespec;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct nstat_args {
 	char	*path;
 	struct nstat *ub;
 };
 #endif
 int
 nstat(td, uap)
 	struct thread *td;
 	register struct nstat_args /* {
 		char *path;
 		struct nstat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error)
 		return (error);
 	cvtnstat(&sb, &nsb);
 	error = copyout(&nsb, uap->ub, sizeof (nsb));
 	return (error);
 }
 
 /*
  * NetBSD lstat.  Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lstat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 int
 nlstat(td, uap)
 	struct thread *td;
 	register struct nlstat_args /* {
 		char *path;
 		struct nstat *ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
 	if (error)
 		return (error);
 	cvtnstat(&sb, &nsb);
 	error = copyout(&nsb, uap->ub, sizeof (nsb));
 	return (error);
 }
 
 /*
  * Get configurable pathname variables.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pathconf_args {
 	char	*path;
 	int	name;
 };
 #endif
 int
 pathconf(td, uap)
 	struct thread *td;
 	register struct pathconf_args /* {
 		char *path;
 		int name;
 	} */ *uap;
 {
 
 	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name));
 }
 
 int
 kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name)
 {
 	struct nameidata nd;
 	int error, vfslocked;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	/* If asynchronous I/O is available, it works for all files. */
 	if (name == _PC_ASYNC_IO)
 		td->td_retval[0] = async_io_version;
 	else
 		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
 	vput(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readlink_args {
 	char	*path;
 	char	*buf;
 	int	count;
 };
 #endif
 int
 readlink(td, uap)
 	struct thread *td;
 	register struct readlink_args /* {
 		char *path;
 		char *buf;
 		int count;
 	} */ *uap;
 {
 
 	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
 	    UIO_USERSPACE, uap->count));
 }
 
 int
 kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
     enum uio_seg bufseg, int count)
 {
 	register struct vnode *vp;
 	struct iovec aiov;
 	struct uio auio;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 #ifdef MAC
 	error = mac_check_vnode_readlink(td->td_ucred, vp);
 	if (error) {
 		vput(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 #endif
 	if (vp->v_type != VLNK)
 		error = EINVAL;
 	else {
 		aiov.iov_base = buf;
 		aiov.iov_len = count;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = bufseg;
 		auio.uio_td = td;
 		auio.uio_resid = count;
 		error = VOP_READLINK(vp, &auio, td->td_ucred);
 	}
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	td->td_retval[0] = count - auio.uio_resid;
 	return (error);
 }
 
 /*
  * Common implementation code for chflags() and fchflags().
  */
 static int
 setfflags(td, vp, flags)
 	struct thread *td;
 	struct vnode *vp;
 	int flags;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	/*
 	 * Prevent non-root users from setting flags on devices.  When
 	 * a device is reused, users can retain ownership of the device
 	 * if they are allowed to set flags and programs assume that
 	 * chown can't fail when done as root.
 	 */
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		error = priv_check_cred(td->td_ucred, PRIV_VFS_CHFLAGS_DEV,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			return (error);
 	}
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_flags = flags;
 #ifdef MAC
 	error = mac_check_vnode_setflags(td->td_ucred, vp, vattr.va_flags);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change flags of a file given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chflags_args {
 	char	*path;
 	int	flags;
 };
 #endif
 int
 chflags(td, uap)
 	struct thread *td;
 	register struct chflags_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(fflags, uap->flags);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vfslocked = NDHASGIANT(&nd);
 	error = setfflags(td, nd.ni_vp, uap->flags);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Same as chflags() but doesn't follow symlinks.
  */
 int
 lchflags(td, uap)
 	struct thread *td;
 	register struct lchflags_args /* {
 		char *path;
 		int flags;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(fflags, uap->flags);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfflags(td, nd.ni_vp, uap->flags);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Change flags of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchflags_args {
 	int	fd;
 	int	flags;
 };
 #endif
 int
 fchflags(td, uap)
 	struct thread *td;
 	register struct fchflags_args /* {
 		int fd;
 		int flags;
 	} */ *uap;
 {
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(fflags, uap->flags);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
 	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
 	VOP_UNLOCK(fp->f_vnode, 0, td);
 #endif
 	error = setfflags(td, fp->f_vnode, uap->flags);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation code for chmod(), lchmod() and fchmod().
  */
 static int
 setfmode(td, vp, mode)
 	struct thread *td;
 	struct vnode *vp;
 	int mode;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_mode = mode & ALLPERMS;
 #ifdef MAC
 	error = mac_check_vnode_setmode(td->td_ucred, vp, vattr.va_mode);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change mode of a file given path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 chmod(td, uap)
 	struct thread *td;
 	register struct chmod_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 int
 kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(mode, mode);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfmode(td, nd.ni_vp, mode);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Change mode of a file given path name (don't follow links.)
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 lchmod(td, uap)
 	struct thread *td;
 	register struct lchmod_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(mode, (mode_t)uap->mode);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfmode(td, nd.ni_vp, uap->mode);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Change mode of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchmod_args {
 	int	fd;
 	int	mode;
 };
 #endif
 int
 fchmod(td, uap)
 	struct thread *td;
 	register struct fchmod_args /* {
 		int fd;
 		int mode;
 	} */ *uap;
 {
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(mode, uap->mode);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
 	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
 	VOP_UNLOCK(fp->f_vnode, 0, td);
 #endif
 	error = setfmode(td, fp->f_vnode, uap->mode);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation for chown(), lchown(), and fchown()
  */
 static int
 setfown(td, vp, uid, gid)
 	struct thread *td;
 	struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_uid = uid;
 	vattr.va_gid = gid;
 #ifdef MAC
 	error = mac_check_vnode_setowner(td->td_ucred, vp, vattr.va_uid,
 	    vattr.va_gid);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Set ownership given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 chown(td, uap)
 	struct thread *td;
 	register struct chown_args /* {
 		char *path;
 		int uid;
 		int gid;
 	} */ *uap;
 {
 
 	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
 }
 
 int
 kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
     int gid)
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(owner, uid, gid);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfown(td, nd.ni_vp, uid, gid);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Set ownership given a path name, do not cross symlinks.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 lchown(td, uap)
 	struct thread *td;
 	register struct lchown_args /* {
 		char *path;
 		int uid;
 		int gid;
 	} */ *uap;
 {
 
 	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
 }
 
 int
 kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
     int gid)
 {
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(owner, uid, gid);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfown(td, nd.ni_vp, uid, gid);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Set ownership given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchown_args {
 	int	fd;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 fchown(td, uap)
 	struct thread *td;
 	register struct fchown_args /* {
 		int fd;
 		int uid;
 		int gid;
 	} */ *uap;
 {
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	AUDIT_ARG(owner, uap->uid, uap->gid);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
 	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
 	VOP_UNLOCK(fp->f_vnode, 0, td);
 #endif
 	error = setfown(td, fp->f_vnode, uap->uid, uap->gid);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 getutimes(usrtvp, tvpseg, tsp)
 	const struct timeval *usrtvp;
 	enum uio_seg tvpseg;
 	struct timespec *tsp;
 {
 	struct timeval tv[2];
 	const struct timeval *tvp;
 	int error;
 
 	if (usrtvp == NULL) {
 		microtime(&tv[0]);
 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
 		tsp[1] = tsp[0];
 	} else {
 		if (tvpseg == UIO_SYSSPACE) {
 			tvp = usrtvp;
 		} else {
 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
 				return (error);
 			tvp = tv;
 		}
 
 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
 			return (EINVAL);
 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
 	}
 	return (0);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 setutimes(td, vp, ts, numtimes, nullflag)
 	struct thread *td;
 	struct vnode *vp;
 	const struct timespec *ts;
 	int numtimes;
 	int nullflag;
 {
 	int error, setbirthtime;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	setbirthtime = 0;
 	if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 &&
 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
 		setbirthtime = 1;
 	VATTR_NULL(&vattr);
 	vattr.va_atime = ts[0];
 	vattr.va_mtime = ts[1];
 	if (setbirthtime)
 		vattr.va_birthtime = ts[1];
 	if (numtimes > 2)
 		vattr.va_birthtime = ts[2];
 	if (nullflag)
 		vattr.va_vaflags |= VA_UTIMES_NULL;
 #ifdef MAC
 	error = mac_check_vnode_setutimes(td->td_ucred, vp, vattr.va_atime,
 	    vattr.va_mtime);
 #endif
 	if (error == 0)
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct utimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 int
 utimes(td, uap)
 	struct thread *td;
 	register struct utimes_args /* {
 		char *path;
 		struct timeval *tptr;
 	} */ *uap;
 {
 
 	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
 	    UIO_USERSPACE));
 }
 
 int
 kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
     struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lutimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 int
 lutimes(td, uap)
 	struct thread *td;
 	register struct lutimes_args /* {
 		char *path;
 		struct timeval *tptr;
 	} */ *uap;
 {
 
 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
 	    UIO_USERSPACE));
 }
 
 int
 kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
     struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct futimes_args {
 	int	fd;
 	struct	timeval *tptr;
 };
 #endif
 int
 futimes(td, uap)
 	struct thread *td;
 	register struct futimes_args /* {
 		int  fd;
 		struct timeval *tptr;
 	} */ *uap;
 {
 
 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
 }
 
 int
 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
     enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, fd);
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
 	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
 	VOP_UNLOCK(fp->f_vnode, 0, td);
 #endif
 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct truncate_args {
 	char	*path;
 	int	pad;
 	off_t	length;
 };
 #endif
 int
 truncate(td, uap)
 	struct thread *td;
 	register struct truncate_args /* {
 		char *path;
 		int pad;
 		off_t length;
 	} */ *uap;
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 }
 
 int
 kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	if (length < 0)
 		return(EINVAL);
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 #ifdef MAC
 	else if ((error = mac_check_vnode_write(td->td_ucred, NOCRED, vp))) {
 	}
 #endif
 	else if ((error = vn_writechk(vp)) == 0 &&
 	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred, td);
 	}
 	vput(vp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Truncate a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ftruncate_args {
 	int	fd;
 	int	pad;
 	off_t	length;
 };
 #endif
 int
 ftruncate(td, uap)
 	struct thread *td;
 	register struct ftruncate_args /* {
 		int fd;
 		int pad;
 		off_t length;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	struct vnode *vp;
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	if (uap->length < 0)
 		return(EINVAL);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	if ((fp->f_flag & FWRITE) == 0) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 #ifdef MAC
 	else if ((error = mac_check_vnode_write(td->td_ucred, fp->f_cred,
 	    vp))) {
 	}
 #endif
 	else if ((error = vn_writechk(vp)) == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = uap->length;
 		error = VOP_SETATTR(vp, &vattr, fp->f_cred, td);
 	}
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 drop:
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct otruncate_args {
 	char	*path;
 	long	length;
 };
 #endif
 int
 otruncate(td, uap)
 	struct thread *td;
 	register struct otruncate_args /* {
 		char *path;
 		long length;
 	} */ *uap;
 {
 	struct truncate_args /* {
 		char *path;
 		int pad;
 		off_t length;
 	} */ nuap;
 
 	nuap.path = uap->path;
 	nuap.length = uap->length;
 	return (truncate(td, &nuap));
 }
 
 /*
  * Truncate a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct oftruncate_args {
 	int	fd;
 	long	length;
 };
 #endif
 int
 oftruncate(td, uap)
 	struct thread *td;
 	register struct oftruncate_args /* {
 		int fd;
 		long length;
 	} */ *uap;
 {
 	struct ftruncate_args /* {
 		int fd;
 		int pad;
 		off_t length;
 	} */ nuap;
 
 	nuap.fd = uap->fd;
 	nuap.length = uap->length;
 	return (ftruncate(td, &nuap));
 }
 #endif /* COMPAT_43 */
 
 /*
  * Sync an open file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fsync_args {
 	int	fd;
 };
 #endif
 int
 fsync(td, uap)
 	struct thread *td;
 	struct fsync_args /* {
 		int fd;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	int vfslocked;
 	int error;
 
 	AUDIT_ARG(fd, uap->fd);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_LOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_UNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 drop:
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
- * Rename files.  Source and destination must either both be directories,
- * or both not be directories.  If target is a directory, it must be empty.
+ * Rename files.  Source and destination must either both be directories, or
+ * both not be directories.  If target is a directory, it must be empty.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rename_args {
 	char	*from;
 	char	*to;
 };
 #endif
 int
 rename(td, uap)
 	struct thread *td;
 	register struct rename_args /* {
 		char *from;
 		char *to;
 	} */ *uap;
 {
 
 	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
 }
 
 int
 kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
 {
 	struct mount *mp = NULL;
 	struct vnode *tvp, *fvp, *tdvp;
 	struct nameidata fromnd, tond;
 	int tvfslocked;
 	int fvfslocked;
 	int error;
 
 	bwillwrite();
 #ifdef MAC
 	NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE |
 	    AUDITVNODE1, pathseg, from, td);
 #else
 	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
 	    AUDITVNODE1, pathseg, from, td);
 #endif
 	if ((error = namei(&fromnd)) != 0)
 		return (error);
 	fvfslocked = NDHASGIANT(&fromnd);
 	tvfslocked = 0;
 #ifdef MAC
 	error = mac_check_vnode_rename_from(td->td_ucred, fromnd.ni_dvp,
 	    fromnd.ni_vp, &fromnd.ni_cnd);
 	VOP_UNLOCK(fromnd.ni_dvp, 0, td);
 	if (fromnd.ni_dvp != fromnd.ni_vp)
 		VOP_UNLOCK(fromnd.ni_vp, 0, td);
 #endif
 	fvp = fromnd.ni_vp;
 	if (error == 0)
 		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
 	if (error != 0) {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		goto out1;
 	}
 	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART |
 	    MPSAFE | AUDITVNODE2, pathseg, to, td);
 	if (fromnd.ni_vp->v_type == VDIR)
 		tond.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&tond)) != 0) {
 		/* Translate error code for rename("dir1", "dir2/."). */
 		if (error == EISDIR && fvp->v_type == VDIR)
 			error = EINVAL;
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		vn_finished_write(mp);
 		goto out1;
 	}
 	tvfslocked = NDHASGIANT(&tond);
 	tdvp = tond.ni_dvp;
 	tvp = tond.ni_vp;
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto out;
 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
 			error = EISDIR;
 			goto out;
 		}
 	}
 	if (fvp == tdvp)
 		error = EINVAL;
 	/*
 	 * If the source is the same as the destination (that is, if they
 	 * are links to the same vnode), then there is nothing to do.
 	 */
 	if (fvp == tvp)
 		error = -1;
 #ifdef MAC
 	else
 		error = mac_check_vnode_rename_to(td->td_ucred, tdvp,
 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
 #endif
 out:
 	if (!error) {
 		VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE);
 		if (fromnd.ni_dvp != tdvp) {
 			VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 		}
 		if (tvp) {
 			VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE);
 		}
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 	} else {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 		if (tvp)
 			vput(tvp);
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 	}
 	vrele(tond.ni_startdir);
 	vn_finished_write(mp);
 out1:
 	if (fromnd.ni_startdir)
 		vrele(fromnd.ni_startdir);
 	VFS_UNLOCK_GIANT(fvfslocked);
 	VFS_UNLOCK_GIANT(tvfslocked);
 	if (error == -1)
 		return (0);
 	return (error);
 }
 
 /*
  * Make a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkdir_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 mkdir(td, uap)
 	struct thread *td;
 	register struct mkdir_args /* {
 		char *path;
 		int mode;
 	} */ *uap;
 {
 
 	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 int
 kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	AUDIT_ARG(mode, mode);
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
 	    segflg, path, td);
 	nd.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		/*
 		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
 		 * the strange behaviour of leaving the vnode unlocked
 		 * if the target is the same vnode as the parent.
 		 */
 		if (vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VDIR;
 	FILEDESC_LOCK_FAST(td->td_proc->p_fd);
 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
 	FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
 #ifdef MAC
 	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error)
 		goto out;
 #endif
 	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 #ifdef MAC
 out:
 #endif
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (!error)
 		vput(nd.ni_vp);
 	vn_finished_write(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Remove a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rmdir_args {
 	char	*path;
 };
 #endif
 int
 rmdir(td, uap)
 	struct thread *td;
 	struct rmdir_args /* {
 		char *path;
 	} */ *uap;
 {
 
 	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
 }
 
 int
 kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 	/*
 	 * No rmdir "." please.
 	 */
 	if (nd.ni_dvp == vp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The root of a mounted filesystem cannot be deleted.
 	 */
 	if (vp->v_vflag & VV_ROOT) {
 		error = EBUSY;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp,
 	    &nd.ni_cnd);
 	if (error)
 		goto out;
 #endif
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(vp);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 	vn_finished_write(mp);
 out:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	if (nd.ni_dvp == vp)
 		vrele(nd.ni_dvp);
 	else
 		vput(nd.ni_dvp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Read a block of directory entries in a filesystem independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ogetdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 ogetdirentries(td, uap)
 	struct thread *td;
 	register struct ogetdirentries_args /* {
 		int fd;
 		char *buf;
 		u_int count;
 		long *basep;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio, kuio;
 	struct iovec aiov, kiov;
 	struct dirent *dp, *edp;
 	caddr_t dirbuf;
 	int error, eofflag, readcnt, vfslocked;
 	long loff;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	if (uap->count > 64 * 1024)
 		return (EINVAL);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 unionread:
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = uap->count;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	loff = auio.uio_offset = fp->f_offset;
 #ifdef MAC
 	error = mac_check_vnode_readdir(td->td_ucred, vp);
 	if (error) {
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 #	if (BYTE_ORDER != LITTLE_ENDIAN)
 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
 			    NULL, NULL);
 			fp->f_offset = auio.uio_offset;
 		} else
 #	endif
 	{
 		kuio = auio;
 		kuio.uio_iov = &kiov;
 		kuio.uio_segflg = UIO_SYSSPACE;
 		kiov.iov_len = uap->count;
 		MALLOC(dirbuf, caddr_t, uap->count, M_TEMP, M_WAITOK);
 		kiov.iov_base = dirbuf;
 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
 			    NULL, NULL);
 		fp->f_offset = kuio.uio_offset;
 		if (error == 0) {
 			readcnt = uap->count - kuio.uio_resid;
 			edp = (struct dirent *)&dirbuf[readcnt];
 			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
 #				if (BYTE_ORDER == LITTLE_ENDIAN)
 					/*
 					 * The expected low byte of
 					 * dp->d_namlen is our dp->d_type.
 					 * The high MBZ byte of dp->d_namlen
 					 * is our dp->d_namlen.
 					 */
 					dp->d_type = dp->d_namlen;
 					dp->d_namlen = 0;
 #				else
 					/*
 					 * The dp->d_type is the high byte
 					 * of the expected dp->d_namlen,
 					 * so must be zero'ed.
 					 */
 					dp->d_type = 0;
 #				endif
 				if (dp->d_reclen > 0) {
 					dp = (struct dirent *)
 					    ((char *)dp + dp->d_reclen);
 				} else {
 					error = EIO;
 					break;
 				}
 			}
 			if (dp >= edp)
 				error = uiomove(dirbuf, readcnt, &auio);
 		}
 		FREE(dirbuf, M_TEMP);
 	}
 	if (error) {
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (error);
 	}
 	if (uap->count == auio.uio_resid &&
 	    (vp->v_vflag & VV_ROOT) &&
 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
 		struct vnode *tvp = vp;
 		vp = vp->v_mount->mnt_vnodecovered;
 		VREF(vp);
 		fp->f_vnode = vp;
 		fp->f_data = vp;
 		fp->f_offset = 0;
 		vput(tvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	error = copyout(&loff, uap->basep, sizeof(long));
 	fdrop(fp, td);
 	td->td_retval[0] = uap->count - auio.uio_resid;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Read a block of directory entries in a filesystem independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 getdirentries(td, uap)
 	struct thread *td;
 	register struct getdirentries_args /* {
 		int fd;
 		char *buf;
 		u_int count;
 		long *basep;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	int vfslocked;
 	long loff;
 	int error, eofflag;
 
 	AUDIT_ARG(fd, uap->fd);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 unionread:
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		error = EINVAL;
 		goto fail;
 	}
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = uap->count;
 	/* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 	loff = auio.uio_offset = fp->f_offset;
 #ifdef MAC
 	error = mac_check_vnode_readdir(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
 		    NULL);
 	fp->f_offset = auio.uio_offset;
 	if (error) {
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto fail;
 	}
 	if (uap->count == auio.uio_resid &&
 	    (vp->v_vflag & VV_ROOT) &&
 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
 		struct vnode *tvp = vp;
 		vp = vp->v_mount->mnt_vnodecovered;
 		VREF(vp);
 		fp->f_vnode = vp;
 		fp->f_data = vp;
 		fp->f_offset = 0;
 		vput(tvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (uap->basep != NULL) {
 		error = copyout(&loff, uap->basep, sizeof(long));
 	}
 	td->td_retval[0] = uap->count - auio.uio_resid;
 fail:
 	fdrop(fp, td);
 	return (error);
 }
+
 #ifndef _SYS_SYSPROTO_H_
 struct getdents_args {
 	int fd;
 	char *buf;
 	size_t count;
 };
 #endif
 int
 getdents(td, uap)
 	struct thread *td;
 	register struct getdents_args /* {
 		int fd;
 		char *buf;
 		u_int count;
 	} */ *uap;
 {
 	struct getdirentries_args ap;
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	return (getdirentries(td, &ap));
 }
 
 /*
  * Set the mode mask for creation of filesystem nodes.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct umask_args {
 	int	newmask;
 };
 #endif
 int
 umask(td, uap)
 	struct thread *td;
 	struct umask_args /* {
 		int newmask;
 	} */ *uap;
 {
 	register struct filedesc *fdp;
 
 	FILEDESC_LOCK_FAST(td->td_proc->p_fd);
 	fdp = td->td_proc->p_fd;
 	td->td_retval[0] = fdp->fd_cmask;
 	fdp->fd_cmask = uap->newmask & ALLPERMS;
 	FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
 	return (0);
 }
 
 /*
- * Void all references to file by ripping underlying filesystem
- * away from vnode.
+ * Void all references to file by ripping underlying filesystem away from
+ * vnode.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct revoke_args {
 	char	*path;
 };
 #endif
 int
 revoke(td, uap)
 	struct thread *td;
 	register struct revoke_args /* {
 		char *path;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 	int vfslocked;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (vp->v_type != VCHR) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_check_vnode_revoke(td->td_ucred, vp);
 	if (error)
 		goto out;
 #endif
 	error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 	if (error)
 		goto out;
 	if (td->td_ucred->cr_uid != vattr.va_uid) {
 		error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN,
 		    SUSER_ALLOWJAIL);
 		if (error)
 			goto out;
 	}
 	if (vcount(vp) > 1)
 		VOP_REVOKE(vp, REVOKEALL);
 out:
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Convert a user file descriptor to a kernel file entry.
  * A reference on the file entry is held upon returning.
  */
 int
 getvnode(fdp, fd, fpp)
 	struct filedesc *fdp;
 	int fd;
 	struct file **fpp;
 {
 	int error;
 	struct file *fp;
 
 	fp = NULL;
 	if (fdp == NULL)
 		error = EBADF;
 	else {
 		FILEDESC_LOCK(fdp);
 		if ((u_int)fd >= fdp->fd_nfiles ||
 		    (fp = fdp->fd_ofiles[fd]) == NULL)
 			error = EBADF;
 		else if (fp->f_vnode == NULL) {
 			fp = NULL;
 			error = EINVAL;
 		} else {
 			fhold(fp);
 			error = 0;
 		}
 		FILEDESC_UNLOCK(fdp);
 	}
 	*fpp = fp;
 	return (error);
 }
 
 /*
- * Get (NFS) file handle
+ * Get an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lgetfh_args {
 	char	*fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 lgetfh(td, uap)
 	struct thread *td;
 	register struct lgetfh_args *uap;
 {
 	struct nameidata nd;
 	fhandle_t fh;
 	register struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->fname, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
 		return (error);
 	error = copyout(&fh, uap->fhp, sizeof (fh));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getfh_args {
 	char	*fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 getfh(td, uap)
 	struct thread *td;
 	register struct getfh_args *uap;
 {
 	struct nameidata nd;
 	fhandle_t fh;
 	register struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->fname, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
 		return (error);
 	error = copyout(&fh, uap->fhp, sizeof (fh));
 	return (error);
 }
 
 /*
  * syscall for the rpc.lockd to use to translate a NFS file handle into an
  * open descriptor.
  *
  * warning: do not remove the priv_check() call or this becomes one giant
  * security hole.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhopen_args {
 	const struct fhandle *u_fhp;
 	int flags;
 };
 #endif
 int
 fhopen(td, uap)
 	struct thread *td;
 	struct fhopen_args /* {
 		const struct fhandle *u_fhp;
 		int flags;
 	} */ *uap;
 {
 	struct proc *p = td->td_proc;
 	struct mount *mp;
 	struct vnode *vp;
 	struct fhandle fhp;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	struct flock lf;
 	struct file *fp;
 	register struct filedesc *fdp = p->p_fd;
 	int fmode, mode, error, type;
 	struct file *nfp;
 	int vfslocked;
 	int indx;
 
 	error = priv_check(td, PRIV_VFS_FHOPEN);
 	if (error)
 		return (error);
 	fmode = FFLAGS(uap->flags);
 	/* why not allow a non-read/write open for our lockd? */
 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
 		return (EINVAL);
 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
 	if (error)
 		return(error);
 	/* find the mount point */
 	mp = vfs_getvfs(&fhp.fh_fsid);
 	if (mp == NULL)
 		return (ESTALE);
 	vfslocked = VFS_LOCK_GIANT(mp);
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
 	if (error)
 		goto out;
 	/*
 	 * from now on we have to make sure not
 	 * to forget about the vnode
 	 * any error that causes an abort must vput(vp)
 	 * just set error = err and 'goto bad;'.
 	 */
 
 	/*
 	 * from vn_open
 	 */
 	if (vp->v_type == VLNK) {
 		error = EMLINK;
 		goto bad;
 	}
 	if (vp->v_type == VSOCK) {
 		error = EOPNOTSUPP;
 		goto bad;
 	}
 	mode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
 		if (vp->v_type == VDIR) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = vn_writechk(vp);
 		if (error)
 			goto bad;
 		mode |= VWRITE;
 	}
 	if (fmode & FREAD)
 		mode |= VREAD;
 	if (fmode & O_APPEND)
 		mode |= VAPPEND;
 #ifdef MAC
 	error = mac_check_vnode_open(td->td_ucred, vp, mode);
 	if (error)
 		goto bad;
 #endif
 	if (mode) {
 		error = VOP_ACCESS(vp, mode, td->td_ucred, td);
 		if (error)
 			goto bad;
 	}
 	if (fmode & O_TRUNC) {
 		VOP_UNLOCK(vp, 0, td);				/* XXX */
 		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
 			vrele(vp);
 			goto out;
 		}
 		VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);	/* XXX */
 #ifdef MAC
 		/*
 		 * We don't yet have fp->f_cred, so use td->td_ucred, which
 		 * should be right.
 		 */
 		error = mac_check_vnode_write(td->td_ucred, td->td_ucred, vp);
 		if (error == 0) {
 #endif
 			VATTR_NULL(vap);
 			vap->va_size = 0;
 			error = VOP_SETATTR(vp, vap, td->td_ucred, td);
 #ifdef MAC
 		}
 #endif
 		vn_finished_write(mp);
 		if (error)
 			goto bad;
 	}
 	error = VOP_OPEN(vp, fmode, td->td_ucred, td, -1);
 	if (error)
 		goto bad;
 
 	if (fmode & FWRITE)
 		vp->v_writecount++;
 
 	/*
 	 * end of vn_open code
 	 */
 
 	if ((error = falloc(td, &nfp, &indx)) != 0) {
 		if (fmode & FWRITE)
 			vp->v_writecount--;
 		goto bad;
 	}
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	fp = nfp;
 
 	nfp->f_vnode = vp;
 	nfp->f_data = vp;
 	nfp->f_flag = fmode & FMASK;
 	nfp->f_ops = &vnops;
 	nfp->f_type = DTYPE_VNODE;
 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		if (fmode & O_EXLOCK)
 			lf.l_type = F_WRLCK;
 		else
 			lf.l_type = F_RDLCK;
 		type = F_FLOCK;
 		if ((fmode & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		VOP_UNLOCK(vp, 0, td);
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 			    type)) != 0) {
 			/*
 			 * The lock request failed.  Normally close the
 			 * descriptor but handle the case where someone might
 			 * have dup()d or close()d it when we weren't looking.
 			 */
 			fdclose(fdp, fp, indx, td);
 
 			/*
 			 * release our private reference
 			 */
 			fdrop(fp, td);
 			goto out;
 		}
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		fp->f_flag |= FHASLOCK;
 	}
 
 	VOP_UNLOCK(vp, 0, td);
 	fdrop(fp, td);
 	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	td->td_retval[0] = indx;
 	return (0);
 
 bad:
 	vput(vp);
 out:
 	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Stat an (NFS) file handle.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstat_args {
 	struct fhandle *u_fhp;
 	struct stat *sb;
 };
 #endif
 int
 fhstat(td, uap)
 	struct thread *td;
 	register struct fhstat_args /* {
 		struct fhandle *u_fhp;
 		struct stat *sb;
 	} */ *uap;
 {
 	struct stat sb;
 	fhandle_t fh;
 	struct mount *mp;
 	struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTAT);
 	if (error)
 		return (error);
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	vfslocked = VFS_LOCK_GIANT(mp);
 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) {
 		vfs_rel(mp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
 	vput(vp);
 	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
 		return (error);
 	error = copyout(&sb, uap->sb, sizeof(sb));
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct statfs *buf;
 };
 #endif
 int
 fhstatfs(td, uap)
 	struct thread *td;
 	struct fhstatfs_args /* {
 		struct fhandle *u_fhp;
 		struct statfs *buf;
 	} */ *uap;
 {
 	struct statfs sf;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
 	error = kern_fhstatfs(td, fh, &sf);
 	if (error)
 		return (error);
 	return (copyout(&sf, uap->buf, sizeof(sf)));
 }
 
 int
 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
 {
 	struct statfs *sp;
 	struct mount *mp;
 	struct vnode *vp;
 	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTATFS);
 	if (error)
 		return (error);
 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	vfslocked = VFS_LOCK_GIANT(mp);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, &vp);
 	if (error) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfs_rel(mp);
 		return (error);
 	}
 	vput(vp);
 	error = prison_canseemount(td->td_ucred, mp);
 	if (error)
 		goto out;
 #ifdef MAC
 	error = mac_check_mount_stat(td->td_ucred, mp);
 	if (error)
 		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
 	 */
 	sp = &mp->mnt_stat;
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp, td);
 	if (error == 0)
 		*buf = *sp;
 out:
 	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }